Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 197 additions & 2 deletions warn/scrapers/mi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
Comment thread
palewire marked this conversation as resolved.
from pathlib import Path

import pdfplumber
from bs4 import BeautifulSoup

from .. import utils
Expand Down Expand Up @@ -30,11 +31,9 @@ def scrape(
# Grabs the main page with the current year's data
current_page = utils.get_url("https://milmi.org/warn/")
current_html = current_page.text

# Grabs the WARN archive years html page with previous data
archive_web_page = utils.get_url("https://milmi.org/warn/archive")
archive_web_html = archive_web_page.text

# Write the raw current year's file to the cache
cache = Cache(cache_dir)
cache.write("mi/current.html", current_html)
Expand Down Expand Up @@ -80,6 +79,23 @@ def scrape(
pdf_file = cache.download(cache_key, pdf_url)
pdf_list.append(pdf_file)

# parse pdf data
pdf_data = []
print(pdf_list)
# Parse the pdfs from 2007-2015
for file in range(len(pdf_list) - 9, len(pdf_list))[::-1]:
Comment thread
palewire marked this conversation as resolved.
Outdated
with pdfplumber.open(pdf_list[file]) as pdf:
for i in pdf.pages:
pdf_data += process_pdf_2(i.extract_text())
Comment thread
palewire marked this conversation as resolved.
Outdated

# Parse pdfs from 2000-2006, which use a different type of spacing
for file in range(len(pdf_list) - 9)[::-1]:
with pdfplumber.open(pdf_list[file]) as pdf:
for i in pdf.pages:
pdf_data += process_pdf_1(i.extract_text())

cleaned_data += pdf_data

# Set the path to the final CSV
# We should always use the lower-case state postal code, like nj.csv
output_csv = data_dir / "mi.csv"
Expand All @@ -91,6 +107,185 @@ def scrape(
return output_csv


def process_pdf_1(txt):
"""Process the 2000-2006 pdfs."""
# split at newline, remove space placeholders, parts that aren't layoff data
txt = txt.split("\n")
txt = [i.replace("\xa0", " ").replace("\xad", " ") for i in txt]
txt = txt[5:-5]

# contains all the data for the page
ans = []

for row in txt:

# used later to check if data is damaged
broken = False
# the final processed row goes here
final = []

# the furthest the company name column ends is at index 35
compname = 35

try:
# However, sometimes it ends before that,
# so we start at index 35 and go back until we hit a space
while row[compname] != " ":
# if not space is found, the row is smushed together
if compname == 0:
broken = True
break
compname -= 1
if broken:
txt.remove(row)
continue
except Exception:
# if this fails, the data is damaged (it fails on about 5 rows)
txt.remove(row)
continue

# add company name to final row
final.append(row[: compname + 1])

# Usually, the city name ends 21 characters after the company name
# this isn't always the case, so we do what we did with company column
cityname = compname + 21
try:
while row[cityname] != " ":
broken = False
if cityname == 0:
broken = True
cityname -= 1
if broken:
txt.remove(row)
continue
except Exception:
# if it fails, the data is damaged (it only fails on two)
txt.remove(row)
continue

# add city name to final row
final.append(row[compname + 1 : cityname])

# temp contains the test of the string
# which has date, event type, and number affected in that order
temp = row[cityname + 1 :]

# split to get date, event type, and number affected seperately
temp = temp.split()

# they use numbers instead of these words in some of the pdf
if temp[1] == "1":
temp[1] = "Plant Closing"
elif temp[1] == "2":
temp[1] = "Mass Layoff"
elif temp[1] == "3":
temp[1] = ""
elif temp[1] == "4":
temp[1] = ""

# add date, event type, and number affected to final row
for j in temp:
final.append(j)

# eliminate extra space
final = [row.strip() for row in final]

# see function
final = handle_edge_cases(final)

# add row to page
ans.append(final)
return ans


def process_pdf_2(txt):
"""Process 2007-2015 pdfs."""
# split at newline
txt = txt.split("\n")
ans = []
for i in txt:
final = []
# split at spaces, which are used as column divders
raw = i.split(" ")
# remove space placeholders
final = [j.replace("\xa0", " ").replace("\xad", " ") for j in raw]

# removing edge cases such as the "Notes:" section and blank lines
if (
len(final) == 5
and (final[0] not in ["Company", "Company Name"])
and final[1] != "Received"
and final != [" ", " ", " ", " ", " "]
):
ans.append(final)
# accounting for the 2008-2009 minor format changes
if len(final) == 6:
if (final[2][-4:] == "2008" or final[2][-4:] == "2009") and (
final[0] != "January 1 through December 31:"
):
final.pop(-1)
ans.append(final)

return ans


# there are around 15 lines with unique spacing issues
# there doesn't seem to be a pattern to them,
# so I just fixed them manually
# there may be a more elegant solution
def handle_edge_cases(row):
"""Handle edge cases in 2000-2006 pdfs."""
if row[0] == "General Motors Nao Orion AsseOrion":
row[1] = "Orion"
row[0] = row[0][-5:]
if row[1] == "LifDetroit":
row[1] = "Detroit"
if row[1] == "ILapeer":
row[1] = "Lapeer"
if row[0] == "Standard Federal Bank #320 Troy":
row[1] = "Troy"
row[0] = "Standard Federal Bank #320"
if row[0] == "Standard Federal Bank #340 Troy":
row[1] = "Troy"
row[0] = "Standard Federal Bank #340"
if row[1] == "IFlint":
row[1] = "Flint"
if row[1] == "DivisioJackson":
row[1] = "Jackson"
if row[1] == "IncWyoming":
row[1] = "Wyoming"
if row[0] == "Michigan Machine And EngineerFenton":
row[1] = "Fenton"
row[0] = row[0][-6:]
if row[1] == "CompDearborn Heights":
row[1] = "Dearborn Heights"
if row[1] == "InPetersburg":
row[1] = "Petersburg"
if row[0] == "Asplundh Tree Company #93 Wixom":
row[0] = row[0][-5:]
row[1] = "Wixom"
if row[0] == "Asplundh Tree Company #43 Mt.":
row[0] = "Asplundh Tree Company #43"
row[1] = "Mt. Clemens"
if row[0] == "Asplundh Tree Company #843 Howell":
row[0] = "Asplundh Tree Company #843"
row[1] = "Howell"
if row[1] == "RegionaSaginaw":
row[1] = "Saginaw"
if row[0] == "Robert Bosch Corp. Chassis DivisSi":
row[0] = "Robert Bosch Corp. Chassis Division"
row[1] = "St. Joseph"
if row[0] == "Magna / Cosma Body & Chassis Troy":
row[0] = row[0][-4:]
row[1] = "Troy"
if row[1] == "Corp.Flint":
row[1] = "Flint"
if row[1] == "Inc.Howell":
row[1] = "Howell"
return row
Comment thread
palewire marked this conversation as resolved.
Outdated


def _parse_html_table(soup):
black_list = [
"TOTAL:",
Expand Down
90 changes: 90 additions & 0 deletions warn/scrapers/wv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from pathlib import Path

import pdfplumber

from .. import utils
from ..cache import Cache

__authors__ = ["Ash1R"]
__tags__ = ["html", "pdf"]
__source__ = {
"name": "Workforce West Virginia",
"url": "https://workforcewv.org/public-information/warn-notices/current-warn-notices",
}


def scrape(
data_dir: Path = utils.WARN_DATA_DIR,
cache_dir: Path = utils.WARN_CACHE_DIR,
) -> Path:
"""
Scrape data from west virginia workforce site.

It was a big pdf with all historical data,
And I used pdfplumber to extract the tables
And put them into wv.csv
"""
cache = Cache(cache_dir)
headers = [
"Company",
"Address",
"Contact Information",
"Region",
"County",
"Date",
"Projected Date",
"Type",
"Number Affected",
]
final_data = [headers]

wv_pdf = cache.download(
"WV_WARN_Notices_3-1-11_to_3-22-22.",
"https://workforcewv.org/images/files/PublicInfo/WV_WARN_Notices_3-1-11_to_3-22-22.pdf",
)
with pdfplumber.open(wv_pdf) as pdf:
for i in pdf.pages:
tables = i.find_tables()
for j in tables:
data = j.extract(x_tolerance=3, y_tolerance=3)
company = ""
companydone = False
row = []
for k in range(len(data)):
if data[k][0] is not None:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it necessary to the range in the loop here? Can you not simple do something more like for row in data?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each company's data is contained in two consecutive rows, with some blank rows in between these company row-pairs. Alternative company names and addresses are stored on the second row. I used range so I can access the second row using an index of k + 1. I did unnecessarily use range later, so I removed that.

if (
(data[k][0].strip() == "Contact Information")
or (data[k][0].strip() == "Region")
or (data[k][0].strip() == "County")
or (data[k][0].strip() == "Date of Notice")
or (data[k][0].strip() == "Projected Date")
or (data[k][0].strip() == "Closure/Mass Layoff")
or (data[k][0].strip() == "Number Affected")
):
row.append(data[k][1].strip())

elif data[k][0].strip() == "Address":
if not companydone:
row.append(company)
companydone = True
row.append(data[k][1].strip())

elif data[k][0].strip() == "Company":
company = company + data[k][1].strip()

elif ((data[k][0] is None) and (k != 0)) or (
data[k][0] == "None" and k != 0
):
for p in range(1, len(data[k])):
if data[k][p] is not None:
company = company + ", " + data[k][p].strip()
Comment thread
palewire marked this conversation as resolved.
Outdated

final_data.append(row)

output_csv = data_dir / "wv.csv"
utils.write_rows_to_csv(output_csv, final_data)
return output_csv


if __name__ == "__main__":
scrape()