Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 148 additions & 4 deletions warn/scrapers/mi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import re
from pathlib import Path

import pdfplumber
import pdfplumber
from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache

__authors__ = ["anikasikka"]
__authors__ = ["anikasikka, Ash1R"]
__tags__ = ["html", "pdf"]
__source__ = {
"name": "Michigan Department of Technology, Management and Budget",
Expand All @@ -24,17 +26,14 @@ def scrape(
Keyword arguments:
data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)

Returns: the Path where the file is written
"""
# Grabs the main page with the current year's data
current_page = utils.get_url("https://milmi.org/warn/")
current_html = current_page.text

# Grabs the WARN archive years html page with previous data
archive_web_page = utils.get_url("https://milmi.org/warn/archive")
archive_web_html = archive_web_page.text

# Write the raw current year's file to the cache
cache = Cache(cache_dir)
cache.write("mi/current.html", current_html)
Expand Down Expand Up @@ -80,6 +79,23 @@ def scrape(
pdf_file = cache.download(cache_key, pdf_url)
pdf_list.append(pdf_file)

# parse pdf data
pdf_data = []

# Parse the pdfs from 2007-2015
for file in range(len(pdf_list) - 9, len(pdf_list))[::-1]:
with pdfplumber.open(pdf_list[file]) as pdf:
for i in pdf.pages:
pdf_data += process_pdf_2007_2015(i.extract_text())

# Parse pdfs from 2000-2006, which use a different type of spacing
for file in range(len(pdf_list) - 9)[::-1]:
with pdfplumber.open(pdf_list[file]) as pdf:
for i in pdf.pages:
pdf_data += process_pdf_2000_2006(i.extract_text())

cleaned_data += pdf_data

# Set the path to the final CSV
# We should always use the lower-case state postal code, like nj.csv
output_csv = data_dir / "mi.csv"
Expand All @@ -91,6 +107,134 @@ def scrape(
return output_csv


def process_pdf_2000_2006(txt):
"""Process the 2000-2006 pdfs."""
# split at newline, remove space placeholders, parts that aren't layoff data
txt = txt.split("\n")
txt = [i.replace("\xa0", " ").replace("\xad", " ") for i in txt]
txt = txt[5:-5]

# contains all the data for the page
ans = []

for row in txt:

# used later to check if data is damaged
broken = False
# the final processed row goes here
final = []

# the furthest the company name column ends is at index 35
compname = 35

try:
# However, sometimes it ends before that,
# so we start at index 35 and go back until we hit a space
while row[compname] != " ":
# if not space is found, the row is smushed together
if compname == 0:
broken = True
break
compname -= 1
if broken:
txt.remove(row)
continue
except Exception:
# if this fails, the data is damaged (it fails on about 5 rows)
txt.remove(row)
continue

# add company name to final row
final.append(row[: compname + 1])

# Usually, the city name ends 21 characters after the company name
# this isn't always the case, so we do what we did with company column
cityname = compname + 21
try:
while row[cityname] != " ":
broken = False
if cityname == 0:
broken = True
cityname -= 1
if broken:
txt.remove(row)
continue
except Exception:
# if it fails, the data is damaged (it only fails on two)
txt.remove(row)
continue

# add city name to final row
final.append(row[compname + 1: cityname])

# temp contains the test of the string
# which has date, event type, and number affected in that order
temp = row[cityname + 1:]

# split to get date, event type, and number affected seperately
temp = temp.split()

# they use numbers instead of these words in some of the pdf
if temp[1] == "1":
temp[1] = "Plant Closing"
elif temp[1] == "2":
temp[1] = "Mass Layoff"
elif temp[1] == "3":
temp[1] = ""
elif temp[1] == "4":
temp[1] = ""

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed in other PRs, we don't want to do any data editing or translation in these scrapers, if we can avoid it. Let's leave these values as is.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done!


# add date, event type, and number affected to final row
for j in temp:
final.append(j)

# eliminate extra space
final = [row.strip() for row in final]

# see function

# add row to page
ans.append(final)
return ans


def process_pdf_2007_2015(txt):
"""Process 2007-2015 pdfs."""
# split at newline
txt = txt.split("\n")
ans = []
for i in txt:
final = []
# split at spaces, which are used as column divders
raw = i.split(" ")
# remove space placeholders
final = [j.replace("\xa0", " ").replace("\xad", " ") for j in raw]

# removing edge cases such as the "Notes:" section and blank lines
if (
len(final) == 5
and (final[0] not in ["Company", "Company Name"])
and final[1] != "Received"
and final != [" ", " ", " ", " ", " "]
):
ans.append(final)
# accounting for the 2008-2009 minor format changes
if len(final) == 6:
if (final[2][-4:] == "2008" or final[2][-4:] == "2009") and (
final[0] != "January 1 through December 31:"
):
final.pop(-1)
ans.append(final)

return ans


# there are around 15 lines with unique spacing issues
# there doesn't seem to be a pattern to them,
# so I just fixed them manually
# there may be a more elegant solution


def _parse_html_table(soup):
black_list = [
"TOTAL:",
Expand Down