biglocalnews · Ash1R · Aug 5, 2022 · Oct 22, 2022 · Oct 28, 2022 · Nov 24, 2022
diff --git a/warn/scrapers/mi.py b/warn/scrapers/mi.py
@@ -1,6 +1,7 @@
 import re
 from pathlib import Path
 
+import pdfplumber
 from bs4 import BeautifulSoup
 
 from .. import utils
@@ -30,11 +31,9 @@ def scrape(
     # Grabs the main page with the current year's data
     current_page = utils.get_url("https://milmi.org/warn/")
     current_html = current_page.text
-
     # Grabs the WARN archive years html page with previous data
     archive_web_page = utils.get_url("https://milmi.org/warn/archive")
     archive_web_html = archive_web_page.text
-
     # Write the raw current year's file to the cache
     cache = Cache(cache_dir)
     cache.write("mi/current.html", current_html)
@@ -80,6 +79,23 @@ def scrape(
             pdf_file = cache.download(cache_key, pdf_url)
         pdf_list.append(pdf_file)
 
+    # parse pdf data
+    pdf_data = []
+    print(pdf_list)
+    # Parse the pdfs from 2007-2015
+    for file in range(len(pdf_list) - 9, len(pdf_list))[::-1]:
+        with pdfplumber.open(pdf_list[file]) as pdf:
+            for i in pdf.pages:
+                pdf_data += process_pdf_2(i.extract_text())
+
+    # Parse pdfs from 2000-2006, which use a different type of spacing
+    for file in range(len(pdf_list) - 9)[::-1]:
+        with pdfplumber.open(pdf_list[file]) as pdf:
+            for i in pdf.pages:
+                pdf_data += process_pdf_1(i.extract_text())
+
+    cleaned_data += pdf_data
+
     # Set the path to the final CSV
     # We should always use the lower-case state postal code, like nj.csv
     output_csv = data_dir / "mi.csv"
@@ -91,6 +107,185 @@ def scrape(
     return output_csv
 
 
+def process_pdf_1(txt):
+    """Process the 2000-2006 pdfs."""
+    # split at newline, remove space placeholders, parts that aren't layoff data
+    txt = txt.split("\n")
+    txt = [i.replace("\xa0", " ").replace("\xad", " ") for i in txt]
+    txt = txt[5:-5]
+
+    # contains all the data for the page
+    ans = []
+
+    for row in txt:
+
+        # used later to check if data is damaged
+        broken = False
+        # the final processed row goes here
+        final = []
+
+        # the furthest the company name column ends is at index 35
+        compname = 35
+
+        try:
+            # However, sometimes it ends before that,
+            # so we start at index 35 and go back until we hit a space
+            while row[compname] != " ":
+                # if not space is found, the row is smushed together
+                if compname == 0:
+                    broken = True
+                    break
+                compname -= 1
+            if broken:
+                txt.remove(row)
+                continue
+        except Exception:
+            # if this fails, the data is damaged (it fails on about 5 rows)
+            txt.remove(row)
+            continue
+
+        # add company name to final row
+        final.append(row[: compname + 1])
+
+        # Usually, the city name ends 21 characters after the company name
+        # this isn't always the case, so we do what we did with company column
+        cityname = compname + 21
+        try:
+            while row[cityname] != " ":
+                broken = False
+                if cityname == 0:
+                    broken = True
+                cityname -= 1
+            if broken:
+                txt.remove(row)
+                continue
+        except Exception:
+            # if it fails, the data is damaged (it only fails on two)
+            txt.remove(row)
+            continue
+
+        # add city name to final row
+        final.append(row[compname + 1 : cityname])
+
+        # temp contains the test of the string
+        # which has date, event type, and number affected in that order
+        temp = row[cityname + 1 :]
+
+        # split to get date, event type, and number affected seperately
+        temp = temp.split()
+
+        # they use numbers instead of these words in some of the pdf
+        if temp[1] == "1":
+            temp[1] = "Plant Closing"
+        elif temp[1] == "2":
+            temp[1] = "Mass Layoff"
+        elif temp[1] == "3":
+            temp[1] = ""
+        elif temp[1] == "4":
+            temp[1] = ""
+
+        # add date, event type, and number affected to final row
+        for j in temp:
+            final.append(j)
+
+        # eliminate extra space
+        final = [row.strip() for row in final]
+
+        # see function
+        final = handle_edge_cases(final)
+
+        # add row to page
+        ans.append(final)
+    return ans
+
+
+def process_pdf_2(txt):
+    """Process 2007-2015 pdfs."""
+    # split at newline
+    txt = txt.split("\n")
+    ans = []
+    for i in txt:
+        final = []
+        # split at spaces, which are used as column divders
+        raw = i.split(" ")
+        # remove space placeholders
+        final = [j.replace("\xa0", " ").replace("\xad", " ") for j in raw]
+
+        # removing edge cases such as the "Notes:" section and blank lines
+        if (
+            len(final) == 5
+            and (final[0] not in ["Company", "Company Name"])
+            and final[1] != "Received"
+            and final != [" ", " ", " ", " ", " "]
+        ):
+            ans.append(final)
+        # accounting for the 2008-2009 minor format changes
+        if len(final) == 6:
+            if (final[2][-4:] == "2008" or final[2][-4:] == "2009") and (
+                final[0] != "January 1 through December 31:"
+            ):
+                final.pop(-1)
+                ans.append(final)
+
+    return ans
+
+
+# there are around 15 lines with unique spacing issues
+# there doesn't seem to be a pattern to them,
+# so I just fixed them manually
+# there may be a more elegant solution
+def handle_edge_cases(row):
+    """Handle edge cases in 2000-2006 pdfs."""
+    if row[0] == "General Motors Nao Orion AsseOrion":
+        row[1] = "Orion"
+        row[0] = row[0][-5:]
+    if row[1] == "LifDetroit":
+        row[1] = "Detroit"
+    if row[1] == "ILapeer":
+        row[1] = "Lapeer"
+    if row[0] == "Standard Federal Bank #320     Troy":
+        row[1] = "Troy"
+        row[0] = "Standard Federal Bank #320"
+    if row[0] == "Standard Federal Bank #340     Troy":
+        row[1] = "Troy"
+        row[0] = "Standard Federal Bank #340"
+    if row[1] == "IFlint":
+        row[1] = "Flint"
+    if row[1] == "DivisioJackson":
+        row[1] = "Jackson"
+    if row[1] == "IncWyoming":
+        row[1] = "Wyoming"
+    if row[0] == "Michigan Machine And EngineerFenton":
+        row[1] = "Fenton"
+        row[0] = row[0][-6:]
+    if row[1] == "CompDearborn Heights":
+        row[1] = "Dearborn Heights"
+    if row[1] == "InPetersburg":
+        row[1] = "Petersburg"
+    if row[0] == "Asplundh Tree Company #93     Wixom":
+        row[0] = row[0][-5:]
+        row[1] = "Wixom"
+    if row[0] == "Asplundh Tree Company #43     Mt.":
+        row[0] = "Asplundh Tree Company #43"
+        row[1] = "Mt. Clemens"
+    if row[0] == "Asplundh Tree Company #843   Howell":
+        row[0] = "Asplundh Tree Company #843"
+        row[1] = "Howell"
+    if row[1] == "RegionaSaginaw":
+        row[1] = "Saginaw"
+    if row[0] == "Robert Bosch Corp. Chassis DivisSi":
+        row[0] = "Robert Bosch Corp. Chassis Division"
+        row[1] = "St. Joseph"
+    if row[0] == "Magna / Cosma Body & Chassis Troy":
+        row[0] = row[0][-4:]
+        row[1] = "Troy"
+    if row[1] == "Corp.Flint":
+        row[1] = "Flint"
+    if row[1] == "Inc.Howell":
+        row[1] = "Howell"
+    return row
+
+
 def _parse_html_table(soup):
     black_list = [
         "TOTAL:",

diff --git a/warn/scrapers/wv.py b/warn/scrapers/wv.py
@@ -0,0 +1,90 @@
+from pathlib import Path
+
+import pdfplumber
+
+from .. import utils
+from ..cache import Cache
+
+__authors__ = ["Ash1R"]
+__tags__ = ["html", "pdf"]
+__source__ = {
+    "name": "Workforce West Virginia",
+    "url": "https://workforcewv.org/public-information/warn-notices/current-warn-notices",
+}
+
+
+def scrape(
+    data_dir: Path = utils.WARN_DATA_DIR,
+    cache_dir: Path = utils.WARN_CACHE_DIR,
+) -> Path:
+    """
+    Scrape data from west virginia workforce site.
+
+    It was a big pdf with all historical data,
+    And I used pdfplumber to extract the tables
+    And put them into wv.csv
+    """
+    cache = Cache(cache_dir)
+    headers = [
+        "Company",
+        "Address",
+        "Contact Information",
+        "Region",
+        "County",
+        "Date",
+        "Projected Date",
+        "Type",
+        "Number Affected",
+    ]
+    final_data = [headers]
+
+    wv_pdf = cache.download(
+        "WV_WARN_Notices_3-1-11_to_3-22-22.",
+        "https://workforcewv.org/images/files/PublicInfo/WV_WARN_Notices_3-1-11_to_3-22-22.pdf",
+    )
+    with pdfplumber.open(wv_pdf) as pdf:
+        for i in pdf.pages:
+            tables = i.find_tables()
+            for j in tables:
+                data = j.extract(x_tolerance=3, y_tolerance=3)
+                company = ""
+                companydone = False
+                row = []
+                for k in range(len(data)):
+                    if data[k][0] is not None:
+                        if (
+                            (data[k][0].strip() == "Contact Information")
+                            or (data[k][0].strip() == "Region")
+                            or (data[k][0].strip() == "County")
+                            or (data[k][0].strip() == "Date of Notice")
+                            or (data[k][0].strip() == "Projected Date")
+                            or (data[k][0].strip() == "Closure/Mass Layoff")
+                            or (data[k][0].strip() == "Number Affected")
+                        ):
+                            row.append(data[k][1].strip())
+
+                        elif data[k][0].strip() == "Address":
+                            if not companydone:
+                                row.append(company)
+                                companydone = True
+                            row.append(data[k][1].strip())
+
+                        elif data[k][0].strip() == "Company":
+                            company = company + data[k][1].strip()
+
+                    elif ((data[k][0] is None) and (k != 0)) or (
+                        data[k][0] == "None" and k != 0
+                    ):
+                        for p in range(1, len(data[k])):
+                            if data[k][p] is not None:
+                                company = company + ", " + data[k][p].strip()
+
+                final_data.append(row)
+
+    output_csv = data_dir / "wv.csv"
+    utils.write_rows_to_csv(output_csv, final_data)
+    return output_csv
+
+
+if __name__ == "__main__":
+    scrape()