Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion warn/scrapers/mi.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ def scrape(
Keyword arguments:
data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)

Returns: the Path where the file is written
"""
# Grabs the main page with the current year's data
Expand Down
90 changes: 90 additions & 0 deletions warn/scrapers/wv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from pathlib import Path

import pdfplumber

from .. import utils
from ..cache import Cache

__authors__ = ["Ash1R"]
__tags__ = ["html", "pdf"]
__source__ = {
"name": "Workforce West Virginia",
"url": "https://workforcewv.org/public-information/warn-notices/current-warn-notices",
}


def scrape(
data_dir: Path = utils.WARN_DATA_DIR,
cache_dir: Path = utils.WARN_CACHE_DIR,
) -> Path:
"""
Scrape data from west virginia workforce site.

It was a big pdf with all historical data,
And I used pdfplumber to extract the tables
And put them into wv.csv
"""
cache = Cache(cache_dir)
headers = [
"Company",
"Address",
"Contact Information",
"Region",
"County",
"Date",
"Projected Date",
"Type",
"Number Affected",
]
final_data = [headers]

wv_pdf = cache.download(
"WV_WARN_Notices_3-1-11_to_3-22-22.",
"https://workforcewv.org/images/files/PublicInfo/WV_WARN_Notices_3-1-11_to_3-22-22.pdf",
)
with pdfplumber.open(wv_pdf) as pdf:
for i in pdf.pages:
tables = i.find_tables()
for j in tables:
data = j.extract(x_tolerance=3, y_tolerance=3)
company = ""
companydone = False
row = []
for k in range(len(data)):
if data[k][0] is not None:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it necessary to the range in the loop here? Can you not simple do something more like for row in data?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each company's data is contained in two consecutive rows, with some blank rows in between these company row-pairs. Alternative company names and addresses are stored on the second row. I used range so I can access the second row using an index of k + 1. I did unnecessarily use range later, so I removed that.

if (
(data[k][0].strip() == "Contact Information")
or (data[k][0].strip() == "Region")
or (data[k][0].strip() == "County")
or (data[k][0].strip() == "Date of Notice")
or (data[k][0].strip() == "Projected Date")
or (data[k][0].strip() == "Closure/Mass Layoff")
or (data[k][0].strip() == "Number Affected")
):
row.append(data[k][1].strip())

elif data[k][0].strip() == "Address":
if not companydone:
row.append(company)
companydone = True
row.append(data[k][1].strip())

elif data[k][0].strip() == "Company":
company = company + data[k][1].strip()

elif ((data[k][0] is None) and (k != 0)) or (
data[k][0] == "None" and k != 0
):
for p in range(1, len(data[k])):
if data[k][p] is not None:
company = company + ", " + data[k][p].strip()
Comment thread
palewire marked this conversation as resolved.
Outdated

final_data.append(row)

output_csv = data_dir / "wv.csv"
utils.write_rows_to_csv(output_csv, final_data)
return output_csv


if __name__ == "__main__":
scrape()