Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build in SSL verification workaround; patch pre-commit; patch AZ #540

Merged
merged 5 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ repos:
additional_dependencies: [black]

- repo: https://github.com/timothycrosley/isort
rev: 5.10.1
rev: 5.12.0
hooks:
- id: isort

- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
rev: 6.1.0
hooks:
- id: flake8
additional_dependencies:
Expand Down
7 changes: 5 additions & 2 deletions warn/platforms/job_center/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ class Site:
state (str): State postal code
url (str): Search URL for the site (should end in '/warn_lookups')
cache_dir (str): Cache directory
verify (boolean, default True): SSL certificate verification
"""

def __init__(self, state, url, cache_dir):
def __init__(self, state, url, cache_dir, verify=True):
"""Initialize a new instance."""
self.state = state.upper()
self.url = url
self.cache = Cache(cache_dir)
self.verify = verify
print(f"Site init SSL verification status: {self.verify}")

def scrape(self, start_date=None, end_date=None, detail_pages=True, use_cache=True):
"""
Expand Down Expand Up @@ -110,7 +113,7 @@ def _get_page(self, url, params=None, use_cache=True):
return self.cache.fetch(url, params)
else:
logger.debug("Pulling from the web")
response = requests.get(url, params=params)
response = requests.get(url, params=params, verify=self.verify)
logger.debug(f"Response code: {response.status_code}")
html = response.text
self.cache.save(url, params, html)
Expand Down
26 changes: 21 additions & 5 deletions warn/platforms/job_center/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@


def scrape_state(
state_postal, search_url, output_csv, stop_year, cache_dir, use_cache=True
state_postal,
search_url,
output_csv,
stop_year,
cache_dir,
use_cache=True,
verify=True,
):
"""Date-based scraper for Job Center states.

Expand All @@ -29,6 +35,7 @@ def scrape_state(
stop_year (int): First year that data is available for state (requires manaul research)
cache_dir (str): The root directory for WARN's cache files (e.g. ~/.warn-scraper/cache)
use_cache (boolean, default True): Whether to use cached files for older years
verify (boolean, default True): Use SSL certificate verifcation

Returns:
Full path to exported csv (e.g. ~/.warn-scraper/exports/ks.csv)
Expand All @@ -41,7 +48,10 @@ def scrape_state(

# Set up scraper instance
state_cache_dir = cache_dir / state_postal.lower()
site = JobCenterSite(state_postal.upper(), search_url, cache_dir=state_cache_dir)
print(f"scrape_state verify: {verify}")
site = JobCenterSite(
state_postal.upper(), search_url, cache_dir=state_cache_dir, verify=verify
)

# Date-based searches produce search result pages that appear to have certain
# records duplicated over paged results. We'll initially write all data to a raw
Expand All @@ -66,16 +76,22 @@ def scrape_state(
# Execute the scrape in two batches
# 1. Current and prior year. Always scrape fresh (i.e. never use cached files)
# in case records have been updated.
_scrape_years(site, raw_csv, headers, no_cache_years, use_cache=False)
_scrape_years(
site, raw_csv, headers, no_cache_years, use_cache=False, verify=verify
)
# 2. Years before current & prior, going back to stop_year.
# We should generally use cached files for these older years,
# since data is less likely to be updated.
_scrape_years(site, raw_csv, headers, yearly_dates, use_cache=use_cache)
_scrape_years(
site, raw_csv, headers, yearly_dates, use_cache=use_cache, verify=verify
)
_dedupe(raw_csv, output_csv)
return output_csv


def _scrape_years(site, output_csv, headers, start_end_dates, use_cache=True):
def _scrape_years(
site, output_csv, headers, start_end_dates, use_cache=True, verify=True
):
"""Loop through years of data and write out to CSV."""
# NOTE: Scraping for Jan 1 - Dec 31 for current year works
# throughout the year. Additionally, it allows us to avoid
Expand Down
15 changes: 13 additions & 2 deletions warn/scrapers/az.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from .. import utils

__authors__ = ["zstumgoren", "Dilcia19"]
__authors__ = ["zstumgoren", "Dilcia19", "stucka"]
__tags__ = [
"jobcenter",
]
Expand Down Expand Up @@ -32,12 +32,23 @@ def scrape(
output_csv = data_dir / "az.csv"
search_url = "https://www.azjobconnection.gov/search/warn_lookups"

# Use SSL certificate? Broke August 2023
verify = False

# Date chosen based on manual research
stop_year = 2010

# Use cache for years before current and prior year
print(f"AZ cache status: {use_cache}")
print(f"AZ SSL verification: {verify}")
scrape_state(
"AZ", search_url, output_csv, stop_year, cache_dir, use_cache=use_cache
"AZ",
search_url,
output_csv,
stop_year,
cache_dir,
use_cache=use_cache,
verify=verify,
)

return output_csv
Expand Down
Loading