Skip to content

Commit

Permalink
clean code
Browse files Browse the repository at this point in the history
  • Loading branch information
jeisenman23 committed Jan 7, 2025
1 parent 4eac5dc commit 2ed61b9
Show file tree
Hide file tree
Showing 3 changed files with 456 additions and 21 deletions.
49 changes: 30 additions & 19 deletions elm/web/osti.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ def download(self, fp):
fp : str
Filepath to download this record to, typically a .pdf
"""
# OSTI returns citation on first query and pdf on second (weird)
session = requests.Session()
response = session.get(self.url)
response = session.get(self.url)
Expand All @@ -172,8 +171,8 @@ def __init__(self, url, n_pages=1):
https://www.osti.gov/api/v1/docs
n_pages : int
Number of pages to get from the API. Typical response has 20
entries per page. Default of 1 ensures that this class doesnt hang
on a million responses.
entries per page. Default of 1 ensures that this class doesnt
hang on a million responses.
"""

self.url = url
Expand All @@ -187,7 +186,18 @@ def __init__(self, url, n_pages=1):
super().__init__(records)

def clean_escape_sequences(self, text: str) -> str:
"""Clean problematic escape sequences in text"""
"""Clean problematic escape sequences and formatting in JSON text.
Parameters
----------
text : str
Raw JSON text to be cleaned
Returns
-------
str
Cleaned JSON text with proper escape sequences and formatting
"""
# First fix any invalid escape sequences
text = re.sub(r'\\([^"\\/bfnrtu])', r'\1', text)

Expand All @@ -213,29 +223,26 @@ def clean_escape_sequences(self, text: str) -> str:
def parse_json_safely(self, text: str) -> List[Dict]:
"""Safely parse JSON with multiple fallback strategies"""
try:
# First attempt: direct parse after cleaning
cleaned_text = self.clean_escape_sequences(text)
return json.loads(cleaned_text)
except json.JSONDecodeError as e1:
logger.debug(f"First parse attempt failed: {e1}")
try:
# Second attempt: more aggressive cleaning
text = re.sub(r'[\x00-\x1F]+', '', text) # Remove control characters
text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text) # Remove unicode escapes
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
text = re.sub(r'[\x00-\x1F]+', '', text)
text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
text = re.sub(r'\s+', ' ', text)
return json.loads(text)
except json.JSONDecodeError as e2:
logger.debug(f"Second parse attempt failed: {e2}")
try:
# Final attempt: extract what we can
matches = re.findall(r'{[^{}]*}', text)
if matches:
# Reconstruct array with valid objects
valid_json = f"[{','.join(matches)}]"
return json.loads(valid_json)
raise e2
except json.JSONDecodeError as e3:
logger.error(f"All parsing attempts failed. Final error: {e3}")
logger.error(f"""All parsing attempts
failed. Final error: {e3}""")
raise

def _get_first(self):
Expand All @@ -251,7 +258,8 @@ def _get_first(self):
raw_text = self._response.text
first_page = self.parse_json_safely(raw_text)
except (json.JSONDecodeError, UnicodeError) as e:
logger.error(f"JSON decode error: {str(e)}\nRaw text: {raw_text[:500]}...")
logger.error(f"""JSON decode error:
{str(e)}\nRaw text: {raw_text[:500]}...""")
raise
self._n_pages = 1
if 'last' in self._response.links:
Expand Down Expand Up @@ -284,7 +292,8 @@ def _get_pages(self, n_pages):
params={'page': page}
)
if not response.ok:
logger.error(f"Failed to get page {page}: {response.status_code}")
logger.error(f"""Failed to get page {page}:
{response.status_code}""")
continue
page_records = self.parse_json_safely(response.text)

Expand Down Expand Up @@ -317,15 +326,17 @@ def _get_all(self, n_pages):

def download(self, out_dir):
"""Download all PDFs from the records in this OSTI object into a
directory. PDFs will be given file names based on their OSTI record ID
directory. PDFs will be given file names based on their OSTI record
ID
Parameters
----------
out_dir : str
Directory to download PDFs to. This directory will be created if it
does not already exist.
Directory to download PDFs to. This directory will be created
if it does not already exist.
"""
logger.info('Downloading {} records to: {}'.format(len(self), out_dir))
logger.info('''Downloading {} records to: {}'''
.format(len(self), out_dir))
os.makedirs(out_dir, exist_ok=True)
for record in self:
fp_out = os.path.join(out_dir, record.osti_id + '.pdf')
Expand All @@ -334,7 +345,7 @@ def download(self, out_dir):
record.download(fp_out)
except Exception as e:
logger.exception('Could not download OSTI ID {} "{}": {}'
.format(record.osti_id, record.title, e))
.format(record.osti_id, record.title, e))
logger.info('Finished download!')

@property
Expand Down
Loading

0 comments on commit 2ed61b9

Please sign in to comment.