Skip to content

Commit

Permalink
Merge pull request #5215 from openstates/pr-fix-title
Browse files Browse the repository at this point in the history
PR: fix bill title
  • Loading branch information
jessemortenson authored Jan 14, 2025
2 parents 0ea7886 + 9217f2d commit 83865e3
Showing 1 changed file with 15 additions and 6 deletions.
21 changes: 15 additions & 6 deletions scrapers/pr/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,15 +333,24 @@ def scrape_bill(self, chamber, session, url):
}
html = self.s.get(url, headers=headers, verify=False).text
page = lxml.html.fromstring(html)
# search for Titulo, accent over i messes up lxml, so use 'tulo'
title = page.xpath('//main//div[contains(@class, "items-center")]/h1/text()')[
0
].strip()
if title:
bill_id = re.findall(r"[A-Z]{2}\d{4}", title)[0]

page_header_elems = page.xpath(
'//main//div[contains(@class, "items-center")]/h1/text()'
)
if len(page_header_elems) > 0:
page_header_text = page_header_elems[0].strip()
bill_id = re.findall(r"[A-Z]{2}\d{4}", page_header_text)[0]
else:
self.logger.error(f"Bill found with no bill identifier at {url}")

bill_title_elems = page.xpath(
'//span/strong[text()="Título:"]/../following-sibling::span'
)
if len(bill_title_elems) > 0:
title = bill_title_elems[0].text_content().strip()
else:
self.logger.error(f"Bill found with no title at {url}")

# PR occasionally repeats a bill at different URLs (????)
# example:
# PC0205 https://sutra.oslpr.org/medidas/152982
Expand Down

0 comments on commit 83865e3

Please sign in to comment.