From df373871460cfe5927c0a6f65a6024a0789e978f Mon Sep 17 00:00:00 2001 From: Weves Date: Sun, 19 Nov 2023 15:31:07 -0800 Subject: [PATCH] Fix a couple bugs with google sites link finding --- backend/danswer/connectors/google_site/connector.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/danswer/connectors/google_site/connector.py b/backend/danswer/connectors/google_site/connector.py index 6a11bee16f8..4af5b0016a9 100644 --- a/backend/danswer/connectors/google_site/connector.py +++ b/backend/danswer/connectors/google_site/connector.py @@ -21,14 +21,16 @@ logger = setup_logger() -def process_link(element: BeautifulSoup | Tag) -> str: +def process_link(element: BeautifulSoup | Tag) -> str | None: href = cast(str | None, element.get("href")) if not href: - raise RuntimeError(f"Invalid link - {element}") + return None # cleanup href href = urllib.parse.unquote(href) - href = href.rstrip(".html").lower() + href = href.lower() + if href.endswith(".html"): + href = href[:-5] href = href.replace("_", "") href = re.sub( r"([\s-]+)", "-", href @@ -44,8 +46,9 @@ def find_google_sites_page_path_from_navbar( if ul: if not is_initial: a = cast(Tag, element.find("a")) - new_path = f"{path}/{process_link(a)}" - if a.get("aria-selected") == "true": + href = process_link(a) + new_path = f"{path}/{href}" + if href and a.get("aria-selected") == "true": return new_path else: new_path = ""