Skip to content

Commit

Permalink
Fix a couple bugs with google sites link finding
Browse files Browse the repository at this point in the history
  • Loading branch information
Weves committed Nov 19, 2023
1 parent f72825c commit df37387
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions backend/danswer/connectors/google_site/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,16 @@
logger = setup_logger()


def process_link(element: BeautifulSoup | Tag) -> str:
def process_link(element: BeautifulSoup | Tag) -> str | None:
href = cast(str | None, element.get("href"))
if not href:
raise RuntimeError(f"Invalid link - {element}")
return None

# cleanup href
href = urllib.parse.unquote(href)
href = href.rstrip(".html").lower()
href = href.lower()
if href.endswith(".html"):
href = href[:-5]
href = href.replace("_", "")
href = re.sub(
r"([\s-]+)", "-", href
Expand All @@ -44,8 +46,9 @@ def find_google_sites_page_path_from_navbar(
if ul:
if not is_initial:
a = cast(Tag, element.find("a"))
new_path = f"{path}/{process_link(a)}"
if a.get("aria-selected") == "true":
href = process_link(a)
new_path = f"{path}/{href}"
if href and a.get("aria-selected") == "true":
return new_path
else:
new_path = ""
Expand Down

1 comment on commit df37387

@vercel
Copy link

@vercel vercel bot commented on df37387 Nov 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.