replace usp

onyx-dot-app · Oct 26, 2024 · fe31324 · fe31324
1 parent a3b2941
commit fe31324
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 26 deletions.
diff --git a/backend/danswer/utils/sitemap.py b/backend/danswer/utils/sitemap.py
@@ -1,39 +1,78 @@
-from datetime import datetime
-from urllib import robotparser
+import re
+import xml.etree.ElementTree as ET
+from typing import List
+from typing import Set
+from urllib.parse import urljoin
 
-from usp.tree import sitemap_tree_for_homepage  # type: ignore
+import requests
 
 from danswer.utils.logger import setup_logger
 
 logger = setup_logger()
 
 
-def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool:
-    if not rp:
-        return True
-    else:
-        return rp.can_fetch("*", url)
+def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
+    """Extract sitemap URLs from robots.txt"""
+    sitemap_urls = set()
+    try:
+        robots_url = urljoin(base_url, "/robots.txt")
+        resp = requests.get(robots_url, timeout=10)
+        if resp.status_code == 200:
+            for line in resp.text.splitlines():
+                if line.lower().startswith("sitemap:"):
+                    sitemap_url = line.split(":", 1)[1].strip()
+                    sitemap_urls.add(sitemap_url)
+    except Exception as e:
+        logger.warning(f"Error fetching robots.txt: {e}")
+    return sitemap_urls
+
 
+def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
+    """Extract URLs from a sitemap XML file"""
+    urls = set()
+    try:
+        resp = requests.get(sitemap_url, timeout=10)
+        if resp.status_code != 200:
+            return urls
 
-def init_robots_txt(site: str) -> robotparser.RobotFileParser:
-    ts = datetime.now().timestamp()
-    robots_url = f"{site}/robots.txt?ts={ts}"
-    rp = robotparser.RobotFileParser()
-    rp.set_url(robots_url)
-    rp.read()
-    return rp
+        root = ET.fromstring(resp.content)
 
+        # Handle both regular sitemaps and sitemap indexes
+        # Remove namespace for easier parsing
+        namespace = re.match(r"\{.*\}", root.tag)
+        ns = namespace.group(0) if namespace else ""
 
-def list_pages_for_site(site: str) -> list[str]:
-    rp: robotparser.RobotFileParser | None = None
-    try:
-        rp = init_robots_txt(site)
-    except Exception:
-        logger.warning("Failed to load robots.txt")
+        if root.tag == f"{ns}sitemapindex":
+            # This is a sitemap index
+            for sitemap in root.findall(f".//{ns}loc"):
+                sub_urls = _extract_urls_from_sitemap(sitemap.text)
+                urls.update(sub_urls)
+        else:
+            # This is a regular sitemap
+            for url in root.findall(f".//{ns}loc"):
+                if url.text:
+                    urls.add(url.text)
+
+    except Exception as e:
+        logger.warning(f"Error processing sitemap {sitemap_url}: {e}")
+
+    return urls
+
+
+def list_pages_for_site(site: str) -> List[str]:
+    """Get list of pages from a site's sitemaps"""
+    site = site.rstrip("/")
+    all_urls = set()
 
-    tree = sitemap_tree_for_homepage(site)
+    # Try both common sitemap locations
+    sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
+    for path in sitemap_paths:
+        sitemap_url = urljoin(site, path)
+        all_urls.update(_extract_urls_from_sitemap(sitemap_url))
 
-    pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)]
-    pages = list(dict.fromkeys(pages))
+    # Check robots.txt for additional sitemaps
+    sitemap_locations = _get_sitemap_locations_from_robots(site)
+    for sitemap_url in sitemap_locations:
+        all_urls.update(_extract_urls_from_sitemap(sitemap_url))
 
-    return pages
+    return list(all_urls)
diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
@@ -78,7 +78,6 @@ asana==5.0.8
 zenpy==2.0.41
 dropbox==11.36.2
 boto3-stubs[s3]==1.34.133
-ultimate_sitemap_parser==0.5
 stripe==10.12.0
 urllib3==2.2.3
 mistune==0.8.4