Skip to content

Commit

Permalink
replace usp
Browse files Browse the repository at this point in the history
  • Loading branch information
pablonyx committed Oct 26, 2024
1 parent a3b2941 commit fe31324
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 26 deletions.
89 changes: 64 additions & 25 deletions backend/danswer/utils/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,78 @@
from datetime import datetime
from urllib import robotparser
import re
import xml.etree.ElementTree as ET
from typing import List
from typing import Set
from urllib.parse import urljoin

from usp.tree import sitemap_tree_for_homepage # type: ignore
import requests

from danswer.utils.logger import setup_logger

logger = setup_logger()


def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool:
if not rp:
return True
else:
return rp.can_fetch("*", url)
def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
"""Extract sitemap URLs from robots.txt"""
sitemap_urls = set()
try:
robots_url = urljoin(base_url, "/robots.txt")
resp = requests.get(robots_url, timeout=10)
if resp.status_code == 200:
for line in resp.text.splitlines():
if line.lower().startswith("sitemap:"):
sitemap_url = line.split(":", 1)[1].strip()
sitemap_urls.add(sitemap_url)
except Exception as e:
logger.warning(f"Error fetching robots.txt: {e}")
return sitemap_urls


def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
"""Extract URLs from a sitemap XML file"""
urls = set()
try:
resp = requests.get(sitemap_url, timeout=10)
if resp.status_code != 200:
return urls

def init_robots_txt(site: str) -> robotparser.RobotFileParser:
ts = datetime.now().timestamp()
robots_url = f"{site}/robots.txt?ts={ts}"
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
root = ET.fromstring(resp.content)

# Handle both regular sitemaps and sitemap indexes
# Remove namespace for easier parsing
namespace = re.match(r"\{.*\}", root.tag)
ns = namespace.group(0) if namespace else ""

def list_pages_for_site(site: str) -> list[str]:
rp: robotparser.RobotFileParser | None = None
try:
rp = init_robots_txt(site)
except Exception:
logger.warning("Failed to load robots.txt")
if root.tag == f"{ns}sitemapindex":
# This is a sitemap index
for sitemap in root.findall(f".//{ns}loc"):
sub_urls = _extract_urls_from_sitemap(sitemap.text)
urls.update(sub_urls)
else:
# This is a regular sitemap
for url in root.findall(f".//{ns}loc"):
if url.text:
urls.add(url.text)

except Exception as e:
logger.warning(f"Error processing sitemap {sitemap_url}: {e}")

return urls


def list_pages_for_site(site: str) -> List[str]:
"""Get list of pages from a site's sitemaps"""
site = site.rstrip("/")
all_urls = set()

tree = sitemap_tree_for_homepage(site)
# Try both common sitemap locations
sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
for path in sitemap_paths:
sitemap_url = urljoin(site, path)
all_urls.update(_extract_urls_from_sitemap(sitemap_url))

pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)]
pages = list(dict.fromkeys(pages))
# Check robots.txt for additional sitemaps
sitemap_locations = _get_sitemap_locations_from_robots(site)
for sitemap_url in sitemap_locations:
all_urls.update(_extract_urls_from_sitemap(sitemap_url))

return pages
return list(all_urls)
1 change: 0 additions & 1 deletion backend/requirements/default.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ asana==5.0.8
zenpy==2.0.41
dropbox==11.36.2
boto3-stubs[s3]==1.34.133
ultimate_sitemap_parser==0.5
stripe==10.12.0
urllib3==2.2.3
mistune==0.8.4
Expand Down

0 comments on commit fe31324

Please sign in to comment.