-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
64 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,39 +1,78 @@ | ||
from datetime import datetime | ||
from urllib import robotparser | ||
import re | ||
import xml.etree.ElementTree as ET | ||
from typing import List | ||
from typing import Set | ||
from urllib.parse import urljoin | ||
|
||
from usp.tree import sitemap_tree_for_homepage # type: ignore | ||
import requests | ||
|
||
from danswer.utils.logger import setup_logger | ||
|
||
logger = setup_logger() | ||
|
||
|
||
def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool: | ||
if not rp: | ||
return True | ||
else: | ||
return rp.can_fetch("*", url) | ||
def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]: | ||
"""Extract sitemap URLs from robots.txt""" | ||
sitemap_urls = set() | ||
try: | ||
robots_url = urljoin(base_url, "/robots.txt") | ||
resp = requests.get(robots_url, timeout=10) | ||
if resp.status_code == 200: | ||
for line in resp.text.splitlines(): | ||
if line.lower().startswith("sitemap:"): | ||
sitemap_url = line.split(":", 1)[1].strip() | ||
sitemap_urls.add(sitemap_url) | ||
except Exception as e: | ||
logger.warning(f"Error fetching robots.txt: {e}") | ||
return sitemap_urls | ||
|
||
|
||
def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]: | ||
"""Extract URLs from a sitemap XML file""" | ||
urls = set() | ||
try: | ||
resp = requests.get(sitemap_url, timeout=10) | ||
if resp.status_code != 200: | ||
return urls | ||
|
||
def init_robots_txt(site: str) -> robotparser.RobotFileParser: | ||
ts = datetime.now().timestamp() | ||
robots_url = f"{site}/robots.txt?ts={ts}" | ||
rp = robotparser.RobotFileParser() | ||
rp.set_url(robots_url) | ||
rp.read() | ||
return rp | ||
root = ET.fromstring(resp.content) | ||
|
||
# Handle both regular sitemaps and sitemap indexes | ||
# Remove namespace for easier parsing | ||
namespace = re.match(r"\{.*\}", root.tag) | ||
ns = namespace.group(0) if namespace else "" | ||
|
||
def list_pages_for_site(site: str) -> list[str]: | ||
rp: robotparser.RobotFileParser | None = None | ||
try: | ||
rp = init_robots_txt(site) | ||
except Exception: | ||
logger.warning("Failed to load robots.txt") | ||
if root.tag == f"{ns}sitemapindex": | ||
# This is a sitemap index | ||
for sitemap in root.findall(f".//{ns}loc"): | ||
sub_urls = _extract_urls_from_sitemap(sitemap.text) | ||
urls.update(sub_urls) | ||
else: | ||
# This is a regular sitemap | ||
for url in root.findall(f".//{ns}loc"): | ||
if url.text: | ||
urls.add(url.text) | ||
|
||
except Exception as e: | ||
logger.warning(f"Error processing sitemap {sitemap_url}: {e}") | ||
|
||
return urls | ||
|
||
|
||
def list_pages_for_site(site: str) -> List[str]: | ||
"""Get list of pages from a site's sitemaps""" | ||
site = site.rstrip("/") | ||
all_urls = set() | ||
|
||
tree = sitemap_tree_for_homepage(site) | ||
# Try both common sitemap locations | ||
sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"] | ||
for path in sitemap_paths: | ||
sitemap_url = urljoin(site, path) | ||
all_urls.update(_extract_urls_from_sitemap(sitemap_url)) | ||
|
||
pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)] | ||
pages = list(dict.fromkeys(pages)) | ||
# Check robots.txt for additional sitemaps | ||
sitemap_locations = _get_sitemap_locations_from_robots(site) | ||
for sitemap_url in sitemap_locations: | ||
all_urls.update(_extract_urls_from_sitemap(sitemap_url)) | ||
|
||
return pages | ||
return list(all_urls) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters