diff --git a/visual-retrieval-colpali/prepare_feed_deploy.py b/visual-retrieval-colpali/prepare_feed_deploy.py index 54290b5d0..6497b10b3 100644 --- a/visual-retrieval-colpali/prepare_feed_deploy.py +++ b/visual-retrieval-colpali/prepare_feed_deploy.py @@ -178,8 +178,10 @@ for a_tag in year_div.select("a.button.button--download-secondary[href]"): href = a_tag["href"] full_url = urljoin(url, href) - links.append(full_url) - url_to_year[full_url] = year + # exclude non-pdf links + if full_url.endswith(".pdf"): + links.append(full_url) + url_to_year[full_url] = year links, url_to_year # -