Skip to content

Commit

Permalink
[SCRAPER]: Initial implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
amadolid committed Nov 10, 2023
1 parent 36b9b89 commit 76c8c48
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 1 deletion.
2 changes: 1 addition & 1 deletion jaseci_ai_kit/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

JAC_NLP_MODULES=("bart_sum" "cl_summer" "ent_ext" "fast_enc" "sbert_sim" "t5_sum" "text_seg" "tfm_ner" "use_enc" "use_qa" "zs_classifier" "bi_enc" "topic_ext" "gpt2" "gpt3" "dolly" "llm")
JAC_SPEECH_MODULES=("stt" "vc_tts")
JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain")
JAC_MISC_MODULES=("pdf_ext" "translator" "cluster" "ph" "openai" "elastic_retrieval" "huggingface" "langchain" "scraper")
JAC_VISION_MODULES=("detr" "rftm" "yolos" "dpt")

install_modules() {
Expand Down
1 change: 1 addition & 0 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .scraper import * # noqa
1 change: 1 addition & 0 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
playwright>=1.39.0
71 changes: 71 additions & 0 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from jaseci.jsorc.live_actions import jaseci_action
from playwright.sync_api import sync_playwright, Page


@jaseci_action(act_group=["ws"], allow_remote=True)
def scrape(urls: str, depth: int = 1):
all_content = ""

scraped = set()
with sync_playwright() as spw:
browser = spw.chromium.launch()
page = browser.new_page()

while depth > 0:
content, urls = scraping(page, urls, scraped)
all_content += f"\n{content}"
depth -= 1

browser.close()

return " ".join(all_content.split())


def load_and_save(page: Page, target: str, scraped: set):
print("#############################")
try:
scraped.add(target)
print(f"loading {target} ...")
page.goto(target, wait_until="networkidle")

# print(f"capturing {target} ...")
# page.screenshot(path="".join(x for x in target if x.isalnum()) + ".png", full_page=True)

print(f"getting relevant content {target} ...")
return page.evaluate(
"""() =>
document.body.textContent;
"""
)
except Exception as e:
print(
f"Error occurs when trying to load and save {target} ...\n{e}",
)
return ""


def crawling(page: Page):
try:
return page.query_selector_all("a[href]")
except Exception as e:
print(f"Error occurs when trying to crawl {page.url} !\n{e}")
return []


def scraping(page: Page, urls: set, scraped: set):
content = ""
next_scrape = set()

while urls:
url: str = urls.pop()
if url not in scraped:
content += load_and_save(page, url, scraped)

for ahref in crawling(page):
href = ahref.get_attribute("href")
if href.startswith("http"):
next_scrape.add(href)
elif href.startswith("/"):
next_scrape.add(f"{url}{href}")

return content, next_scrape
1 change: 1 addition & 0 deletions jaseci_ai_kit/jac_misc/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"huggingface",
"langchain",
"forecast",
"scraper",
]


Expand Down

0 comments on commit 76c8c48

Please sign in to comment.