Skip to content

Commit

Permalink
[SCRAPER]: Version 2
Browse files Browse the repository at this point in the history
  • Loading branch information
amadolid committed Dec 4, 2023
1 parent 0d86551 commit 9a2411a
Show file tree
Hide file tree
Showing 2 changed files with 382 additions and 1 deletion.
190 changes: 190 additions & 0 deletions jaseci_ai_kit/jac_misc/jac_misc/scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# **SCRAPER (`Playwright Python`)**

## wbs.**`url_to_filename`**
> **`Arguments`:** \
> **url**: str
>
> **`Return`:** \
> str
>
> **`Usage`:** \
> To convert url to a valid file name
>
##### **`HOW TO TRIGGER`**
```js
wbs.url_to_filename("https://google.com")
```

## wbs.**`scrape`**
> **`Arguments`:** \
> **pages**: list (structure below) \
> **pre_configs**: list (structure below)\
> **detailed**: bool = False
>
> **`Return`:** \
> str or dict
>
> **`Usage`:** \
> To scrape specified url
>
> **`Remarks`:** \
> **detailed** true will return dict with scanned/scraped urls
>
##### **`STRUCTURE`**
```python
###########################################################################
# pages structure #
###########################################################################
[{
# required
"goto": {
#required
"url": "",
"wait_until": "networkidle",

# optional
# all pre and post scripts have same structure
"pre_scripts": [{
# methods from playwright.sync_api.Page
# https://playwright.dev/docs/api/class-page#methods
"method": "wait_for_selector",

# all other fields other than "method" will be used as **kwargs
"**": "value"
}],
# optional
"post_scripts": []
},

# optional
"getters": [{
# "selector" | "custom" | "none" | else default
"method": "default",

# optional
# selector == css query selector to target element where to trigger textContent
# custom == your custom js script that will return string
# none == empty
# anything else == whole document.body
# only works with selector and custom
"expression": "",

# optional
# defaults to ["script", "style", "link", "noscript"]
# element to remove before textContent
# only works with method selector and default
"excluded_element": [],

# optional
"pre_scripts": [],
# optional
"post_scripts": []
}],

# optional
"crawler": {
# required
# list of query selection with different attributes
"queries": [{
# css query selector
"selector": "",
# element attributes where we can get the url for crawling
"attribute": ""
}],

# list of regex string that will be included in crawler
# empty will allow everything
"filters": [],

# depth of crawl default to zero
# zero will stop crawling
"depth": 1,

"pre_scripts": [],
"post_scripts": []
}
}]

###########################################################################
# pre_configs structure #
###########################################################################
[{
# if crawled url match to this regex, scraper field will be the config used for that url
"regex": "",
"scraper": {

# similar to pages structure without goto.url
"goto": {
"wait_until": "networkidle",
"pre_scripts": [],
"post_scripts": []
}
}
}]
```

##### **`HOW TO TRIGGER`**
```python
wbs.scrape(
pages = [{
"goto": {
"url": "http://google.com",
"wait_until": "networkidle",
"pre_scripts": [],
"post_scripts": [{
"method": "evaluate",
"expression": """
try {
document.querySelector("textarea[id=APjFqb]").value = "speed test";
document.querySelector("form[action='/search']:has(input[type=submit][value='Google Search'])").submit();
} catch (err) { }
"""
},{
"method": "wait_for_selector",
"selector": "#result-stats",
"state": "visible"
}]
},
"getters": [{
"method": "default",
}],
"crawler": {
"filters": ["^((?!google\\.com).)*$"],
"depth": 1
}
}],
pre_configs = [{
"regex": "speedtest\\.net",
"scraper": {
"goto": {
"wait_until": "load"
},
"getters": [{
"method": "default",
}],
}
},{
"regex": "fast\\.com",
"scraper": {
"goto": {
"wait_until": "load"
},
"getters": [{
"method": "default",
}],
}
},{
"regex": "speedcheck\\.org",
"scraper": {
"goto": {
"wait_until": "load"
},
"getters": [{
"method": "default",
}],
}
}],
detailed = True
)
```
193 changes: 192 additions & 1 deletion jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from playwright.sync_api import sync_playwright, Page
from typing import Union
from re import search
from copy import deepcopy


@jaseci_action(act_group=["wbs"], allow_remote=True)
Expand All @@ -10,7 +11,197 @@ def url_to_filename(url: str):


@jaseci_action(act_group=["wbs"], allow_remote=True)
def scrape(
def scrape(pages: list, pre_configs: list = [], detailed: bool = False):
content = ""
urls = {"scanned": set(), "scraped": set(), "crawled": set()}
with sync_playwright() as spw:
browser = spw.chromium.launch()
page = browser.new_page()

while pages:
pg: dict = pages.pop(0)

goto(page, pg.get("goto") or {}, urls)
content += getters(page, pg.get("getters") or [], urls)
crawler(page, pg.get("crawler") or {}, urls, pages, pre_configs)

browser.close()

content = " ".join(content.split())

if detailed:
return {
"content": content,
"scanned": list(urls["scanned"]),
"scraped": list(urls["scraped"]),
}

return content


def goto(page: Page, specs: dict, urls: dict):
if specs:
post = get_script(specs, "post")
run_scripts(page, get_script(specs, "pre"), urls)

print(f'[goto]: loading {specs["url"]}')

page.goto(**specs)
add_url(page, urls)

run_scripts(page, post, urls)


def getters(page: Page, specss: list[dict], urls: dict):
content = ""
for specs in specss:
if specs:
post = get_script(specs, "post")
run_scripts(page, get_script(specs, "pre"), urls)

exel_str = ""
for exel in (
specs.get("excluded_element", ["script", "style", "link", "noscript"])
or []
):
exel_str += (
f'clone.querySelectorAll("{exel}").forEach(d => d.remove());\n'
)

method = specs.get("method")
if method == "selector":
expression = f"""
Array.prototype.map.call(
document.querySelectorAll("{specs.get("expression")}"),
d => {{
clone = d.cloneNode(true);
{exel_str}
return clone.textContent;
}}).join("\n");
"""
elif method == "custom":
expression = f'{{{specs.get("expression")}}}'
elif method == "none":
expression = '""'
else:
expression = f"""{{
clone = document.body.cloneNode(true);
{exel_str}
return clone.textContent;
}}"""

if expression:
print(f"[getters]: getting content from {page.url}")
content += page.evaluate(f"() =>{expression}")
add_url(page, urls, expression)

run_scripts(page, post, urls)
return content


def crawler(page: Page, specs: dict, urls: dict, pages: list, pre_configs: list):
if specs:
post = get_script(specs, "post")
run_scripts(page, get_script(specs, "pre"), urls)

queries = specs.get("queries") or [{"selector": "a[href]", "attribute": "href"}]
filters = specs.get("filters") or []
depth = specs.get("depth", 1) or 0

if depth > 0:
for query in queries:
for node in page.query_selector_all(query.get("selector") or "a[href]"):
url = node.get_attribute(query.get("attribute") or "href")
c_url = get_hostname(page)

if url.startswith("/"):
url = f"{c_url}{url}"

if url.startswith("http") and url not in urls["crawled"]:
included = not bool(filters)

for filter in filters:
if search(filter, url):
included = True
break

if included:
add_crawl(
pages,
pre_configs,
urls,
url,
{
"queries": queries,
"depth": depth - 1,
"filters": filters,
},
)

run_scripts(page, post, urls)


def get_script(specs: dict, name: str):
return specs.pop(f"{name}_scripts", []) or []


def run_scripts(page: Page, scripts: list[dict], urls: dict):
for script in scripts:
method = script.pop("method", "evalutate") or "evaluate"
print(f"[script]: running method {method}\n{str(script)}")
getattr(page, method)(**script)
add_url(page, urls)


def add_url(page: Page, urls: dict, scraped: bool = False):
url = page.url
if url:
if url not in urls["scanned"]:
urls["scanned"].add(url)

if scraped and url not in urls["scraped"]:
urls["scraped"].add(url)


def add_crawl(pages: list, pre_configs: list, urls: dict, url: str, def_crawl: dict):
urls["crawled"].add(url)
scraper = {
"goto": {
"url": url,
"wait_until": "networkidle",
"pre_scripts": [],
"post_scripts": [],
},
"getters": [{"method": "default"}],
"crawler": def_crawl,
}
for pconf in pre_configs:
if search(pconf["regex"], url):
scraper = deepcopy(pconf["scraper"])
(scraper.get("goto") or {})["url"] = url
scraper["crawler"] = scraper.get("crawler") or def_crawl
break

pages.append(scraper)


def get_hostname(page: Page):
url = page.url
if url:
splitter = url.split("//")
protocol = splitter[0]
hostname = splitter[1].split("/")[0]
return f"{protocol}//{hostname}"
return url


###########################################################################
# OLD SCRAPER #
###########################################################################


@jaseci_action(act_group=["wbs"], allow_remote=True)
def old_scrape(
urls: set,
scripts: dict = {},
url_filters: list = [],
Expand Down

0 comments on commit 9a2411a

Please sign in to comment.