[SCRAPER]: Version 2

Jaseci-Labs · Dec 4, 2023 · 9a2411a · 9a2411a
1 parent 0d86551
commit 9a2411a
Show file tree

Hide file tree

Showing 2 changed files with 382 additions and 1 deletion.
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/README.md b/jaseci_ai_kit/jac_misc/jac_misc/scraper/README.md
@@ -0,0 +1,190 @@
+# **SCRAPER (`Playwright Python`)**
+
+## wbs.**`url_to_filename`**
+> **`Arguments`:** \
+> **url**: str
+>
+> **`Return`:** \
+> str
+>
+> **`Usage`:** \
+> To convert url to a valid file name
+>
+
+##### **`HOW TO TRIGGER`**
+```js
+wbs.url_to_filename("https://google.com")
+```
+
+## wbs.**`scrape`**
+> **`Arguments`:** \
+> **pages**: list (structure below) \
+> **pre_configs**: list (structure below)\
+> **detailed**: bool = False
+>
+> **`Return`:** \
+> str or dict
+>
+> **`Usage`:** \
+> To scrape specified url
+>
+> **`Remarks`:** \
+> **detailed** true will return dict with scanned/scraped urls
+>
+##### **`STRUCTURE`**
+```python
+###########################################################################
+#                             pages structure                             #
+###########################################################################
+[{
+    # required
+    "goto": {
+        #required
+        "url": "",
+        "wait_until": "networkidle",
+
+        # optional
+        # all pre and post scripts have same structure
+        "pre_scripts": [{
+            # methods from playwright.sync_api.Page
+            # https://playwright.dev/docs/api/class-page#methods
+            "method": "wait_for_selector",
+
+            # all other fields other than "method" will be used as **kwargs
+            "**": "value"
+        }],
+        # optional
+        "post_scripts": []
+    },
+
+    # optional
+    "getters": [{
+        # "selector" | "custom" | "none" | else default
+        "method": "default",
+
+        # optional
+        # selector == css query selector to target element where to trigger textContent
+        # custom == your custom js script that will return string
+        # none == empty
+        # anything else == whole document.body
+        # only works with selector and custom
+        "expression": "",
+
+        # optional
+        # defaults to ["script", "style", "link", "noscript"]
+        # element to remove before textContent
+        # only works with method selector and default
+        "excluded_element": [],
+
+        # optional
+        "pre_scripts": [],
+        # optional
+        "post_scripts": []
+    }],
+
+    # optional
+    "crawler": {
+        # required
+        # list of query selection with different attributes
+        "queries": [{
+                # css query selector
+                "selector": "",
+                # element attributes where we can get the url for crawling
+                "attribute": ""
+        }],
+
+        # list of regex string that will be included in crawler
+        # empty will allow everything
+        "filters": [],
+
+        # depth of crawl default to zero
+        # zero will stop crawling
+        "depth": 1,
+
+        "pre_scripts": [],
+        "post_scripts": []
+    }
+}]
+
+###########################################################################
+#                          pre_configs structure                          #
+###########################################################################
+[{
+    # if crawled url match to this regex, scraper field will be the config used for that url
+    "regex": "",
+    "scraper": {
+
+        # similar to pages structure without goto.url
+        "goto": {
+            "wait_until": "networkidle",
+            "pre_scripts": [],
+            "post_scripts": []
+        }
+    }
+}]
+```
+
+##### **`HOW TO TRIGGER`**
+```python
+wbs.scrape(
+    pages = [{
+        "goto": {
+            "url": "http://google.com",
+            "wait_until": "networkidle",
+            "pre_scripts": [],
+            "post_scripts": [{
+                "method": "evaluate",
+                "expression": """
+                try {
+                    document.querySelector("textarea[id=APjFqb]").value = "speed test";
+                    document.querySelector("form[action='/search']:has(input[type=submit][value='Google Search'])").submit();
+                } catch (err) { }
+                """
+            },{
+                "method": "wait_for_selector",
+                "selector": "#result-stats",
+                "state": "visible"
+            }]
+        },
+        "getters": [{
+            "method": "default",
+        }],
+        "crawler": {
+            "filters": ["^((?!google\\.com).)*$"],
+            "depth": 1
+        }
+    }],
+    pre_configs = [{
+        "regex": "speedtest\\.net",
+        "scraper": {
+            "goto": {
+                "wait_until": "load"
+            },
+            "getters": [{
+                "method": "default",
+            }],
+        }
+    },{
+        "regex": "fast\\.com",
+        "scraper": {
+            "goto": {
+                "wait_until": "load"
+            },
+            "getters": [{
+                "method": "default",
+            }],
+        }
+    },{
+        "regex": "speedcheck\\.org",
+        "scraper": {
+            "goto": {
+                "wait_until": "load"
+            },
+            "getters": [{
+                "method": "default",
+            }],
+        }
+    }],
+    detailed = True
+)
+```
diff --git a/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py b/jaseci_ai_kit/jac_misc/jac_misc/scraper/scraper.py
@@ -2,6 +2,7 @@
 from playwright.sync_api import sync_playwright, Page
 from typing import Union
 from re import search
+from copy import deepcopy
 
 
 @jaseci_action(act_group=["wbs"], allow_remote=True)
@@ -10,7 +11,197 @@ def url_to_filename(url: str):
 
 
 @jaseci_action(act_group=["wbs"], allow_remote=True)
-def scrape(
+def scrape(pages: list, pre_configs: list = [], detailed: bool = False):
+    content = ""
+    urls = {"scanned": set(), "scraped": set(), "crawled": set()}
+    with sync_playwright() as spw:
+        browser = spw.chromium.launch()
+        page = browser.new_page()
+
+        while pages:
+            pg: dict = pages.pop(0)
+
+            goto(page, pg.get("goto") or {}, urls)
+            content += getters(page, pg.get("getters") or [], urls)
+            crawler(page, pg.get("crawler") or {}, urls, pages, pre_configs)
+
+        browser.close()
+
+    content = " ".join(content.split())
+
+    if detailed:
+        return {
+            "content": content,
+            "scanned": list(urls["scanned"]),
+            "scraped": list(urls["scraped"]),
+        }
+
+    return content
+
+
+def goto(page: Page, specs: dict, urls: dict):
+    if specs:
+        post = get_script(specs, "post")
+        run_scripts(page, get_script(specs, "pre"), urls)
+
+        print(f'[goto]: loading {specs["url"]}')
+
+        page.goto(**specs)
+        add_url(page, urls)
+
+        run_scripts(page, post, urls)
+
+
+def getters(page: Page, specss: list[dict], urls: dict):
+    content = ""
+    for specs in specss:
+        if specs:
+            post = get_script(specs, "post")
+            run_scripts(page, get_script(specs, "pre"), urls)
+
+            exel_str = ""
+            for exel in (
+                specs.get("excluded_element", ["script", "style", "link", "noscript"])
+                or []
+            ):
+                exel_str += (
+                    f'clone.querySelectorAll("{exel}").forEach(d => d.remove());\n'
+                )
+
+            method = specs.get("method")
+            if method == "selector":
+                expression = f"""
+                    Array.prototype.map.call(
+                        document.querySelectorAll("{specs.get("expression")}"),
+                        d => {{
+                            clone = d.cloneNode(true);
+                            {exel_str}
+                            return clone.textContent;
+                        }}).join("\n");
+                """
+            elif method == "custom":
+                expression = f'{{{specs.get("expression")}}}'
+            elif method == "none":
+                expression = '""'
+            else:
+                expression = f"""{{
+                    clone = document.body.cloneNode(true);
+                    {exel_str}
+                    return clone.textContent;
+                }}"""
+
+            if expression:
+                print(f"[getters]: getting content from {page.url}")
+                content += page.evaluate(f"() =>{expression}")
+            add_url(page, urls, expression)
+
+            run_scripts(page, post, urls)
+    return content
+
+
+def crawler(page: Page, specs: dict, urls: dict, pages: list, pre_configs: list):
+    if specs:
+        post = get_script(specs, "post")
+        run_scripts(page, get_script(specs, "pre"), urls)
+
+        queries = specs.get("queries") or [{"selector": "a[href]", "attribute": "href"}]
+        filters = specs.get("filters") or []
+        depth = specs.get("depth", 1) or 0
+
+        if depth > 0:
+            for query in queries:
+                for node in page.query_selector_all(query.get("selector") or "a[href]"):
+                    url = node.get_attribute(query.get("attribute") or "href")
+                    c_url = get_hostname(page)
+
+                    if url.startswith("/"):
+                        url = f"{c_url}{url}"
+
+                    if url.startswith("http") and url not in urls["crawled"]:
+                        included = not bool(filters)
+
+                        for filter in filters:
+                            if search(filter, url):
+                                included = True
+                                break
+
+                        if included:
+                            add_crawl(
+                                pages,
+                                pre_configs,
+                                urls,
+                                url,
+                                {
+                                    "queries": queries,
+                                    "depth": depth - 1,
+                                    "filters": filters,
+                                },
+                            )
+
+        run_scripts(page, post, urls)
+
+
+def get_script(specs: dict, name: str):
+    return specs.pop(f"{name}_scripts", []) or []
+
+
+def run_scripts(page: Page, scripts: list[dict], urls: dict):
+    for script in scripts:
+        method = script.pop("method", "evalutate") or "evaluate"
+        print(f"[script]: running method {method}\n{str(script)}")
+        getattr(page, method)(**script)
+        add_url(page, urls)
+
+
+def add_url(page: Page, urls: dict, scraped: bool = False):
+    url = page.url
+    if url:
+        if url not in urls["scanned"]:
+            urls["scanned"].add(url)
+
+        if scraped and url not in urls["scraped"]:
+            urls["scraped"].add(url)
+
+
+def add_crawl(pages: list, pre_configs: list, urls: dict, url: str, def_crawl: dict):
+    urls["crawled"].add(url)
+    scraper = {
+        "goto": {
+            "url": url,
+            "wait_until": "networkidle",
+            "pre_scripts": [],
+            "post_scripts": [],
+        },
+        "getters": [{"method": "default"}],
+        "crawler": def_crawl,
+    }
+    for pconf in pre_configs:
+        if search(pconf["regex"], url):
+            scraper = deepcopy(pconf["scraper"])
+            (scraper.get("goto") or {})["url"] = url
+            scraper["crawler"] = scraper.get("crawler") or def_crawl
+            break
+
+    pages.append(scraper)
+
+
+def get_hostname(page: Page):
+    url = page.url
+    if url:
+        splitter = url.split("//")
+        protocol = splitter[0]
+        hostname = splitter[1].split("/")[0]
+        return f"{protocol}//{hostname}"
+    return url
+
+
+###########################################################################
+#                               OLD SCRAPER                               #
+###########################################################################
+
+
+@jaseci_action(act_group=["wbs"], allow_remote=True)
+def old_scrape(
     urls: set,
     scripts: dict = {},
     url_filters: list = [],