From 794baaaa28a8b84ae32a4cd03b9347dd81875bf9 Mon Sep 17 00:00:00 2001 From: Yiping Kang Date: Tue, 18 Apr 2023 16:17:12 -0400 Subject: [PATCH 1/2] Replace metadataparser with a beautifulsoup4 solution --- .../extens/act_lib/tests/fixtures/webtool.jac | 34 +++---------------- .../extens/act_lib/tests/test_webtool.py | 23 +++++++------ jaseci_core/jaseci/extens/act_lib/webtool.py | 27 +++++++-------- jaseci_core/setup.py | 2 +- 4 files changed, 30 insertions(+), 56 deletions(-) diff --git a/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac b/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac index c2b8335e0c..f759185237 100644 --- a/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac +++ b/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac @@ -1,42 +1,18 @@ walker get_meta_valid { has url = "https://pypi.org/project/metadata-parser/"; can webtool.get_page_meta; - - root { - try { - report webtool.get_page_meta(url); - } else with error { - report error; - report:error = error; - } - } + report webtool.get_page_meta(url); } -walker get_meta_403_response { - has url = "https://www.wsj.com/articles/why-this-housing-downturn-isnt-like-the-last-one-11671273004?mod=hp_lead_pos1"; +walker get_meta_need_auth { + has url = "https://docs.google.com/presentation/d/1lIYEuzzhZZ9PJaG_u3XgrFXX5Y6xd0zHV-aB2F8bXXU/edit"; can webtool.get_page_meta; - - root { - try { - report webtool.get_page_meta(url); - } else with error { - report error; - report:error = error; - } - } + report webtool.get_page_meta(url); } walker get_meta_invalid { has url = ""; can webtool.get_page_meta; - - root { - try { - report webtool.get_page_meta(url); - } else with error { - report error; - report:error = error; - } - } + report webtool.get_page_meta(url); } diff --git a/jaseci_core/jaseci/extens/act_lib/tests/test_webtool.py b/jaseci_core/jaseci/extens/act_lib/tests/test_webtool.py index 8ace904628..fdeff7ee8d 100644 --- a/jaseci_core/jaseci/extens/act_lib/tests/test_webtool.py +++ b/jaseci_core/jaseci/extens/act_lib/tests/test_webtool.py @@ -9,19 +9,20 @@ class WebtoolTest(CoreTest): @jac_testcase("webtool.jac", "get_meta_valid") def test_get_meta_valid(self, ret): self.assertTrue(ret["success"]) - self.assertTrue("og" in ret["report"][0]) - self.assertTrue("meta" in ret["report"][0]) - self.assertTrue("dc" in ret["report"][0]) - self.assertTrue("page" in ret["report"][0]) + expected_tags = set(["og:image", "og:type", "og:title"]) + tags = set( + [ + meta["property"] if "property" in meta else "" + for meta in ret["report"][0] + ] + ) + self.assertTrue(tags.issuperset(expected_tags)) - @jac_testcase("webtool.jac", "get_meta_403_response") - def test_get_meta_403_response(self, ret): + @jac_testcase("webtool.jac", "get_meta_need_auth") + def test_get_meta_need_auth(self, ret): self.assertTrue(ret["success"]) - self.assertTrue("og" in ret["report"][0]) - self.assertTrue("meta" in ret["report"][0]) - self.assertTrue("dc" in ret["report"][0]) - self.assertTrue("page" in ret["report"][0]) + self.assertTrue(len(ret["report"][0]) > 0) @jac_testcase("webtool.jac", "get_meta_invalid") def test_get_meta_invalid(self, ret): - self.assertFalse(ret["success"]) + self.assertTrue("Failed at getting metadata" in ret["report"][0]) diff --git a/jaseci_core/jaseci/extens/act_lib/webtool.py b/jaseci_core/jaseci/extens/act_lib/webtool.py index 585a783e1c..3c67297d00 100644 --- a/jaseci_core/jaseci/extens/act_lib/webtool.py +++ b/jaseci_core/jaseci/extens/act_lib/webtool.py @@ -1,25 +1,22 @@ """Built in actions for Jaseci""" -from fastapi import HTTPException +import requests from jaseci.jsorc.live_actions import jaseci_action -import metadata_parser +from bs4 import BeautifulSoup @jaseci_action() -def get_page_meta(url: str = ""): +def get_page_meta(url: str): """ Util to parse metadata out of urls and html documents """ - - if url == "": - raise HTTPException(status_code=400, detail=str("No url provided")) - try: - page = metadata_parser.MetadataParser( - url=url, - url_headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0", - }, - ) - return page.metadata + webpage = requests.get(url) + soup = BeautifulSoup(webpage.content, features="lxml") + meta = soup.find_all("meta") + meta_list = [] + for tag in meta: + meta_list.append(dict(tag.attrs)) + return meta_list except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) + print("Failed") + return f"Failed at getting metadata for {url}: {str(e)}" diff --git a/jaseci_core/setup.py b/jaseci_core/setup.py index a4beb66d90..20a9db7a4b 100644 --- a/jaseci_core/setup.py +++ b/jaseci_core/setup.py @@ -32,11 +32,11 @@ def get_ver(): "pytest-xdist", "pytest-cov", "gprof2dot", - "metadata_parser", "validators", "psycopg2-binary==2.9.5", "pygls", "mock", + "beautifulsoup4>=4.12.2, <4.13.0", ], package_data={ "": ["*.ini", "*.yaml", "jac.g4", "VERSION"], From 99a933c340f34da011aa7ac01558d58e473ee9d5 Mon Sep 17 00:00:00 2001 From: Yiping Kang Date: Tue, 18 Apr 2023 16:22:42 -0400 Subject: [PATCH 2/2] Add lxml dependency --- jaseci_core/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jaseci_core/setup.py b/jaseci_core/setup.py index 20a9db7a4b..9fab03bfc3 100644 --- a/jaseci_core/setup.py +++ b/jaseci_core/setup.py @@ -37,6 +37,7 @@ def get_ver(): "pygls", "mock", "beautifulsoup4>=4.12.2, <4.13.0", + "lxml>=4.9.2, <4.10.0", ], package_data={ "": ["*.ini", "*.yaml", "jac.g4", "VERSION"],