Skip to content

Commit

Permalink
Merge pull request #1067 from Jaseci-Labs/replace_metadata_parser
Browse files Browse the repository at this point in the history
Replace metadataparser with a beautifulsoup4 solution
  • Loading branch information
marsninja authored Apr 18, 2023
2 parents 071f82b + 99a933c commit f2f3038
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 56 deletions.
34 changes: 5 additions & 29 deletions jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac
Original file line number Diff line number Diff line change
@@ -1,42 +1,18 @@
walker get_meta_valid {
has url = "https://pypi.org/project/metadata-parser/";
can webtool.get_page_meta;

root {
try {
report webtool.get_page_meta(url);
} else with error {
report error;
report:error = error;
}
}
report webtool.get_page_meta(url);
}

walker get_meta_403_response {
has url = "https://www.wsj.com/articles/why-this-housing-downturn-isnt-like-the-last-one-11671273004?mod=hp_lead_pos1";
walker get_meta_need_auth {
has url = "https://docs.google.com/presentation/d/1lIYEuzzhZZ9PJaG_u3XgrFXX5Y6xd0zHV-aB2F8bXXU/edit";
can webtool.get_page_meta;

root {
try {
report webtool.get_page_meta(url);
} else with error {
report error;
report:error = error;
}
}
report webtool.get_page_meta(url);
}


walker get_meta_invalid {
has url = "";
can webtool.get_page_meta;

root {
try {
report webtool.get_page_meta(url);
} else with error {
report error;
report:error = error;
}
}
report webtool.get_page_meta(url);
}
23 changes: 12 additions & 11 deletions jaseci_core/jaseci/extens/act_lib/tests/test_webtool.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@ class WebtoolTest(CoreTest):
@jac_testcase("webtool.jac", "get_meta_valid")
def test_get_meta_valid(self, ret):
self.assertTrue(ret["success"])
self.assertTrue("og" in ret["report"][0])
self.assertTrue("meta" in ret["report"][0])
self.assertTrue("dc" in ret["report"][0])
self.assertTrue("page" in ret["report"][0])
expected_tags = set(["og:image", "og:type", "og:title"])
tags = set(
[
meta["property"] if "property" in meta else ""
for meta in ret["report"][0]
]
)
self.assertTrue(tags.issuperset(expected_tags))

@jac_testcase("webtool.jac", "get_meta_403_response")
def test_get_meta_403_response(self, ret):
@jac_testcase("webtool.jac", "get_meta_need_auth")
def test_get_meta_need_auth(self, ret):
self.assertTrue(ret["success"])
self.assertTrue("og" in ret["report"][0])
self.assertTrue("meta" in ret["report"][0])
self.assertTrue("dc" in ret["report"][0])
self.assertTrue("page" in ret["report"][0])
self.assertTrue(len(ret["report"][0]) > 0)

@jac_testcase("webtool.jac", "get_meta_invalid")
def test_get_meta_invalid(self, ret):
self.assertFalse(ret["success"])
self.assertTrue("Failed at getting metadata" in ret["report"][0])
27 changes: 12 additions & 15 deletions jaseci_core/jaseci/extens/act_lib/webtool.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
"""Built in actions for Jaseci"""
from fastapi import HTTPException
import requests
from jaseci.jsorc.live_actions import jaseci_action
import metadata_parser
from bs4 import BeautifulSoup


@jaseci_action()
def get_page_meta(url: str = ""):
def get_page_meta(url: str):
"""
Util to parse metadata out of urls and html documents
"""

if url == "":
raise HTTPException(status_code=400, detail=str("No url provided"))

try:
page = metadata_parser.MetadataParser(
url=url,
url_headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0",
},
)
return page.metadata
webpage = requests.get(url)
soup = BeautifulSoup(webpage.content, features="lxml")
meta = soup.find_all("meta")
meta_list = []
for tag in meta:
meta_list.append(dict(tag.attrs))
return meta_list
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
print("Failed")
return f"Failed at getting metadata for {url}: {str(e)}"
3 changes: 2 additions & 1 deletion jaseci_core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,12 @@ def get_ver():
"pytest-xdist",
"pytest-cov",
"gprof2dot",
"metadata_parser",
"validators",
"psycopg2-binary==2.9.5",
"pygls",
"mock",
"beautifulsoup4>=4.12.2, <4.13.0",
"lxml>=4.9.2, <4.10.0",
],
package_data={
"": ["*.ini", "*.yaml", "jac.g4", "VERSION"],
Expand Down

0 comments on commit f2f3038

Please sign in to comment.