-
Notifications
You must be signed in to change notification settings - Fork 216
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1067 from Jaseci-Labs/replace_metadata_parser
Replace metadataparser with a beautifulsoup4 solution
- Loading branch information
Showing
4 changed files
with
31 additions
and
56 deletions.
There are no files selected for viewing
34 changes: 5 additions & 29 deletions
34
jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,18 @@ | ||
walker get_meta_valid { | ||
has url = "https://pypi.org/project/metadata-parser/"; | ||
can webtool.get_page_meta; | ||
|
||
root { | ||
try { | ||
report webtool.get_page_meta(url); | ||
} else with error { | ||
report error; | ||
report:error = error; | ||
} | ||
} | ||
report webtool.get_page_meta(url); | ||
} | ||
|
||
walker get_meta_403_response { | ||
has url = "https://www.wsj.com/articles/why-this-housing-downturn-isnt-like-the-last-one-11671273004?mod=hp_lead_pos1"; | ||
walker get_meta_need_auth { | ||
has url = "https://docs.google.com/presentation/d/1lIYEuzzhZZ9PJaG_u3XgrFXX5Y6xd0zHV-aB2F8bXXU/edit"; | ||
can webtool.get_page_meta; | ||
|
||
root { | ||
try { | ||
report webtool.get_page_meta(url); | ||
} else with error { | ||
report error; | ||
report:error = error; | ||
} | ||
} | ||
report webtool.get_page_meta(url); | ||
} | ||
|
||
|
||
walker get_meta_invalid { | ||
has url = ""; | ||
can webtool.get_page_meta; | ||
|
||
root { | ||
try { | ||
report webtool.get_page_meta(url); | ||
} else with error { | ||
report error; | ||
report:error = error; | ||
} | ||
} | ||
report webtool.get_page_meta(url); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,22 @@ | ||
"""Built in actions for Jaseci""" | ||
from fastapi import HTTPException | ||
import requests | ||
from jaseci.jsorc.live_actions import jaseci_action | ||
import metadata_parser | ||
from bs4 import BeautifulSoup | ||
|
||
|
||
@jaseci_action() | ||
def get_page_meta(url: str = ""): | ||
def get_page_meta(url: str): | ||
""" | ||
Util to parse metadata out of urls and html documents | ||
""" | ||
|
||
if url == "": | ||
raise HTTPException(status_code=400, detail=str("No url provided")) | ||
|
||
try: | ||
page = metadata_parser.MetadataParser( | ||
url=url, | ||
url_headers={ | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0", | ||
}, | ||
) | ||
return page.metadata | ||
webpage = requests.get(url) | ||
soup = BeautifulSoup(webpage.content, features="lxml") | ||
meta = soup.find_all("meta") | ||
meta_list = [] | ||
for tag in meta: | ||
meta_list.append(dict(tag.attrs)) | ||
return meta_list | ||
except Exception as e: | ||
raise HTTPException(status_code=500, detail=str(e)) | ||
print("Failed") | ||
return f"Failed at getting metadata for {url}: {str(e)}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters