From b3db6cdc5e98e55981c53cf6d1aa44e5393de17a Mon Sep 17 00:00:00 2001 From: "Alexie (Boyong) Madolid" Date: Tue, 24 Oct 2023 20:38:30 +0800 Subject: [PATCH] [BUGFIX]: Webtool optional headers MINOR: - Use different url for get_meta_need_auth as previous URL (https://docs.google.com/presentation/d/1lIYEuzzhZZ9PJaG_u3XgrFXX5Y6xd0zHV-aB2F8bXXU/edit) doesn't have meta if loaded without javascript --- .../jaseci/extens/act_lib/tests/fixtures/webtool.jac | 6 ++++-- jaseci_core/jaseci/extens/act_lib/webtool.py | 6 ++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac b/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac index 0a0fbd03bd..77af4ba2d5 100644 --- a/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac +++ b/jaseci_core/jaseci/extens/act_lib/tests/fixtures/webtool.jac @@ -5,7 +5,7 @@ walker get_meta_valid { } walker get_meta_need_auth { - has url = "https://docs.google.com/presentation/d/1lIYEuzzhZZ9PJaG_u3XgrFXX5Y6xd0zHV-aB2F8bXXU/edit"; + has url = "https://github.com/settings/profile"; can webtool.get_page_meta; report webtool.get_page_meta(url); } @@ -26,5 +26,7 @@ walker get_meta_timeout { walker get_meta_need_header { has url = "https://www.invaluable.com/blog/what-is-a-mandala/"; can webtool.get_page_meta; - report webtool.get_page_meta(url); + report webtool.get_page_meta(url, headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0" + }); } \ No newline at end of file diff --git a/jaseci_core/jaseci/extens/act_lib/webtool.py b/jaseci_core/jaseci/extens/act_lib/webtool.py index 9150a1fdf2..50a7893857 100644 --- a/jaseci_core/jaseci/extens/act_lib/webtool.py +++ b/jaseci_core/jaseci/extens/act_lib/webtool.py @@ -5,7 +5,7 @@ @jaseci_action() -def get_page_meta(url: str, timeout: int = 3, parser: str = "lxml"): +def get_page_meta(url: str, timeout: int = 3, parser: str = "lxml", headers: dict = {}): """ Util to parse metadata out of urls and html documents Parser option: lxml (default), html5lib, html.parser @@ -15,9 +15,7 @@ def get_page_meta(url: str, timeout: int = 3, parser: str = "lxml"): webpage = requests.get( url, timeout=timeout, - headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0" - }, + headers=headers, ) soup = BeautifulSoup(webpage.content, features=parser) meta = soup.find_all("meta")