From a4513d03abe875101860ba98f37118526d1a8132 Mon Sep 17 00:00:00 2001 From: Doug Guthrie Date: Sat, 16 Dec 2023 11:05:19 -0700 Subject: [PATCH 1/6] Add logic for consent screen --- yahooquery/__init__.py | 8 +++---- yahooquery/base.py | 16 +++++++++++--- yahooquery/headless.py | 14 +++++------- yahooquery/misc.py | 6 +++++- yahooquery/utils/__init__.py | 42 ++++++++++++++++++++++++++++++------ 5 files changed, 63 insertions(+), 23 deletions(-) diff --git a/yahooquery/__init__.py b/yahooquery/__init__.py index e3c178f..3ef299b 100644 --- a/yahooquery/__init__.py +++ b/yahooquery/__init__.py @@ -1,11 +1,8 @@ """Python interface to unofficial Yahoo Finance API endpoints""" name = "yahooquery" -__version__ = "2.3.6" +__version__ = "2.3.7" -from .research import Research # noqa -from .ticker import Ticker # noqa -from .screener import Screener # noqa from .misc import ( # noqa get_currencies, get_exchanges, @@ -13,3 +10,6 @@ get_trending, search, ) +from .research import Research # noqa +from .screener import Screener # noqa +from .ticker import Ticker # noqa diff --git a/yahooquery/base.py b/yahooquery/base.py index 458dd07..0be3ca7 100644 --- a/yahooquery/base.py +++ b/yahooquery/base.py @@ -1,4 +1,5 @@ # stdlib +import logging import os import time from concurrent.futures import as_completed @@ -26,6 +27,9 @@ import urlparse as parse +logger = logging.getLogger(__name__) + + class _YahooFinance(object): CHUNK = 1500 @@ -938,11 +942,12 @@ def __init__(self, **kwargs): self.progress = kwargs.pop("progress", False) self.username = kwargs.pop("username", os.getenv("YF_USERNAME", None)) self.password = kwargs.pop("password", os.getenv("YF_PASSWORD", None)) + self._setup_url = kwargs.pop("setup_url", os.getenv("YF_SETUP_URL", None)) self.session = initialize_session(kwargs.pop("session", None), **kwargs) if self.username and self.password: self.login() else: - self.session = setup_session(self.session) + self.session = setup_session(self.session, self._setup_url) self.crumb = get_crumb(self.session) @property @@ -991,13 +996,18 @@ def default_query_params(self): params["crumb"] = self.crumb return params - def login(self): + def login(self) -> None: if _has_selenium: instance = YahooFinanceHeadless(self.username, self.password) instance.login() self.session.cookies = instance.cookies - return [] + else: + logger.warning( + "You do not have the required libraries to use this feature. Install " + "with the following: `pip install yahooquery[premium]`" + ) + self.session = setup_session(self.session, self._setup_url) def _chunk_symbols(self, key, params={}, chunk=None, **kwargs): current_symbols = self.symbols diff --git a/yahooquery/headless.py b/yahooquery/headless.py index 55c723d..4fd2db2 100644 --- a/yahooquery/headless.py +++ b/yahooquery/headless.py @@ -7,13 +7,11 @@ try: # third party from selenium import webdriver - from selenium.common.exceptions import NoSuchElementException, TimeoutException - from selenium.webdriver.chrome.options import Options - from selenium.webdriver.chrome.service import Service as ChromeService + from selenium.common.exceptions import TimeoutException + from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait - from webdriver_manager.chrome import ChromeDriverManager except ImportError: # Selenium was not installed _has_selenium = False @@ -28,16 +26,14 @@ def __init__(self, username: str, password: str): self.username = username self.password = password self.cookies = RequestsCookieJar() - chrome_options = Options() + chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--log-level=3") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--ignore-ssl-errors") - self.driver = webdriver.Chrome( - service=ChromeService(ChromeDriverManager().install()), - options=chrome_options, - ) + service = Service() + self.driver = webdriver.Chrome(service=service, options=chrome_options) def login(self): try: diff --git a/yahooquery/misc.py b/yahooquery/misc.py index 49e96b3..25de24f 100644 --- a/yahooquery/misc.py +++ b/yahooquery/misc.py @@ -1,3 +1,6 @@ +# stdlib +import os + # third party import pandas as pd @@ -20,8 +23,9 @@ def _make_request( country, ", ".join(sorted(COUNTRIES.keys())) ) ) + setup_url = kwargs.pop("setup_url", os.getenv("YF_SETUP_URL", None)) session = initialize_session(**kwargs) - session = setup_session(session) + session = setup_session(session, setup_url) crumb = get_crumb(session) if crumb is not None: params["crumb"] = crumb diff --git a/yahooquery/utils/__init__.py b/yahooquery/utils/__init__.py index 93c9a43..3347242 100644 --- a/yahooquery/utils/__init__.py +++ b/yahooquery/utils/__init__.py @@ -7,6 +7,7 @@ # third party import pandas as pd import requests +from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from requests.exceptions import ConnectionError, RetryError, SSLError from requests.packages.urllib3.util.retry import Retry @@ -17,7 +18,7 @@ DEFAULT_TIMEOUT = 5 - +DEFAULT_SESSION_URL = "https://finance.yahoo.com" CRUMB_FAILURE = ( "Failed to obtain crumb. Ability to retrieve data will be significantly limited." ) @@ -1366,8 +1367,8 @@ def initialize_session(session=None, **kwargs): return session -def setup_session(session: requests.Session): - url = "https://finance.yahoo.com" +def setup_session(session: requests.Session, url: str = None): + url = url or DEFAULT_SESSION_URL try: response = session.get(url, allow_redirects=True) except SSLError: @@ -1380,10 +1381,39 @@ def setup_session(session: requests.Session): except SSLError: counter += 1 - if not isinstance(session, FuturesSession): - return session + if isinstance(session, FuturesSession): + response = response.result() + + # check for and handle consent page:w + if response.url.find("consent"): + logger.debug(f'Redirected to consent page: "{response.url}"') + + soup = BeautifulSoup(response.content, "html.parser") + + params = {} + for param in ["csrfToken", "sessionId"]: + try: + params[param] = soup.find("input", attrs={"name": param})["value"] + except Exception as exc: + logger.critical( + f'Failed to find or extract "{param}" from response. Exception={exc}' + ) + return + + logger.debug(f"params: {params}") + + response = session.post( + "https://consent.yahoo.com/v2/collectConsent", + data={ + "agree": ["agree", "agree"], + "consentUUID": "default", + "sessionId": params["sessionId"], + "csrfToken": params["csrfToken"], + "originalDoneUrl": url, + "namespace": "yahoo", + }, + ) - _ = response.result() return session From 928bdb9dce83fa0ceb78a2bd1fc439fbbcdd05b9 Mon Sep 17 00:00:00 2001 From: Doug Guthrie Date: Sat, 16 Dec 2023 11:05:39 -0700 Subject: [PATCH 2/6] Add changelog for 2.3.7 --- CHANGELOG.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 079f8e5..878dac5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,23 @@ Change Log ========== +2.3.7 +----- +## Add +- Logic for handling setting up a session when a consent screen is encountered. This is primarily seen in European countries + and should allow for the continued use of this package. +- Keyword argument, `setup_url`, to the base `_YahooFinance` class that allows a user to override the url used in setting up the session. As a default + the Yahoo Finance home page is used (https://finance.yahoo.com). You can also create an environment variable, `YF_SETUP_URL` that will be used if set. + Example usage: + ```python + import yahooquery as yq + + t = yq.Ticker('aapl', setup_url='https://finance.yahoo.com/quote/AAPL') + ``` + +## Remove +- Webdriver manager is no longer used internally. Selenium Manager is now fully included with selenium `4.10.0`, so this package is no longer needed. + 2.3.6 ----- ## Fix From f2e4ee94f566a94624682fdcce3a4c25ecf8b060 Mon Sep 17 00:00:00 2001 From: Doug Guthrie Date: Sat, 16 Dec 2023 11:06:55 -0700 Subject: [PATCH 3/6] Fix trailing whitespace --- CHANGELOG.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 878dac5..53b7d99 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -127,7 +127,7 @@ Change Log to adjust the timezone (:code:`adj_timezone`) to the ticker's timezone. It defaults to :code:`True`. - Further documentation of acceptable keyword arguments to the :code:`Ticker` class. -- :code:`Ticker.news` is now a method. It accepts two arguments: :code:`count` - +- :code:`Ticker.news` is now a method. It accepts two arguments: :code:`count` - number of items to return; :code:`start` - start date to begin retrieving news items from - Bug fixes: :code:`Ticker.history` method no longer returns extra rows when retrieving intraday data. @@ -148,12 +148,12 @@ Change Log :code:`p_valuation_measures` and supply either :code:`a`, :code:`q`, or :code:`m` (annual, quarterly, monthly). The data returned with these can be seen in the `Statistics` tab through the Yahoo Finance front-end. - + .. image:: demo/valuation_measures.PNG 2.2.2 ----- -- Fix bug in retrieving cash flow / income statement data. Most recent month was +- Fix bug in retrieving cash flow / income statement data. Most recent month was combining with TTM. A new column was created in the dataframe called 'periodType'. Annual data will be shown as '12M', quarterly data will be shown as '3M', and trailing 12 month data will be shown as 'TTM'. From 3a06edb3d8f6304e6aab2b00f44065c41f97a4dd Mon Sep 17 00:00:00 2001 From: Doug Guthrie Date: Sat, 16 Dec 2023 11:07:09 -0700 Subject: [PATCH 4/6] Update package dependencies --- poetry.lock | 63 +++++++++++++++++++++++++------------------------- pyproject.toml | 6 ++--- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6c72f6f..7f2e1ff 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. [[package]] name = "appnope" @@ -76,6 +76,24 @@ files = [ {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, ] +[[package]] +name = "beautifulsoup4" +version = "4.12.2" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, + {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "black" version = "23.10.1" @@ -1507,20 +1525,6 @@ files = [ [package.dependencies] six = ">=1.5" -[[package]] -name = "python-dotenv" -version = "1.0.0" -description = "Read key-value pairs from a .env file and set them as environment variables" -optional = true -python-versions = ">=3.8" -files = [ - {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, - {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, -] - -[package.extras] -cli = ["click (>=5.0)"] - [[package]] name = "pytz" version = "2023.3.post1" @@ -1806,6 +1810,17 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "soupsieve" +version = "2.5" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + [[package]] name = "stack-data" version = "0.6.3" @@ -2019,22 +2034,6 @@ files = [ {file = "wcwidth-0.2.9.tar.gz", hash = "sha256:a675d1a4a2d24ef67096a04b85b02deeecd8e226f57b5e3a72dbb9ed99d27da8"}, ] -[[package]] -name = "webdriver-manager" -version = "3.9.1" -description = "Library provides the way to automatically manage drivers for different browsers" -optional = true -python-versions = ">=3.7" -files = [ - {file = "webdriver_manager-3.9.1-py2.py3-none-any.whl", hash = "sha256:1dfc29a786abb97ba28076d4766d931064eeeac71a9685a3e8d46f5d363fcbe3"}, - {file = "webdriver_manager-3.9.1.tar.gz", hash = "sha256:cd1f49ebb325a98b4dc3c41056f5b645e82fff3f83e346607844ec0bdf561c0b"}, -] - -[package.dependencies] -packaging = "*" -python-dotenv = "*" -requests = "*" - [[package]] name = "wsproto" version = "1.2.0" @@ -2070,4 +2069,4 @@ premium = ["selenium", "webdriver-manager"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "5bb142dfc188dfb4ea5909389ba272201a2148343ae8d05e3d3e5f3530604663" +content-hash = "79b43190183d3827f506292ea737f1cd95cad8d73703b9cd098e1a0bc4452744" diff --git a/pyproject.toml b/pyproject.toml index 6509077..9d3fb29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "yahooquery" -version = "2.3.6" +version = "2.3.7" description = "Python wrapper for an unofficial Yahoo Finance API" authors = ["Doug Guthrie "] documentation = "https://yahooquery.dpguthrie.com" @@ -16,7 +16,7 @@ requests-futures = "^1.0.1" tqdm = "^4.65.0" lxml = "^4.9.3" selenium = {version = "^4.10.0", optional = true} -webdriver-manager = {version = "^3.8.6", optional = true} +beautifulsoup4 = "^4.12.2" [tool.poetry.dev-dependencies] pytest = "^7.4.0" @@ -34,4 +34,4 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.poetry.extras] -premium = ["selenium", "webdriver-manager"] +premium = ["selenium"] From 2047ddc3a7d0d027a4ddddf95434057b04014dc9 Mon Sep 17 00:00:00 2001 From: Doug Guthrie Date: Sat, 16 Dec 2023 11:32:59 -0700 Subject: [PATCH 5/6] Add logic to setup a session when logging in is unsuccessful --- yahooquery/base.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/yahooquery/base.py b/yahooquery/base.py index 0be3ca7..0162d29 100644 --- a/yahooquery/base.py +++ b/yahooquery/base.py @@ -1000,14 +1000,23 @@ def login(self) -> None: if _has_selenium: instance = YahooFinanceHeadless(self.username, self.password) instance.login() - self.session.cookies = instance.cookies + if instance.cookies: + self.session.cookies = instance.cookies + return + + else: + logger.warning( + "Unable to login and/or retrieve the appropriate cookies. This is " + "most likely due to Yahoo Finance instituting recaptcha, which " + "this package does not support." + ) else: logger.warning( "You do not have the required libraries to use this feature. Install " "with the following: `pip install yahooquery[premium]`" ) - self.session = setup_session(self.session, self._setup_url) + self.session = setup_session(self.session, self._setup_url) def _chunk_symbols(self, key, params={}, chunk=None, **kwargs): current_symbols = self.symbols From 399284b7dd0239736cef9d0a53c8bf01e034eb67 Mon Sep 17 00:00:00 2001 From: Doug Guthrie Date: Sat, 16 Dec 2023 11:33:55 -0700 Subject: [PATCH 6/6] Update logic for finding consent in url --- yahooquery/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yahooquery/utils/__init__.py b/yahooquery/utils/__init__.py index 3347242..8fad8ac 100644 --- a/yahooquery/utils/__init__.py +++ b/yahooquery/utils/__init__.py @@ -1385,7 +1385,7 @@ def setup_session(session: requests.Session, url: str = None): response = response.result() # check for and handle consent page:w - if response.url.find("consent"): + if response.url.find("consent") >= 0: logger.debug(f'Redirected to consent page: "{response.url}"') soup = BeautifulSoup(response.content, "html.parser") @@ -1398,7 +1398,7 @@ def setup_session(session: requests.Session, url: str = None): logger.critical( f'Failed to find or extract "{param}" from response. Exception={exc}' ) - return + return session logger.debug(f"params: {params}")