diff --git a/.bandit.yml b/.bandit.yml index bb9aab2..4f60a02 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -1,4 +1,6 @@ skips: - B101 +- B311 - B320 - B410 +exclude_dirs: ['tests'] diff --git a/.flake8 b/.flake8 index d086822..7e5efc6 100644 --- a/.flake8 +++ b/.flake8 @@ -9,6 +9,7 @@ per-file-ignores = setup.py:E501 tests/test_selector.py:E501 tests/test_selector_csstranslator.py:E501 + tests/test_selector_jmespath.py:E501 tests/test_utils.py:E501 tests/test_xpathfuncs.py:E501 tests/typing/*.py:E,F diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..00d5546 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# applying pre-commit hooks to the project +a57c23e3b7be0f001595bd8767fe05e40a66e730 \ No newline at end of file diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 22576e0..0aa7558 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -10,10 +10,7 @@ jobs: include: - python-version: "3.12" env: - TOXENV: security - - python-version: "3.12" - env: - TOXENV: flake8 + TOXENV: pre-commit - python-version: "3.12" env: TOXENV: pylint @@ -23,9 +20,6 @@ jobs: - python-version: "3.12" env: TOXENV: typing - - python-version: "3.12" - env: - TOXENV: black - python-version: "3.12" env: TOXENV: twinecheck diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..6860bdb --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9f1a2f1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: +- repo: https://github.com/PyCQA/bandit + rev: 1.7.7 + hooks: + - id: bandit + args: [-r, -c, .bandit.yml] +- repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 +- repo: https://github.com/psf/black.git + rev: 24.1.1 + hooks: + - id: black +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 689af48..4d7b0d6 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,7 +3,6 @@ import os import sys - # Get the project root dir, which is the parent dir of this cwd = os.getcwd() project_root = os.path.dirname(cwd) @@ -13,8 +12,7 @@ # version is used. sys.path.insert(0, project_root) -import parsel - +import parsel # noqa: E402 # -- General configuration --------------------------------------------- @@ -98,10 +96,9 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1) + ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1), ] - # -- Options for Texinfo output ---------------------------------------- # Grouping the document tree into Texinfo files. List of tuples diff --git a/parsel/__init__.py b/parsel/__init__.py index 955f3df..cd00fe6 100644 --- a/parsel/__init__.py +++ b/parsel/__init__.py @@ -13,8 +13,8 @@ "xpathfuncs", ] -from parsel.selector import Selector, SelectorList # NOQA -from parsel.csstranslator import css2xpath # NOQA from parsel import xpathfuncs # NOQA +from parsel.csstranslator import css2xpath # NOQA +from parsel.selector import Selector, SelectorList # NOQA xpathfuncs.setup() diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py index e1adc0a..ac6af32 100644 --- a/parsel/csstranslator.py +++ b/parsel/csstranslator.py @@ -3,10 +3,9 @@ from cssselect import GenericTranslator as OriginalGenericTranslator from cssselect import HTMLTranslator as OriginalHTMLTranslator -from cssselect.xpath import XPathExpr as OriginalXPathExpr -from cssselect.xpath import ExpressionError from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement - +from cssselect.xpath import ExpressionError +from cssselect.xpath import XPathExpr as OriginalXPathExpr if TYPE_CHECKING: # typing.Self requires Python 3.11 @@ -25,9 +24,7 @@ def from_xpath( textnode: bool = False, attribute: Optional[str] = None, ) -> "Self": - x = cls( - path=xpath.path, element=xpath.element, condition=xpath.condition - ) + x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition) x.textnode = textnode x.attribute = attribute return x @@ -82,9 +79,7 @@ class TranslatorMixin: Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``. """ - def xpath_element( - self: TranslatorProtocol, selector: Element - ) -> XPathExpr: + def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr: # https://github.com/python/mypy/issues/12344 xpath = super().xpath_element(selector) # type: ignore[safe-super] return XPathExpr.from_xpath(xpath) @@ -104,7 +99,9 @@ def xpath_pseudo_element( ) xpath = method(xpath, pseudo_element) else: - method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element" + method_name = ( + f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element" + ) method = getattr(self, method_name, None) if not method: raise ExpressionError( @@ -121,30 +118,22 @@ def xpath_attr_functional_pseudo_element( raise ExpressionError( f"Expected a single string or ident for ::attr(), got {function.arguments!r}" # noqa: E231 ) - return XPathExpr.from_xpath( - xpath, attribute=function.arguments[0].value - ) + return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value) - def xpath_text_simple_pseudo_element( - self, xpath: OriginalXPathExpr - ) -> XPathExpr: + def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr) -> XPathExpr: """Support selecting text nodes using ::text pseudo-element""" return XPathExpr.from_xpath(xpath, textnode=True) class GenericTranslator(TranslatorMixin, OriginalGenericTranslator): @lru_cache(maxsize=256) - def css_to_xpath( - self, css: str, prefix: str = "descendant-or-self::" - ) -> str: + def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: return super().css_to_xpath(css, prefix) class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator): @lru_cache(maxsize=256) - def css_to_xpath( - self, css: str, prefix: str = "descendant-or-self::" - ) -> str: + def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: return super().css_to_xpath(css, prefix) diff --git a/parsel/selector.py b/parsel/selector.py index 11d7979..dd9c936 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -29,7 +29,6 @@ from .csstranslator import GenericTranslator, HTMLTranslator from .utils import extract_regex, flatten, iflatten, shorten - _SelectorType = TypeVar("_SelectorType", bound="Selector") _ParserType = Union[etree.XMLParser, etree.HTMLParser] # simplified _OutputMethodArg from types-lxml @@ -135,18 +134,14 @@ def __getitem__( ) -> Union[_SelectorType, "SelectorList[_SelectorType]"]: o = super().__getitem__(pos) if isinstance(pos, slice): - return self.__class__( - typing.cast("SelectorList[_SelectorType]", o) - ) + return self.__class__(typing.cast("SelectorList[_SelectorType]", o)) else: return typing.cast(_SelectorType, o) def __getstate__(self) -> None: raise TypeError("can't pickle SelectorList objects") - def jmespath( - self, query: str, **kwargs: Any - ) -> "SelectorList[_SelectorType]": + def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]": """ Call the ``.jmespath()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. @@ -158,9 +153,7 @@ def jmespath( selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) """ - return self.__class__( - flatten([x.jmespath(query, **kwargs) for x in self]) - ) + return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self])) def xpath( self, @@ -185,9 +178,7 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ return self.__class__( - flatten( - [x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self] - ) + flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]) ) def css(self, query: str) -> "SelectorList[_SelectorType]": @@ -211,9 +202,7 @@ def re( Passing ``replace_entities`` as ``False`` switches off these replacements. """ - return flatten( - [x.re(regex, replace_entities=replace_entities) for x in self] - ) + return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) @typing.overload def re_first( @@ -316,9 +305,7 @@ def drop(self) -> None: _NOT_SET = object() -def _get_root_from_text( - text: str, *, type: str, **lxml_kwargs: Any -) -> etree._Element: +def _get_root_from_text(text: str, *, type: str, **lxml_kwargs: Any) -> etree._Element: return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs) @@ -583,9 +570,7 @@ def make_selector(x: Any) -> _SelectorType: # closure function return self.__class__(root=x, _expr=query) result = [make_selector(x) for x in result] - return typing.cast( - SelectorList[_SelectorType], self.selectorlist_cls(result) - ) + return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) def xpath( self: _SelectorType, @@ -611,9 +596,7 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ if self.type not in ("html", "xml", "text"): - raise ValueError( - f"Cannot use xpath on a Selector of type {self.type!r}" - ) + raise ValueError(f"Cannot use xpath on a Selector of type {self.type!r}") if self.type in ("html", "xml"): try: xpathev = self.root.xpath @@ -654,9 +637,7 @@ def xpath( ) for x in result ] - return typing.cast( - SelectorList[_SelectorType], self.selectorlist_cls(result) - ) + return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: """ @@ -670,9 +651,7 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: .. _cssselect: https://pypi.python.org/pypi/cssselect/ """ if self.type not in ("html", "xml", "text"): - raise ValueError( - f"Cannot use css on a Selector of type {self.type!r}" - ) + raise ValueError(f"Cannot use css on a Selector of type {self.type!r}") return self.xpath(self._css2xpath(query)) def _css2xpath(self, query: str) -> str: diff --git a/parsel/utils.py b/parsel/utils.py index 2677f47..ec77d74 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -1,5 +1,6 @@ import re from typing import Any, Iterable, Iterator, List, Match, Pattern, Union, cast + from w3lib.html import replace_entities as w3lib_replace_entities diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py index e8cea0a..7b984c5 100644 --- a/parsel/xpathfuncs.py +++ b/parsel/xpathfuncs.py @@ -2,10 +2,8 @@ from typing import Any, Callable, Optional from lxml import etree - from w3lib.html import HTML5_WHITESPACE - regex = f"[{HTML5_WHITESPACE}]+" replace_html5_whitespaces = re.compile(regex).sub @@ -43,14 +41,10 @@ def has_class(context: Any, *classes: str) -> bool: """ if not context.eval_context.get("args_checked"): if not classes: - raise ValueError( - "XPath error: has-class must have at least 1 argument" - ) + raise ValueError("XPath error: has-class must have at least 1 argument") for c in classes: if not isinstance(c, str): - raise ValueError( - "XPath error: has-class arguments must be strings" - ) + raise ValueError("XPath error: has-class arguments must be strings") context.eval_context["args_checked"] = True node_cls = context.context_node.get("class") diff --git a/setup.py b/setup.py index 8ba4b0c..1be8413 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ from setuptools import setup - with open("README.rst", encoding="utf-8") as readme_file: readme = readme_file.read() diff --git a/tests/test_selector.py b/tests/test_selector.py index 645b999..96713f9 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1,11 +1,10 @@ +import pickle import re +import typing +import unittest import warnings import weakref -import unittest -import pickle - -import typing -from typing import cast, Any, Optional, Mapping +from typing import Any, Mapping, Optional, cast from lxml import etree from lxml.html import HtmlElement @@ -13,10 +12,10 @@ from parsel import Selector, SelectorList from parsel.selector import ( - CannotRemoveElementWithoutRoot, - CannotRemoveElementWithoutParent, - LXML_SUPPORTS_HUGE_TREE, _NOT_SET, + LXML_SUPPORTS_HUGE_TREE, + CannotRemoveElementWithoutParent, + CannotRemoveElementWithoutRoot, ) @@ -32,9 +31,7 @@ def assertIsSelectorList(self, value: Any) -> None: def test_pickle_selector(self) -> None: sel = self.sscls(text="
some text
") - self.assertRaises( - TypeError, lambda s: pickle.dumps(s, protocol=2), sel - ) + self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) def test_pickle_selector_list(self) -> None: sel = self.sscls( @@ -44,9 +41,7 @@ def test_pickle_selector_list(self) -> None: empty_sel_list = sel.css("p") self.assertIsSelectorList(sel_list) self.assertIsSelectorList(empty_sel_list) - self.assertRaises( - TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list - ) + self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list) self.assertRaises( TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list ) @@ -99,10 +94,7 @@ def test_simple_selection_with_variables(self) -> None: sel = self.sscls(text=body) self.assertEqual( - [ - x.extract() - for x in sel.xpath("//input[@value=$number]/@name", number=1) - ], + [x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)], ["a"], ) self.assertEqual( @@ -124,15 +116,11 @@ def test_simple_selection_with_variables(self) -> None: # you can also pass booleans self.assertEqual( - sel.xpath( - "boolean(count(//input)=$cnt)=$test", cnt=2, test=True - ).extract(), + sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(), ["1"], ) self.assertEqual( - sel.xpath( - "boolean(count(//input)=$cnt)=$test", cnt=4, test=True - ).extract(), + sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(), ["0"], ) self.assertEqual( @@ -162,16 +150,11 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None: t = 'I say "Yeah!"' # naive string formatting with give something like: # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name - self.assertRaises( - ValueError, sel.xpath, f'//input[@value="{t}"]/@name' - ) + self.assertRaises(ValueError, sel.xpath, f'//input[@value="{t}"]/@name') # with XPath variables, escaping is done for you self.assertEqual( - [ - x.extract() - for x in sel.xpath("//input[@value=$text]/@name", text=t) - ], + [x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)], ["a"], ) lt = """I'm mixing single and "double quotes" and I don't care :)""" @@ -184,9 +167,7 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None: self.assertEqual( [ x.extract() - for x in sel.xpath( - "//p[normalize-space()=$lng]//@name", lng=lt - ) + for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt) ], ["a"], ) @@ -210,9 +191,7 @@ def test_accessing_attributes(self) -> None: ) # for a SelectorList, bring the attributes of first-element only - self.assertEqual( - {"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib - ) + self.assertEqual({"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib) self.assertEqual( {"class": "item-cls", "id": "list-item-1"}, sel.css("li").attrib ) @@ -232,9 +211,7 @@ def test_representation_slice(self) -> None: body = f"" sel = self.sscls(text=body) - representation = ( - f"