Skip to content

Commit

Permalink
Merge pull request #743 from lindsay-stevens/performance
Browse files Browse the repository at this point in the history
More performance improvements
  • Loading branch information
lindsay-stevens authored Dec 24, 2024
2 parents 5b25fa2 + ef662d4 commit 854ffe6
Show file tree
Hide file tree
Showing 13 changed files with 283 additions and 202 deletions.
4 changes: 2 additions & 2 deletions pyxform/entities/entity_declaration.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, name: str, type: str, parameters: dict, **kwargs):
super().__init__(name=name, **kwargs)

def xml_instance(self, **kwargs):
parameters = self.get(const.PARAMETERS, {})
parameters = self.parameters

attributes = {
EC.DATASET.value: parameters.get(EC.DATASET, ""),
Expand Down Expand Up @@ -75,7 +75,7 @@ def xml_bindings(self, survey: "Survey"):
"""
See the class comment for an explanation of the logic for generating bindings.
"""
parameters = self.get(const.PARAMETERS, {})
parameters = self.parameters
entity_id_expression = parameters.get(EC.ENTITY_ID, None)
create_condition = parameters.get(EC.CREATE_IF, None)
update_condition = parameters.get(EC.UPDATE_IF, None)
Expand Down
60 changes: 29 additions & 31 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import re
from collections.abc import Iterable
from functools import lru_cache


def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"""
Get a expression lexer (scanner) for parsing.
"""
def get_lexer_rules():
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
Expand All @@ -29,7 +25,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
return {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
Expand All @@ -49,7 +45,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
Expand All @@ -60,15 +56,21 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}


LEXER_RULES = get_lexer_rules()
RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""")
RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""")
RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"])


def get_expression_lexer() -> re.Scanner:
def get_tokenizer(name):
def tokenizer(scan, value) -> ExpLexerToken | str:
if name_only:
return name
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
return re.Scanner(lexicon)
Expand All @@ -84,9 +86,8 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None:
self.end: int = end


# Scanner takes a few 100ms to compile so use these shared instances.
# Scanner takes a few 100ms to compile so use the shared instance.
_EXPRESSION_LEXER = get_expression_lexer()
_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True)


@lru_cache(maxsize=128)
Expand All @@ -103,32 +104,29 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
return tokens, remainder


def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
"""
Does the expression contain single token of one of the provided token types?
"""
if not expression:
return False
tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip())
if 1 == len(tokens) and tokens[0] in token_types:
return True
else:
return False


def is_pyxform_reference(value: str) -> bool:
"""
Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
"""
if not value or len(value) <= 3: # Needs 3 characters for "${}", plus a name inside.
return False
return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
# Needs 3 characters for "${}", plus a name inside.
return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.match(value))


def is_xml_tag(value: str) -> bool:
"""
Does the input string contain only a valid XML tag / element name?
"""
if not value:
return False
return is_single_token_expression(expression=value, token_types=("NAME",))
return value and bool(RE_ONLY_NCNAME.match(value))


def has_last_saved(value: str) -> bool:
"""
Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question}
"""
# Needs 14 characters for "${last-saved#}", plus a name inside.
return (
value
and len(value) > 14
and "${last-saved#" in value
and RE_ANY_PYXFORM_REF.search(value)
)
11 changes: 8 additions & 3 deletions pyxform/parsing/instance_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
:param xml_text: XML text that may contain an instance expression.
:return: Tokens in instance expression, and the string position boundaries.
"""
tokens, _ = parse_expression(xml_text)
if not tokens:
return []
instance_enter = False
path_enter = False
pred_enter = False
last_token = None
tokens, _ = parse_expression(xml_text)
boundaries = []

for t in tokens:
Expand Down Expand Up @@ -96,8 +98,11 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
:param survey: The Survey that the context is in.
:return: The possibly modified string.
"""
# 9 = len("instance(")
if 9 >= len(xml_text):
return xml_text
boundaries = find_boundaries(xml_text=xml_text)
if 0 < len(boundaries):
if boundaries:
new_strings = []
for start, end in boundaries:
old_str = xml_text[start:end]
Expand All @@ -116,6 +121,6 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
# expression positions due to incremental replacement.
offset = 0
for s, e, o, n in new_strings:
xml_text = xml_text[: s + offset] + n + xml_text[e + offset :]
xml_text = f"{xml_text[: s + offset]}{n}{xml_text[e + offset :]}"
offset += len(n) - len(o)
return xml_text
Loading

0 comments on commit 854ffe6

Please sign in to comment.