Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance enhancements #73

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
86b9451
Enable packratting for pyparser
ahankinson Apr 13, 2021
7fdf8dd
#37 update for Django 3.x compat
jacobcolyvan Jul 26, 2021
6e4a627
Minor updates
ahankinson Apr 26, 2024
80fdd60
Update dependency management
ahankinson Apr 26, 2024
c12d759
Deps
ahankinson Apr 26, 2024
6e508d0
Optimized regexes
ahankinson Jul 23, 2024
f2252f0
Package updates
ahankinson Jul 23, 2024
06ab934
Further optimizations
ahankinson Jul 24, 2024
c9cb56f
Update gitignore
ahankinson Aug 12, 2024
9e51373
Black formatting, updates
ahankinson Aug 13, 2024
1aa53cf
Update imports
ahankinson Aug 13, 2024
ddd8f7b
Merge branch 'main' into performance-enhancements
ahankinson Aug 13, 2024
8c4f968
Merge fixes
ahankinson Aug 13, 2024
6f08bce
ruff formatting
ahankinson Aug 13, 2024
973ccf4
Remove accidentally committed poetry file
ahankinson Aug 13, 2024
ee450a5
Fixed: f-string formatting
ahankinson Aug 14, 2024
46bdce6
Fixed: return type of statement
ahankinson Sep 12, 2024
656f8ad
Updated parser classes
ahankinson Sep 12, 2024
add79bd
Fixed: Remove SHORT_YEAR_RE
ahankinson Sep 12, 2024
fee0b64
Problem with f-string
ahankinson Sep 12, 2024
89f3692
Another f-string fix
ahankinson Sep 12, 2024
9da1d94
Fixed: pyproject errors
ahankinson Sep 12, 2024
95b83aa
Testing without lru_cache
ahankinson Jan 16, 2025
6262a38
Fixed: New ruff rules
ahankinson Jan 16, 2025
8fbce49
Fixed formatting
ahankinson Jan 16, 2025
adc1805
Bad formatting conversion
ahankinson Jan 16, 2025
ffbe2d4
Replace range len with enumerate
ahankinson Jan 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,5 @@ docs/_build/

# PyBuilder
target/
.idea
.DS_Store
8 changes: 4 additions & 4 deletions edtf/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ def trim_struct_time(st: struct_time, strip_time: bool = False) -> struct_time:
"""
if strip_time:
return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS)
else:
return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS)
return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS)


def struct_time_to_jd(st: struct_time) -> float:
Expand Down Expand Up @@ -116,7 +115,7 @@ def jd_to_struct_time(jd: float) -> struct_time:
return struct_time([year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS)


def _roll_negative_time_fields(year, month, day, hour, minute, second):
def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple:
"""
Fix date/time fields which have nonsense negative values for any field
except for year by rolling the overall date/time value backwards, treating
Expand Down Expand Up @@ -152,4 +151,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second):
year += int(month / 12.0) # Adjust by whole year in months
year -= 1 # Subtract 1 for negative minutes
month %= 12 # Convert negative month to positive remainder
return (year, month, day, hour, minute, second)

return year, month, day, hour, minute, second
145 changes: 81 additions & 64 deletions edtf/natlang/en.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Utilities to derive an EDTF string from an (English) natural language string."""

import functools
import re
from datetime import datetime
from typing import Optional

from dateutil.parser import ParserError, parse

Expand All @@ -13,19 +15,46 @@
DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0)
DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0)

SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])"
LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)"
CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)"
SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])")
ahankinson marked this conversation as resolved.
Show resolved Hide resolved
LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)")
CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?")
CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]")
CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)")
ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b")
TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b")
PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$")
SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)")
BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b")
AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b")
APPROX_CHECK = re.compile(
r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)"
)
UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)")
UNCERTAIN_REPL = re.compile(r"(\d{4})\?")
MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s")
MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s")

APPROX_CENTURY_RE = re.compile(
r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
)
UNCERTAIN_CENTURY_RE = re.compile(
r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?"
)

APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)")
UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?")

MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b")
MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b")
MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b")

# Set of RE rules that will cause us to abort text processing, since we know
# the results will be wrong.
REJECT_RULES = (
r".*dynasty.*", # Don't parse '23rd Dynasty' to 'uuuu-uu-23'
)
REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23'


def text_to_edtf(text):
@functools.lru_cache
def text_to_edtf(text: str) -> Optional[str]:
"""
Generate EDTF string equivalent of a given natural language date string.
"""
Expand All @@ -35,35 +64,37 @@ def text_to_edtf(text):
t = text.lower()

# try parsing the whole thing
result = text_to_edtf_date(t)
result: Optional[str] = text_to_edtf_date(t)

if not result:
# split by list delims and move fwd with the first thing that returns a non-empty string.
# TODO: assemble multiple dates into a {} or [] structure.
for split in [",", ";", "or"]:
for list_item in t.split(split):
# try parsing as an interval - split by '-'
toks = list_item.split("-")
toks: list[str] = list_item.split("-")

if len(toks) == 2:
d1 = toks[0].strip()
d2 = toks[1].strip()

# match looks from the beginning of the string, search
# looks anywhere.

if re.match(r"\d\D\b", d2): # 1-digit year partial e.g. 1868-9
if re.match(
ONE_DIGIT_PARTIAL_FIRST, d2
): # 1-digit year partial e.g. 1868-9
if re.search(
r"\b\d\d\d\d$", d1
PARTIAL_CHECK, d1
): # TODO: evaluate it and see if it's a year
d2 = d1[-4:-1] + d2
elif re.match(r"\d\d\b", d2): # 2-digit year partial e.g. 1809-10
if re.search(r"\b\d\d\d\d$", d1):
elif re.match(
TWO_DIGIT_PARTIAL_FIRST, d2
): # 2-digit year partial e.g. 1809-10
if re.search(PARTIAL_CHECK, d1):
d2 = d1[-4:-2] + d2
else:
century_range_match = re.search(
r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]",
f"{d1}-{d2}",
)
century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}")
if century_range_match:
g = century_range_match.groups()
d1 = f"{g[0]}C"
Expand All @@ -73,7 +104,7 @@ def text_to_edtf(text):
r2 = text_to_edtf_date(d2)

if r1 and r2:
result = r1 + "/" + r2
result = f"{r1}/{r2}"
return result

# is it an either/or year "1838/1862" - that has a different
Expand All @@ -82,7 +113,7 @@ def text_to_edtf(text):
# This whole section could be more friendly.

else:
int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item)
int_match = re.search(SLASH_YEAR, list_item)
if int_match:
return f"[{int_match.group(1)}, {int_match.group(2)}]"

Expand All @@ -92,21 +123,19 @@ def text_to_edtf(text):
if result:
break

is_before = re.findall(r"\bbefore\b", t)
is_before = is_before or re.findall(r"\bearlier\b", t)

is_after = re.findall(r"\bafter\b", t)
is_after = is_after or re.findall(r"\bsince\b", t)
is_after = is_after or re.findall(r"\blater\b", t)
is_before = re.findall(BEFORE_CHECK, t)
is_after = re.findall(AFTER_CHECK, t)

if is_before:
result = f"/{result}" # unknown is replaced with null for intervals
result = f"/{result}"
elif is_after:
result = f"{result}/" # unknown is replaced with null for intervals
result = f"{result}/"

return result


def text_to_edtf_date(text):
@functools.lru_cache
def text_to_edtf_date(text: str) -> Optional[str]:
"""
Return EDTF string equivalent of a given natural language date string.

Expand All @@ -115,37 +144,28 @@ def text_to_edtf_date(text):
differ are undefined.
"""
if not text:
return
return None

t = text.lower()
result = ""
result: str = ""

for reject_re in REJECT_RULES:
if re.match(reject_re, t):
return
if re.match(REJECT_RULES, t):
return None

# matches on '1800s'. Needs to happen before is_decade.
could_be_century = re.findall(r"(\d{2}00)s", t)
could_be_century: list = re.findall(MIGHT_BE_CENTURY, t)
# matches on '1800s' and '1910s'. Removes the 's'.
# Needs to happen before is_uncertain because e.g. "1860s?"
t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t)
t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t)

# detect approximation signifiers
# a few 'circa' abbreviations just before the year
is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t)
is_approximate = re.findall(APPROX_CHECK, t)
# the word 'circa' anywhere
is_approximate = is_approximate or re.findall(r"\bcirca\b", t)
# the word 'approx'/'around'/'about' anywhere
is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t)
# a ~ before a year-ish number
is_approximate = is_approximate or re.findall(r"\b~\d{4}", t)
# a ~ at the beginning
is_approximate = is_approximate or re.findall(r"^~", t)

# detect uncertainty signifiers
t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t)
# the words uncertain/maybe/guess anywhere
is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t)
t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t)
is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t)

# detect century forms
is_century = re.findall(CENTURY_RE, t)
Expand All @@ -154,31 +174,28 @@ def text_to_edtf_date(text):
is_ce = re.findall(CE_RE, t)
if is_century:
result = "%02dXX" % (int(is_century[0][0]) - 1,)
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t)
is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t)
is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t)
is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t)

try:
is_bc = is_century[0][-1] in ("bc", "bce")
if is_bc:
if is_century[0][-1] in ("bc", "bce"):
result = f"-{result}"
except IndexError:
pass

elif is_ce:
result = "%04d" % (int(is_ce[0][0]))
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t)
is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t)
is_approximate = is_approximate or re.findall(APPROX_CE_RE, t)
is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t)

try:
is_bc = is_ce[0][-1] in ("bc", "bce")
if is_bc:
if is_ce[0][-1] in ("bc", "bce"):
result = f"-{result}"
except IndexError:
pass

else:
# try dateutil.parse

try:
# parse twice, using different defaults to see what was
# parsed and what was guessed.
Expand All @@ -205,34 +222,34 @@ def text_to_edtf_date(text):

if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date():
# couldn't parse anything - defaults are untouched.
return
return None

date1 = dt1.isoformat()[:10]
date2 = dt2.isoformat()[:10]

# guess precision of 'unspecified' characters to use
mentions_year = re.findall(r"\byear\b.+(in|during)\b", t)
mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t)
mentions_day = re.findall(r"\bday\b.+(in|during)\b", t)
mentions_year = re.findall(MENTIONS_YEAR, t)
mentions_month = re.findall(MENTIONS_MONTH, t)
mentions_day = re.findall(MENTIONS_DAY, t)

for i in range(len(date1)):
# if the given year could be a century (e.g. '1800s') then use
# approximate/uncertain markers to decide whether we treat it as
# a century or a decade.
if i == 2 and could_be_century and not (is_approximate or is_uncertain):
result += "X"
elif i == 3 and is_decade > 0:
elif i == 3 and is_decade:
if mentions_year:
result += "X" # previously year precision - now just X
result += "X" # year precision
else:
result += "X" # previously decade precision - now just X
result += "X" # decade precision
elif date1[i] == date2[i]:
# since both attempts at parsing produced the same result
# it must be parsed value, not a default
result += date1[i]
else:
# different values were produced, meaning that it's likely
# a default. Use 'X'
# a default. Use 'unspecified'
result += "X"

# strip off unknown chars from end of string - except the first 4
Expand Down
Loading
Loading