Skip to content

Commit

Permalink
Update MediaWiki db table and simplify X-Ray code
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed May 25, 2024
1 parent feee439 commit ae4e869
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 210 deletions.
4 changes: 2 additions & 2 deletions data/deps.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"cupy": "12.3.0",
"lxml": "5.2.2",
"rapidfuzz": "3.9.0",
"rapidfuzz": "3.9.1",
"spacy": "3.7.4",
"spacy_cpu_model": "3.7.0",
"spacy_trf_model": "3.7.2",
"thinc-apple-ops": "0.1.5",
"torch": "2.2.2",
"typing-extensions": "4.11.0"
"typing-extensions": "4.12.0"
}
28 changes: 26 additions & 2 deletions database.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,18 @@ def create_x_indices(conn: sqlite3.Connection) -> None:


def insert_x_book_metadata(
conn: sqlite3.Connection, data: tuple[int, int, int, int, int, str | None]
conn: sqlite3.Connection, erl: int, num_images: int, preview_images: str | None
) -> None:
conn.execute("INSERT INTO book_metadata VALUES(0, ?, ?, 0, 0, ?, ?, ?, ?)", data)
num_people = 0
num_terms = 0
for (num,) in conn.execute("SELECT count(*) FROM entity WHERE type = 1"):
num_people = num
for (num,) in conn.execute("SELECT count(*) FROM entity WHERE type = 2"):
num_terms = num
conn.execute(
"INSERT INTO book_metadata VALUES(0, ?, ?, 0, 0, ?, ?, ?, ?)",
(erl, num_images > 0, num_people, num_terms, num_images, preview_images),
)


def insert_x_entities(
Expand All @@ -207,6 +216,21 @@ def insert_x_occurrences(
conn.executemany("INSERT INTO occurrence VALUES(?, ?, ?)", data)


def get_top_ten_entities(conn: sqlite3.Connection, entity_type: int) -> str:
entity_ids = []
for (entity_id,) in conn.execute(
"SELECT id FROM entity WHERE type = ? ORDER BY count DESC LIMIT 10",
(entity_type,),
):
entity_ids.append(entity_id)
return ",".join(map(str, entity_ids))


def insert_x_types(conn: sqlite3.Connection) -> None:
insert_x_type(conn, (1, 14, 15, 1, get_top_ten_entities(conn, 1)))
insert_x_type(conn, (2, 16, 17, 2, get_top_ten_entities(conn, 2)))


def insert_x_type(
conn: sqlite3.Connection, data: tuple[int, int, int, int, str]
) -> None:
Expand Down
76 changes: 37 additions & 39 deletions epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from .x_ray_share import (
FUZZ_THRESHOLD,
PERSON_LABELS,
CustomX,
CustomXDict,
XRayEntity,
is_full_name,
)
Expand All @@ -39,7 +39,7 @@
from x_ray_share import (
FUZZ_THRESHOLD,
PERSON_LABELS,
CustomX,
CustomXDict,
XRayEntity,
is_full_name,
)
Expand Down Expand Up @@ -79,7 +79,7 @@ def __init__(
mediawiki: MediaWiki | None,
wiki_commons: Wikimedia_Commons | None,
wikidata: Wikidata | None,
custom_x_ray: CustomX,
custom_x_ray: CustomXDict,
lemmas_conn: sqlite3.Connection | None,
) -> None:
self.book_path = Path(book_path_str)
Expand Down Expand Up @@ -172,7 +172,7 @@ def extract_epub(self) -> Iterator[tuple[str, tuple[int, int, Path]]]:

def add_entity(
self,
entity: str,
entity_name: str,
ner_label: str,
book_quote: str,
paragraph_start: int,
Expand All @@ -185,32 +185,29 @@ def add_entity(
from rapidfuzz.process import extractOne
from rapidfuzz.utils import default_process

if entity_data := self.entities.get(entity):
entity_id = entity_data["id"]
entity_data["count"] += 1
elif entity not in self.custom_x_ray and (
if entity_data := self.entities.get(entity_name):
entity_id = entity_data.id
entity_data.count += 1
elif entity_name not in self.custom_x_ray and (
r := extractOne(
entity,
entity_name,
self.entities.keys(),
score_cutoff=FUZZ_THRESHOLD,
scorer=partial(token_set_ratio, processor=default_process),
)
):
matched_name = r[0]
matched_entity = self.entities[matched_name]
matched_entity["count"] += 1
entity_id = matched_entity["id"]
if is_full_name(matched_name, matched_entity["label"], entity, ner_label):
self.entities[entity] = matched_entity
matched_entity.count += 1
entity_id = matched_entity.id
if is_full_name(matched_name, matched_entity.label, entity_name, ner_label):
self.entities[entity_name] = matched_entity
del self.entities[matched_name]
else:
entity_id = self.entity_id
self.entities[entity] = {
"id": self.entity_id,
"label": ner_label,
"quote": book_quote,
"count": 1,
}
self.entities[entity_name] = XRayEntity(
self.entity_id, book_quote, ner_label, 1
)
self.entity_id += 1

self.entity_occurrences[xhtml_path].append(
Expand Down Expand Up @@ -254,15 +251,15 @@ def add_lemma(
)

def remove_entities(self, minimal_count: int) -> None:
for entity, data in self.entities.copy().items():
for entity_name, entity_data in self.entities.copy().items():
if (
data["count"] < minimal_count
entity_data.count < minimal_count
and self.mediawiki is not None # mypy
and self.mediawiki.get_cache(entity) is None
and entity not in self.custom_x_ray
and self.mediawiki.get_cache(entity_name) is None
and entity_name not in self.custom_x_ray
):
del self.entities[entity]
self.removed_entity_ids.add(data["id"])
del self.entities[entity_name]
self.removed_entity_ids.add(entity_data.id)

def modify_epub(
self, prefs: Prefs, lemma_lang: str, gloss_lang: str, gloss_source: str
Expand Down Expand Up @@ -401,29 +398,30 @@ def create_x_ray_footnotes(self) -> None:
<head><title>X-Ray</title><meta charset="utf-8"/></head>
<body>
"""
for entity, data in self.entities.items():
if custom_data := self.custom_x_ray.get(entity):
custom_desc, custom_source_id, _ = custom_data
for entity_name, entity_data in self.entities.items():
if entity_data.id in self.removed_entity_ids:
continue
elif custom_data := self.custom_x_ray.get(entity_name):
s += (
f'<aside id="{data["id"]}" epub:type="footnote">'
f"{create_p_tags(custom_desc)}"
f'<aside id="{entity_data.id}" epub:type="footnote">'
f"{create_p_tags(custom_data.desc)}"
)
if custom_source_id is not None:
if custom_data.source_id is not None:
s += "<p>Source: "
s += (
"Wikipedia"
if custom_source_id == 1
if custom_data.source_id == 1
else self.mediawiki.sitename
)
s += "</p>"
s += "</aside>"
elif (
self.prefs["search_people"] or data["label"] not in PERSON_LABELS
) and (intro_cache := self.mediawiki.get_cache(entity)):
s += f'<aside id="{data["id"]}" epub:type="footnote">'
self.prefs["search_people"] or entity_data.label not in PERSON_LABELS
) and (intro_cache := self.mediawiki.get_cache(entity_name)):
s += f'<aside id="{entity_data.id}" epub:type="footnote">'
s += create_p_tags(intro_cache.intro)
s += f"<p>Source: {self.mediawiki.sitename}</p>"
if self.wikidata and (
if self.wikidata is not None and (
wikidata_cache := self.wikidata.get_cache(
intro_cache.wikidata_item_id
)
Expand All @@ -432,7 +430,7 @@ def create_x_ray_footnotes(self) -> None:
if inception := wikidata_cache.get("inception"):
s += f"<p>{inception_text(inception)}</p>"
add_wikidata_source = True
if self.wiki_commons and (
if self.wiki_commons is not None and (
filename := wikidata_cache.get("map_filename")
):
file_path = self.wiki_commons.get_image(filename)
Expand All @@ -449,8 +447,8 @@ def create_x_ray_footnotes(self) -> None:
s += "</aside>"
else:
s += (
f'<aside id="{data["id"]}" epub:type="footnote"><p>'
f'{escape(data["quote"])}</p></aside>'
f'<aside id="{entity_data.id}" epub:type="footnote"><p>'
f"{escape(entity_data.quote)}</p></aside>"
)

s += "</body></html>"
Expand Down
Loading

0 comments on commit ae4e869

Please sign in to comment.