Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Threading #2

Merged
merged 4 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 91 additions & 40 deletions check_biblio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,68 @@
import argparse
import sqlite3
import difflib
from typing import Optional
from typing import Optional, List, Tuple
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import functools

Check failure on line 14 in check_biblio.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

check_biblio.py:14:8: F401 `functools` imported but unused
import tqdm
import logging

import bibtexparser

print_lock = threading.Lock()


class DataBase:
"""A simple database to store the original and final strings"""

def __init__(self, filename: str):
self.filename = filename
self.con = sqlite3.connect(filename)
self.cur = self.con.cursor()
self.cur.execute("CREATE TABLE IF NOT EXISTS entries(key, original, final)")
self._con = sqlite3.connect(filename, check_same_thread=False)
self._cur = self._con.cursor()
self._cur.execute("CREATE TABLE IF NOT EXISTS entries(key, original, final)")
self._lock = threading.Lock()
self._nupdates = 0
self.substitutions: List[Tuple[str, str]] = []

def update(self, key: str, original: str, final: str):
self.cur.execute("INSERT INTO entries VALUES (?, ?, ?)", (key, original.strip(), final.strip()))
with self._lock:
self._cur.execute(
"INSERT INTO entries VALUES (?, ?, ?)",
(key, original.strip(), final.strip()),
)
self._nupdates += 1
if self._nupdates % 20 == 0:
self._con.commit()
if original != final:
self.substitutions.append((original, final))

def query(self, original: str):
try:
query = "SELECT * FROM entries WHERE original=?"
res = self.cur.execute(query, (original,))
r = res.fetchone()
with self._lock:
res = self._cur.execute(query, (original,))
r = res.fetchone()
except sqlite3.OperationalError as ex:
print(f"problem executing query {query} with {original}")
print("error: %s" % ex)
raise ex
if r:
return r[2]
else:
return None
proposed = r[2]
if original != proposed:
with self._lock:
self.substitutions.append((original, proposed))
return proposed
return None

def commit(self):
with self._lock:
self._con.commit()

def __del__(self):
print("closing database")
self.con.commit()
self.con.close()
self._con.commit()
self._con.close()


def diff_strings(a: str, b: str) -> str:
Expand All @@ -67,6 +94,8 @@


regex_unicode = re.compile("[^\x00-\x7F]")


def help_unicode(item: str) -> Optional[str]:
m = regex_unicode.search(item)
if m:
Expand All @@ -77,6 +106,7 @@
+ "*****UNICODE******"
+ item[m.end() :]
)
return None


def replace_unicode(item: str) -> str:
Expand All @@ -92,7 +122,7 @@

def replace_chars(match):
char = match.group(0)
print('unicode found, replacing "%s" with "%s"' % (char, chars[char]))
logging.debug('unicode found, replacing "%s" with "%s"', char, chars[char])
return chars[char]

return re.sub("(" + "|".join(list(chars.keys())) + ")", replace_chars, item)
Expand All @@ -117,18 +147,11 @@

def modify_item(item: str, error: str) -> str:
editor_command = os.environ.get("EDITOR")
if not editor_command:
print("you haven't defined a default EDITOR, (e.g. export EDITOR=emacs)")
editor_command = input(
"enter the command to open an editor (e.g. emacs/atom -w/...): "
)
os.environ["EDITOR"] = editor_command
editor_command = editor_command.strip()

tmp_filename = next(tempfile._get_candidate_names())

with open(tmp_filename, "w", encoding="utf-8") as f:
preamble = "do not delete these lines\n" "error found:\n"
preamble = "do not delete these lines\n" + "error found:\n"
preamble += error
r = help_unicode(item)
if r is not None:
Expand Down Expand Up @@ -214,32 +237,32 @@
return error


def run_entry(entry, db, fix_unicode, substitutions):
def run_entry(entry, db, fix_unicode) -> None:
raw_original = entry.raw.strip()
raw_proposed = raw_original

from_cache = db.query(raw_original)
if from_cache is not None:
if raw_original != from_cache:
substitutions.append((raw_original, from_cache))
return

raw_proposed = raw_original

if fix_unicode:
raw_proposed = replace_unicode(raw_original)
if raw_proposed != raw_original:
print(f"unicode found in {entry.key}, fixing")
logging.debug("unicode found in %s, fixing", entry.key)

while True:
error = check_latex_entry(entry.key, raw_proposed, args.use_bibtex)
if error is None:
break

print(f"problem running item {entry.key}")
raw_proposed = modify_item(raw_proposed, error).strip()
with print_lock:
print(f"problem running item {entry.key}")
raw_proposed = modify_item(raw_proposed, error).strip()

if raw_original != raw_proposed:
print(diff_strings(raw_original, raw_proposed))
substitutions.append((raw_original, raw_proposed))
with print_lock:
print(diff_strings(raw_original, raw_proposed))
db.update(entry.key, raw_original, raw_proposed)


Expand All @@ -251,13 +274,21 @@
)
parser.add_argument("bibtex", default="https://inspirehep.net/")
parser.add_argument("--fix-unicode", action="store_true")
parser.add_argument("--nthreads", type=int, default=5)
parser.add_argument(
"--use-bibtex", action="store_true", help="use bibtex instead of biblatex"
)
args = parser.parse_args()

editor_command = os.environ.get("EDITOR")
if not editor_command:
print("you haven't defined a default EDITOR, (e.g. export EDITOR=emacs)")
editor_command = input(
"enter the command to open an editor (e.g. emacs/atom -w/...): "
)
os.environ["EDITOR"] = editor_command.strip()

try:
substitutions = []
biblio_parsed = bibtexparser.parse_file(args.bibtex)
db = DataBase("db.sqlite")

Expand All @@ -267,24 +298,44 @@
print("found %d entries" % len(biblio_parsed.entries))

nentries = len(biblio_parsed.entries)
# import multiprocessing
# import functools
# p = multiprocessing.Pool(4)
# p.map(functools.partial(run_entry, cur=cur, fix_unicode=args.fix_unicode, substitutions=substitutions), biblio_parsed.entries)
for ientry, entry in enumerate(biblio_parsed.entries, 1):
print("checking key %s %d/%d" % (entry.key, ientry, nentries))
run_entry(entry, db, args.fix_unicode, substitutions)

with ThreadPoolExecutor(max_workers=args.nthreads) as p:
with tqdm.tqdm(total=nentries) as pbar:
pbar.set_description("checking entries")
pbar.set_postfix_str(f"nthreads={args.nthreads}")

def partial_function(entry):
with print_lock:
pbar.set_description(entry.key)
run_entry(entry, db, args.fix_unicode)
with print_lock:
pbar.update()

futures = {}

for entry in biblio_parsed.entries:
future = p.submit(partial_function, entry)
futures[future] = entry.key
for future in as_completed(futures):
key = futures[future]
try:
future.result()
except Exception as ex:
with print_lock:
pbar.write(f"problem with entry: {key}")
raise ex
with print_lock:
pbar.set_description(key)
finally:

biblio = open(args.bibtex, encoding="utf-8").read()
substitutions = db.substitutions
print(f"applying {len(substitutions)} substitutions")
for old, new in substitutions:
if old == new:
print("BIG PROBLEM: old == new")
print(diff_strings(old, new))
if old not in biblio:
print("BIG PROBLEM: old not in biblio")
print("BIG PROBLEM: old not in biblio: %s" % old)
biblio = biblio.replace(old, new)
new_biblio_fn = args.bibtex.replace(".bib", "_new.bib")
with open(new_biblio_fn, "w", encoding="utf-8") as f:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
bibtexparser==2.0.0b4
bibtexparser==2.0.0b4
tqdm
Loading