From dddf1ce97dc6aeb2aa3490a17709048076ce7ae8 Mon Sep 17 00:00:00 2001 From: anthology assist Date: Tue, 14 Mar 2023 14:42:58 -0500 Subject: [PATCH 1/4] test script update. --- bin/ingest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/ingest.py b/bin/ingest.py index 260aa4b150..632693cc12 100755 --- a/bin/ingest.py +++ b/bin/ingest.py @@ -423,6 +423,7 @@ def find_book(): bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) + # front matter do exist if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) @@ -472,7 +473,9 @@ def find_book(): # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] + # not front matter pdf else: + make_simple_element("frontmatter", parent=volume_node) # remove unneeded fields for child in paper_node: if child.tag in [ From b85bbc33657db6d20ce9c66907d2fb75561ff766 Mon Sep 17 00:00:00 2001 From: anthology assist Date: Fri, 17 Mar 2023 14:07:59 -0500 Subject: [PATCH 2/4] script update. --- bin/anthology/venues.py | 2 +- bin/ingest.py | 124 +++++++++++++++++++++++----------------- 2 files changed, 71 insertions(+), 55 deletions(-) diff --git a/bin/anthology/venues.py b/bin/anthology/venues.py index 6f192a42ca..8acffdf0c8 100644 --- a/bin/anthology/venues.py +++ b/bin/anthology/venues.py @@ -85,7 +85,7 @@ def add_venue(self, directory, acronym, title, is_acl=False, url=None): """ Adds a new venue. - Everytime a new venue is created, the corresponding yaml file is created as welll. + Everytime a new venue is created, the corresponding yaml file is created as well. """ slug = VenueIndex.get_slug_from_acronym(acronym) diff --git a/bin/ingest.py b/bin/ingest.py index 632693cc12..72eb6f0c27 100755 --- a/bin/ingest.py +++ b/bin/ingest.py @@ -59,7 +59,7 @@ from anthology.venues import VenueIndex from itertools import chain -from typing import Dict, Any +from typing import Dict, Any, List, Tuple from slugify import slugify @@ -190,53 +190,13 @@ def bib2xml(bibfilename, anthology_id): return paper -def main(args): - volumes = {} - - anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") - venue_index = VenueIndex(srcdir=anthology_datadir) - venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] - - sig_index = SIGIndex(srcdir=anthology_datadir) - - people = AnthologyIndex(srcdir=anthology_datadir) - people.bibkeys = load_bibkeys(anthology_datadir) - - def correct_caps(name): - """ - Many people submit their names in "ALL CAPS" or "all lowercase". - Correct this with heuristics. - """ - if name.islower() or name.isupper(): - # capitalize all parts - corrected = " ".join(list(map(lambda x: x.capitalize(), name.split()))) - print( - f"-> Correcting capitalization of '{name}' to '{corrected}'", - file=sys.stderr, - ) - name = corrected - - return name - - def disambiguate_name(node, anth_id): - name = PersonName.from_element(node) - ids = people.get_ids(name) - choice = -1 - if len(ids) > 1: - while choice < 0 or choice >= len(ids): - print( - f"({anth_id}): ambiguous author {name}; Please choose from the following:" - ) - for i, id_ in enumerate(ids): - print(f"[{i}] {id_} ({people.get_comment(id_)})") - choice = int(input("--> ")) - - return ids[choice], choice - - # Build list of volumes, confirm uniqueness +def build_volumes( + path: str, venue_index: VenueIndex, venue_keys: List[str] +) -> Tuple[List, Dict]: unseen_venues = [] + volumes = {} - for proceedings in args.proceedings: + for proceedings in path: # args.proceedings meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug_from_acronym(venue_abbrev) @@ -249,6 +209,7 @@ def disambiguate_name(node, anth_id): print( f"WARNING: Venue {venue_abbrev} ends in a number, this is probably a mistake" ) + sys.exit(1) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) @@ -265,19 +226,74 @@ def disambiguate_name(node, anth_id): volumes[volume_full_id] = meta if "sig" in meta: - print( - f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:" - ) + print(f"Add this line to sigs/{meta['sig'].lower()}.yaml:") print(f" - {meta['year']}:") print(f" - {volume_full_id} # {meta['booktitle']}") + return unseen_venues, volumes + - # Make sure all venues exist +def create_venues(unseen_venues: List, venue_index: VenueIndex, anthology_datadir: str): + ''' + Create yaml file for new venues + ''' if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title}) slug {slug}") venue_index.add_venue(anthology_datadir, abbrev, title) + +def correct_caps(name): + """ + Many people submit their names in "ALL CAPS" or "all lowercase". + Correct this with heuristics. + """ + if name.islower() or name.isupper(): + # capitalize all parts + corrected = " ".join(list(map(lambda x: x.capitalize(), name.split()))) + print( + f"-> Correcting capitalization of '{name}' to '{corrected}'", + file=sys.stderr, + ) + name = corrected + + return name + + +def disambiguate_name(node, anth_id, people): + name = PersonName.from_element(node) + ids = people.get_ids(name) + choice = -1 + if len(ids) > 1: + while choice < 0 or choice >= len(ids): + print( + f"({anth_id}): ambiguous author {name}; Please choose from the following:" + ) + for i, id_ in enumerate(ids): + print(f"[{i}] {id_} ({people.get_comment(id_)})") + choice = int(input("--> ")) + + return ids[choice], choice + + +def main(args): + volumes = {} + + anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") + venue_index = VenueIndex(srcdir=anthology_datadir) + venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] + + print(f'{type(venue_index)}, {venue_index}, {venue_keys}') + + # sig_index = SIGIndex(srcdir=anthology_datadir) + + people = AnthologyIndex(srcdir=anthology_datadir) + people.bibkeys = load_bibkeys(anthology_datadir) + + # Build list of volumes, confirm uniqueness + unseen_venues, volumes = build_volumes(args.proceedings, venue_index, venue_keys) + create_venues(unseen_venues, venue_index, anthology_datadir) + # Copy over the PDFs and attachments for volume_full_id, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") @@ -423,7 +439,6 @@ def find_book(): bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) - # front matter do exist if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) @@ -434,7 +449,7 @@ def find_book(): paper_node.findall("./author"), paper_node.findall("./editor") ): disamb_name, name_choice = disambiguate_name( - author_or_editor, paper_id_full + author_or_editor, paper_id_full, people ) if name_choice != -1: author_or_editor.attrib["id"] = disamb_name @@ -473,7 +488,6 @@ def find_book(): # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] - # not front matter pdf else: make_simple_element("frontmatter", parent=volume_node) # remove unneeded fields @@ -523,7 +537,9 @@ def find_book(): for name_node in chain( paper_node.findall("./author"), paper_node.findall("./editor") ): - disamb_name, name_choice = disambiguate_name(name_node, paper_id_full) + disamb_name, name_choice = disambiguate_name( + name_node, paper_id_full, people + ) if name_choice != -1: name_node.attrib["id"] = disamb_name person = PersonName.from_element(name_node) From e78eb760233aabc803f383b87495701b096be35d Mon Sep 17 00:00:00 2001 From: anthology assist Date: Mon, 20 Mar 2023 14:04:18 -0500 Subject: [PATCH 3/4] refactor. --- bin/ingest.py | 804 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 548 insertions(+), 256 deletions(-) diff --git a/bin/ingest.py b/bin/ingest.py index 72eb6f0c27..1ebe38f3eb 100755 --- a/bin/ingest.py +++ b/bin/ingest.py @@ -59,7 +59,7 @@ from anthology.venues import VenueIndex from itertools import chain -from typing import Dict, Any, List, Tuple +from typing import Dict, Any, List, Tuple, Optional from slugify import slugify @@ -243,7 +243,282 @@ def create_venues(unseen_venues: List, venue_index: VenueIndex, anthology_datadi venue_index.add_venue(anthology_datadir, abbrev, title) -def correct_caps(name): +def find_book(meta) -> Optional[str]: + """Book location has shifted a bit over the years""" + year = meta["year"] + venue_name = meta["abbrev"].lower() + volume_name = meta["volume"].lower() + + potential_names = [ + os.path.join(meta["path"], "book.pdf"), + os.path.join( + meta["path"], + "cdrom", + f"{year}-{venue_name.lower()}-{volume_name}.pdf", + ), + os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"), + ] + + for book_rel_path in potential_names: + if os.path.exists(book_rel_path): + return book_rel_path + + return None + + +def copy_pdf_and_attachment(meta, pdfs_dir: str) -> Dict: # args.pdfs_dir + root_path = os.path.join(meta["path"], "cdrom") + collection_id = meta["collection_id"] + venue_name = meta["abbrev"].lower() + volume_name = meta["volume"].lower() + year = meta["year"] + + pdfs_dest_dir = os.path.join(pdfs_dir, venue_name) + if not os.path.exists(pdfs_dest_dir): + os.makedirs(pdfs_dest_dir) + + # handle proceedings.pdf + proceedings_src_path = find_book(meta) + proceedings_dest_path = None + + if proceedings_src_path is not None and not args.dry_run: + proceedings_dest_path = ( + os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" + ) + maybe_copy(proceedings_src_path, proceedings_dest_path) + + # temp holder for volume + volume = dict() + + # handle pdfs + pdfs_src_dir = os.path.join(root_path, "pdf") + for pdf_file in os.listdir(pdfs_src_dir): + # Skip . files + if os.path.basename(pdf_file).startswith("."): + continue + + # names are {abbrev}{number}.pdf + match = re.match(rf".*\.(\d+)\.pdf", pdf_file) + + if match is not None: + paper_num = int(match[1]) + paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" + + bib_path = os.path.join( + root_path, + "bib", + pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), + ) + + pdf_src_path = os.path.join(pdfs_src_dir, pdf_file) + pdf_dest_path = os.path.join( + pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" + ) + if not args.dry_run: + maybe_copy(pdf_src_path, pdf_dest_path) + + volume[paper_num] = { + "anthology_id": paper_id_full, + "bib": bib_path, + "pdf": pdf_dest_path, + "attachments": [], + } + + # handle attachments + if os.path.exists(os.path.join(root_path, "additional")): + attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) + if not os.path.exists(attachments_dest_dir): + os.makedirs(attachments_dest_dir) + for attachment_file in os.listdir(os.path.join(root_path, "additional")): + if os.path.basename(attachment_file).startswith("."): + continue + attachment_file_path = os.path.join(root_path, "additional", attachment_file) + # Find the attachment file, using a bit of a fuzzy + # match. The fuzzy match is because sometimes people + # generate the proceedings with the wrong venue + # code. If we correct it, we still need to be able to + # find the file. + match = re.match(rf"{year}\..*-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) + if match is None: + print( + f"* Warning: no attachment match for {attachment_file}", + file=sys.stderr, + ) + sys.exit(2) + + paper_num, type_, ext = match.groups() + paper_num = int(paper_num) + + file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" + dest_path = os.path.join(attachments_dest_dir, file_name) + if not args.dry_run and not os.path.exists(dest_path): + log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) + shutil.copyfile(attachment_file_path, dest_path) + + volume[paper_num]["attachments"].append((dest_path, type_)) + + return volume, proceedings_dest_path + + +def check_volumes(volumes: Dict): + sorted_volumes = sorted(volumes.items()) + + return None + + +def create_xml( + volume: Dict, + meta: Dict, + prooceedings_dst_dir, + anthology_dir: str, + ingest_date: str, + people, +): # args.anthology_dir, args.ingest_date + collection_id = meta["collection_id"] + volume_name = meta["volume"].lower() + venue_name = meta["abbrev"].lower() + + collection_file = os.path.join(anthology_dir, "data", "xml", f"{collection_id}.xml") + if os.path.exists(collection_file): + root_node = etree.parse(collection_file).getroot() + else: + root_node = make_simple_element("collection", attrib={"id": collection_id}) + + volume_node = make_simple_element( + "volume", + attrib={"id": volume_name, "ingest-date": ingest_date}, + ) + + # Replace the existing one if present + existing_volume_node = root_node.find(f"./volume[@id='{volume_name}']") + for i, child in enumerate(root_node): + if child.attrib["id"] == volume_name: + root_node[i] = volume_node + break + else: + root_node.append(volume_node) + + meta_node = None + + for _, paper in sorted(volume.items()): + paper_id_full = paper["anthology_id"] + bibfile = paper["bib"] + paper_node = bib2xml(bibfile, paper_id_full) + + if paper_node.attrib["id"] == "0": + # create metadata subtree + meta_node = make_simple_element("meta", parent=volume_node) + title_node = paper_node.find("title") + title_node.tag = "booktitle" + meta_node.append(title_node) + for author_or_editor in chain( + paper_node.findall("./author"), paper_node.findall("./editor") + ): + disamb_name, name_choice = disambiguate_name( + author_or_editor, paper_id_full, people + ) + if name_choice != -1: + author_or_editor.attrib["id"] = disamb_name + person = PersonName.from_element(author_or_editor) + for name_part in author_or_editor: + name_part.text = correct_caps(name_part.text) + meta_node.append(author_or_editor) + author_or_editor.tag = "editor" + + # Here, we grab the publisher from the meta file, in case it's not in the + # frontmatter paper. We don't handle the situation where it's in neither! + publisher_node = paper_node.find("publisher") + if publisher_node is None: + publisher_node = make_simple_element("publisher", meta["publisher"]) + meta_node.append(publisher_node) + + # Look for the address in the bib file, then the meta file + address_node = paper_node.find("address") + if address_node is None: + address_node = make_simple_element("address", meta["location"]) + meta_node.append(address_node) + + meta_node.append(paper_node.find("month")) + meta_node.append(paper_node.find("year")) + if prooceedings_dst_dir is not None: + make_simple_element( + "url", + text=f"{collection_id}-{volume_name}", + attrib={"hash": compute_hash_from_file(prooceedings_dst_dir)}, + parent=meta_node, + ) + + # Add the venue tag + make_simple_element("venue", venue_name, parent=meta_node) + + # modify frontmatter tag + paper_node.tag = "frontmatter" + del paper_node.attrib["id"] + else: + # remove unneeded fields + for child in paper_node: + if child.tag in [ + "editor", + "address", + "booktitle", + "publisher", + "year", + "month", + ]: + paper_node.remove(child) + + url = paper_node.find("./url") + if url is not None: + url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) + + for path, type_ in paper["attachments"]: + make_simple_element( + "attachment", + text=os.path.basename(path), + attrib={ + "type": type_, + "hash": compute_hash_from_file(path), + }, + parent=paper_node, + ) + + if len(paper_node) > 0: + volume_node.append(paper_node) + + # Normalize + for oldnode in paper_node: + normalize(oldnode, informat="latex") + + # Adjust the language tag + language_node = paper_node.find("./language") + if language_node is not None: + try: + lang = iso639.languages.get(name=language_node.text) + except KeyError: + raise Exception(f"Can't find language '{language_node.text}'") + language_node.text = lang.part3 + + # Fix author names + for name_node in chain( + paper_node.findall("./author"), paper_node.findall("./editor") + ): + disamb_name, name_choice = disambiguate_name(name_node, paper_id_full, people) + if name_choice != -1: + name_node.attrib["id"] = disamb_name + person = PersonName.from_element(name_node) + for name_part in name_node: + name_part.text = correct_caps(name_part.text) + + # Other data from the meta file + if "isbn" in meta: + make_simple_element("isbn", meta["isbn"], parent=meta_node) + + indent(root_node) + tree = etree.ElementTree(root_node) + tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True) + + +def correct_caps(name: str) -> str: """ Many people submit their names in "ALL CAPS" or "all lowercase". Correct this with heuristics. @@ -277,7 +552,7 @@ def disambiguate_name(node, anth_id, people): def main(args): - volumes = {} + # volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) @@ -292,269 +567,286 @@ def main(args): # Build list of volumes, confirm uniqueness unseen_venues, volumes = build_volumes(args.proceedings, venue_index, venue_keys) - create_venues(unseen_venues, venue_index, anthology_datadir) - # Copy over the PDFs and attachments - for volume_full_id, meta in volumes.items(): - root_path = os.path.join(meta["path"], "cdrom") - collection_id = meta["collection_id"] - venue_name = meta["abbrev"].lower() - volume_name = meta["volume"].lower() - year = meta["year"] + for k, v in volumes.items(): + print(f'key is {k} value is {v}') + # check volumes - pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) - if not os.path.exists(pdfs_dest_dir): - os.makedirs(pdfs_dest_dir) - - def find_book(): - """Book location has shifted a bit over the years""" - - potential_names = [ - os.path.join(meta["path"], "book.pdf"), - os.path.join( - meta["path"], - "cdrom", - f"{year}-{venue_name.lower()}-{volume_name}.pdf", - ), - os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"), - ] - - for book_rel_path in potential_names: - if os.path.exists(book_rel_path): - return book_rel_path + create_venues(unseen_venues, venue_index, anthology_datadir) - return None + # Copy over the PDFs and attachments and create xml + for _, meta in volumes.items(): + volume, prooceedings_dst_dir = copy_pdf_and_attachment(meta, args.pdfs_dir) + create_xml( + volume, + meta, + prooceedings_dst_dir, + args.anthology_dir, + args.ingest_date, + people, + ) + # root_path = os.path.join(meta["path"], "cdrom") + # collection_id = meta["collection_id"] + # venue_name = meta["abbrev"].lower() + # volume_name = meta["volume"].lower() + # year = meta["year"] + + # pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) + # if not os.path.exists(pdfs_dest_dir): + # os.makedirs(pdfs_dest_dir) + + # def find_book(): + # """Book location has shifted a bit over the years""" + + # potential_names = [ + # os.path.join(meta["path"], "book.pdf"), + # os.path.join( + # meta["path"], + # "cdrom", + # f"{year}-{venue_name.lower()}-{volume_name}.pdf", + # ), + # os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"), + # ] + + # for book_rel_path in potential_names: + # if os.path.exists(book_rel_path): + # return book_rel_path + + # return None # copy the book from the top-level proceedings/ dir, named "VENUE-year.pdf", # or sometimes "book.pdf" - book_src_path = find_book() - book_dest_path = None - if book_src_path is not None and not args.dry_run: - book_dest_path = ( - os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" - ) - maybe_copy(book_src_path, book_dest_path) + # book_src_path = find_book(meta) + # book_dest_path = None + # if book_src_path is not None and not args.dry_run: + # book_dest_path = ( + # os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" + # ) + # maybe_copy(book_src_path, book_dest_path) # temp holder for papers in each volume - volume = dict() - - # copy the paper PDFs - pdf_src_dir = os.path.join(root_path, "pdf") - for pdf_file in os.listdir(pdf_src_dir): - # Skip . files - if os.path.basename(pdf_file).startswith("."): - continue - - # names are {abbrev}{number}.pdf - match = re.match(rf".*\.(\d+)\.pdf", pdf_file) - - if match is not None: - paper_num = int(match[1]) - paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" - - bib_path = os.path.join( - root_path, - "bib", - pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), - ) - - pdf_src_path = os.path.join(pdf_src_dir, pdf_file) - pdf_dest_path = os.path.join( - pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" - ) - if not args.dry_run: - maybe_copy(pdf_src_path, pdf_dest_path) - - volume[paper_num] = { - "anthology_id": paper_id_full, - "bib": bib_path, - "pdf": pdf_dest_path, - "attachments": [], - } - - # copy the attachments - if os.path.exists(os.path.join(root_path, "additional")): - attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) - if not os.path.exists(attachments_dest_dir): - os.makedirs(attachments_dest_dir) - for attachment_file in os.listdir(os.path.join(root_path, "additional")): - if os.path.basename(attachment_file).startswith("."): - continue - attachment_file_path = os.path.join( - root_path, "additional", attachment_file - ) - # Find the attachment file, using a bit of a fuzzy - # match. The fuzzy match is because sometimes people - # generate the proceedings with the wrong venue - # code. If we correct it, we still need to be able to - # find the file. - match = re.match( - rf"{year}\..*-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file - ) - if match is None: - print( - f"* Warning: no attachment match for {attachment_file}", - file=sys.stderr, - ) - sys.exit(2) - - paper_num, type_, ext = match.groups() - paper_num = int(paper_num) - - file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" - dest_path = os.path.join(attachments_dest_dir, file_name) - if not args.dry_run and not os.path.exists(dest_path): - log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) - shutil.copyfile(attachment_file_path, dest_path) - - volume[paper_num]["attachments"].append((dest_path, type_)) + # volume = dict() + + # # copy the paper PDFs + # pdf_src_dir = os.path.join(root_path, "pdf") + # for pdf_file in os.listdir(pdf_src_dir): + # # Skip . files + # if os.path.basename(pdf_file).startswith("."): + # continue + + # # names are {abbrev}{number}.pdf + # match = re.match(rf".*\.(\d+)\.pdf", pdf_file) + + # if match is not None: + # paper_num = int(match[1]) + # paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" + + # bib_path = os.path.join( + # root_path, + # "bib", + # pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), + # ) + + # pdf_src_path = os.path.join(pdf_src_dir, pdf_file) + # pdf_dest_path = os.path.join( + # pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" + # ) + # if not args.dry_run: + # maybe_copy(pdf_src_path, pdf_dest_path) + + # volume[paper_num] = { + # "anthology_id": paper_id_full, + # "bib": bib_path, + # "pdf": pdf_dest_path, + # "attachments": [], + # } + + # # copy the attachments + # if os.path.exists(os.path.join(root_path, "additional")): + # attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) + # if not os.path.exists(attachments_dest_dir): + # os.makedirs(attachments_dest_dir) + # for attachment_file in os.listdir(os.path.join(root_path, "additional")): + # if os.path.basename(attachment_file).startswith("."): + # continue + # attachment_file_path = os.path.join( + # root_path, "additional", attachment_file + # ) + # # Find the attachment file, using a bit of a fuzzy + # # match. The fuzzy match is because sometimes people + # # generate the proceedings with the wrong venue + # # code. If we correct it, we still need to be able to + # # find the file. + # match = re.match( + # rf"{year}\..*-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file + # ) + # if match is None: + # print( + # f"* Warning: no attachment match for {attachment_file}", + # file=sys.stderr, + # ) + # sys.exit(2) + + # paper_num, type_, ext = match.groups() + # paper_num = int(paper_num) + + # file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" + # dest_path = os.path.join(attachments_dest_dir, file_name) + # if not args.dry_run and not os.path.exists(dest_path): + # log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) + # shutil.copyfile(attachment_file_path, dest_path) + + # volume[paper_num]["attachments"].append((dest_path, type_)) # create xml - collection_file = os.path.join( - args.anthology_dir, "data", "xml", f"{collection_id}.xml" - ) - if os.path.exists(collection_file): - root_node = etree.parse(collection_file).getroot() - else: - root_node = make_simple_element("collection", attrib={"id": collection_id}) - - volume_node = make_simple_element( - "volume", - attrib={"id": volume_name, "ingest-date": args.ingest_date}, - ) - - # Replace the existing one if present - existing_volume_node = root_node.find(f"./volume[@id='{volume_name}']") - for i, child in enumerate(root_node): - if child.attrib["id"] == volume_name: - root_node[i] = volume_node - break - else: - root_node.append(volume_node) - - meta_node = None - - for paper_num, paper in sorted(volume.items()): - paper_id_full = paper["anthology_id"] - bibfile = paper["bib"] - paper_node = bib2xml(bibfile, paper_id_full) - - if paper_node.attrib["id"] == "0": - # create metadata subtree - meta_node = make_simple_element("meta", parent=volume_node) - title_node = paper_node.find("title") - title_node.tag = "booktitle" - meta_node.append(title_node) - for author_or_editor in chain( - paper_node.findall("./author"), paper_node.findall("./editor") - ): - disamb_name, name_choice = disambiguate_name( - author_or_editor, paper_id_full, people - ) - if name_choice != -1: - author_or_editor.attrib["id"] = disamb_name - person = PersonName.from_element(author_or_editor) - for name_part in author_or_editor: - name_part.text = correct_caps(name_part.text) - meta_node.append(author_or_editor) - author_or_editor.tag = "editor" - - # Here, we grab the publisher from the meta file, in case it's not in the - # frontmatter paper. We don't handle the situation where it's in neither! - publisher_node = paper_node.find("publisher") - if publisher_node is None: - publisher_node = make_simple_element("publisher", meta["publisher"]) - meta_node.append(publisher_node) - - # Look for the address in the bib file, then the meta file - address_node = paper_node.find("address") - if address_node is None: - address_node = make_simple_element("address", meta["location"]) - meta_node.append(address_node) - - meta_node.append(paper_node.find("month")) - meta_node.append(paper_node.find("year")) - if book_dest_path is not None: - make_simple_element( - "url", - text=f"{collection_id}-{volume_name}", - attrib={"hash": compute_hash_from_file(book_dest_path)}, - parent=meta_node, - ) - - # Add the venue tag - make_simple_element("venue", venue_name, parent=meta_node) - - # modify frontmatter tag - paper_node.tag = "frontmatter" - del paper_node.attrib["id"] - else: - make_simple_element("frontmatter", parent=volume_node) - # remove unneeded fields - for child in paper_node: - if child.tag in [ - "editor", - "address", - "booktitle", - "publisher", - "year", - "month", - ]: - paper_node.remove(child) - - url = paper_node.find("./url") - if url is not None: - url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) - - for path, type_ in paper["attachments"]: - make_simple_element( - "attachment", - text=os.path.basename(path), - attrib={ - "type": type_, - "hash": compute_hash_from_file(path), - }, - parent=paper_node, - ) - - if len(paper_node) > 0: - volume_node.append(paper_node) - - # Normalize - for oldnode in paper_node: - normalize(oldnode, informat="latex") - - # Adjust the language tag - language_node = paper_node.find("./language") - if language_node is not None: - try: - lang = iso639.languages.get(name=language_node.text) - except KeyError: - raise Exception(f"Can't find language '{language_node.text}'") - language_node.text = lang.part3 - - # Fix author names - for name_node in chain( - paper_node.findall("./author"), paper_node.findall("./editor") - ): - disamb_name, name_choice = disambiguate_name( - name_node, paper_id_full, people - ) - if name_choice != -1: - name_node.attrib["id"] = disamb_name - person = PersonName.from_element(name_node) - for name_part in name_node: - name_part.text = correct_caps(name_part.text) - - # Other data from the meta file - if "isbn" in meta: - make_simple_element("isbn", meta["isbn"], parent=meta_node) - - indent(root_node) - tree = etree.ElementTree(root_node) - tree.write( - collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True - ) + # collection_id = meta["collection_id"] + # volume_name = meta["volume"].lower() + + # collection_file = os.path.join( + # args.anthology_dir, "data", "xml", f"{collection_id}.xml" + # ) + # if os.path.exists(collection_file): + # root_node = etree.parse(collection_file).getroot() + # else: + # root_node = make_simple_element("collection", attrib={"id": collection_id}) + + # volume_node = make_simple_element( + # "volume", + # attrib={"id": volume_name, "ingest-date": args.ingest_date}, + # ) + + # # Replace the existing one if present + # existing_volume_node = root_node.find(f"./volume[@id='{volume_name}']") + # for i, child in enumerate(root_node): + # if child.attrib["id"] == volume_name: + # root_node[i] = volume_node + # break + # else: + # root_node.append(volume_node) + + # meta_node = None + + # for paper_num, paper in sorted(volume.items()): + # paper_id_full = paper["anthology_id"] + # bibfile = paper["bib"] + # paper_node = bib2xml(bibfile, paper_id_full) + + # if paper_node.attrib["id"] == "0": + # # create metadata subtree + # meta_node = make_simple_element("meta", parent=volume_node) + # title_node = paper_node.find("title") + # title_node.tag = "booktitle" + # meta_node.append(title_node) + # for author_or_editor in chain( + # paper_node.findall("./author"), paper_node.findall("./editor") + # ): + # disamb_name, name_choice = disambiguate_name( + # author_or_editor, paper_id_full, people + # ) + # if name_choice != -1: + # author_or_editor.attrib["id"] = disamb_name + # person = PersonName.from_element(author_or_editor) + # for name_part in author_or_editor: + # name_part.text = correct_caps(name_part.text) + # meta_node.append(author_or_editor) + # author_or_editor.tag = "editor" + + # # Here, we grab the publisher from the meta file, in case it's not in the + # # frontmatter paper. We don't handle the situation where it's in neither! + # publisher_node = paper_node.find("publisher") + # if publisher_node is None: + # publisher_node = make_simple_element("publisher", meta["publisher"]) + # meta_node.append(publisher_node) + + # # Look for the address in the bib file, then the meta file + # address_node = paper_node.find("address") + # if address_node is None: + # address_node = make_simple_element("address", meta["location"]) + # meta_node.append(address_node) + + # meta_node.append(paper_node.find("month")) + # meta_node.append(paper_node.find("year")) + # if book_dest_path is not None: + # make_simple_element( + # "url", + # text=f"{collection_id}-{volume_name}", + # attrib={"hash": compute_hash_from_file(book_dest_path)}, + # parent=meta_node, + # ) + + # # Add the venue tag + # make_simple_element("venue", venue_name, parent=meta_node) + + # # modify frontmatter tag + # paper_node.tag = "frontmatter" + # del paper_node.attrib["id"] + # else: + # make_simple_element("frontmatter", parent=volume_node) + # # remove unneeded fields + # for child in paper_node: + # if child.tag in [ + # "editor", + # "address", + # "booktitle", + # "publisher", + # "year", + # "month", + # ]: + # paper_node.remove(child) + + # url = paper_node.find("./url") + # if url is not None: + # url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) + + # for path, type_ in paper["attachments"]: + # make_simple_element( + # "attachment", + # text=os.path.basename(path), + # attrib={ + # "type": type_, + # "hash": compute_hash_from_file(path), + # }, + # parent=paper_node, + # ) + + # if len(paper_node) > 0: + # volume_node.append(paper_node) + + # # Normalize + # for oldnode in paper_node: + # normalize(oldnode, informat="latex") + + # # Adjust the language tag + # language_node = paper_node.find("./language") + # if language_node is not None: + # try: + # lang = iso639.languages.get(name=language_node.text) + # except KeyError: + # raise Exception(f"Can't find language '{language_node.text}'") + # language_node.text = lang.part3 + + # # Fix author names + # for name_node in chain( + # paper_node.findall("./author"), paper_node.findall("./editor") + # ): + # disamb_name, name_choice = disambiguate_name( + # name_node, paper_id_full, people + # ) + # if name_choice != -1: + # name_node.attrib["id"] = disamb_name + # person = PersonName.from_element(name_node) + # for name_part in name_node: + # name_part.text = correct_caps(name_part.text) + + # # Other data from the meta file + # if "isbn" in meta: + # make_simple_element("isbn", meta["isbn"], parent=meta_node) + + # indent(root_node) + # tree = etree.ElementTree(root_node) + # tree.write( + # collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True + # ) if __name__ == "__main__": From dda480dd1f0fe108fd022e6633e4392eb7e09154 Mon Sep 17 00:00:00 2001 From: anthology assist Date: Tue, 21 Mar 2023 21:31:36 -0500 Subject: [PATCH 4/4] ingestion script update, closes #2439. --- bin/ingest.py | 422 +++++++++++++------------------------------------- 1 file changed, 110 insertions(+), 312 deletions(-) diff --git a/bin/ingest.py b/bin/ingest.py index 1ebe38f3eb..b0680ad08f 100755 --- a/bin/ingest.py +++ b/bin/ingest.py @@ -61,8 +61,6 @@ from itertools import chain from typing import Dict, Any, List, Tuple, Optional -from slugify import slugify - def log(text: str, fake: bool = False): message = "[DRY RUN] " if fake else "" @@ -360,10 +358,14 @@ def copy_pdf_and_attachment(meta, pdfs_dir: str) -> Dict: # args.pdfs_dir return volume, proceedings_dest_path -def check_volumes(volumes: Dict): - sorted_volumes = sorted(volumes.items()) - - return None +def check_frontmatter(volume: Dict) -> bool: + ''' + Check if front matter pdf exists + ''' + for _, volume_content in volume.items(): + if volume_content['anthology_id'].split('.')[-1] == '0': + return True + return False def create_xml( @@ -373,7 +375,7 @@ def create_xml( anthology_dir: str, ingest_date: str, people, -): # args.anthology_dir, args.ingest_date +): collection_id = meta["collection_id"] volume_name = meta["volume"].lower() venue_name = meta["abbrev"].lower() @@ -400,46 +402,108 @@ def create_xml( meta_node = None + # Flag to make sure meta and frontmatter block only gets generated once + set_meta_frontmatter_block = check_frontmatter(volume) + for _, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) - if paper_node.attrib["id"] == "0": + # 0 is the front matter pdf + if paper_node.attrib["id"] == "0" or set_meta_frontmatter_block is False: # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) - title_node = paper_node.find("title") - title_node.tag = "booktitle" + + if paper_node.attrib["id"] == "0": + title_node = paper_node.find("title") + title_node.tag = "booktitle" + else: + title_node = make_simple_element( + "booktitle", meta['booktitle'], parent=meta_node + ) meta_node.append(title_node) - for author_or_editor in chain( - paper_node.findall("./author"), paper_node.findall("./editor") - ): - disamb_name, name_choice = disambiguate_name( - author_or_editor, paper_id_full, people + + # editors + if paper_node.attrib["id"] == "0": + author_or_editors = chain( + paper_node.findall("./author"), paper_node.findall("./editor") ) - if name_choice != -1: - author_or_editor.attrib["id"] = disamb_name - person = PersonName.from_element(author_or_editor) - for name_part in author_or_editor: - name_part.text = correct_caps(name_part.text) - meta_node.append(author_or_editor) - author_or_editor.tag = "editor" - - # Here, we grab the publisher from the meta file, in case it's not in the - # frontmatter paper. We don't handle the situation where it's in neither! - publisher_node = paper_node.find("publisher") - if publisher_node is None: - publisher_node = make_simple_element("publisher", meta["publisher"]) - meta_node.append(publisher_node) - # Look for the address in the bib file, then the meta file - address_node = paper_node.find("address") - if address_node is None: - address_node = make_simple_element("address", meta["location"]) + for author_or_editor in author_or_editors: + disamb_name, name_choice = disambiguate_name( + author_or_editor, paper_id_full, people + ) + if name_choice != -1: + author_or_editor.attrib["id"] = disamb_name + for name_part in author_or_editor: + name_part.text = correct_caps(name_part.text) + meta_node.append(author_or_editor) + author_or_editor.tag = "editor" + else: + editors = meta.get('chairs') + if len(editors) == 0: + print(f'chairs are missing in meta file') + sys.exit(2) + for editor in editors: + name_node = make_simple_element('editor', parent=meta_node) + make_simple_element( + "first", ' '.join(editor.split(' ')[0:-1]), parent=name_node + ) + make_simple_element("last", editor.split(' ')[-1], parent=name_node) + + # publisher info + if meta.get('publisher') is None: + print('publisher is missing in meta') + sys.exit(2) + publisher_node = ( + paper_node.find("publisher") + if ( + paper_node.attrib["id"] == "0" + and paper_node.find("publisher") is not None + ) + else make_simple_element("publisher", meta.get("publisher")) + ) + meta_node.append(publisher_node) + # address info + if meta.get('location') is None: + print('location is missing in meta') + sys.exit(2) + address_node = ( + paper_node.find("address") + if ( + paper_node.attrib["id"] == "0" + and paper_node.find("address") is not None + ) + else make_simple_element("address", meta.get("location")) + ) meta_node.append(address_node) + # month info + if meta.get('month') is None: + print('month is missing in meta') + sys.exit(2) + month_node = ( + paper_node.find("month") + if ( + paper_node.attrib["id"] == "0" + and paper_node.find("month") is not None + ) + else make_simple_element("month", meta.get("month")) + ) + meta_node.append(month_node) + # year info + if meta.get('year') is None: + print('year is missing in meta') + sys.exit(2) + year_node = ( + paper_node.find("year") + if ( + paper_node.attrib["id"] == "0" and paper_node.find("year") is not None + ) + else make_simple_element("year", meta.get("year")) + ) + meta_node.append(year_node) - meta_node.append(paper_node.find("month")) - meta_node.append(paper_node.find("year")) if prooceedings_dst_dir is not None: make_simple_element( "url", @@ -451,11 +515,17 @@ def create_xml( # Add the venue tag make_simple_element("venue", venue_name, parent=meta_node) - # modify frontmatter tag - paper_node.tag = "frontmatter" - del paper_node.attrib["id"] - else: - # remove unneeded fields + # Front matter block + if paper_node.attrib["id"] == "0": + # modify frontmatter tag + paper_node.tag = "frontmatter" + del paper_node.attrib["id"] + else: + make_simple_element("frontmatter", parent=volume_node) + set_meta_frontmatter_block = True + + if paper_node.attrib["id"] != "0": + print(f'onto removing stuff for paper {paper_node.attrib["id"]}') for child in paper_node: if child.tag in [ "editor", @@ -552,26 +622,16 @@ def disambiguate_name(node, anth_id, people): def main(args): - # volumes = {} - anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] - print(f'{type(venue_index)}, {venue_index}, {venue_keys}') - - # sig_index = SIGIndex(srcdir=anthology_datadir) - people = AnthologyIndex(srcdir=anthology_datadir) people.bibkeys = load_bibkeys(anthology_datadir) # Build list of volumes, confirm uniqueness unseen_venues, volumes = build_volumes(args.proceedings, venue_index, venue_keys) - for k, v in volumes.items(): - print(f'key is {k} value is {v}') - # check volumes - create_venues(unseen_venues, venue_index, anthology_datadir) # Copy over the PDFs and attachments and create xml @@ -585,268 +645,6 @@ def main(args): args.ingest_date, people, ) - # root_path = os.path.join(meta["path"], "cdrom") - # collection_id = meta["collection_id"] - # venue_name = meta["abbrev"].lower() - # volume_name = meta["volume"].lower() - # year = meta["year"] - - # pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) - # if not os.path.exists(pdfs_dest_dir): - # os.makedirs(pdfs_dest_dir) - - # def find_book(): - # """Book location has shifted a bit over the years""" - - # potential_names = [ - # os.path.join(meta["path"], "book.pdf"), - # os.path.join( - # meta["path"], - # "cdrom", - # f"{year}-{venue_name.lower()}-{volume_name}.pdf", - # ), - # os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"), - # ] - - # for book_rel_path in potential_names: - # if os.path.exists(book_rel_path): - # return book_rel_path - - # return None - - # copy the book from the top-level proceedings/ dir, named "VENUE-year.pdf", - # or sometimes "book.pdf" - # book_src_path = find_book(meta) - # book_dest_path = None - # if book_src_path is not None and not args.dry_run: - # book_dest_path = ( - # os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" - # ) - # maybe_copy(book_src_path, book_dest_path) - - # temp holder for papers in each volume - # volume = dict() - - # # copy the paper PDFs - # pdf_src_dir = os.path.join(root_path, "pdf") - # for pdf_file in os.listdir(pdf_src_dir): - # # Skip . files - # if os.path.basename(pdf_file).startswith("."): - # continue - - # # names are {abbrev}{number}.pdf - # match = re.match(rf".*\.(\d+)\.pdf", pdf_file) - - # if match is not None: - # paper_num = int(match[1]) - # paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" - - # bib_path = os.path.join( - # root_path, - # "bib", - # pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), - # ) - - # pdf_src_path = os.path.join(pdf_src_dir, pdf_file) - # pdf_dest_path = os.path.join( - # pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" - # ) - # if not args.dry_run: - # maybe_copy(pdf_src_path, pdf_dest_path) - - # volume[paper_num] = { - # "anthology_id": paper_id_full, - # "bib": bib_path, - # "pdf": pdf_dest_path, - # "attachments": [], - # } - - # # copy the attachments - # if os.path.exists(os.path.join(root_path, "additional")): - # attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) - # if not os.path.exists(attachments_dest_dir): - # os.makedirs(attachments_dest_dir) - # for attachment_file in os.listdir(os.path.join(root_path, "additional")): - # if os.path.basename(attachment_file).startswith("."): - # continue - # attachment_file_path = os.path.join( - # root_path, "additional", attachment_file - # ) - # # Find the attachment file, using a bit of a fuzzy - # # match. The fuzzy match is because sometimes people - # # generate the proceedings with the wrong venue - # # code. If we correct it, we still need to be able to - # # find the file. - # match = re.match( - # rf"{year}\..*-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file - # ) - # if match is None: - # print( - # f"* Warning: no attachment match for {attachment_file}", - # file=sys.stderr, - # ) - # sys.exit(2) - - # paper_num, type_, ext = match.groups() - # paper_num = int(paper_num) - - # file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" - # dest_path = os.path.join(attachments_dest_dir, file_name) - # if not args.dry_run and not os.path.exists(dest_path): - # log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) - # shutil.copyfile(attachment_file_path, dest_path) - - # volume[paper_num]["attachments"].append((dest_path, type_)) - - # create xml - # collection_id = meta["collection_id"] - # volume_name = meta["volume"].lower() - - # collection_file = os.path.join( - # args.anthology_dir, "data", "xml", f"{collection_id}.xml" - # ) - # if os.path.exists(collection_file): - # root_node = etree.parse(collection_file).getroot() - # else: - # root_node = make_simple_element("collection", attrib={"id": collection_id}) - - # volume_node = make_simple_element( - # "volume", - # attrib={"id": volume_name, "ingest-date": args.ingest_date}, - # ) - - # # Replace the existing one if present - # existing_volume_node = root_node.find(f"./volume[@id='{volume_name}']") - # for i, child in enumerate(root_node): - # if child.attrib["id"] == volume_name: - # root_node[i] = volume_node - # break - # else: - # root_node.append(volume_node) - - # meta_node = None - - # for paper_num, paper in sorted(volume.items()): - # paper_id_full = paper["anthology_id"] - # bibfile = paper["bib"] - # paper_node = bib2xml(bibfile, paper_id_full) - - # if paper_node.attrib["id"] == "0": - # # create metadata subtree - # meta_node = make_simple_element("meta", parent=volume_node) - # title_node = paper_node.find("title") - # title_node.tag = "booktitle" - # meta_node.append(title_node) - # for author_or_editor in chain( - # paper_node.findall("./author"), paper_node.findall("./editor") - # ): - # disamb_name, name_choice = disambiguate_name( - # author_or_editor, paper_id_full, people - # ) - # if name_choice != -1: - # author_or_editor.attrib["id"] = disamb_name - # person = PersonName.from_element(author_or_editor) - # for name_part in author_or_editor: - # name_part.text = correct_caps(name_part.text) - # meta_node.append(author_or_editor) - # author_or_editor.tag = "editor" - - # # Here, we grab the publisher from the meta file, in case it's not in the - # # frontmatter paper. We don't handle the situation where it's in neither! - # publisher_node = paper_node.find("publisher") - # if publisher_node is None: - # publisher_node = make_simple_element("publisher", meta["publisher"]) - # meta_node.append(publisher_node) - - # # Look for the address in the bib file, then the meta file - # address_node = paper_node.find("address") - # if address_node is None: - # address_node = make_simple_element("address", meta["location"]) - # meta_node.append(address_node) - - # meta_node.append(paper_node.find("month")) - # meta_node.append(paper_node.find("year")) - # if book_dest_path is not None: - # make_simple_element( - # "url", - # text=f"{collection_id}-{volume_name}", - # attrib={"hash": compute_hash_from_file(book_dest_path)}, - # parent=meta_node, - # ) - - # # Add the venue tag - # make_simple_element("venue", venue_name, parent=meta_node) - - # # modify frontmatter tag - # paper_node.tag = "frontmatter" - # del paper_node.attrib["id"] - # else: - # make_simple_element("frontmatter", parent=volume_node) - # # remove unneeded fields - # for child in paper_node: - # if child.tag in [ - # "editor", - # "address", - # "booktitle", - # "publisher", - # "year", - # "month", - # ]: - # paper_node.remove(child) - - # url = paper_node.find("./url") - # if url is not None: - # url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) - - # for path, type_ in paper["attachments"]: - # make_simple_element( - # "attachment", - # text=os.path.basename(path), - # attrib={ - # "type": type_, - # "hash": compute_hash_from_file(path), - # }, - # parent=paper_node, - # ) - - # if len(paper_node) > 0: - # volume_node.append(paper_node) - - # # Normalize - # for oldnode in paper_node: - # normalize(oldnode, informat="latex") - - # # Adjust the language tag - # language_node = paper_node.find("./language") - # if language_node is not None: - # try: - # lang = iso639.languages.get(name=language_node.text) - # except KeyError: - # raise Exception(f"Can't find language '{language_node.text}'") - # language_node.text = lang.part3 - - # # Fix author names - # for name_node in chain( - # paper_node.findall("./author"), paper_node.findall("./editor") - # ): - # disamb_name, name_choice = disambiguate_name( - # name_node, paper_id_full, people - # ) - # if name_choice != -1: - # name_node.attrib["id"] = disamb_name - # person = PersonName.from_element(name_node) - # for name_part in name_node: - # name_part.text = correct_caps(name_part.text) - - # # Other data from the meta file - # if "isbn" in meta: - # make_simple_element("isbn", meta["isbn"], parent=meta_node) - - # indent(root_node) - # tree = etree.ElementTree(root_node) - # tree.write( - # collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True - # ) if __name__ == "__main__":