diff --git a/bin/anthology/venues.py b/bin/anthology/venues.py index 6f192a42ca..8acffdf0c8 100644 --- a/bin/anthology/venues.py +++ b/bin/anthology/venues.py @@ -85,7 +85,7 @@ def add_venue(self, directory, acronym, title, is_acl=False, url=None): """ Adds a new venue. - Everytime a new venue is created, the corresponding yaml file is created as welll. + Everytime a new venue is created, the corresponding yaml file is created as well. """ slug = VenueIndex.get_slug_from_acronym(acronym) diff --git a/bin/ingest.py b/bin/ingest.py index 260aa4b150..b0680ad08f 100755 --- a/bin/ingest.py +++ b/bin/ingest.py @@ -59,9 +59,7 @@ from anthology.venues import VenueIndex from itertools import chain -from typing import Dict, Any - -from slugify import slugify +from typing import Dict, Any, List, Tuple, Optional def log(text: str, fake: bool = False): @@ -190,53 +188,13 @@ def bib2xml(bibfilename, anthology_id): return paper -def main(args): - volumes = {} - - anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") - venue_index = VenueIndex(srcdir=anthology_datadir) - venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] - - sig_index = SIGIndex(srcdir=anthology_datadir) - - people = AnthologyIndex(srcdir=anthology_datadir) - people.bibkeys = load_bibkeys(anthology_datadir) - - def correct_caps(name): - """ - Many people submit their names in "ALL CAPS" or "all lowercase". - Correct this with heuristics. - """ - if name.islower() or name.isupper(): - # capitalize all parts - corrected = " ".join(list(map(lambda x: x.capitalize(), name.split()))) - print( - f"-> Correcting capitalization of '{name}' to '{corrected}'", - file=sys.stderr, - ) - name = corrected - - return name - - def disambiguate_name(node, anth_id): - name = PersonName.from_element(node) - ids = people.get_ids(name) - choice = -1 - if len(ids) > 1: - while choice < 0 or choice >= len(ids): - print( - f"({anth_id}): ambiguous author {name}; Please choose from the following:" - ) - for i, id_ in enumerate(ids): - print(f"[{i}] {id_} ({people.get_comment(id_)})") - choice = int(input("--> ")) - - return ids[choice], choice - - # Build list of volumes, confirm uniqueness +def build_volumes( + path: str, venue_index: VenueIndex, venue_keys: List[str] +) -> Tuple[List, Dict]: unseen_venues = [] + volumes = {} - for proceedings in args.proceedings: + for proceedings in path: # args.proceedings meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug_from_acronym(venue_abbrev) @@ -249,6 +207,7 @@ def disambiguate_name(node, anth_id): print( f"WARNING: Venue {venue_abbrev} ends in a number, this is probably a mistake" ) + sys.exit(1) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) @@ -265,276 +224,426 @@ def disambiguate_name(node, anth_id): volumes[volume_full_id] = meta if "sig" in meta: - print( - f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:" - ) + print(f"Add this line to sigs/{meta['sig'].lower()}.yaml:") print(f" - {meta['year']}:") print(f" - {volume_full_id} # {meta['booktitle']}") + return unseen_venues, volumes + - # Make sure all venues exist +def create_venues(unseen_venues: List, venue_index: VenueIndex, anthology_datadir: str): + ''' + Create yaml file for new venues + ''' if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title}) slug {slug}") venue_index.add_venue(anthology_datadir, abbrev, title) - # Copy over the PDFs and attachments - for volume_full_id, meta in volumes.items(): - root_path = os.path.join(meta["path"], "cdrom") - collection_id = meta["collection_id"] - venue_name = meta["abbrev"].lower() - volume_name = meta["volume"].lower() - year = meta["year"] - - pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) - if not os.path.exists(pdfs_dest_dir): - os.makedirs(pdfs_dest_dir) - - def find_book(): - """Book location has shifted a bit over the years""" - - potential_names = [ - os.path.join(meta["path"], "book.pdf"), - os.path.join( - meta["path"], - "cdrom", - f"{year}-{venue_name.lower()}-{volume_name}.pdf", - ), - os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"), - ] - - for book_rel_path in potential_names: - if os.path.exists(book_rel_path): - return book_rel_path - - return None - - # copy the book from the top-level proceedings/ dir, named "VENUE-year.pdf", - # or sometimes "book.pdf" - book_src_path = find_book() - book_dest_path = None - if book_src_path is not None and not args.dry_run: - book_dest_path = ( - os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" - ) - maybe_copy(book_src_path, book_dest_path) - # temp holder for papers in each volume - volume = dict() +def find_book(meta) -> Optional[str]: + """Book location has shifted a bit over the years""" + year = meta["year"] + venue_name = meta["abbrev"].lower() + volume_name = meta["volume"].lower() + + potential_names = [ + os.path.join(meta["path"], "book.pdf"), + os.path.join( + meta["path"], + "cdrom", + f"{year}-{venue_name.lower()}-{volume_name}.pdf", + ), + os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"), + ] - # copy the paper PDFs - pdf_src_dir = os.path.join(root_path, "pdf") - for pdf_file in os.listdir(pdf_src_dir): - # Skip . files - if os.path.basename(pdf_file).startswith("."): - continue + for book_rel_path in potential_names: + if os.path.exists(book_rel_path): + return book_rel_path - # names are {abbrev}{number}.pdf - match = re.match(rf".*\.(\d+)\.pdf", pdf_file) + return None - if match is not None: - paper_num = int(match[1]) - paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" - bib_path = os.path.join( - root_path, - "bib", - pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), - ) +def copy_pdf_and_attachment(meta, pdfs_dir: str) -> Dict: # args.pdfs_dir + root_path = os.path.join(meta["path"], "cdrom") + collection_id = meta["collection_id"] + venue_name = meta["abbrev"].lower() + volume_name = meta["volume"].lower() + year = meta["year"] - pdf_src_path = os.path.join(pdf_src_dir, pdf_file) - pdf_dest_path = os.path.join( - pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" - ) - if not args.dry_run: - maybe_copy(pdf_src_path, pdf_dest_path) - - volume[paper_num] = { - "anthology_id": paper_id_full, - "bib": bib_path, - "pdf": pdf_dest_path, - "attachments": [], - } - - # copy the attachments - if os.path.exists(os.path.join(root_path, "additional")): - attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) - if not os.path.exists(attachments_dest_dir): - os.makedirs(attachments_dest_dir) - for attachment_file in os.listdir(os.path.join(root_path, "additional")): - if os.path.basename(attachment_file).startswith("."): - continue - attachment_file_path = os.path.join( - root_path, "additional", attachment_file - ) - # Find the attachment file, using a bit of a fuzzy - # match. The fuzzy match is because sometimes people - # generate the proceedings with the wrong venue - # code. If we correct it, we still need to be able to - # find the file. - match = re.match( - rf"{year}\..*-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file - ) - if match is None: - print( - f"* Warning: no attachment match for {attachment_file}", - file=sys.stderr, - ) - sys.exit(2) + pdfs_dest_dir = os.path.join(pdfs_dir, venue_name) + if not os.path.exists(pdfs_dest_dir): + os.makedirs(pdfs_dest_dir) - paper_num, type_, ext = match.groups() - paper_num = int(paper_num) + # handle proceedings.pdf + proceedings_src_path = find_book(meta) + proceedings_dest_path = None - file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" - dest_path = os.path.join(attachments_dest_dir, file_name) - if not args.dry_run and not os.path.exists(dest_path): - log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) - shutil.copyfile(attachment_file_path, dest_path) + if proceedings_src_path is not None and not args.dry_run: + proceedings_dest_path = ( + os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" + ) + maybe_copy(proceedings_src_path, proceedings_dest_path) - volume[paper_num]["attachments"].append((dest_path, type_)) + # temp holder for volume + volume = dict() - # create xml - collection_file = os.path.join( - args.anthology_dir, "data", "xml", f"{collection_id}.xml" - ) - if os.path.exists(collection_file): - root_node = etree.parse(collection_file).getroot() - else: - root_node = make_simple_element("collection", attrib={"id": collection_id}) + # handle pdfs + pdfs_src_dir = os.path.join(root_path, "pdf") + for pdf_file in os.listdir(pdfs_src_dir): + # Skip . files + if os.path.basename(pdf_file).startswith("."): + continue - volume_node = make_simple_element( - "volume", - attrib={"id": volume_name, "ingest-date": args.ingest_date}, - ) + # names are {abbrev}{number}.pdf + match = re.match(rf".*\.(\d+)\.pdf", pdf_file) - # Replace the existing one if present - existing_volume_node = root_node.find(f"./volume[@id='{volume_name}']") - for i, child in enumerate(root_node): - if child.attrib["id"] == volume_name: - root_node[i] = volume_node - break + if match is not None: + paper_num = int(match[1]) + paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" + + bib_path = os.path.join( + root_path, + "bib", + pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), + ) + + pdf_src_path = os.path.join(pdfs_src_dir, pdf_file) + pdf_dest_path = os.path.join( + pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" + ) + if not args.dry_run: + maybe_copy(pdf_src_path, pdf_dest_path) + + volume[paper_num] = { + "anthology_id": paper_id_full, + "bib": bib_path, + "pdf": pdf_dest_path, + "attachments": [], + } + + # handle attachments + if os.path.exists(os.path.join(root_path, "additional")): + attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) + if not os.path.exists(attachments_dest_dir): + os.makedirs(attachments_dest_dir) + for attachment_file in os.listdir(os.path.join(root_path, "additional")): + if os.path.basename(attachment_file).startswith("."): + continue + attachment_file_path = os.path.join(root_path, "additional", attachment_file) + # Find the attachment file, using a bit of a fuzzy + # match. The fuzzy match is because sometimes people + # generate the proceedings with the wrong venue + # code. If we correct it, we still need to be able to + # find the file. + match = re.match(rf"{year}\..*-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) + if match is None: + print( + f"* Warning: no attachment match for {attachment_file}", + file=sys.stderr, + ) + sys.exit(2) + + paper_num, type_, ext = match.groups() + paper_num = int(paper_num) + + file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" + dest_path = os.path.join(attachments_dest_dir, file_name) + if not args.dry_run and not os.path.exists(dest_path): + log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) + shutil.copyfile(attachment_file_path, dest_path) + + volume[paper_num]["attachments"].append((dest_path, type_)) + + return volume, proceedings_dest_path + + +def check_frontmatter(volume: Dict) -> bool: + ''' + Check if front matter pdf exists + ''' + for _, volume_content in volume.items(): + if volume_content['anthology_id'].split('.')[-1] == '0': + return True + return False + + +def create_xml( + volume: Dict, + meta: Dict, + prooceedings_dst_dir, + anthology_dir: str, + ingest_date: str, + people, +): + collection_id = meta["collection_id"] + volume_name = meta["volume"].lower() + venue_name = meta["abbrev"].lower() + + collection_file = os.path.join(anthology_dir, "data", "xml", f"{collection_id}.xml") + if os.path.exists(collection_file): + root_node = etree.parse(collection_file).getroot() + else: + root_node = make_simple_element("collection", attrib={"id": collection_id}) + + volume_node = make_simple_element( + "volume", + attrib={"id": volume_name, "ingest-date": ingest_date}, + ) + + # Replace the existing one if present + existing_volume_node = root_node.find(f"./volume[@id='{volume_name}']") + for i, child in enumerate(root_node): + if child.attrib["id"] == volume_name: + root_node[i] = volume_node + break else: root_node.append(volume_node) - meta_node = None + meta_node = None + + # Flag to make sure meta and frontmatter block only gets generated once + set_meta_frontmatter_block = check_frontmatter(volume) + + for _, paper in sorted(volume.items()): + paper_id_full = paper["anthology_id"] + bibfile = paper["bib"] + paper_node = bib2xml(bibfile, paper_id_full) - for paper_num, paper in sorted(volume.items()): - paper_id_full = paper["anthology_id"] - bibfile = paper["bib"] - paper_node = bib2xml(bibfile, paper_id_full) + # 0 is the front matter pdf + if paper_node.attrib["id"] == "0" or set_meta_frontmatter_block is False: + # create metadata subtree + meta_node = make_simple_element("meta", parent=volume_node) if paper_node.attrib["id"] == "0": - # create metadata subtree - meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" - meta_node.append(title_node) - for author_or_editor in chain( + else: + title_node = make_simple_element( + "booktitle", meta['booktitle'], parent=meta_node + ) + meta_node.append(title_node) + + # editors + if paper_node.attrib["id"] == "0": + author_or_editors = chain( paper_node.findall("./author"), paper_node.findall("./editor") - ): + ) + + for author_or_editor in author_or_editors: disamb_name, name_choice = disambiguate_name( - author_or_editor, paper_id_full + author_or_editor, paper_id_full, people ) if name_choice != -1: author_or_editor.attrib["id"] = disamb_name - person = PersonName.from_element(author_or_editor) for name_part in author_or_editor: name_part.text = correct_caps(name_part.text) meta_node.append(author_or_editor) author_or_editor.tag = "editor" - - # Here, we grab the publisher from the meta file, in case it's not in the - # frontmatter paper. We don't handle the situation where it's in neither! - publisher_node = paper_node.find("publisher") - if publisher_node is None: - publisher_node = make_simple_element("publisher", meta["publisher"]) - meta_node.append(publisher_node) - - # Look for the address in the bib file, then the meta file - address_node = paper_node.find("address") - if address_node is None: - address_node = make_simple_element("address", meta["location"]) - meta_node.append(address_node) - - meta_node.append(paper_node.find("month")) - meta_node.append(paper_node.find("year")) - if book_dest_path is not None: + else: + editors = meta.get('chairs') + if len(editors) == 0: + print(f'chairs are missing in meta file') + sys.exit(2) + for editor in editors: + name_node = make_simple_element('editor', parent=meta_node) make_simple_element( - "url", - text=f"{collection_id}-{volume_name}", - attrib={"hash": compute_hash_from_file(book_dest_path)}, - parent=meta_node, + "first", ' '.join(editor.split(' ')[0:-1]), parent=name_node ) + make_simple_element("last", editor.split(' ')[-1], parent=name_node) + + # publisher info + if meta.get('publisher') is None: + print('publisher is missing in meta') + sys.exit(2) + publisher_node = ( + paper_node.find("publisher") + if ( + paper_node.attrib["id"] == "0" + and paper_node.find("publisher") is not None + ) + else make_simple_element("publisher", meta.get("publisher")) + ) + meta_node.append(publisher_node) + # address info + if meta.get('location') is None: + print('location is missing in meta') + sys.exit(2) + address_node = ( + paper_node.find("address") + if ( + paper_node.attrib["id"] == "0" + and paper_node.find("address") is not None + ) + else make_simple_element("address", meta.get("location")) + ) + meta_node.append(address_node) + # month info + if meta.get('month') is None: + print('month is missing in meta') + sys.exit(2) + month_node = ( + paper_node.find("month") + if ( + paper_node.attrib["id"] == "0" + and paper_node.find("month") is not None + ) + else make_simple_element("month", meta.get("month")) + ) + meta_node.append(month_node) + # year info + if meta.get('year') is None: + print('year is missing in meta') + sys.exit(2) + year_node = ( + paper_node.find("year") + if ( + paper_node.attrib["id"] == "0" and paper_node.find("year") is not None + ) + else make_simple_element("year", meta.get("year")) + ) + meta_node.append(year_node) + + if prooceedings_dst_dir is not None: + make_simple_element( + "url", + text=f"{collection_id}-{volume_name}", + attrib={"hash": compute_hash_from_file(prooceedings_dst_dir)}, + parent=meta_node, + ) - # Add the venue tag - make_simple_element("venue", venue_name, parent=meta_node) + # Add the venue tag + make_simple_element("venue", venue_name, parent=meta_node) + # Front matter block + if paper_node.attrib["id"] == "0": # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: - # remove unneeded fields - for child in paper_node: - if child.tag in [ - "editor", - "address", - "booktitle", - "publisher", - "year", - "month", - ]: - paper_node.remove(child) - - url = paper_node.find("./url") - if url is not None: - url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) - - for path, type_ in paper["attachments"]: - make_simple_element( - "attachment", - text=os.path.basename(path), - attrib={ - "type": type_, - "hash": compute_hash_from_file(path), - }, - parent=paper_node, - ) + make_simple_element("frontmatter", parent=volume_node) + set_meta_frontmatter_block = True + + if paper_node.attrib["id"] != "0": + print(f'onto removing stuff for paper {paper_node.attrib["id"]}') + for child in paper_node: + if child.tag in [ + "editor", + "address", + "booktitle", + "publisher", + "year", + "month", + ]: + paper_node.remove(child) + + url = paper_node.find("./url") + if url is not None: + url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) + + for path, type_ in paper["attachments"]: + make_simple_element( + "attachment", + text=os.path.basename(path), + attrib={ + "type": type_, + "hash": compute_hash_from_file(path), + }, + parent=paper_node, + ) + + if len(paper_node) > 0: + volume_node.append(paper_node) + + # Normalize + for oldnode in paper_node: + normalize(oldnode, informat="latex") + + # Adjust the language tag + language_node = paper_node.find("./language") + if language_node is not None: + try: + lang = iso639.languages.get(name=language_node.text) + except KeyError: + raise Exception(f"Can't find language '{language_node.text}'") + language_node.text = lang.part3 + + # Fix author names + for name_node in chain( + paper_node.findall("./author"), paper_node.findall("./editor") + ): + disamb_name, name_choice = disambiguate_name(name_node, paper_id_full, people) + if name_choice != -1: + name_node.attrib["id"] = disamb_name + person = PersonName.from_element(name_node) + for name_part in name_node: + name_part.text = correct_caps(name_part.text) + + # Other data from the meta file + if "isbn" in meta: + make_simple_element("isbn", meta["isbn"], parent=meta_node) + + indent(root_node) + tree = etree.ElementTree(root_node) + tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True) + + +def correct_caps(name: str) -> str: + """ + Many people submit their names in "ALL CAPS" or "all lowercase". + Correct this with heuristics. + """ + if name.islower() or name.isupper(): + # capitalize all parts + corrected = " ".join(list(map(lambda x: x.capitalize(), name.split()))) + print( + f"-> Correcting capitalization of '{name}' to '{corrected}'", + file=sys.stderr, + ) + name = corrected + + return name + + +def disambiguate_name(node, anth_id, people): + name = PersonName.from_element(node) + ids = people.get_ids(name) + choice = -1 + if len(ids) > 1: + while choice < 0 or choice >= len(ids): + print( + f"({anth_id}): ambiguous author {name}; Please choose from the following:" + ) + for i, id_ in enumerate(ids): + print(f"[{i}] {id_} ({people.get_comment(id_)})") + choice = int(input("--> ")) + + return ids[choice], choice + - if len(paper_node) > 0: - volume_node.append(paper_node) - - # Normalize - for oldnode in paper_node: - normalize(oldnode, informat="latex") - - # Adjust the language tag - language_node = paper_node.find("./language") - if language_node is not None: - try: - lang = iso639.languages.get(name=language_node.text) - except KeyError: - raise Exception(f"Can't find language '{language_node.text}'") - language_node.text = lang.part3 - - # Fix author names - for name_node in chain( - paper_node.findall("./author"), paper_node.findall("./editor") - ): - disamb_name, name_choice = disambiguate_name(name_node, paper_id_full) - if name_choice != -1: - name_node.attrib["id"] = disamb_name - person = PersonName.from_element(name_node) - for name_part in name_node: - name_part.text = correct_caps(name_part.text) - - # Other data from the meta file - if "isbn" in meta: - make_simple_element("isbn", meta["isbn"], parent=meta_node) - - indent(root_node) - tree = etree.ElementTree(root_node) - tree.write( - collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True +def main(args): + anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") + venue_index = VenueIndex(srcdir=anthology_datadir) + venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] + + people = AnthologyIndex(srcdir=anthology_datadir) + people.bibkeys = load_bibkeys(anthology_datadir) + + # Build list of volumes, confirm uniqueness + unseen_venues, volumes = build_volumes(args.proceedings, venue_index, venue_keys) + + create_venues(unseen_venues, venue_index, anthology_datadir) + + # Copy over the PDFs and attachments and create xml + for _, meta in volumes.items(): + volume, prooceedings_dst_dir = copy_pdf_and_attachment(meta, args.pdfs_dir) + create_xml( + volume, + meta, + prooceedings_dst_dir, + args.anthology_dir, + args.ingest_date, + people, )