Skip to content

Commit

Permalink
fix buggy XML parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
bhlieberman committed Aug 5, 2024
1 parent f4adf71 commit 1096ee1
Showing 1 changed file with 2 additions and 23 deletions.
25 changes: 2 additions & 23 deletions stages/03_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from queue import Full, Queue
import logging
from typing import Dict, List
from lxml import etree
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field
Expand All @@ -14,14 +13,6 @@
biocxml_out = brick_dir / "BioCXML"


def parse_element(element):
element_dict = {
element.tag: element.text,
}
element_dict["children"] = [parse_element(child) for child in element]
return element_dict


@dataclass(frozen=True)
class Pubtator:
logger: logging.Logger = field(default=logging.getLogger(__name__))
Expand All @@ -34,24 +25,12 @@ def create_out_dir(self) -> Path:
brick_dir.mkdir(parents=True)
return brick_dir

def create_parquet(self, file, elems) -> None:
def create_parquet(self, file, df) -> None:
name = file.with_suffix(".parquet").name
df = pd.DataFrame.from_records(elems[0]["children"])
df.to_parquet(biocxml_out / name)

def parse_xml(self, xml_file) -> List[Dict]:
with open(xml_file, "rb") as xml:
logging.info(f"opening {xml_file}")
parsed_elements = []
for event, element in etree.iterparse(xml, events=("end",)):
if element.tag == "collection":
parsed_elements.append(parse_element(element))
# Clear the element from memory
element.clear()
while element.getprevious() is not None:
del element.getparent()[0]
logging.info(f"finished parsing {xml_file}")
return parsed_elements, xml_file
return pd.read_xml(xml_file), xml_file

def take_from_xml_queue(self):
fut = self.xml_queue.get(timeout=10.0)
Expand Down

0 comments on commit 1096ee1

Please sign in to comment.