diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 2bf3a16f..d31da86c 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -1,5 +1,6 @@ """SSSOM parsers.""" +import io import json import logging import re @@ -7,19 +8,17 @@ from collections import Counter from pathlib import Path from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, Union, cast -from urllib.request import urlopen from xml.dom import Node, minidom from xml.dom.minidom import Document import numpy as np import pandas as pd -import validators +import requests import yaml from deprecation import deprecated from linkml_runtime.loaders.json_loader import JSONLoader +from pandas.errors import EmptyDataError from rdflib import Graph, URIRef - -# from .sssom_datamodel import Mapping, MappingSet from sssom_schema import Mapping, MappingSet from sssom.constants import ( @@ -70,7 +69,6 @@ get_file_extension, is_multivalued_slot, raise_for_bad_path, - read_pandas, to_mapping_set_dataframe, ) @@ -86,10 +84,9 @@ def read_sssom_table( file_path: Union[str, Path], prefix_map: Optional[PrefixMap] = None, meta: Optional[MetadataType] = None, - **kwargs, ) -> MappingSetDataFrame: """DEPRECATE.""" - return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta, kwargs=kwargs) + return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta) @deprecated( @@ -134,22 +131,130 @@ def read_sssom_json( # Parsers (from file) +def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO: + """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object. + + :param input: A string representing a URL, a filepath, or file contents, + or a Path object representing a filepath. + :return: A StringIO object containing the input data. + """ + # If the import already is a StrinIO, return it + if isinstance(input, io.StringIO): + return input + elif isinstance(input, Path): + input = str(input) + + if isinstance(input, str): + if input.startswith("http://") or input.startswith("https://"): + # It's a URL + data = requests.get(input, timeout=30).content + return io.StringIO(data.decode("utf-8")) + elif "\n" in input or "\r" in input: + # It's string data + return io.StringIO(input) + else: + # It's a local file path + with open(input, "r") as file: + file_content = file.read() + return io.StringIO(file_content) + + raise IOError(f"Could not determine the type of input {input}") + + +def _separate_metadata_and_table_from_stream(s: io.StringIO): + s.seek(0) + + # Create a new StringIO object for filtered data + table_component = io.StringIO() + metadata_component = io.StringIO() + + header_section = True + + # Filter out lines starting with '#' + for line in s: + if not line.startswith("#"): + table_component.write(line) + if header_section: + header_section = False + elif header_section: + metadata_component.write(line) + else: + logging.info( + f"Line {line} is starting with hash symbol, but header section is already passed. " + f"This line is skipped" + ) + + # Reset the cursor to the start of the new StringIO object + table_component.seek(0) + metadata_component.seek(0) + return table_component, metadata_component + + +def _read_pandas_and_metadata(input: io.StringIO, sep: str = None): + """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly. + + :param input: The file to read. If no separator is given, this file should be named. + :param sep: File separator for pandas + :return: A pandas dataframe + """ + table_stream, metadata_stream = _separate_metadata_and_table_from_stream(input) + + try: + df = pd.read_csv(table_stream, sep=sep) + df.fillna("", inplace=True) + except EmptyDataError as e: + logging.warning(f"Seems like the dataframe is empty: {e}") + df = pd.DataFrame( + columns=[ + SUBJECT_ID, + SUBJECT_LABEL, + PREDICATE_ID, + OBJECT_ID, + MAPPING_JUSTIFICATION, + ] + ) + + if isinstance(df, pd.DataFrame): + sssom_metadata = _read_metadata_from_table(metadata_stream) + return df, sssom_metadata + + return None, None + + +def _get_seperator_symbol_from_file_path(file): + r""" + Take as an input a filepath and return the seperate symbol used, for example, by pandas. + + :param file: the file path + :return: the seperator symbols as a string, e.g. '\t' + """ + if isinstance(file, Path) or isinstance(file, str): + extension = get_file_extension(file) + if extension == "tsv": + return "\t" + elif extension == "csv": + return "," + logging.warning(f"Could not guess file extension for {file}") + return None + + def parse_sssom_table( - file_path: Union[str, Path], + file_path: Union[str, Path, TextIO], prefix_map: Optional[PrefixMap] = None, meta: Optional[MetadataType] = None, - **kwargs - # mapping_predicates: Optional[List[str]] = None, + **kwargs, ) -> MappingSetDataFrame: """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`.""" - raise_for_bad_path(file_path) - df = read_pandas(file_path) + if isinstance(file_path, Path) or isinstance(file_path, str): + raise_for_bad_path(file_path) + stream: io.StringIO = _open_input(file_path) + sep_new = _get_seperator_symbol_from_file_path(file_path) + df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new) # if mapping_predicates: # # Filter rows based on presence of predicate_id list provided. # df = df[df["predicate_id"].isin(mapping_predicates)] # If SSSOM external metadata is provided, merge it with the internal metadata - sssom_metadata = _read_metadata_from_table(file_path) if sssom_metadata: if meta: @@ -733,24 +838,13 @@ def _swap_object_subject(mapping: Mapping) -> Mapping: return mapping -def _read_metadata_from_table(path: Union[str, Path]) -> Dict[str, Any]: - if isinstance(path, Path) or not validators.url(path): - with open(path) as file: - yamlstr = "" - for line in file: - if line.startswith("#"): - yamlstr += re.sub("^#", "", line) - else: - break - else: - response = urlopen(path) - yamlstr = "" - for lin in response: - line = lin.decode("utf-8") - if line.startswith("#"): - yamlstr += re.sub("^#", "", line) - else: - break +def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]: + yamlstr = "" + for line in stream: + if line.startswith("#"): + yamlstr += re.sub("^#", "", line) + else: + break if yamlstr: meta = yaml.safe_load(yamlstr) diff --git a/src/sssom/util.py b/src/sssom/util.py index f066c4b2..25f5cb5c 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -9,6 +9,7 @@ from functools import reduce from io import StringIO from pathlib import Path +from string import punctuation from typing import ( Any, ChainMap, @@ -24,6 +25,7 @@ ) from urllib.request import urlopen +import deprecation import numpy as np import pandas as pd import validators @@ -852,23 +854,28 @@ def get_file_extension(file: Union[str, Path, TextIO]) -> str: """Get file extension. :param file: File path - :raises Exception: Cannot determine extension exception - :return: format of the file passed + :return: format of the file passed, default tsv """ - if isinstance(file, str): + if isinstance(file, Path): + if file.suffix: + return file.suffix.strip(punctuation) + else: + logging.warning( + f"Cannot guess format from {file}, despite appearing to be a Path-like object." + ) + elif isinstance(file, str): filename = file - elif isinstance(file, Path): - return file.suffix - else: - filename = file.name - parts = filename.split(".") - if len(parts) > 0: - f_format = parts[-1] - return f_format - else: - raise Exception(f"Cannot guess format from {filename}") + parts = filename.split(".") + if len(parts) > 0: + f_format = parts[-1] + return f_format.strip(punctuation) + else: + logging.warning(f"Cannot guess format from {filename}") + logging.info("Cannot guess format extension for this file, assuming TSV.") + return "tsv" +@deprecation.deprecated(details="Use pandas.read_csv() instead.") def read_csv( filename: Union[str, Path, TextIO], comment: str = "#", sep: str = "," ) -> pd.DataFrame: @@ -923,6 +930,7 @@ def read_metadata(filename: str) -> Metadata: return Metadata(prefix_map=prefix_map, metadata=metadata) +@deprecation.deprecated(details="Use pandas.read_csv() instead.") def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd.DataFrame: """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly. @@ -931,15 +939,14 @@ def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd :return: A pandas dataframe """ if sep is None: - extension = get_file_extension(file) - if extension == "tsv": - sep = "\t" - elif extension == "csv": - sep = "," - else: - sep = "\t" - logging.warning("Cannot automatically determine table format, trying tsv.") - df = read_csv(file, comment="#", sep=sep).fillna("") + if isinstance(file, Path) or isinstance(file, str): + extension = get_file_extension(file) + if extension == "tsv": + sep = "\t" + elif extension == "csv": + sep = "," + logging.warning(f"Could not guess file extension for {file}") + df = read_csv(file, comment="#", sep=sep).fillna("") return sort_df_rows_columns(df) @@ -1188,7 +1195,7 @@ def filter_prefixes( return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features) -# TODO this is not used anywhere +@deprecation.deprecated(details="This is no longer used and will be removed from the public API.") def guess_file_format(filename: Union[str, TextIO]) -> str: """Get file format. @@ -1259,6 +1266,8 @@ def raise_for_bad_path(file_path: Union[str, Path]) -> None: if isinstance(file_path, Path): if not file_path.is_file(): raise FileNotFoundError(f"{file_path} is not a valid file path or url.") + elif not isinstance(file_path, str): + logging.info("Path provided to raise_for_bad_path() is neither a Path nor str-like object.") elif not validators.url(file_path) and not os.path.exists(file_path): raise FileNotFoundError(f"{file_path} is not a valid file path or url.") diff --git a/src/sssom/writers.py b/src/sssom/writers.py index c3acbe1f..a29b6ee2 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -7,6 +7,7 @@ import pandas as pd import yaml +from deprecation import deprecated from jsonasobj2 import JsonObj from linkml_runtime.dumpers import JSONDumper, rdflib_dumper from linkml_runtime.utils.schemaview import SchemaView @@ -161,6 +162,9 @@ def write_ontoportal_json( # Converters convert a mappingsetdataframe to an object of the supportes types (json, pandas dataframe) +@deprecated( + details="Use df variable of 'MappingSetDataFrame' instead (msdf.df).", +) def to_dataframe(msdf: MappingSetDataFrame) -> pd.DataFrame: """Convert a mapping set dataframe to a dataframe.""" data = [] diff --git a/tests/test_cli.py b/tests/test_cli.py index 8250166c..a0dab750 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -27,6 +27,7 @@ split, validate, ) +from tests.constants import data_dir from tests.test_data import ( RECON_YAML, SSSOMTestCase, @@ -35,8 +36,6 @@ test_out_dir, ) -from .constants import data_dir - class SSSOMCLITestSuite(unittest.TestCase): """A test case for the dynamic CLI tests.""" diff --git a/tests/test_conversion.py b/tests/test_conversion.py index 59dd8850..6ca1825b 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -9,11 +9,10 @@ import yaml from rdflib import Graph -from sssom.parsers import get_parsing_function, to_mapping_set_document +from sssom.parsers import get_parsing_function, parse_sssom_table, to_mapping_set_document from sssom.sssom_document import MappingSetDocument -from sssom.util import read_pandas, to_mapping_set_dataframe +from sssom.util import to_mapping_set_dataframe from sssom.writers import ( - to_dataframe, to_json, to_ontoportal_json, to_owl_graph, @@ -23,9 +22,8 @@ write_rdf, write_table, ) - -from .constants import data_dir -from .test_data import SSSOMTestCase, get_all_test_cases +from tests.constants import data_dir +from tests.test_data import SSSOMTestCase, get_all_test_cases class SSSOMReadWriteTestSuite(unittest.TestCase): @@ -139,7 +137,7 @@ def _test_graph_size(self, graph: Graph, queries: list, file: str): def _test_to_dataframe(self, mdoc, test): msdf = to_mapping_set_dataframe(mdoc) - df = to_dataframe(msdf) + df = msdf.df self.assertEqual( len(df), test.ct_data_frame_rows, @@ -147,7 +145,7 @@ def _test_to_dataframe(self, mdoc, test): ) df.to_csv(test.get_out_file("roundtrip.tsv"), sep="\t") # data = pd.read_csv(test.get_out_file("roundtrip.tsv"), sep="\t") - data = read_pandas(test.get_out_file("roundtrip.tsv")) + data = parse_sssom_table(test.get_out_file("roundtrip.tsv")).df self.assertEqual( len(data), test.ct_data_frame_rows, @@ -157,7 +155,7 @@ def _test_to_dataframe(self, mdoc, test): with open(path, "w") as file: write_table(msdf, file) # self._test_files_equal(test.get_out_file("tsv"), test.get_validate_file("tsv")) - df = read_pandas(path) + df = parse_sssom_table(path).df self.assertEqual( len(df), test.ct_data_frame_rows, diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 11538eb7..ffb58e91 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -1,5 +1,6 @@ """Tests for parsers.""" +import io import json import math import os @@ -63,7 +64,7 @@ def setUp(self) -> None: self.alignmentxml = minidom.parse(self.alignmentxml_file) self.metadata = get_default_metadata() - def test_parse_sssom_dataframe(self): + def test_parse_sssom_dataframe_from_file(self): """Test parsing a TSV.""" input_path = f"{test_data_dir}/basic.tsv" msdf = parse_sssom_table(input_path) @@ -76,7 +77,23 @@ def test_parse_sssom_dataframe(self): f"{input_path} has the wrong number of mappings.", ) - def test_parse_sssom_dataframe_url(self): + def test_parse_sssom_dataframe_from_stringio(self): + """Test parsing a TSV.""" + input_path = f"{test_data_dir}/basic.tsv" + with open(input_path, "r") as file: + input_string = file.read() + stream = io.StringIO(input_string) + msdf = parse_sssom_table(stream) + output_path = os.path.join(test_out_dir, "test_parse_sssom_dataframe_stream.tsv") + with open(output_path, "w") as file: + write_table(msdf, file) + self.assertEqual( + len(msdf.df), + 141, + f"{input_path} has the wrong number of mappings.", + ) + + def test_parse_sssom_dataframe_from_url(self): """Test parsing a TSV from a URL.""" msdf = parse_sssom_table(self.df_url) output_path = os.path.join(test_out_dir, "test_parse_sssom_dataframe_url.tsv")