From badf527a0ac43a499f040087eaef294b9c29dec9 Mon Sep 17 00:00:00 2001
From: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>
Date: Mon, 26 Jun 2023 18:24:38 +0300
Subject: [PATCH] Change parse methods to support StringIO (#385)

* Change parse methods to support StringIO

* Fix flake8, black and mypy

* add test

* Update parsers.py

* Update util.py

* Handle comments correctly

* Fix issues

* Tox fixes

* Change test paths

* Update parsers.py

* Update util.py

* making get separator method private

* fix comment

* Fixing some more tests

* Update util.py

* Update util.py
---
 src/sssom/parsers.py     | 156 +++++++++++++++++++++++++++++++--------
 src/sssom/util.py        |  55 ++++++++------
 src/sssom/writers.py     |   4 +
 tests/test_cli.py        |   3 +-
 tests/test_conversion.py |  16 ++--
 tests/test_parsers.py    |  21 +++++-
 6 files changed, 188 insertions(+), 67 deletions(-)

diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py
index 2bf3a16f..d31da86c 100644
--- a/src/sssom/parsers.py
+++ b/src/sssom/parsers.py
@@ -1,5 +1,6 @@
 """SSSOM parsers."""
 
+import io
 import json
 import logging
 import re
@@ -7,19 +8,17 @@
 from collections import Counter
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, Union, cast
-from urllib.request import urlopen
 from xml.dom import Node, minidom
 from xml.dom.minidom import Document
 
 import numpy as np
 import pandas as pd
-import validators
+import requests
 import yaml
 from deprecation import deprecated
 from linkml_runtime.loaders.json_loader import JSONLoader
+from pandas.errors import EmptyDataError
 from rdflib import Graph, URIRef
-
-# from .sssom_datamodel import Mapping, MappingSet
 from sssom_schema import Mapping, MappingSet
 
 from sssom.constants import (
@@ -70,7 +69,6 @@
     get_file_extension,
     is_multivalued_slot,
     raise_for_bad_path,
-    read_pandas,
     to_mapping_set_dataframe,
 )
 
@@ -86,10 +84,9 @@ def read_sssom_table(
     file_path: Union[str, Path],
     prefix_map: Optional[PrefixMap] = None,
     meta: Optional[MetadataType] = None,
-    **kwargs,
 ) -> MappingSetDataFrame:
     """DEPRECATE."""
-    return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta, kwargs=kwargs)
+    return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta)
 
 
 @deprecated(
@@ -134,22 +131,130 @@ def read_sssom_json(
 # Parsers (from file)
 
 
+def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO:
+    """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object.
+
+    :param input: A string representing a URL, a filepath, or file contents,
+                              or a Path object representing a filepath.
+    :return: A StringIO object containing the input data.
+    """
+    # If the import already is a StrinIO, return it
+    if isinstance(input, io.StringIO):
+        return input
+    elif isinstance(input, Path):
+        input = str(input)
+
+    if isinstance(input, str):
+        if input.startswith("http://") or input.startswith("https://"):
+            # It's a URL
+            data = requests.get(input, timeout=30).content
+            return io.StringIO(data.decode("utf-8"))
+        elif "\n" in input or "\r" in input:
+            # It's string data
+            return io.StringIO(input)
+        else:
+            # It's a local file path
+            with open(input, "r") as file:
+                file_content = file.read()
+            return io.StringIO(file_content)
+
+    raise IOError(f"Could not determine the type of input {input}")
+
+
+def _separate_metadata_and_table_from_stream(s: io.StringIO):
+    s.seek(0)
+
+    # Create a new StringIO object for filtered data
+    table_component = io.StringIO()
+    metadata_component = io.StringIO()
+
+    header_section = True
+
+    # Filter out lines starting with '#'
+    for line in s:
+        if not line.startswith("#"):
+            table_component.write(line)
+            if header_section:
+                header_section = False
+        elif header_section:
+            metadata_component.write(line)
+        else:
+            logging.info(
+                f"Line {line} is starting with hash symbol, but header section is already passed. "
+                f"This line is skipped"
+            )
+
+    # Reset the cursor to the start of the new StringIO object
+    table_component.seek(0)
+    metadata_component.seek(0)
+    return table_component, metadata_component
+
+
+def _read_pandas_and_metadata(input: io.StringIO, sep: str = None):
+    """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
+
+    :param input: The file to read. If no separator is given, this file should be named.
+    :param sep: File separator for pandas
+    :return: A pandas dataframe
+    """
+    table_stream, metadata_stream = _separate_metadata_and_table_from_stream(input)
+
+    try:
+        df = pd.read_csv(table_stream, sep=sep)
+        df.fillna("", inplace=True)
+    except EmptyDataError as e:
+        logging.warning(f"Seems like the dataframe is empty: {e}")
+        df = pd.DataFrame(
+            columns=[
+                SUBJECT_ID,
+                SUBJECT_LABEL,
+                PREDICATE_ID,
+                OBJECT_ID,
+                MAPPING_JUSTIFICATION,
+            ]
+        )
+
+    if isinstance(df, pd.DataFrame):
+        sssom_metadata = _read_metadata_from_table(metadata_stream)
+        return df, sssom_metadata
+
+    return None, None
+
+
+def _get_seperator_symbol_from_file_path(file):
+    r"""
+    Take as an input a filepath and return the seperate symbol used, for example, by pandas.
+
+    :param file: the file path
+    :return: the seperator symbols as a string, e.g. '\t'
+    """
+    if isinstance(file, Path) or isinstance(file, str):
+        extension = get_file_extension(file)
+        if extension == "tsv":
+            return "\t"
+        elif extension == "csv":
+            return ","
+        logging.warning(f"Could not guess file extension for {file}")
+    return None
+
+
 def parse_sssom_table(
-    file_path: Union[str, Path],
+    file_path: Union[str, Path, TextIO],
     prefix_map: Optional[PrefixMap] = None,
     meta: Optional[MetadataType] = None,
-    **kwargs
-    # mapping_predicates: Optional[List[str]] = None,
+    **kwargs,
 ) -> MappingSetDataFrame:
     """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
-    raise_for_bad_path(file_path)
-    df = read_pandas(file_path)
+    if isinstance(file_path, Path) or isinstance(file_path, str):
+        raise_for_bad_path(file_path)
+    stream: io.StringIO = _open_input(file_path)
+    sep_new = _get_seperator_symbol_from_file_path(file_path)
+    df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
     # if mapping_predicates:
     #     # Filter rows based on presence of predicate_id list provided.
     #     df = df[df["predicate_id"].isin(mapping_predicates)]
 
     # If SSSOM external metadata is provided, merge it with the internal metadata
-    sssom_metadata = _read_metadata_from_table(file_path)
 
     if sssom_metadata:
         if meta:
@@ -733,24 +838,13 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
     return mapping
 
 
-def _read_metadata_from_table(path: Union[str, Path]) -> Dict[str, Any]:
-    if isinstance(path, Path) or not validators.url(path):
-        with open(path) as file:
-            yamlstr = ""
-            for line in file:
-                if line.startswith("#"):
-                    yamlstr += re.sub("^#", "", line)
-                else:
-                    break
-    else:
-        response = urlopen(path)
-        yamlstr = ""
-        for lin in response:
-            line = lin.decode("utf-8")
-            if line.startswith("#"):
-                yamlstr += re.sub("^#", "", line)
-            else:
-                break
+def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]:
+    yamlstr = ""
+    for line in stream:
+        if line.startswith("#"):
+            yamlstr += re.sub("^#", "", line)
+        else:
+            break
 
     if yamlstr:
         meta = yaml.safe_load(yamlstr)
diff --git a/src/sssom/util.py b/src/sssom/util.py
index f066c4b2..25f5cb5c 100644
--- a/src/sssom/util.py
+++ b/src/sssom/util.py
@@ -9,6 +9,7 @@
 from functools import reduce
 from io import StringIO
 from pathlib import Path
+from string import punctuation
 from typing import (
     Any,
     ChainMap,
@@ -24,6 +25,7 @@
 )
 from urllib.request import urlopen
 
+import deprecation
 import numpy as np
 import pandas as pd
 import validators
@@ -852,23 +854,28 @@ def get_file_extension(file: Union[str, Path, TextIO]) -> str:
     """Get file extension.
 
     :param file: File path
-    :raises Exception: Cannot determine extension exception
-    :return: format of the file passed
+    :return: format of the file passed, default tsv
     """
-    if isinstance(file, str):
+    if isinstance(file, Path):
+        if file.suffix:
+            return file.suffix.strip(punctuation)
+        else:
+            logging.warning(
+                f"Cannot guess format from {file}, despite appearing to be a Path-like object."
+            )
+    elif isinstance(file, str):
         filename = file
-    elif isinstance(file, Path):
-        return file.suffix
-    else:
-        filename = file.name
-    parts = filename.split(".")
-    if len(parts) > 0:
-        f_format = parts[-1]
-        return f_format
-    else:
-        raise Exception(f"Cannot guess format from {filename}")
+        parts = filename.split(".")
+        if len(parts) > 0:
+            f_format = parts[-1]
+            return f_format.strip(punctuation)
+        else:
+            logging.warning(f"Cannot guess format from {filename}")
+    logging.info("Cannot guess format extension for this file, assuming TSV.")
+    return "tsv"
 
 
+@deprecation.deprecated(details="Use pandas.read_csv() instead.")
 def read_csv(
     filename: Union[str, Path, TextIO], comment: str = "#", sep: str = ","
 ) -> pd.DataFrame:
@@ -923,6 +930,7 @@ def read_metadata(filename: str) -> Metadata:
     return Metadata(prefix_map=prefix_map, metadata=metadata)
 
 
+@deprecation.deprecated(details="Use pandas.read_csv() instead.")
 def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd.DataFrame:
     """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
 
@@ -931,15 +939,14 @@ def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd
     :return: A pandas dataframe
     """
     if sep is None:
-        extension = get_file_extension(file)
-        if extension == "tsv":
-            sep = "\t"
-        elif extension == "csv":
-            sep = ","
-        else:
-            sep = "\t"
-            logging.warning("Cannot automatically determine table format, trying tsv.")
-        df = read_csv(file, comment="#", sep=sep).fillna("")
+        if isinstance(file, Path) or isinstance(file, str):
+            extension = get_file_extension(file)
+            if extension == "tsv":
+                sep = "\t"
+            elif extension == "csv":
+                sep = ","
+            logging.warning(f"Could not guess file extension for {file}")
+    df = read_csv(file, comment="#", sep=sep).fillna("")
     return sort_df_rows_columns(df)
 
 
@@ -1188,7 +1195,7 @@ def filter_prefixes(
     return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)
 
 
-# TODO this is not used anywhere
+@deprecation.deprecated(details="This is no longer used and will be removed from the public API.")
 def guess_file_format(filename: Union[str, TextIO]) -> str:
     """Get file format.
 
@@ -1259,6 +1266,8 @@ def raise_for_bad_path(file_path: Union[str, Path]) -> None:
     if isinstance(file_path, Path):
         if not file_path.is_file():
             raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
+    elif not isinstance(file_path, str):
+        logging.info("Path provided to raise_for_bad_path() is neither a Path nor str-like object.")
     elif not validators.url(file_path) and not os.path.exists(file_path):
         raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
 
diff --git a/src/sssom/writers.py b/src/sssom/writers.py
index c3acbe1f..a29b6ee2 100644
--- a/src/sssom/writers.py
+++ b/src/sssom/writers.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 import yaml
+from deprecation import deprecated
 from jsonasobj2 import JsonObj
 from linkml_runtime.dumpers import JSONDumper, rdflib_dumper
 from linkml_runtime.utils.schemaview import SchemaView
@@ -161,6 +162,9 @@ def write_ontoportal_json(
 # Converters convert a mappingsetdataframe to an object of the supportes types (json, pandas dataframe)
 
 
+@deprecated(
+    details="Use df variable of 'MappingSetDataFrame' instead (msdf.df).",
+)
 def to_dataframe(msdf: MappingSetDataFrame) -> pd.DataFrame:
     """Convert a mapping set dataframe to a dataframe."""
     data = []
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 8250166c..a0dab750 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -27,6 +27,7 @@
     split,
     validate,
 )
+from tests.constants import data_dir
 from tests.test_data import (
     RECON_YAML,
     SSSOMTestCase,
@@ -35,8 +36,6 @@
     test_out_dir,
 )
 
-from .constants import data_dir
-
 
 class SSSOMCLITestSuite(unittest.TestCase):
     """A test case for the dynamic CLI tests."""
diff --git a/tests/test_conversion.py b/tests/test_conversion.py
index 59dd8850..6ca1825b 100644
--- a/tests/test_conversion.py
+++ b/tests/test_conversion.py
@@ -9,11 +9,10 @@
 import yaml
 from rdflib import Graph
 
-from sssom.parsers import get_parsing_function, to_mapping_set_document
+from sssom.parsers import get_parsing_function, parse_sssom_table, to_mapping_set_document
 from sssom.sssom_document import MappingSetDocument
-from sssom.util import read_pandas, to_mapping_set_dataframe
+from sssom.util import to_mapping_set_dataframe
 from sssom.writers import (
-    to_dataframe,
     to_json,
     to_ontoportal_json,
     to_owl_graph,
@@ -23,9 +22,8 @@
     write_rdf,
     write_table,
 )
-
-from .constants import data_dir
-from .test_data import SSSOMTestCase, get_all_test_cases
+from tests.constants import data_dir
+from tests.test_data import SSSOMTestCase, get_all_test_cases
 
 
 class SSSOMReadWriteTestSuite(unittest.TestCase):
@@ -139,7 +137,7 @@ def _test_graph_size(self, graph: Graph, queries: list, file: str):
 
     def _test_to_dataframe(self, mdoc, test):
         msdf = to_mapping_set_dataframe(mdoc)
-        df = to_dataframe(msdf)
+        df = msdf.df
         self.assertEqual(
             len(df),
             test.ct_data_frame_rows,
@@ -147,7 +145,7 @@ def _test_to_dataframe(self, mdoc, test):
         )
         df.to_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
         # data = pd.read_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
-        data = read_pandas(test.get_out_file("roundtrip.tsv"))
+        data = parse_sssom_table(test.get_out_file("roundtrip.tsv")).df
         self.assertEqual(
             len(data),
             test.ct_data_frame_rows,
@@ -157,7 +155,7 @@ def _test_to_dataframe(self, mdoc, test):
         with open(path, "w") as file:
             write_table(msdf, file)
         # self._test_files_equal(test.get_out_file("tsv"), test.get_validate_file("tsv"))
-        df = read_pandas(path)
+        df = parse_sssom_table(path).df
         self.assertEqual(
             len(df),
             test.ct_data_frame_rows,
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index 11538eb7..ffb58e91 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -1,5 +1,6 @@
 """Tests for parsers."""
 
+import io
 import json
 import math
 import os
@@ -63,7 +64,7 @@ def setUp(self) -> None:
         self.alignmentxml = minidom.parse(self.alignmentxml_file)
         self.metadata = get_default_metadata()
 
-    def test_parse_sssom_dataframe(self):
+    def test_parse_sssom_dataframe_from_file(self):
         """Test parsing a TSV."""
         input_path = f"{test_data_dir}/basic.tsv"
         msdf = parse_sssom_table(input_path)
@@ -76,7 +77,23 @@ def test_parse_sssom_dataframe(self):
             f"{input_path} has the wrong number of mappings.",
         )
 
-    def test_parse_sssom_dataframe_url(self):
+    def test_parse_sssom_dataframe_from_stringio(self):
+        """Test parsing a TSV."""
+        input_path = f"{test_data_dir}/basic.tsv"
+        with open(input_path, "r") as file:
+            input_string = file.read()
+        stream = io.StringIO(input_string)
+        msdf = parse_sssom_table(stream)
+        output_path = os.path.join(test_out_dir, "test_parse_sssom_dataframe_stream.tsv")
+        with open(output_path, "w") as file:
+            write_table(msdf, file)
+        self.assertEqual(
+            len(msdf.df),
+            141,
+            f"{input_path} has the wrong number of mappings.",
+        )
+
+    def test_parse_sssom_dataframe_from_url(self):
         """Test parsing a TSV from a URL."""
         msdf = parse_sssom_table(self.df_url)
         output_path = os.path.join(test_out_dir, "test_parse_sssom_dataframe_url.tsv")