Skip to content

Commit

Permalink
Change parse methods to support StringIO (#385)
Browse files Browse the repository at this point in the history
* Change parse methods to support StringIO

* Fix flake8, black and mypy

* add test

* Update parsers.py

* Update util.py

* Handle comments correctly

* Fix issues

* Tox fixes

* Change test paths

* Update parsers.py

* Update util.py

* making get separator method private

* fix comment

* Fixing some more tests

* Update util.py

* Update util.py
  • Loading branch information
matentzn authored Jun 26, 2023
1 parent a983827 commit badf527
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 67 deletions.
156 changes: 125 additions & 31 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,24 @@
"""SSSOM parsers."""

import io
import json
import logging
import re
import typing
from collections import Counter
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, Union, cast
from urllib.request import urlopen
from xml.dom import Node, minidom
from xml.dom.minidom import Document

import numpy as np
import pandas as pd
import validators
import requests
import yaml
from deprecation import deprecated
from linkml_runtime.loaders.json_loader import JSONLoader
from pandas.errors import EmptyDataError
from rdflib import Graph, URIRef

# from .sssom_datamodel import Mapping, MappingSet
from sssom_schema import Mapping, MappingSet

from sssom.constants import (
Expand Down Expand Up @@ -70,7 +69,6 @@
get_file_extension,
is_multivalued_slot,
raise_for_bad_path,
read_pandas,
to_mapping_set_dataframe,
)

Expand All @@ -86,10 +84,9 @@ def read_sssom_table(
file_path: Union[str, Path],
prefix_map: Optional[PrefixMap] = None,
meta: Optional[MetadataType] = None,
**kwargs,
) -> MappingSetDataFrame:
"""DEPRECATE."""
return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta, kwargs=kwargs)
return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta)


@deprecated(
Expand Down Expand Up @@ -134,22 +131,130 @@ def read_sssom_json(
# Parsers (from file)


def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO:
"""Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object.
:param input: A string representing a URL, a filepath, or file contents,
or a Path object representing a filepath.
:return: A StringIO object containing the input data.
"""
# If the import already is a StrinIO, return it
if isinstance(input, io.StringIO):
return input
elif isinstance(input, Path):
input = str(input)

if isinstance(input, str):
if input.startswith("http://") or input.startswith("https://"):
# It's a URL
data = requests.get(input, timeout=30).content
return io.StringIO(data.decode("utf-8"))
elif "\n" in input or "\r" in input:
# It's string data
return io.StringIO(input)
else:
# It's a local file path
with open(input, "r") as file:
file_content = file.read()
return io.StringIO(file_content)

raise IOError(f"Could not determine the type of input {input}")


def _separate_metadata_and_table_from_stream(s: io.StringIO):
s.seek(0)

# Create a new StringIO object for filtered data
table_component = io.StringIO()
metadata_component = io.StringIO()

header_section = True

# Filter out lines starting with '#'
for line in s:
if not line.startswith("#"):
table_component.write(line)
if header_section:
header_section = False
elif header_section:
metadata_component.write(line)
else:
logging.info(
f"Line {line} is starting with hash symbol, but header section is already passed. "
f"This line is skipped"
)

# Reset the cursor to the start of the new StringIO object
table_component.seek(0)
metadata_component.seek(0)
return table_component, metadata_component


def _read_pandas_and_metadata(input: io.StringIO, sep: str = None):
"""Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
:param input: The file to read. If no separator is given, this file should be named.
:param sep: File separator for pandas
:return: A pandas dataframe
"""
table_stream, metadata_stream = _separate_metadata_and_table_from_stream(input)

try:
df = pd.read_csv(table_stream, sep=sep)
df.fillna("", inplace=True)
except EmptyDataError as e:
logging.warning(f"Seems like the dataframe is empty: {e}")
df = pd.DataFrame(
columns=[
SUBJECT_ID,
SUBJECT_LABEL,
PREDICATE_ID,
OBJECT_ID,
MAPPING_JUSTIFICATION,
]
)

if isinstance(df, pd.DataFrame):
sssom_metadata = _read_metadata_from_table(metadata_stream)
return df, sssom_metadata

return None, None


def _get_seperator_symbol_from_file_path(file):
r"""
Take as an input a filepath and return the seperate symbol used, for example, by pandas.
:param file: the file path
:return: the seperator symbols as a string, e.g. '\t'
"""
if isinstance(file, Path) or isinstance(file, str):
extension = get_file_extension(file)
if extension == "tsv":
return "\t"
elif extension == "csv":
return ","
logging.warning(f"Could not guess file extension for {file}")
return None


def parse_sssom_table(
file_path: Union[str, Path],
file_path: Union[str, Path, TextIO],
prefix_map: Optional[PrefixMap] = None,
meta: Optional[MetadataType] = None,
**kwargs
# mapping_predicates: Optional[List[str]] = None,
**kwargs,
) -> MappingSetDataFrame:
"""Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
raise_for_bad_path(file_path)
df = read_pandas(file_path)
if isinstance(file_path, Path) or isinstance(file_path, str):
raise_for_bad_path(file_path)
stream: io.StringIO = _open_input(file_path)
sep_new = _get_seperator_symbol_from_file_path(file_path)
df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
# if mapping_predicates:
# # Filter rows based on presence of predicate_id list provided.
# df = df[df["predicate_id"].isin(mapping_predicates)]

# If SSSOM external metadata is provided, merge it with the internal metadata
sssom_metadata = _read_metadata_from_table(file_path)

if sssom_metadata:
if meta:
Expand Down Expand Up @@ -733,24 +838,13 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
return mapping


def _read_metadata_from_table(path: Union[str, Path]) -> Dict[str, Any]:
if isinstance(path, Path) or not validators.url(path):
with open(path) as file:
yamlstr = ""
for line in file:
if line.startswith("#"):
yamlstr += re.sub("^#", "", line)
else:
break
else:
response = urlopen(path)
yamlstr = ""
for lin in response:
line = lin.decode("utf-8")
if line.startswith("#"):
yamlstr += re.sub("^#", "", line)
else:
break
def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]:
yamlstr = ""
for line in stream:
if line.startswith("#"):
yamlstr += re.sub("^#", "", line)
else:
break

if yamlstr:
meta = yaml.safe_load(yamlstr)
Expand Down
55 changes: 32 additions & 23 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from functools import reduce
from io import StringIO
from pathlib import Path
from string import punctuation
from typing import (
Any,
ChainMap,
Expand All @@ -24,6 +25,7 @@
)
from urllib.request import urlopen

import deprecation
import numpy as np
import pandas as pd
import validators
Expand Down Expand Up @@ -852,23 +854,28 @@ def get_file_extension(file: Union[str, Path, TextIO]) -> str:
"""Get file extension.
:param file: File path
:raises Exception: Cannot determine extension exception
:return: format of the file passed
:return: format of the file passed, default tsv
"""
if isinstance(file, str):
if isinstance(file, Path):
if file.suffix:
return file.suffix.strip(punctuation)
else:
logging.warning(
f"Cannot guess format from {file}, despite appearing to be a Path-like object."
)
elif isinstance(file, str):
filename = file
elif isinstance(file, Path):
return file.suffix
else:
filename = file.name
parts = filename.split(".")
if len(parts) > 0:
f_format = parts[-1]
return f_format
else:
raise Exception(f"Cannot guess format from {filename}")
parts = filename.split(".")
if len(parts) > 0:
f_format = parts[-1]
return f_format.strip(punctuation)
else:
logging.warning(f"Cannot guess format from {filename}")
logging.info("Cannot guess format extension for this file, assuming TSV.")
return "tsv"


@deprecation.deprecated(details="Use pandas.read_csv() instead.")
def read_csv(
filename: Union[str, Path, TextIO], comment: str = "#", sep: str = ","
) -> pd.DataFrame:
Expand Down Expand Up @@ -923,6 +930,7 @@ def read_metadata(filename: str) -> Metadata:
return Metadata(prefix_map=prefix_map, metadata=metadata)


@deprecation.deprecated(details="Use pandas.read_csv() instead.")
def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd.DataFrame:
"""Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
Expand All @@ -931,15 +939,14 @@ def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd
:return: A pandas dataframe
"""
if sep is None:
extension = get_file_extension(file)
if extension == "tsv":
sep = "\t"
elif extension == "csv":
sep = ","
else:
sep = "\t"
logging.warning("Cannot automatically determine table format, trying tsv.")
df = read_csv(file, comment="#", sep=sep).fillna("")
if isinstance(file, Path) or isinstance(file, str):
extension = get_file_extension(file)
if extension == "tsv":
sep = "\t"
elif extension == "csv":
sep = ","
logging.warning(f"Could not guess file extension for {file}")
df = read_csv(file, comment="#", sep=sep).fillna("")
return sort_df_rows_columns(df)


Expand Down Expand Up @@ -1188,7 +1195,7 @@ def filter_prefixes(
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)


# TODO this is not used anywhere
@deprecation.deprecated(details="This is no longer used and will be removed from the public API.")
def guess_file_format(filename: Union[str, TextIO]) -> str:
"""Get file format.
Expand Down Expand Up @@ -1259,6 +1266,8 @@ def raise_for_bad_path(file_path: Union[str, Path]) -> None:
if isinstance(file_path, Path):
if not file_path.is_file():
raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
elif not isinstance(file_path, str):
logging.info("Path provided to raise_for_bad_path() is neither a Path nor str-like object.")
elif not validators.url(file_path) and not os.path.exists(file_path):
raise FileNotFoundError(f"{file_path} is not a valid file path or url.")

Expand Down
4 changes: 4 additions & 0 deletions src/sssom/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pandas as pd
import yaml
from deprecation import deprecated
from jsonasobj2 import JsonObj
from linkml_runtime.dumpers import JSONDumper, rdflib_dumper
from linkml_runtime.utils.schemaview import SchemaView
Expand Down Expand Up @@ -161,6 +162,9 @@ def write_ontoportal_json(
# Converters convert a mappingsetdataframe to an object of the supportes types (json, pandas dataframe)


@deprecated(
details="Use df variable of 'MappingSetDataFrame' instead (msdf.df).",
)
def to_dataframe(msdf: MappingSetDataFrame) -> pd.DataFrame:
"""Convert a mapping set dataframe to a dataframe."""
data = []
Expand Down
3 changes: 1 addition & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
split,
validate,
)
from tests.constants import data_dir
from tests.test_data import (
RECON_YAML,
SSSOMTestCase,
Expand All @@ -35,8 +36,6 @@
test_out_dir,
)

from .constants import data_dir


class SSSOMCLITestSuite(unittest.TestCase):
"""A test case for the dynamic CLI tests."""
Expand Down
Loading

0 comments on commit badf527

Please sign in to comment.