Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to read CSV method #1512

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions src/tlo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
from pandas import DataFrame, DateOffset
from pandas._typing import DtypeArg

from tlo import Population, Property, Types

Expand Down Expand Up @@ -474,7 +475,9 @@ def convert_excel_files_to_csv(folder: Path, files: Optional[list[str]] = None,
Path(folder/excel_file_path).unlink()


def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame | dict[str, DataFrame]:
def read_csv_files(folder: Path,
dtype: DtypeArg | dict[str, DtypeArg] | None = None,
files: str | int | list[str] | None = 0) -> DataFrame | dict[str, DataFrame]:
"""
A function to read CSV files in a similar way pandas reads Excel files (:py:func:`pandas.read_excel`).

Expand All @@ -484,8 +487,20 @@ def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame
:py:func:`pandas.drop`.

:param folder: Path to folder containing CSV files to read.
:param dtype: allows passing in a dictionary of datatypes in cases where you want different datatypes per column
:param files: preferred csv file name(s). This is the same as sheet names in Excel file. Note that if None(no files
selected) then all files in the containing folder will be loaded
selected) then all csv files in the containing folder will be read

Please take note of the following behaviours:
-----------------------------------------------
- if files argument is initialised to zero(default) and the folder contains one or multiple files,
this method will return a dataframe. If the folder contain multiple files, it is good to
specify file names or initialise files argument with None to ensure correct files are selected
- if files argument is initialised to None and the folder contains one or multiple files, this method
will return a dataframe dictionary
- if the folder contains multiple files and files argument is initialised with one file name this
method will return a dataframe. it will return a dataframe dictionary when files argument is
initialised with a list of multiple file names

"""
all_data: dict[str, DataFrame] = {} # dataframes dictionary
Expand All @@ -498,15 +513,21 @@ def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None:
for _key, dataframe in dataframes_dict.items():
all_data[_key] = dataframe.drop(dataframe.filter(like='Unnamed'), axis=1) # filter and drop Unnamed columns

if files is None:
for f_name in folder.rglob("*.csv"):
all_data[f_name.stem] = pd.read_csv(f_name)

return_dict = False # a flag that will determine whether the output should be a dictionary or a DatFrame
if isinstance(files, list):
return_dict = True
elif files is None:
files = [f_name.stem for f_name in folder.rglob("*.csv")]
return_dict = True
elif isinstance(files, str):
files = [files]
else:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment this else clause will be run whenever files is not a instance list or str and is not None. Looking at the type hint / default value I guess this is meant to cover the case when files is an instance of an int? If so it would be better to make this elif isinstance(files, int): and add a separate else: clause that raises an error saying files is not of an expected type if none of the previous clauses have run. At the moment for example if a user passes a non-list iterable data structure as the files argument - for example a tuple or generator (as returned for example by Path.glob and Path.rglob) then the value passed will be silently overwritten which is probably not what they would expect. Failing loudly if files is of an unexpected type (and/or allowing files to be of more general iterable types by for example checking if it is an instance of Iterable from collections.abc instead of list) would be better here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could also avoid the duplication of the list comprehension [f_name.stem for f_name in folder.glob("*.csv")] by having something like

elif isinstance(files, int) or files is None:
    files = [f_name.stem for f_name in folder.glob("*.csv")]
    return_dict = files is None
else:
    raise TypeError(f"Value passed for files argument {files} is not one of expected types.")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thought of combining those two conditions crossed my mind but I was lacking the smart way of setting return_dict to either True or False. thanks for this @matt-graham

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe using isinstance(files, collections.abc.Iterable) is not the best approach here, as it also considers strings to be iterable. If a string is included, it could alter the expected behavior of the return type when modelers specify a file name by passing a string. This would cause the method to return a dictionary instead of a DataFrame, which is likely not the intended outcome.

for f_name in files:
all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"))
files = [f_name.stem for f_name in folder.rglob("*.csv")]
mnjowe marked this conversation as resolved.
Show resolved Hide resolved

for f_name in files:
all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"), dtype=dtype)
# clean and return the dataframe dictionary
clean_dataframe(all_data)
# If only one file loaded return dataframe directly rather than dict
return next(iter(all_data.values())) if len(all_data) == 1 else all_data
# return a dictionary if return_dict flag is set to True else return a dataframe
return all_data if return_dict else next(iter(all_data.values()))

55 changes: 45 additions & 10 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,30 +332,65 @@ def copy_files_to_temporal_directory_and_return_path(tmpdir):
return tmpdir_resource_filepath


def test_read_csv_method_with_no_file(tmpdir):
""" read csv method when no file name is supplied
i) should return dictionary.
ii) dictionary keys should match csv file names in resource folder
iii) all dictionary values should be dataframes
def test_pass_datatypes_to_read_csv_method(tmpdir):
""" test passing column datatypes to read csv method. Final column datatype should change to what has been passed """
# copy and get resource files path in the temporal directory
path_to_tmpdir = Path(tmpdir)
sample_data = pd.DataFrame(data={'numbers1': [5,6,8,4,9,6], 'numbers2': [19,27,53,49,75,56]}, dtype=int)
sample_data.to_csv(tmpdir/'sample_data.csv', index=False)
# read from the sample data file
read_sample_data = read_csv_files(path_to_tmpdir, files='sample_data')
# confirm column datatype is what was assigned
assert read_sample_data.numbers1.dtype == 'int' and read_sample_data.numbers2.dtype == 'int'
# define new datatypes
datatype = {'numbers1': int, 'numbers2': float}
# pass the new datatypes to read csv method and confirm datatype has changed to what has been declared now
assign_dtype = read_csv_files(path_to_tmpdir, files='sample_data', dtype=datatype)
assert assign_dtype.numbers1.dtype == 'int' and assign_dtype.numbers2.dtype == 'float'


def test_read_csv_file_method_passing_none_to_files_argument(tmpdir):
""" test reading csv files with one file in the target resource file and setting to None the files argument

Expectations
1. should return a dictionary
2. the dictionary key name should match file name
"""
# copy and get resource files path in the temporal directory
tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
# choose an Excel file with one sheet in it and convert it to csv file
convert_excel_files_to_csv(tmpdir_resource_filepath, files=['ResourceFile_load-parameters.xlsx'])
# get the folder containing the newly converted csv file and check the expected behavior
this_csv_resource_folder = tmpdir_resource_filepath/"ResourceFile_load-parameters"
file_names = [csv_file_path.stem for csv_file_path in this_csv_resource_folder.rglob("*.csv")]
one_csv_file_in_folder_dict = read_csv_files(this_csv_resource_folder, files=None)
assert isinstance(one_csv_file_in_folder_dict, dict)
assert set(one_csv_file_in_folder_dict.keys()) == set(file_names)


def test_read_csv_method_with_default_value_for_files_argument(tmpdir):
""" read csv method when no file name(s) is supplied to the files argument
i) should return a dataframe of the first csv file in the folder. Similar to pd.read_excel returning
a dataframe of first sheet in the file.

:param tmpdir: path to a temporal directory

"""
tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
file_names = [csv_file_path.stem for csv_file_path in tmpdir_resource_filepath.rglob("*.csv")]
df_no_files = read_csv_files(tmpdir_resource_filepath)
assert isinstance(df_no_files, dict)
assert set(df_no_files.keys()) == set(file_names)
assert all(isinstance(value, pd.DataFrame) for value in df_no_files.values())
fist_file_in_folder_df = read_csv_files(tmpdir_resource_filepath, files=file_names[0])
assert isinstance(df_no_files, pd.DataFrame)
pd.testing.assert_frame_equal(fist_file_in_folder_df, df_no_files)


def test_read_csv_method_with_one_file(tmpdir):
""" test read csv method when one file name is supplied. should return a dataframe
""" test read csv method when one file name is supplied to files argument. should return a dataframe
:param tmpdir: path to a temporal directory

"""
tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
df = read_csv_files(tmpdir_resource_filepath, files=['df_at_healthcareseeking'])
df = read_csv_files(tmpdir_resource_filepath, files='df_at_healthcareseeking')
assert isinstance(df, pd.DataFrame)


Expand Down