Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to read CSV method #1512

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/tlo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
from pandas import DataFrame, DateOffset
from pandas._typing import DtypeArg

from tlo import Population, Property, Types

Expand Down Expand Up @@ -474,7 +475,7 @@ def convert_excel_files_to_csv(folder: Path, files: Optional[list[str]] = None,
Path(folder/excel_file_path).unlink()


def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame | dict[str, DataFrame]:
def read_csv_files(folder: Path, dtype: DtypeArg | None = None, files: Optional[list[str]] | None | int = 0) -> DataFrame | dict[str, DataFrame]:
mnjowe marked this conversation as resolved.
Show resolved Hide resolved
"""
A function to read CSV files in a similar way pandas reads Excel files (:py:func:`pandas.read_excel`).

Expand All @@ -484,6 +485,7 @@ def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame
:py:func:`pandas.drop`.

:param folder: Path to folder containing CSV files to read.
:param dtype: preferred datatype
mnjowe marked this conversation as resolved.
Show resolved Hide resolved
:param files: preferred csv file name(s). This is the same as sheet names in Excel file. Note that if None(no files
selected) then all files in the containing folder will be loaded

Expand All @@ -498,15 +500,15 @@ def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None:
for _key, dataframe in dataframes_dict.items():
all_data[_key] = dataframe.drop(dataframe.filter(like='Unnamed'), axis=1) # filter and drop Unnamed columns

if files is None:
if files == 0 or files is None:
matt-graham marked this conversation as resolved.
Show resolved Hide resolved
for f_name in folder.rglob("*.csv"):
all_data[f_name.stem] = pd.read_csv(f_name)
all_data[f_name.stem] = pd.read_csv(f_name, dtype=dtype)

else:
for f_name in files:
all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"))
all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"), dtype=dtype)
# clean and return the dataframe dictionary
clean_dataframe(all_data)
# If only one file loaded return dataframe directly rather than dict
return next(iter(all_data.values())) if len(all_data) == 1 else all_data
return next(iter(all_data.values())) if len(all_data) == 1 and files is not None else all_data

42 changes: 39 additions & 3 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,44 @@ def copy_files_to_temporal_directory_and_return_path(tmpdir):
return tmpdir_resource_filepath


def test_read_csv_method_with_no_file(tmpdir):
""" read csv method when no file name is supplied
def test_pass_datatypes_to_read_csv_method(tmpdir):
""" test passing column datatypes to read csv method. Final column datatype should change to what has been passed """
# copy and get resource files path in the temporal directory
tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
mnjowe marked this conversation as resolved.
Show resolved Hide resolved
sample_data = pd.DataFrame(data={'numbers1': [5,6,8,4,9,6], 'numbers2': [19,27,53,49,75,56]}, dtype=int)
sample_data.to_csv(tmpdir_resource_filepath/'sample_data.csv', index=False)
# read from the sample data file
read_sample_data = read_csv_files(tmpdir_resource_filepath, files=['sample_data'])
# confirm column datatype is what was assigned
assert read_sample_data.numbers1.dtype and read_sample_data.numbers2.dtype == 'int'
mnjowe marked this conversation as resolved.
Show resolved Hide resolved
# define new datatypes
datatype = {'numbers1': int, 'numbers2': float}
# pass the new datatypes to read csv method and confirm datatype has changed to what has been declared now
assign_dtype = read_csv_files(tmpdir_resource_filepath, files=['sample_data'], dtype=datatype)
assert assign_dtype.numbers1.dtype == 'int' and assign_dtype.numbers2.dtype == 'float'


def test_read_csv_file_method_passing_none_to_files_argument(tmpdir):
""" test reading csv files with one file in the target resource file and setting to None the files argument

Expectations
1. should return a dictionary
2. the dictionary key name should match file name
"""
# copy and get resource files path in the temporal directory
tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
# choose an Excel file with one sheet in it and convert it to csv file
convert_excel_files_to_csv(tmpdir_resource_filepath, files=['ResourceFile_load-parameters.xlsx'])
# get the folder containing the newly converted csv file and check the expected behavior
this_csv_resource_folder = tmpdir_resource_filepath/"ResourceFile_load-parameters"
file_names = [csv_file_path.stem for csv_file_path in this_csv_resource_folder.rglob("*.csv")]
one_csv_file_in_folder_dict = read_csv_files(this_csv_resource_folder, files=None)
assert isinstance(one_csv_file_in_folder_dict, dict)
assert set(one_csv_file_in_folder_dict.keys()) == set(file_names)


def test_read_csv_method_with_default_value_for_files_argument(tmpdir):
""" read csv method when no file name(s) is supplied to the files argument
i) should return dictionary.
ii) dictionary keys should match csv file names in resource folder
iii) all dictionary values should be dataframes
Expand All @@ -350,7 +386,7 @@ def test_read_csv_method_with_no_file(tmpdir):


def test_read_csv_method_with_one_file(tmpdir):
""" test read csv method when one file name is supplied. should return a dataframe
""" test read csv method when one file name is supplied to files argument. should return a dataframe
:param tmpdir: path to a temporal directory

"""
Expand Down