UCL · mnjowe · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 13, 2024
diff --git a/src/tlo/util.py b/src/tlo/util.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 from pandas import DataFrame, DateOffset
+from pandas._typing import DtypeArg
 
 from tlo import Population, Property, Types
 
@@ -474,7 +475,9 @@ def convert_excel_files_to_csv(folder: Path, files: Optional[list[str]] = None,
             Path(folder/excel_file_path).unlink()
 
 
-def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame | dict[str, DataFrame]:
+def read_csv_files(folder: Path,
+                   dtype: DtypeArg | dict[str, DtypeArg] | None = None,
+                   files: str | int | list[str] | None = 0) -> DataFrame | dict[str, DataFrame]:
     """
     A function to read CSV files in a similar way pandas reads Excel files (:py:func:`pandas.read_excel`).
 
@@ -484,8 +487,20 @@ def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame
     :py:func:`pandas.drop`.
 
     :param folder: Path to folder containing CSV files to read.
+    :param dtype: allows passing in a dictionary of datatypes in cases where you want different datatypes per column
     :param files: preferred csv file name(s). This is the same as sheet names in Excel file. Note that if None(no files
-                  selected) then all files in the containing folder will be loaded
+                  selected) then all csv files in the containing folder will be read
+
+                  Please take note of the following behaviours:
+                  -----------------------------------------------
+                   - if files argument is initialised to zero(default) and the folder contains one or multiple files,
+                     this method will return a dataframe. If the folder contain multiple files, it is good to
+                     specify file names or initialise files argument with None to ensure correct files are selected
+                   - if files argument is initialised to None and the folder contains one or multiple files, this method
+                     will return a dataframe dictionary
+                   - if the folder contains multiple files and files argument is initialised with one file name this
+                     method will return a dataframe. it will return a dataframe dictionary when files argument is
+                     initialised with a list of multiple file names
 
     """
     all_data: dict[str, DataFrame] = {}  # dataframes dictionary
@@ -498,15 +513,21 @@ def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None:
         for _key, dataframe in dataframes_dict.items():
             all_data[_key] = dataframe.drop(dataframe.filter(like='Unnamed'), axis=1)  # filter and drop Unnamed columns
 
-    if files is None:
-        for f_name in folder.rglob("*.csv"):
-            all_data[f_name.stem] = pd.read_csv(f_name)
-
+    return_dict = False # a flag that will determine whether the output should be a dictionary or a DatFrame
+    if isinstance(files, list):
+        return_dict = True
+    elif files is None:
+        files = [f_name.stem for f_name in folder.rglob("*.csv")]
+        return_dict = True
+    elif isinstance(files, str):
+        files = [files]
     else:
-        for f_name in files:
-            all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"))
+        files = [f_name.stem for f_name in folder.rglob("*.csv")]
+
+    for f_name in files:
+        all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"), dtype=dtype)
     # clean and return the dataframe dictionary
     clean_dataframe(all_data)
-    # If only one file loaded return dataframe directly rather than dict
-    return next(iter(all_data.values())) if len(all_data) == 1 else all_data
+    # return a dictionary if return_dict flag is set to True else return a dataframe
+    return all_data if return_dict else next(iter(all_data.values()))
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -332,30 +332,65 @@ def copy_files_to_temporal_directory_and_return_path(tmpdir):
     return tmpdir_resource_filepath
 
 
-def test_read_csv_method_with_no_file(tmpdir):
-    """ read csv method when no file name is supplied
-        i)  should return dictionary.
-        ii) dictionary keys should match csv file names in resource folder
-        iii)  all dictionary values should be dataframes
+def test_pass_datatypes_to_read_csv_method(tmpdir):
+    """ test passing column datatypes to read csv method. Final column datatype should change to what has been passed """
+    # copy and get resource files path in the temporal directory
+    path_to_tmpdir = Path(tmpdir)
+    sample_data = pd.DataFrame(data={'numbers1': [5,6,8,4,9,6], 'numbers2': [19,27,53,49,75,56]}, dtype=int)
+    sample_data.to_csv(tmpdir/'sample_data.csv', index=False)
+    # read from the sample data file
+    read_sample_data = read_csv_files(path_to_tmpdir, files='sample_data')
+    # confirm column datatype is what was assigned
+    assert read_sample_data.numbers1.dtype == 'int' and read_sample_data.numbers2.dtype == 'int'
+    # define new datatypes
+    datatype = {'numbers1': int, 'numbers2': float}
+    # pass the new datatypes to read csv method and confirm datatype has changed to what has been declared now
+    assign_dtype = read_csv_files(path_to_tmpdir, files='sample_data', dtype=datatype)
+    assert assign_dtype.numbers1.dtype == 'int' and assign_dtype.numbers2.dtype == 'float'
+
+
+def test_read_csv_file_method_passing_none_to_files_argument(tmpdir):
+    """ test reading csv files with one file in the target resource file and setting to None the files argument
+
+        Expectations
+            1.  should return a dictionary
+            2.  the dictionary key name should match file name
+    """
+    # copy and get resource files path in the temporal directory
+    tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
+    #  choose an Excel file with one sheet in it and convert it to csv file
+    convert_excel_files_to_csv(tmpdir_resource_filepath, files=['ResourceFile_load-parameters.xlsx'])
+    # get the folder containing the newly converted csv file and check the expected behavior
+    this_csv_resource_folder = tmpdir_resource_filepath/"ResourceFile_load-parameters"
+    file_names = [csv_file_path.stem for csv_file_path in this_csv_resource_folder.rglob("*.csv")]
+    one_csv_file_in_folder_dict = read_csv_files(this_csv_resource_folder, files=None)
+    assert isinstance(one_csv_file_in_folder_dict, dict)
+    assert set(one_csv_file_in_folder_dict.keys()) == set(file_names)
+
+
+def test_read_csv_method_with_default_value_for_files_argument(tmpdir):
+    """ read csv method when no file name(s) is supplied to the files argument
+        i)  should return a dataframe of the first csv file in the folder. Similar to pd.read_excel returning
+            a dataframe of first sheet in the file.
 
     :param tmpdir: path to a temporal directory
 
     """
     tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
     file_names = [csv_file_path.stem for csv_file_path in tmpdir_resource_filepath.rglob("*.csv")]
     df_no_files = read_csv_files(tmpdir_resource_filepath)
-    assert isinstance(df_no_files, dict)
-    assert set(df_no_files.keys()) == set(file_names)
-    assert all(isinstance(value, pd.DataFrame) for value in df_no_files.values())
+    fist_file_in_folder_df = read_csv_files(tmpdir_resource_filepath, files=file_names[0])
+    assert isinstance(df_no_files, pd.DataFrame)
+    pd.testing.assert_frame_equal(fist_file_in_folder_df, df_no_files)
 
 
 def test_read_csv_method_with_one_file(tmpdir):
-    """ test read csv method when one file name is supplied. should return a dataframe
+    """ test read csv method when one file name is supplied to files argument. should return a dataframe
     :param tmpdir: path to a temporal directory
 
     """
     tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir)
-    df = read_csv_files(tmpdir_resource_filepath, files=['df_at_healthcareseeking'])
+    df = read_csv_files(tmpdir_resource_filepath, files='df_at_healthcareseeking')
     assert isinstance(df, pd.DataFrame)