diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 36137df..5290b80 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -22,7 +22,9 @@ class _Dataset: _table_related_cached_properties = [ "segment_durations", "segments", + "tables_columns", "tables_preview", + "tables_rows", ] """Cached properties relying on table data. @@ -510,6 +512,22 @@ def tables(self) -> typing.List[str]: tables = list(db) return tables + @functools.cached_property + def tables_columns(self) -> typing.Dict[str, int]: + """Number of columns for each table of the dataset. + + Returns: + dictionary with table IDs as keys + and number of columns as values + + Examples: + >>> ds = Dataset("emodb", "1.4.1") + >>> ds.tables_columns["speaker"] + 3 + + """ + return {table: stats["columns"] for table, stats in self._tables_stats.items()} + @functools.cached_property def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]: """Table preview for each table of the dataset. @@ -540,21 +558,32 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]: """ preview = {} - for table in list(self.header): - df = audb.load_table( - self.name, - table, - version=self.version, - verbose=False, - ) + for table, stats in self._tables_stats.items(): + df = stats["preview"] df = df.reset_index() header = [df.columns.tolist()] - body = df.head(5).astype("string").values.tolist() + body = df.astype("string").values.tolist() # Remove unwanted chars and limit length of each entry body = [[self._parse_text(column) for column in row] for row in body] preview[table] = header + body return preview + @functools.cached_property + def tables_rows(self) -> typing.Dict[str, int]: + """Number of rows for each table of the dataset. + + Returns: + dictionary with table IDs as keys + and number of rows as values + + Examples: + >>> ds = Dataset("emodb", "1.4.1") + >>> ds.tables_rows["speaker"] + 10 + + """ + return {table: stats["rows"] for table, stats in self._tables_stats.items()} + @functools.cached_property def tables_table(self) -> typing.List[str]: """Tables of the dataset.""" @@ -751,6 +780,39 @@ def _segments(self) -> pd.MultiIndex: index = audformat.utils.union([index, df.index]) return index + @functools.cached_property + def _tables_stats(self) -> typing.Dict[str, dict]: + """Table information of tables in the dataset. + + Caches table information to improve performance + of multiple table-related properties. + This property computes and stores statistics for all tables, + reducing repeated computations. + It significantly improves performance + when accessing multiple table properties frequently. + + Returns: + A dictionary with table names as keys and dictionaries containing: + - "columns": number of columns + - "rows": number of rows + - "preview": dataframe preview (first 5 rows) + + """ + stats = {} + for table in list(self.header): + df = audb.load_table( + self.name, + table, + version=self.version, + verbose=False, + ) + stats[table] = { + "columns": len(df.columns), + "rows": len(df), + "preview": df.head(5), + } + return stats + @staticmethod def _map_iso_languages(languages: typing.List[str]) -> typing.List[str]: r"""Calculate ISO languages for a list of languages. diff --git a/audbcards/core/templates/datacard_tables.j2 b/audbcards/core/templates/datacard_tables.j2 index 0b11de5..5fc9807 100644 --- a/audbcards/core/templates/datacard_tables.j2 +++ b/audbcards/core/templates/datacard_tables.j2 @@ -41,8 +41,10 @@ Tables {% for column in row %}

{{ column }}

{% endfor %} + {% endif %} {% endfor %} +

{{ tables_rows[row[0]] }} {% if tables_rows[row[0]] == 1 %}row{% else %}rows{% endif %} x {{ tables_columns[row[0]] }} {% if tables_columns[row[0]] == 1 %}column{% else %}columns{% endif %}

diff --git a/audbcards/sphinx/table-preview.css b/audbcards/sphinx/table-preview.css index ce82007..ba4019f 100644 --- a/audbcards/sphinx/table-preview.css +++ b/audbcards/sphinx/table-preview.css @@ -34,6 +34,10 @@ table.preview td { border-top: none; border-bottom: none; } +table.preview td p.table-statistic { + /* Make "N rows x M columns" smaller */ + font-size: 90%; +} table.clickable td:not(.expanded-row-content), table.clickable th { /* Allow to center cell copntent with `margin: auto` */ diff --git a/tests/test_data/rendered_templates/medium_db.rst b/tests/test_data/rendered_templates/medium_db.rst index 5abaa7a..0dd4912 100644 --- a/tests/test_data/rendered_templates/medium_db.rst +++ b/tests/test_data/rendered_templates/medium_db.rst @@ -73,10 +73,13 @@ Tables

data/f0.wav

0

- + +

data/f1.wav

1

- + +

2 rows x 1 column

+ @@ -104,22 +107,27 @@ Tables

0 days 00:00:00

0 days 00:00:00.500000

neutral

- + +

data/f0.wav

0 days 00:00:00.500000

0 days 00:00:01

neutral

- + +

data/f1.wav

0 days 00:00:00

0 days 00:02:30

happy

- + +

data/f1.wav

0 days 00:02:30

0 days 00:05:01

angry

- + +

4 rows x 1 column

+ @@ -145,11 +153,14 @@ Tables

0

23

female

- + +

1

49

male

- + +

2 rows x 2 columns

+ diff --git a/tests/test_data/rendered_templates/minimal_db.rst b/tests/test_data/rendered_templates/minimal_db.rst index 97b956a..248991e 100644 --- a/tests/test_data/rendered_templates/minimal_db.rst +++ b/tests/test_data/rendered_templates/minimal_db.rst @@ -58,7 +58,9 @@ Tables

f0.wav

0

- + +

1 row x 1 column

+ diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 38d559c..d73bdfa 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -7,7 +7,6 @@ import audb import audeer -import audformat import audiofile import audbcards @@ -50,12 +49,67 @@ def test_dataset_property_scope(tmpdir, db, request): @pytest.mark.parametrize( - "db", + "db, " + "expected_description, " + "expected_schemes_table, " + "expected_tables_table, " + "expected_tables_columns, " + "expected_tables_rows, " + "expected_segment_durations", [ - "medium_db", + ( + "bare_db", + "", + [[]], + [["ID", "Type", "Columns"]], + {}, + {}, + [], + ), + ( + "minimal_db", + "Minimal database.", + [[]], + [["ID", "Type", "Columns"], ["files", "filewise", "speaker"]], + {"files": 1}, + {"files": 1}, + [], + ), + ( + "medium_db", + "Medium database. | Some description |.", + [ + ["ID", "Dtype", "Min", "Labels", "Mappings"], + ["age", "int", 0, "", ""], + ["emotion", "str", "", "angry, happy, neutral", ""], + ["gender", "str", "", "female, male", ""], + ["speaker", "int", "", "0, 1", "age, gender"], + ], + [ + ["ID", "Type", "Columns"], + ["files", "filewise", "speaker"], + ["segments", "segmented", "emotion"], + ["speaker", "misc", "age, gender"], + ], + {"files": 1, "segments": 1, "speaker": 2}, + {"files": 2, "segments": 4, "speaker": 2}, + [0.5, 0.5, 150, 151], + ), ], ) -def test_dataset(audb_cache, tmpdir, repository, db, request): +def test_dataset( + audb_cache, + tmpdir, + repository, + request, + db, + expected_description, + expected_schemes_table, + expected_tables_table, + expected_tables_columns, + expected_tables_rows, + expected_segment_durations, +): r"""Test audbcards.Dataset object and all its properties.""" db = request.getfixturevalue(db) @@ -115,7 +169,7 @@ def test_dataset(audb_cache, tmpdir, repository, db, request): # duration expected_duration = db.files_duration(db.files).sum() - assert dataset.duration == expected_duration + assert dataset.duration == pd.to_timedelta(expected_duration) # files expected_files = len(db.files) @@ -175,17 +229,9 @@ def test_dataset(audb_cache, tmpdir, repository, db, request): assert dataset.schemes == expected_schemes # schemes_table - expected_schemes_table = [ - ["ID", "Dtype", "Min", "Labels", "Mappings"], - ["age", "int", 0, "", ""], - ["emotion", "str", "", "angry, happy, neutral", ""], - ["gender", "str", "", "female, male", ""], - ["speaker", "int", "", "0, 1", "age, gender"], - ] assert dataset.schemes_table == expected_schemes_table # segment_durations - expected_segment_durations = [0.5, 0.5, 150, 151] assert dataset.segment_durations == expected_segment_durations # segments @@ -193,28 +239,19 @@ def test_dataset(audb_cache, tmpdir, repository, db, request): assert dataset.segments == expected_segments # short_description - max_desc_length = 150 - expected_description = ( - db.description - if (len(db.description) < max_desc_length) - else f"{db.description[:max_desc_length - 3]}..." - ) assert dataset.short_description == expected_description # tables expected_tables = list(db) assert dataset.tables == expected_tables + # tables_columns + assert dataset.tables_columns == expected_tables_columns + + # tables_rows + assert dataset.tables_rows == expected_tables_rows + # tables_table - expected_tables_table = [["ID", "Type", "Columns"]] - for table_id in list(db): - table = db[table_id] - if isinstance(table, audformat.MiscTable): - table_type = "misc" - else: - table_type = table.type - columns = ", ".join(list(table.columns)) - expected_tables_table.append([table_id, table_type, columns]) assert dataset.tables_table == expected_tables_table # version