Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Dataset.tables_columns and tables_rows #113

Merged
merged 13 commits into from
Oct 22, 2024
78 changes: 70 additions & 8 deletions audbcards/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ class _Dataset:
_table_related_cached_properties = [
"segment_durations",
"segments",
"tables_columns",
"tables_preview",
"tables_rows",
]
"""Cached properties relying on table data.

Expand Down Expand Up @@ -510,6 +512,22 @@ def tables(self) -> typing.List[str]:
tables = list(db)
return tables

@functools.cached_property
hagenw marked this conversation as resolved.
Show resolved Hide resolved
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
def tables_columns(self) -> typing.Dict[str, int]:
"""Number of columns for each table of the dataset.

Returns:
dictionary with table IDs as keys
and number of columns as values

Examples:
>>> ds = Dataset("emodb", "1.4.1")
>>> ds.tables_columns["speaker"]
3

"""
return {table: stats["columns"] for table, stats in self._tables_stats.items()}

@functools.cached_property
def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:
"""Table preview for each table of the dataset.
Expand Down Expand Up @@ -540,21 +558,32 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:

"""
preview = {}
for table in list(self.header):
df = audb.load_table(
self.name,
table,
version=self.version,
verbose=False,
)
for table, stats in self._tables_stats.items():
df = stats["preview"]
df = df.reset_index()
header = [df.columns.tolist()]
body = df.head(5).astype("string").values.tolist()
body = df.astype("string").values.tolist()
# Remove unwanted chars and limit length of each entry
body = [[self._parse_text(column) for column in row] for row in body]
preview[table] = header + body
return preview

@functools.cached_property
def tables_rows(self) -> typing.Dict[str, int]:
"""Number of rows for each table of the dataset.

Returns:
dictionary with table IDs as keys
and number of rows as values

Examples:
>>> ds = Dataset("emodb", "1.4.1")
>>> ds.tables_rows["speaker"]
10

"""
return {table: stats["rows"] for table, stats in self._tables_stats.items()}

@functools.cached_property
def tables_table(self) -> typing.List[str]:
"""Tables of the dataset."""
Expand Down Expand Up @@ -751,6 +780,39 @@ def _segments(self) -> pd.MultiIndex:
index = audformat.utils.union([index, df.index])
return index

@functools.cached_property
def _tables_stats(self) -> typing.Dict[str, dict]:
"""Table information of tables in the dataset.

Caches table information to improve performance
of multiple table-related properties.
This property computes and stores statistics for all tables,
reducing repeated computations.
It significantly improves performance
when accessing multiple table properties frequently.

Returns:
A dictionary with table names as keys and dictionaries containing:
- "columns": number of columns
- "rows": number of rows
- "preview": dataframe preview (first 5 rows)

"""
stats = {}
for table in list(self.header):
df = audb.load_table(
self.name,
table,
version=self.version,
verbose=False,
)
stats[table] = {
"columns": len(df.columns),
"rows": len(df),
"preview": df.head(5),
}
return stats

@staticmethod
def _map_iso_languages(languages: typing.List[str]) -> typing.List[str]:
r"""Calculate ISO languages for a list of languages.
Expand Down
2 changes: 2 additions & 0 deletions audbcards/core/templates/datacard_tables.j2
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ Tables
{% for column in row %}
<td><p>{{ column }}</p></td>
{% endfor %}
</tr>
{% endif %}
{% endfor %}
<tr><td><p class="table-statistic">{{ tables_rows[row[0]] }} {% if tables_rows[row[0]] == 1 %}row{% else %}rows{% endif %} x {{ tables_columns[row[0]] }} {% if tables_columns[row[0]] == 1 %}column{% else %}columns{% endif %}</p></td></tr>
</tbody>
</table>

Expand Down
4 changes: 4 additions & 0 deletions audbcards/sphinx/table-preview.css
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ table.preview td {
border-top: none;
border-bottom: none;
}
table.preview td p.table-statistic {
/* Make "N rows x M columns" smaller */
font-size: 90%;
}
table.clickable td:not(.expanded-row-content),
table.clickable th {
/* Allow to center cell copntent with `margin: auto` */
Expand Down
27 changes: 19 additions & 8 deletions tests/test_data/rendered_templates/medium_db.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,13 @@ Tables
<tr>
<td><p>data/f0.wav</p></td>
<td><p>0</p></td>
<tr>
</tr>
<tr>
<td><p>data/f1.wav</p></td>
<td><p>1</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">2 rows x 1 column</p></td></tr>
</tbody>
</table>


Expand Down Expand Up @@ -104,22 +107,27 @@ Tables
<td><p>0 days 00:00:00</p></td>
<td><p>0 days 00:00:00.500000</p></td>
<td><p>neutral</p></td>
<tr>
</tr>
<tr>
<td><p>data/f0.wav</p></td>
<td><p>0 days 00:00:00.500000</p></td>
<td><p>0 days 00:00:01</p></td>
<td><p>neutral</p></td>
<tr>
</tr>
<tr>
<td><p>data/f1.wav</p></td>
<td><p>0 days 00:00:00</p></td>
<td><p>0 days 00:02:30</p></td>
<td><p>happy</p></td>
<tr>
</tr>
<tr>
<td><p>data/f1.wav</p></td>
<td><p>0 days 00:02:30</p></td>
<td><p>0 days 00:05:01</p></td>
<td><p>angry</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">4 rows x 1 column</p></td></tr>
</tbody>
</table>


Expand All @@ -145,11 +153,14 @@ Tables
<td><p>0</p></td>
<td><p>23</p></td>
<td><p>female</p></td>
<tr>
</tr>
<tr>
<td><p>1</p></td>
<td><p>49</p></td>
<td><p>male</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">2 rows x 2 columns</p></td></tr>
</tbody>
</table>


Expand Down
4 changes: 3 additions & 1 deletion tests/test_data/rendered_templates/minimal_db.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ Tables
<tr>
<td><p>f0.wav</p></td>
<td><p>0</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">1 row x 1 column</p></td></tr>
</tbody>
</table>


Expand Down
93 changes: 65 additions & 28 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import audb
import audeer
import audformat
import audiofile

import audbcards
Expand Down Expand Up @@ -50,12 +49,67 @@ def test_dataset_property_scope(tmpdir, db, request):


@pytest.mark.parametrize(
"db",
"db, "
"expected_description, "
"expected_schemes_table, "
"expected_tables_table, "
"expected_tables_columns, "
"expected_tables_rows, "
"expected_segment_durations",
[
"medium_db",
(
"bare_db",
"",
[[]],
[["ID", "Type", "Columns"]],
{},
{},
[],
),
(
"minimal_db",
"Minimal database.",
[[]],
[["ID", "Type", "Columns"], ["files", "filewise", "speaker"]],
{"files": 1},
{"files": 1},
[],
),
(
"medium_db",
"Medium database. | Some description |.",
[
["ID", "Dtype", "Min", "Labels", "Mappings"],
["age", "int", 0, "", ""],
["emotion", "str", "", "angry, happy, neutral", ""],
["gender", "str", "", "female, male", ""],
["speaker", "int", "", "0, 1", "age, gender"],
],
[
["ID", "Type", "Columns"],
["files", "filewise", "speaker"],
["segments", "segmented", "emotion"],
["speaker", "misc", "age, gender"],
],
{"files": 1, "segments": 1, "speaker": 2},
{"files": 2, "segments": 4, "speaker": 2},
[0.5, 0.5, 150, 151],
),
],
)
def test_dataset(audb_cache, tmpdir, repository, db, request):
def test_dataset(
audb_cache,
tmpdir,
repository,
request,
db,
expected_description,
expected_schemes_table,
expected_tables_table,
expected_tables_columns,
expected_tables_rows,
expected_segment_durations,
):
r"""Test audbcards.Dataset object and all its properties."""
db = request.getfixturevalue(db)

Expand Down Expand Up @@ -115,7 +169,7 @@ def test_dataset(audb_cache, tmpdir, repository, db, request):

# duration
expected_duration = db.files_duration(db.files).sum()
assert dataset.duration == expected_duration
assert dataset.duration == pd.to_timedelta(expected_duration)

# files
expected_files = len(db.files)
Expand Down Expand Up @@ -175,46 +229,29 @@ def test_dataset(audb_cache, tmpdir, repository, db, request):
assert dataset.schemes == expected_schemes

# schemes_table
expected_schemes_table = [
["ID", "Dtype", "Min", "Labels", "Mappings"],
["age", "int", 0, "", ""],
["emotion", "str", "", "angry, happy, neutral", ""],
["gender", "str", "", "female, male", ""],
["speaker", "int", "", "0, 1", "age, gender"],
]
assert dataset.schemes_table == expected_schemes_table

# segment_durations
expected_segment_durations = [0.5, 0.5, 150, 151]
assert dataset.segment_durations == expected_segment_durations

# segments
expected_segments = str(len(db.segments))
assert dataset.segments == expected_segments

# short_description
max_desc_length = 150
expected_description = (
db.description
if (len(db.description) < max_desc_length)
else f"{db.description[:max_desc_length - 3]}..."
)
assert dataset.short_description == expected_description

# tables
expected_tables = list(db)
assert dataset.tables == expected_tables

# tables_columns
assert dataset.tables_columns == expected_tables_columns
hagenw marked this conversation as resolved.
Show resolved Hide resolved

# tables_rows
assert dataset.tables_rows == expected_tables_rows

# tables_table
expected_tables_table = [["ID", "Type", "Columns"]]
for table_id in list(db):
table = db[table_id]
if isinstance(table, audformat.MiscTable):
table_type = "misc"
else:
table_type = table.type
columns = ", ".join(list(table.columns))
expected_tables_table.append([table_id, table_type, columns])
assert dataset.tables_table == expected_tables_table

# version
Expand Down