Skip to content

Commit

Permalink
Add audbackend.checksum() (#245)
Browse files Browse the repository at this point in the history
* Add audbackend.checksum()

* Add docstring example

* Add test for missing file

* Simplify tests

* Add test for folder

* Add raises section to docstring

* Update test for Windows

* Improve docstring
  • Loading branch information
hagenw authored Nov 14, 2024
1 parent f9934a6 commit 1d0c713
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 9 deletions.
1 change: 1 addition & 0 deletions audbackend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from audbackend.core.backend.base import Base as Backend # legacy
from audbackend.core.backend.filesystem import FileSystem # legacy
from audbackend.core.errors import BackendError
from audbackend.core.utils import checksum
from audbackend.core.repository import Repository

# Import optional backends (legacy)
Expand Down
8 changes: 4 additions & 4 deletions audbackend/core/backend/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,12 @@ def _assert_equal_checksum(
"""
if path_is_local:
checksum = audeer.md5(path)
checksum = utils.checksum(path)
else:
checksum = self.checksum(path)

if path_ref_is_local:
checksum_ref = audeer.md5(path_ref)
checksum_ref = utils.checksum(path_ref)
else:
checksum_ref = self.checksum(path_ref)

Expand Down Expand Up @@ -569,7 +569,7 @@ def get_file(
msg = f"Permission denied: '{dst_path}'"
raise PermissionError(msg)

if not os.path.exists(dst_path) or audeer.md5(dst_path) != self.checksum(
if not os.path.exists(dst_path) or utils.checksum(dst_path) != self.checksum(
src_path
):
# get file to a temporary directory first,
Expand Down Expand Up @@ -1042,7 +1042,7 @@ def put_file(
elif os.path.isdir(src_path):
raise utils.raise_is_a_directory(src_path)

checksum = audeer.md5(src_path)
checksum = utils.checksum(src_path)

# skip if file with same checksum already exists
if not self.exists(dst_path) or self.checksum(dst_path) != checksum:
Expand Down
2 changes: 1 addition & 1 deletion audbackend/core/backend/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def _checksum(
) -> str:
r"""MD5 checksum of file on backend."""
path = self._expand(path)
return audeer.md5(path)
return utils.checksum(path)

def _collapse(
self,
Expand Down
2 changes: 1 addition & 1 deletion audbackend/core/interface/versioned.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def checksum(
Examples:
>>> file = "src.txt"
>>> import audeer
>>> audeer.md5(file)
>>> audbackend.checksum(file)
'd41d8cd98f00b204e9800998ecf8427e'
>>> interface.put_file(file, "/file.txt", "1.0.0")
>>> interface.checksum("/file.txt", "1.0.0")
Expand Down
57 changes: 57 additions & 0 deletions audbackend/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import os
import re

import audeer

from audbackend.core.errors import BackendError


Expand Down Expand Up @@ -78,6 +80,61 @@ def check_version(version: str) -> str:
return version


def checksum(file: str) -> str:
r"""Checksum of file.
This function is used by backends
to get the checksum of local files,
using :func:`audeer.md5`.
An exception are parquet files,
for which their ``"hash"`` metadata entry
is used as checksum,
if the entry is available
and pyarrow_ is installed.
.. _pyarrow: https://arrow.apache.org/docs/python/index.html
Args:
file: file path with extension
Returns:
MD5 checksum of file
Raises:
FileNotFoundError: if ``file`` does not exist
Examples:
>>> checksum("src.txt")
'd41d8cd98f00b204e9800998ecf8427e'
>>> import audformat
>>> import pandas as pd
>>> import pyarrow as pa
>>> import pyarrow.parquet as pq
>>> df = pd.DataFrame([0, 1], columns=["a"])
>>> hash = audformat.utils.hash(df, strict=True)
>>> hash
'9021a9b6e1e696ba9de4fe29346319b2'
>>> table = pa.Table.from_pandas(df)
>>> table = table.replace_schema_metadata({"hash": hash})
>>> pq.write_table(table, "file.parquet", compression="snappy")
>>> checksum("file.parquet")
'9021a9b6e1e696ba9de4fe29346319b2'
"""
ext = audeer.file_extension(file)
if ext == "parquet":
try:
import pyarrow.parquet as parquet

metadata = parquet.read_schema(file).metadata or {}
if b"hash" in metadata:
return metadata[b"hash"].decode()
except ModuleNotFoundError:
pass
return audeer.md5(file)


def date_format(date: datetime.datetime) -> str:
return date.strftime("%Y-%m-%d")

Expand Down
1 change: 1 addition & 0 deletions docs/api-src/audbackend.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ and functions are available.
BackendError
Repository
access
checksum
create
delete
register
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
autodoc_inherit_docstrings = False # disable docstring inheritance
intersphinx_mapping = {
"audeer": ("https://audeering.github.io/audeer/", None),
"audformat": ("https://audeering.github.io/audformat/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
"python": ("https://docs.python.org/3/", None),
}
Expand Down
4 changes: 1 addition & 3 deletions tests/bad_file_system.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import audeer

import audbackend


Expand All @@ -16,7 +14,7 @@ def put_file(
verbose: bool = False,
):
r"""Put file on backend."""
checksum = audeer.md5(src_path)
checksum = audbackend.checksum(src_path)
audbackend.core.utils.call_function_on_backend(
self._put_file,
src_path,
Expand Down
1 change: 1 addition & 0 deletions tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
audformat
pytest<8.1.0 # required by doctestplus
pytest-cov
pytest-doctestplus
112 changes: 112 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import sys

import pyarrow
import pyarrow.parquet as parquet
import pytest

import audeer

import audbackend


@pytest.fixture
def pyarrow_installed(request):
"""Simulate missing pyarrow installation.
Args:
request: request parameter for indirect call of fixture
"""
if not request.param:
sys.modules["pyarrow"] = None

yield False

del sys.modules["pyarrow"]

else:
yield True


class TestChecksum:
"""Test local checksum calculation."""

@classmethod
@pytest.fixture(autouse=True)
def setup(cls, tmpdir):
"""Prepare files for tests."""
cls.files = {}

file = "file.txt"
cls.files[file] = audeer.path(tmpdir, file)
with open(cls.files[file], "w") as fp:
fp.write("hello\n")

file = "file.parquet"
cls.files[file] = audeer.path(tmpdir, file)
table = pyarrow.Table.from_pylist([{"a": 0, "b": 1}])
parquet.write_table(table, cls.files[file], compression="snappy")

file = "file-metadata.parquet"
cls.files[file] = audeer.path(tmpdir, file)
metadata = {"hash": "my-hash"}
table = table.replace_schema_metadata(metadata)
parquet.write_table(table, cls.files[file], compression="snappy")

file = "folder"
cls.files[file] = audeer.mkdir(tmpdir, file)

@pytest.mark.parametrize("pyarrow_installed", [True, False], indirect=True)
@pytest.mark.parametrize(
"file",
["file.txt", "file.parquet", "file-metadata.parquet", "folder"],
)
def test_checksum(self, file, pyarrow_installed):
"""Test checksum of local file.
Args:
file: file name, see ``setup()``
pyarrow_installed: if ``False,
it hides the ``pyarrow`` module
expected_checksum_function: function executed
to generate expected checksum for ``file``
"""
path = self.files[file]
expected_checksum = self.determine_expected_checksum(file, pyarrow_installed)
assert audbackend.checksum(path) == expected_checksum

@pytest.mark.parametrize(
"file, error, error_msg",
[
("non-existing.txt", FileNotFoundError, "non-existing.txt"),
("non-existing.parquet", FileNotFoundError, "non-existing.parquet"),
],
)
def test_errors(self, file, error, error_msg):
"""Test expected errors.
Args:
file: file path
error: expected error
error_msg: expected error message.
For ``FileNotFoundError``,
we recommend to use only the file name
as the rest of the error message differs under Windows
"""
with pytest.raises(error, match=error_msg):
file = self.files.get(file, file)
audbackend.checksum(file)

def determine_expected_checksum(self, file, pyarrow_installed):
"""Expected checksum for file and pyarrow installation.
Args:
file: file to calculate checksum for
pyarrow_installed: if ``True`` it assumes ``pyarrow`` is installed
"""
if file == "file-metadata.parquet" and pyarrow_installed:
return "my-hash"
return audeer.md5(self.files[file])

0 comments on commit 1d0c713

Please sign in to comment.