Skip to content

Commit

Permalink
Remove audbackend.checksum() and use MD5 sum (#255)
Browse files Browse the repository at this point in the history
* TST: add failing test for parquet on Artifactory

* Revert "Add audbackend.checksum() (#245)"

This reverts commit 1d0c713.

* Fix rebasing

* Add docstring, simplify code

* Fix typo
  • Loading branch information
hagenw authored Nov 26, 2024
1 parent 1b1177d commit 639d86c
Show file tree
Hide file tree
Showing 11 changed files with 84 additions and 190 deletions.
1 change: 0 additions & 1 deletion audbackend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from audbackend.core.backend.base import Base as Backend # legacy
from audbackend.core.backend.filesystem import FileSystem # legacy
from audbackend.core.errors import BackendError
from audbackend.core.utils import checksum
from audbackend.core.repository import Repository

# Import optional backends (legacy)
Expand Down
44 changes: 30 additions & 14 deletions audbackend/core/backend/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,38 @@ def _assert_equal_checksum(
):
r"""Assert checksums are equal.
Compare the MD5 sum of a file
(``path``)
to the MD5 sum of a reference file
(``path_ref``).
If check fails,
``path`` is removed
and an error is raised.
"""
if path_is_local:
checksum = utils.checksum(path)
else:
checksum = self.checksum(path)
Both ``path`` and ``path_ref``
can be local files,
or stored on any backend.
if path_ref_is_local:
checksum_ref = utils.checksum(path_ref)
else:
checksum_ref = self.checksum(path_ref)
Args:
path: path to a file.
Its MD5 sum is compared
to a reference one,
calculated from ``path_ref``
path_is_local: if ``True``,
assumes ``path`` is stored on local machine
path_ref: path to a file.
Its MD5 sum is used as reference
path_ref_is_local: if ``True``,
assumes ``path_ref`` is stored on local machine
Raises:
InterruptedError: if the MD5 sums do not match
"""
md5 = audeer.md5(path) if path_is_local else self.checksum(path)
md5_ref = audeer.md5(path_ref) if path_ref_is_local else self.checksum(path_ref)

if checksum != checksum_ref:
if md5 != md5_ref:
if path_is_local:
os.remove(path)
location = "local file system"
Expand All @@ -90,9 +106,9 @@ def _assert_equal_checksum(
f"Execution is interrupted because "
f"{path} "
f"has checksum "
f"'{checksum}' "
f"'{md5}' "
"when the expected checksum is "
f"'{checksum_ref}'. "
f"'{md5_ref}'. "
f"The file has been removed from the "
f"{location}."
)
Expand Down Expand Up @@ -569,7 +585,7 @@ def get_file(
msg = f"Permission denied: '{dst_path}'"
raise PermissionError(msg)

if not os.path.exists(dst_path) or utils.checksum(dst_path) != self.checksum(
if not os.path.exists(dst_path) or audeer.md5(dst_path) != self.checksum(
src_path
):
# get file to a temporary directory first,
Expand Down Expand Up @@ -1042,7 +1058,7 @@ def put_file(
elif os.path.isdir(src_path):
raise utils.raise_is_a_directory(src_path)

checksum = utils.checksum(src_path)
checksum = audeer.md5(src_path)

# skip if file with same checksum already exists
if not self.exists(dst_path) or self.checksum(dst_path) != checksum:
Expand Down
2 changes: 1 addition & 1 deletion audbackend/core/backend/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def _checksum(
) -> str:
r"""MD5 checksum of file on backend."""
path = self._expand(path)
return utils.checksum(path)
return audeer.md5(path)

def _collapse(
self,
Expand Down
2 changes: 1 addition & 1 deletion audbackend/core/interface/versioned.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def checksum(
Examples:
>>> file = "src.txt"
>>> import audeer
>>> audbackend.checksum(file)
>>> audeer.md5(file)
'd41d8cd98f00b204e9800998ecf8427e'
>>> interface.put_file(file, "/file.txt", "1.0.0")
>>> interface.checksum("/file.txt", "1.0.0")
Expand Down
58 changes: 0 additions & 58 deletions audbackend/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import re
import time

import audeer

from audbackend.core.errors import BackendError


Expand Down Expand Up @@ -106,62 +104,6 @@ def check_version(version: str) -> str:
return version


def checksum(file: str) -> str:
r"""Checksum of file.
This function is used by backends
to get the checksum of local files,
using :func:`audeer.md5`.
An exception are parquet files,
for which their ``"hash"`` metadata entry
is used as checksum,
if the entry is available
and pyarrow_ is installed.
.. _pyarrow: https://arrow.apache.org/docs/python/index.html
Args:
file: file path with extension
Returns:
MD5 checksum of file
Raises:
FileNotFoundError: if ``file`` does not exist
Examples:
>>> checksum("src.txt")
'd41d8cd98f00b204e9800998ecf8427e'
>>> import audformat
>>> import pandas as pd
>>> import pyarrow as pa
>>> import pyarrow.parquet as pq
>>> df = pd.DataFrame([0, 1], columns=["a"])
>>> hash = audformat.utils.hash(df, strict=True)
>>> hash
'9021a9b6e1e696ba9de4fe29346319b2'
>>> parquet_file = audeer.path("file.parquet")
>>> table = pa.Table.from_pandas(df)
>>> table = table.replace_schema_metadata({"hash": hash})
>>> pq.write_table(table, parquet_file, compression="snappy")
>>> checksum(parquet_file)
'9021a9b6e1e696ba9de4fe29346319b2'
"""
ext = audeer.file_extension(file)
if ext == "parquet":
try:
import pyarrow.parquet as parquet

metadata = parquet.read_schema(file).metadata or {}
if b"hash" in metadata:
return metadata[b"hash"].decode()
except ModuleNotFoundError:
pass
return audeer.md5(file)


def date_format(date: datetime.datetime) -> str:
return date.strftime("%Y-%m-%d")

Expand Down
1 change: 0 additions & 1 deletion docs/api-src/audbackend.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,3 @@ and functions are available.

BackendError
Repository
checksum
1 change: 0 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
autodoc_inherit_docstrings = False # disable docstring inheritance
intersphinx_mapping = {
"audeer": ("https://audeering.github.io/audeer/", None),
"audformat": ("https://audeering.github.io/audformat/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
"python": ("https://docs.python.org/3/", None),
}
Expand Down
4 changes: 3 additions & 1 deletion tests/bad_file_system.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import audeer

import audbackend


Expand All @@ -14,7 +16,7 @@ def put_file(
verbose: bool = False,
):
r"""Put file on backend."""
checksum = audbackend.checksum(src_path)
checksum = audeer.md5(src_path)
audbackend.core.utils.call_function_on_backend(
self._put_file,
src_path,
Expand Down
29 changes: 29 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

import audeer
import audformat

import audbackend

Expand Down Expand Up @@ -83,6 +84,34 @@ def owner(request):
yield owner


@pytest.fixture(scope="function")
def parquet_file(tmpdir):
r"""Provide a parquet file with checksum stored in metadata.
``audformat`` provides the possibility
to store a checksum,
based on the content of a parquet file,
in the metadata of that file.
The motivation is that a parquet file
cannot be written in a deterministic way
and the checksum is a way to track,
if the content has changed.
"""
db = audformat.Database("mydb")
db.schemes["age"] = audformat.Scheme("int")
db["files"] = audformat.Table(audformat.filewise_index(["f1"]))
db["files"]["age"] = audformat.Column(scheme_id="age")
db["files"]["age"].set([40])
path = audeer.path(tmpdir, "files.parquet")
db["files"].save(
audeer.replace_file_extension(path, ""),
storage_format="parquet",
)

yield path


@pytest.fixture(scope="function", autouse=False)
def interface(tmpdir_factory, request):
r"""Create a backend with interface.
Expand Down
20 changes: 20 additions & 0 deletions tests/test_backend_artifactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,23 @@ def test_open_close(host, repository):
audbackend.backend.Artifactory.create(host, repository)
backend.open()
backend.close()


@pytest.mark.parametrize(
"interface",
[(audbackend.backend.Artifactory, audbackend.interface.Maven)],
indirect=True,
)
def test_parquet_file(interface, parquet_file):
"""Test uploading a parquet file with hash in metadata.
We need to make sure to hand the MD5 sum
to the deploy method of Artifactory,
not the checksum hash of the parquet file metadata.
See https://github.com/audeering/audbackend/issues/254.
"""
dst_file = f"/{os.path.basename(parquet_file)}"
version = "1.0.0"
interface.put_file(parquet_file, dst_file, version)
assert interface.exists(dst_file, version)
112 changes: 0 additions & 112 deletions tests/test_utils.py

This file was deleted.

0 comments on commit 639d86c

Please sign in to comment.