diff --git a/audbackend/__init__.py b/audbackend/__init__.py index c94d8468..b2113cdc 100644 --- a/audbackend/__init__.py +++ b/audbackend/__init__.py @@ -3,7 +3,6 @@ from audbackend.core.backend.base import Base as Backend # legacy from audbackend.core.backend.filesystem import FileSystem # legacy from audbackend.core.errors import BackendError -from audbackend.core.utils import checksum from audbackend.core.repository import Repository # Import optional backends (legacy) diff --git a/audbackend/core/backend/base.py b/audbackend/core/backend/base.py index 533f9cf4..299f8856 100644 --- a/audbackend/core/backend/base.py +++ b/audbackend/core/backend/base.py @@ -63,22 +63,38 @@ def _assert_equal_checksum( ): r"""Assert checksums are equal. + Compare the MD5 sum of a file + (``path``) + to the MD5 sum of a reference file + (``path_ref``). If check fails, ``path`` is removed and an error is raised. - """ - if path_is_local: - checksum = utils.checksum(path) - else: - checksum = self.checksum(path) + Both ``path`` and ``path_ref`` + can be local files, + or stored on any backend. - if path_ref_is_local: - checksum_ref = utils.checksum(path_ref) - else: - checksum_ref = self.checksum(path_ref) + Args: + path: path to a file. + Its MD5 sum is compared + to a reference one, + calculated from ``path_ref`` + path_is_local: if ``True``, + assumes ``path`` is stored on local machine + path_ref: path to a file. + Its MD5 sum is used as reference + path_ref_is_local: if ``True``, + assumes ``path_ref`` is stored on local machine + + Raises: + InterruptedError: if the MD5 sums do not match + + """ + md5 = audeer.md5(path) if path_is_local else self.checksum(path) + md5_ref = audeer.md5(path_ref) if path_ref_is_local else self.checksum(path_ref) - if checksum != checksum_ref: + if md5 != md5_ref: if path_is_local: os.remove(path) location = "local file system" @@ -90,9 +106,9 @@ def _assert_equal_checksum( f"Execution is interrupted because " f"{path} " f"has checksum " - f"'{checksum}' " + f"'{md5}' " "when the expected checksum is " - f"'{checksum_ref}'. " + f"'{md5_ref}'. " f"The file has been removed from the " f"{location}." ) @@ -569,7 +585,7 @@ def get_file( msg = f"Permission denied: '{dst_path}'" raise PermissionError(msg) - if not os.path.exists(dst_path) or utils.checksum(dst_path) != self.checksum( + if not os.path.exists(dst_path) or audeer.md5(dst_path) != self.checksum( src_path ): # get file to a temporary directory first, @@ -1042,7 +1058,7 @@ def put_file( elif os.path.isdir(src_path): raise utils.raise_is_a_directory(src_path) - checksum = utils.checksum(src_path) + checksum = audeer.md5(src_path) # skip if file with same checksum already exists if not self.exists(dst_path) or self.checksum(dst_path) != checksum: diff --git a/audbackend/core/backend/filesystem.py b/audbackend/core/backend/filesystem.py index c6b09e17..32f07258 100644 --- a/audbackend/core/backend/filesystem.py +++ b/audbackend/core/backend/filesystem.py @@ -32,7 +32,7 @@ def _checksum( ) -> str: r"""MD5 checksum of file on backend.""" path = self._expand(path) - return utils.checksum(path) + return audeer.md5(path) def _collapse( self, diff --git a/audbackend/core/interface/versioned.py b/audbackend/core/interface/versioned.py index 610fda47..02121707 100644 --- a/audbackend/core/interface/versioned.py +++ b/audbackend/core/interface/versioned.py @@ -78,7 +78,7 @@ def checksum( Examples: >>> file = "src.txt" >>> import audeer - >>> audbackend.checksum(file) + >>> audeer.md5(file) 'd41d8cd98f00b204e9800998ecf8427e' >>> interface.put_file(file, "/file.txt", "1.0.0") >>> interface.checksum("/file.txt", "1.0.0") diff --git a/audbackend/core/utils.py b/audbackend/core/utils.py index 76a430f9..4cf07584 100644 --- a/audbackend/core/utils.py +++ b/audbackend/core/utils.py @@ -5,8 +5,6 @@ import re import time -import audeer - from audbackend.core.errors import BackendError @@ -106,62 +104,6 @@ def check_version(version: str) -> str: return version -def checksum(file: str) -> str: - r"""Checksum of file. - - This function is used by backends - to get the checksum of local files, - using :func:`audeer.md5`. - - An exception are parquet files, - for which their ``"hash"`` metadata entry - is used as checksum, - if the entry is available - and pyarrow_ is installed. - - .. _pyarrow: https://arrow.apache.org/docs/python/index.html - - Args: - file: file path with extension - - Returns: - MD5 checksum of file - - Raises: - FileNotFoundError: if ``file`` does not exist - - Examples: - >>> checksum("src.txt") - 'd41d8cd98f00b204e9800998ecf8427e' - >>> import audformat - >>> import pandas as pd - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - >>> df = pd.DataFrame([0, 1], columns=["a"]) - >>> hash = audformat.utils.hash(df, strict=True) - >>> hash - '9021a9b6e1e696ba9de4fe29346319b2' - >>> parquet_file = audeer.path("file.parquet") - >>> table = pa.Table.from_pandas(df) - >>> table = table.replace_schema_metadata({"hash": hash}) - >>> pq.write_table(table, parquet_file, compression="snappy") - >>> checksum(parquet_file) - '9021a9b6e1e696ba9de4fe29346319b2' - - """ - ext = audeer.file_extension(file) - if ext == "parquet": - try: - import pyarrow.parquet as parquet - - metadata = parquet.read_schema(file).metadata or {} - if b"hash" in metadata: - return metadata[b"hash"].decode() - except ModuleNotFoundError: - pass - return audeer.md5(file) - - def date_format(date: datetime.datetime) -> str: return date.strftime("%Y-%m-%d") diff --git a/docs/api-src/audbackend.rst b/docs/api-src/audbackend.rst index 2fd8359c..8b2fdb5a 100644 --- a/docs/api-src/audbackend.rst +++ b/docs/api-src/audbackend.rst @@ -31,4 +31,3 @@ and functions are available. BackendError Repository - checksum diff --git a/docs/conf.py b/docs/conf.py index 948fda7b..1fa66c6f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,6 @@ autodoc_inherit_docstrings = False # disable docstring inheritance intersphinx_mapping = { "audeer": ("https://audeering.github.io/audeer/", None), - "audformat": ("https://audeering.github.io/audformat/", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "python": ("https://docs.python.org/3/", None), } diff --git a/tests/bad_file_system.py b/tests/bad_file_system.py index f57bc30c..7e6a3e5a 100644 --- a/tests/bad_file_system.py +++ b/tests/bad_file_system.py @@ -1,3 +1,5 @@ +import audeer + import audbackend @@ -14,7 +16,7 @@ def put_file( verbose: bool = False, ): r"""Put file on backend.""" - checksum = audbackend.checksum(src_path) + checksum = audeer.md5(src_path) audbackend.core.utils.call_function_on_backend( self._put_file, src_path, diff --git a/tests/conftest.py b/tests/conftest.py index 8dd54aac..17042819 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import pytest import audeer +import audformat import audbackend @@ -83,6 +84,34 @@ def owner(request): yield owner +@pytest.fixture(scope="function") +def parquet_file(tmpdir): + r"""Provide a parquet file with checksum stored in metadata. + + ``audformat`` provides the possibility + to store a checksum, + based on the content of a parquet file, + in the metadata of that file. + The motivation is that a parquet file + cannot be written in a deterministic way + and the checksum is a way to track, + if the content has changed. + + """ + db = audformat.Database("mydb") + db.schemes["age"] = audformat.Scheme("int") + db["files"] = audformat.Table(audformat.filewise_index(["f1"])) + db["files"]["age"] = audformat.Column(scheme_id="age") + db["files"]["age"].set([40]) + path = audeer.path(tmpdir, "files.parquet") + db["files"].save( + audeer.replace_file_extension(path, ""), + storage_format="parquet", + ) + + yield path + + @pytest.fixture(scope="function", autouse=False) def interface(tmpdir_factory, request): r"""Create a backend with interface. diff --git a/tests/test_backend_artifactory.py b/tests/test_backend_artifactory.py index 9926ff40..728f7312 100644 --- a/tests/test_backend_artifactory.py +++ b/tests/test_backend_artifactory.py @@ -228,3 +228,23 @@ def test_open_close(host, repository): audbackend.backend.Artifactory.create(host, repository) backend.open() backend.close() + + +@pytest.mark.parametrize( + "interface", + [(audbackend.backend.Artifactory, audbackend.interface.Maven)], + indirect=True, +) +def test_parquet_file(interface, parquet_file): + """Test uploading a parquet file with hash in metadata. + + We need to make sure to hand the MD5 sum + to the deploy method of Artifactory, + not the checksum hash of the parquet file metadata. + See https://github.com/audeering/audbackend/issues/254. + + """ + dst_file = f"/{os.path.basename(parquet_file)}" + version = "1.0.0" + interface.put_file(parquet_file, dst_file, version) + assert interface.exists(dst_file, version) diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index 311d6d88..00000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,112 +0,0 @@ -import sys - -import pyarrow -import pyarrow.parquet as parquet -import pytest - -import audeer - -import audbackend - - -@pytest.fixture -def pyarrow_installed(request): - """Simulate missing pyarrow installation. - - Args: - request: request parameter for indirect call of fixture - - """ - if not request.param: - sys.modules["pyarrow"] = None - - yield False - - del sys.modules["pyarrow"] - - else: - yield True - - -class TestChecksum: - """Test local checksum calculation.""" - - @classmethod - @pytest.fixture(autouse=True) - def setup(cls, tmpdir): - """Prepare files for tests.""" - cls.files = {} - - file = "file.txt" - cls.files[file] = audeer.path(tmpdir, file) - with open(cls.files[file], "w") as fp: - fp.write("hello\n") - - file = "file.parquet" - cls.files[file] = audeer.path(tmpdir, file) - table = pyarrow.Table.from_pylist([{"a": 0, "b": 1}]) - parquet.write_table(table, cls.files[file], compression="snappy") - - file = "file-metadata.parquet" - cls.files[file] = audeer.path(tmpdir, file) - metadata = {"hash": "my-hash"} - table = table.replace_schema_metadata(metadata) - parquet.write_table(table, cls.files[file], compression="snappy") - - file = "folder" - cls.files[file] = audeer.mkdir(tmpdir, file) - - @pytest.mark.parametrize("pyarrow_installed", [True, False], indirect=True) - @pytest.mark.parametrize( - "file", - ["file.txt", "file.parquet", "file-metadata.parquet", "folder"], - ) - def test_checksum(self, file, pyarrow_installed): - """Test checksum of local file. - - Args: - file: file name, see ``setup()`` - pyarrow_installed: if ``False, - it hides the ``pyarrow`` module - expected_checksum_function: function executed - to generate expected checksum for ``file`` - - """ - path = self.files[file] - expected_checksum = self.determine_expected_checksum(file, pyarrow_installed) - assert audbackend.checksum(path) == expected_checksum - - @pytest.mark.parametrize( - "file, error, error_msg", - [ - ("non-existing.txt", FileNotFoundError, "non-existing.txt"), - ("non-existing.parquet", FileNotFoundError, "non-existing.parquet"), - ], - ) - def test_errors(self, file, error, error_msg): - """Test expected errors. - - Args: - file: file path - error: expected error - error_msg: expected error message. - For ``FileNotFoundError``, - we recommend to use only the file name - as the rest of the error message differs under Windows - - """ - with pytest.raises(error, match=error_msg): - file = self.files.get(file, file) - audbackend.checksum(file) - - def determine_expected_checksum(self, file, pyarrow_installed): - """Expected checksum for file and pyarrow installation. - - Args: - file: file to calculate checksum for - pyarrow_installed: if ``True`` it assumes ``pyarrow`` is installed - - """ - if file == "file-metadata.parquet" and pyarrow_installed: - return "my-hash" - return audeer.md5(self.files[file])