Skip to content

Commit

Permalink
Allow using {filename}.sha256 instead of recalculating a hash
Browse files Browse the repository at this point in the history
  • Loading branch information
TyberiusPrime committed Nov 21, 2024
1 parent 41a2e2f commit 5352c72
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 29 deletions.
11 changes: 8 additions & 3 deletions docs/content/docs/concepts/tracking_hashes/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@

For each and every job, ppg2 tracks multiple hashes.


* The output hash (what did this job produce last time).
* Input hashes for each incoming job.
* A hash on the input-job-job_ids, allowing to detect if inputs have been added or removed


Depending on the job type, the hashes are different kinds of 'enhanced hashes', not just hexdigests,
allowing an efficient comparisons and recalculations of hashes.
Depending on the job type, the hashes are different kinds of 'enhanced hashes',
not just hexdigests, allowing an efficient comparisons and recalculations of hashes.

For files and bytes (as in Python `bytes' objects), we use xxh3_128.

Expand All @@ -21,6 +20,12 @@ Jobs that return python objects, such as [AttributeLoadingJobs](../../jobs/attri
FunctionInvariants store the byte-code keyed on python version as well as the source code. They are therefore
capable of ignoring no-op changes like comments or docstrings.

At times, it's much cheaper to have a tool calculate a hash while it's reading
the file anyway. For that purpose, if there is a {filename}.sha256 file next to
the file, we use that hash instead of recalculating it. The .sha256 must be
newer (or have the same mtime) than the file itself, otherwise a
JobContractError is raised. It must contain a 64 character hexdigest, otherwise
a JobContractError is raised.



72 changes: 46 additions & 26 deletions python/pypipegraph2/hashers.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,57 @@
from pathlib import Path
from .exceptions import JobContractError
from xxhash import xxh3_128


def hash_file(path: Path):
"""delegate to a fast and somewhat collision resistant hash function"""
# I profiled a bunch of hash functions
# and xx3 and spooky were the fastest 128bit hashers
# (we want 128 bit to prevent collisions).
# single core, spooky seemed a bit faster
# but the xxhash implementation releases the gil
# when passed more than 100kb (otherwise it's a
# faster *not* to acquire the lock!)
hasher = xxh3_128()
# if known_st_size is None:
# known_st_size = path.stat().st_size
# we are not acquiring the core lock here.
# why? because this is essentially always
# limited by the read-bandwidth, not the
# cpu.
# (even on a very fast Samsung EVO equivalent SSD
# (about 3gb/s), doing it from memory is 4 times faster)
# so we shouldn't be stalling everything else much
# (except for memory bandwidth. oh well, at least
# it should not go into swap with the tiny buffer we use here)
with open(path, "rb") as op:
block = op.read(1024 * 512)
while block:
hasher.update(block)
block = op.read(1024 * 512)
"""delegate to a fast and somewhat collision resistant hash function.
Or reuse a .sha256 file if present and at least as new as the file.
(Having one present that's older than the file is a JobContractError).
The file must contain exactly one 64 character hash (+- a newline)
"""
stat = path.stat()
sha256_path = path.with_name(path.name + ".sha256")
try:
sha256_stat = sha256_path.stat()
if sha256_stat.st_mtime < stat.st_mtime:
raise JobContractError(
f"Found {sha256_path} but it's older than {path}. Have the generating code set correct mtimes, or not produce the file"
)
the_hash = sha256_path.read_text().strip()
if len(the_hash) != 64:
raise JobContractError(f"Expected a 64 character hash in {sha256_path}, but got {the_hash} with len {len(the_hash)}")
if not all([x in "0123456789abcdefABCDEF" for x in the_hash]):
raise JobContractError(f"Expected a 64 character hash in {sha256_path}, but got {the_hash} with invalid characters (not 0-9a-fA-F)")
except FileNotFoundError:

# I profiled a bunch of hash functions
# and xx3 and spooky were the fastest 128bit hashers
# (we want 128 bit to prevent collisions).
# single core, spooky seemed a bit faster
# but the xxhash implementation releases the gil
# when passed more than 100kb (otherwise it's a
# faster *not* to acquire the lock!)
hasher = xxh3_128()
# if known_st_size is None:
# known_st_size = path.stat().st_size
# we are not acquiring the core lock here.
# why? because this is essentially always
# limited by the read-bandwidth, not the
# cpu.
# (even on a very fast Samsung EVO equivalent SSD
# (about 3gb/s), doing it from memory is 4 times faster)
# so we shouldn't be stalling everything else much
# (except for memory bandwidth. oh well, at least
# it should not go into swap with the tiny buffer we use here)
with open(path, "rb") as op:
block = op.read(1024 * 512)
while block:
hasher.update(block)
block = op.read(1024 * 512)
the_hash = hasher.hexdigest()

return {
"hash": hasher.hexdigest(),
"hash": the_hash,
"mtime": int(stat.st_mtime),
"size": stat.st_size,
}
Expand Down
27 changes: 27 additions & 0 deletions tests/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,3 +696,30 @@ def bad(files):
ppg.run()
error = ppg.global_pipegraph.last_run_result[tmfg.job_id].error
assert "changed output" in error and "ephemeral" in error.lower()


@pytest.mark.usefixtures("ppg2_per_test", "create_out_dir")
class TestHashersWithPremadeSha256:
def test_reads_dot_sha256(self):
a = Path("a.txt")
asha256 = Path("a.txt.sha256")
a.write_text("hello")
asha256.write_text("a" * 64)
h = ppg.hashers.hash_file(a)
assert h['hash'] == 'a' * 64
mtime = a.stat().st_mtime
# now push sha256 into the past
os.utime(asha256, (mtime - 100, mtime - 100))
with pytest.raises(ppg.JobContractError):
ppg.hashers.hash_file(a)
asha256.unlink()
h = ppg.hashers.hash_file(a)
assert h['hash'] != 'abc'

# or have a non hash in there.
asha256.write_text("a")
with pytest.raises(ppg.JobContractError):
ppg.hashers.hash_file(a)
asha256.write_text("a" * 63 + 'X')
with pytest.raises(ppg.JobContractError):
ppg.hashers.hash_file(a)

0 comments on commit 5352c72

Please sign in to comment.