Spaces:
Sleeping
Sleeping
"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes.""" | |
from typing import BinaryIO, Optional | |
from .insecure_hashlib import sha1, sha256 | |
def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: | |
""" | |
Computes the sha256 hash of the given file object, by chunks of size `chunk_size`. | |
Args: | |
fileobj (file-like object): | |
The File object to compute sha256 for, typically obtained with `open(path, "rb")` | |
chunk_size (`int`, *optional*): | |
The number of bytes to read from `fileobj` at once, defaults to 1MB. | |
Returns: | |
`bytes`: `fileobj`'s sha256 hash as bytes | |
""" | |
chunk_size = chunk_size if chunk_size is not None else 1024 * 1024 | |
sha = sha256() | |
while True: | |
chunk = fileobj.read(chunk_size) | |
sha.update(chunk) | |
if not chunk: | |
break | |
return sha.digest() | |
def git_hash(data: bytes) -> str: | |
""" | |
Computes the git-sha1 hash of the given bytes, using the same algorithm as git. | |
This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object | |
for more details. | |
Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the | |
pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of | |
the LFS file content when we want to compare LFS files. | |
Args: | |
data (`bytes`): | |
The data to compute the git-hash for. | |
Returns: | |
`str`: the git-hash of `data` as an hexadecimal string. | |
Example: | |
```python | |
>>> from huggingface_hub.utils.sha import git_hash | |
>>> git_hash(b"Hello, World!") | |
'b45ef6fec89518d314f546fd6c3025367b721684' | |
``` | |
""" | |
# Taken from https://gist.github.com/msabramo/763200 | |
# Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum). | |
sha = sha1() | |
sha.update(b"blob ") | |
sha.update(str(len(data)).encode()) | |
sha.update(b"\0") | |
sha.update(data) | |
return sha.hexdigest() | |