|
""" |
|
Fast cryptographic hash of Python objects, with a special case for fast |
|
hashing of numpy arrays. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
import decimal |
|
import hashlib |
|
import io |
|
import pickle |
|
import struct |
|
import sys |
|
import types |
|
|
|
Pickler = pickle._Pickler |
|
|
|
|
|
class _ConsistentSet(object): |
|
"""Class used to ensure the hash of Sets is preserved |
|
whatever the order of its items. |
|
""" |
|
|
|
def __init__(self, set_sequence): |
|
|
|
try: |
|
|
|
|
|
|
|
|
|
self._sequence = sorted(set_sequence) |
|
except (TypeError, decimal.InvalidOperation): |
|
|
|
|
|
self._sequence = sorted((hash(e) for e in set_sequence)) |
|
|
|
|
|
class _MyHash(object): |
|
"""Class used to hash objects that won't normally pickle""" |
|
|
|
def __init__(self, *args): |
|
self.args = args |
|
|
|
|
|
class Hasher(Pickler): |
|
"""A subclass of pickler, to do cryptographic hashing, rather than |
|
pickling. This is used to produce a unique hash of the given |
|
Python object that is not necessarily cryptographically secure. |
|
""" |
|
|
|
def __init__(self, hash_name="md5"): |
|
self.stream = io.BytesIO() |
|
|
|
|
|
protocol = 3 |
|
Pickler.__init__(self, self.stream, protocol=protocol) |
|
|
|
self._hash = hashlib.new(hash_name, usedforsecurity=False) |
|
|
|
def hash(self, obj, return_digest=True): |
|
try: |
|
self.dump(obj) |
|
except pickle.PicklingError as e: |
|
e.args += ("PicklingError while hashing %r: %r" % (obj, e),) |
|
raise |
|
dumps = self.stream.getvalue() |
|
self._hash.update(dumps) |
|
if return_digest: |
|
return self._hash.hexdigest() |
|
|
|
def save(self, obj): |
|
if isinstance(obj, (types.MethodType, type({}.pop))): |
|
|
|
|
|
if hasattr(obj, "__func__"): |
|
func_name = obj.__func__.__name__ |
|
else: |
|
func_name = obj.__name__ |
|
inst = obj.__self__ |
|
if type(inst) is type(pickle): |
|
obj = _MyHash(func_name, inst.__name__) |
|
elif inst is None: |
|
|
|
obj = _MyHash(func_name, inst) |
|
else: |
|
cls = obj.__self__.__class__ |
|
obj = _MyHash(func_name, inst, cls) |
|
Pickler.save(self, obj) |
|
|
|
def memoize(self, obj): |
|
|
|
|
|
|
|
|
|
if isinstance(obj, (bytes, str)): |
|
return |
|
Pickler.memoize(self, obj) |
|
|
|
|
|
|
|
def save_global(self, obj, name=None, pack=struct.pack): |
|
|
|
|
|
|
|
kwargs = dict(name=name, pack=pack) |
|
del kwargs["pack"] |
|
try: |
|
Pickler.save_global(self, obj, **kwargs) |
|
except pickle.PicklingError: |
|
Pickler.save_global(self, obj, **kwargs) |
|
module = getattr(obj, "__module__", None) |
|
if module == "__main__": |
|
my_name = name |
|
if my_name is None: |
|
my_name = obj.__name__ |
|
mod = sys.modules[module] |
|
if not hasattr(mod, my_name): |
|
|
|
|
|
setattr(mod, my_name, obj) |
|
|
|
dispatch = Pickler.dispatch.copy() |
|
|
|
dispatch[type(len)] = save_global |
|
|
|
dispatch[type(object)] = save_global |
|
|
|
dispatch[type(Pickler)] = save_global |
|
|
|
dispatch[type(pickle.dump)] = save_global |
|
|
|
|
|
|
|
def _batch_setitems(self, items, *args): |
|
|
|
try: |
|
|
|
|
|
|
|
|
|
Pickler._batch_setitems(self, iter(sorted(items)), *args) |
|
except TypeError: |
|
|
|
|
|
Pickler._batch_setitems( |
|
self, iter(sorted((hash(k), v) for k, v in items)), *args |
|
) |
|
|
|
def save_set(self, set_items): |
|
|
|
Pickler.save(self, _ConsistentSet(set_items)) |
|
|
|
dispatch[type(set())] = save_set |
|
|
|
|
|
class NumpyHasher(Hasher): |
|
"""Special case the hasher for when numpy is loaded.""" |
|
|
|
def __init__(self, hash_name="md5", coerce_mmap=False): |
|
""" |
|
Parameters |
|
---------- |
|
hash_name: string |
|
The hash algorithm to be used |
|
coerce_mmap: boolean |
|
Make no difference between np.memmap and np.ndarray |
|
objects. |
|
""" |
|
self.coerce_mmap = coerce_mmap |
|
Hasher.__init__(self, hash_name=hash_name) |
|
|
|
import numpy as np |
|
|
|
self.np = np |
|
if hasattr(np, "getbuffer"): |
|
self._getbuffer = np.getbuffer |
|
else: |
|
self._getbuffer = memoryview |
|
|
|
def save(self, obj): |
|
"""Subclass the save method, to hash ndarray subclass, rather |
|
than pickling them. Off course, this is a total abuse of |
|
the Pickler class. |
|
""" |
|
if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: |
|
|
|
|
|
if obj.shape == (): |
|
|
|
|
|
obj_c_contiguous = obj.flatten() |
|
elif obj.flags.c_contiguous: |
|
obj_c_contiguous = obj |
|
elif obj.flags.f_contiguous: |
|
obj_c_contiguous = obj.T |
|
else: |
|
|
|
|
|
|
|
obj_c_contiguous = obj.flatten() |
|
|
|
|
|
|
|
|
|
|
|
self._hash.update(self._getbuffer(obj_c_contiguous.view(self.np.uint8))) |
|
|
|
|
|
|
|
|
|
if self.coerce_mmap and isinstance(obj, self.np.memmap): |
|
|
|
|
|
|
|
klass = self.np.ndarray |
|
else: |
|
klass = obj.__class__ |
|
|
|
|
|
|
|
|
|
obj = (klass, ("HASHED", obj.dtype, obj.shape, obj.strides)) |
|
elif isinstance(obj, self.np.dtype): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._hash.update("_HASHED_DTYPE".encode("utf-8")) |
|
self._hash.update(pickle.dumps(obj)) |
|
return |
|
Hasher.save(self, obj) |
|
|
|
|
|
def hash(obj, hash_name="md5", coerce_mmap=False): |
|
"""Quick calculation of a hash to identify uniquely Python objects |
|
containing numpy arrays. |
|
|
|
Parameters |
|
---------- |
|
hash_name: 'md5' or 'sha1' |
|
Hashing algorithm used. sha1 is supposedly safer, but md5 is |
|
faster. |
|
coerce_mmap: boolean |
|
Make no difference between np.memmap and np.ndarray |
|
""" |
|
valid_hash_names = ("md5", "sha1") |
|
if hash_name not in valid_hash_names: |
|
raise ValueError( |
|
"Valid options for 'hash_name' are {}. Got hash_name={!r} instead.".format( |
|
valid_hash_names, hash_name |
|
) |
|
) |
|
if "numpy" in sys.modules: |
|
hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) |
|
else: |
|
hasher = Hasher(hash_name=hash_name) |
|
return hasher.hash(obj) |
|
|