Spaces:
Running
on
Zero
Running
on
Zero
""" | |
Fast cryptographic hash of Python objects, with a special case for fast | |
hashing of numpy arrays. | |
""" | |
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> | |
# Copyright (c) 2009 Gael Varoquaux | |
# License: BSD Style, 3 clauses. | |
import pickle | |
import hashlib | |
import sys | |
import types | |
import struct | |
import io | |
import decimal | |
Pickler = pickle._Pickler | |
class _ConsistentSet(object): | |
""" Class used to ensure the hash of Sets is preserved | |
whatever the order of its items. | |
""" | |
def __init__(self, set_sequence): | |
# Forces order of elements in set to ensure consistent hash. | |
try: | |
# Trying first to order the set assuming the type of elements is | |
# consistent and orderable. | |
# This fails on python 3 when elements are unorderable | |
# but we keep it in a try as it's faster. | |
self._sequence = sorted(set_sequence) | |
except (TypeError, decimal.InvalidOperation): | |
# If elements are unorderable, sorting them using their hash. | |
# This is slower but works in any case. | |
self._sequence = sorted((hash(e) for e in set_sequence)) | |
class _MyHash(object): | |
""" Class used to hash objects that won't normally pickle """ | |
def __init__(self, *args): | |
self.args = args | |
class Hasher(Pickler): | |
""" A subclass of pickler, to do cryptographic hashing, rather than | |
pickling. | |
""" | |
def __init__(self, hash_name='md5'): | |
self.stream = io.BytesIO() | |
# By default we want a pickle protocol that only changes with | |
# the major python version and not the minor one | |
protocol = 3 | |
Pickler.__init__(self, self.stream, protocol=protocol) | |
# Initialise the hash obj | |
self._hash = hashlib.new(hash_name) | |
def hash(self, obj, return_digest=True): | |
try: | |
self.dump(obj) | |
except pickle.PicklingError as e: | |
e.args += ('PicklingError while hashing %r: %r' % (obj, e),) | |
raise | |
dumps = self.stream.getvalue() | |
self._hash.update(dumps) | |
if return_digest: | |
return self._hash.hexdigest() | |
def save(self, obj): | |
if isinstance(obj, (types.MethodType, type({}.pop))): | |
# the Pickler cannot pickle instance methods; here we decompose | |
# them into components that make them uniquely identifiable | |
if hasattr(obj, '__func__'): | |
func_name = obj.__func__.__name__ | |
else: | |
func_name = obj.__name__ | |
inst = obj.__self__ | |
if type(inst) is type(pickle): | |
obj = _MyHash(func_name, inst.__name__) | |
elif inst is None: | |
# type(None) or type(module) do not pickle | |
obj = _MyHash(func_name, inst) | |
else: | |
cls = obj.__self__.__class__ | |
obj = _MyHash(func_name, inst, cls) | |
Pickler.save(self, obj) | |
def memoize(self, obj): | |
# We want hashing to be sensitive to value instead of reference. | |
# For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] | |
# to hash to the same value and that's why we disable memoization | |
# for strings | |
if isinstance(obj, (bytes, str)): | |
return | |
Pickler.memoize(self, obj) | |
# The dispatch table of the pickler is not accessible in Python | |
# 3, as these lines are only bugware for IPython, we skip them. | |
def save_global(self, obj, name=None, pack=struct.pack): | |
# We have to override this method in order to deal with objects | |
# defined interactively in IPython that are not injected in | |
# __main__ | |
kwargs = dict(name=name, pack=pack) | |
del kwargs['pack'] | |
try: | |
Pickler.save_global(self, obj, **kwargs) | |
except pickle.PicklingError: | |
Pickler.save_global(self, obj, **kwargs) | |
module = getattr(obj, "__module__", None) | |
if module == '__main__': | |
my_name = name | |
if my_name is None: | |
my_name = obj.__name__ | |
mod = sys.modules[module] | |
if not hasattr(mod, my_name): | |
# IPython doesn't inject the variables define | |
# interactively in __main__ | |
setattr(mod, my_name, obj) | |
dispatch = Pickler.dispatch.copy() | |
# builtin | |
dispatch[type(len)] = save_global | |
# type | |
dispatch[type(object)] = save_global | |
# classobj | |
dispatch[type(Pickler)] = save_global | |
# function | |
dispatch[type(pickle.dump)] = save_global | |
def _batch_setitems(self, items): | |
# forces order of keys in dict to ensure consistent hash. | |
try: | |
# Trying first to compare dict assuming the type of keys is | |
# consistent and orderable. | |
# This fails on python 3 when keys are unorderable | |
# but we keep it in a try as it's faster. | |
Pickler._batch_setitems(self, iter(sorted(items))) | |
except TypeError: | |
# If keys are unorderable, sorting them using their hash. This is | |
# slower but works in any case. | |
Pickler._batch_setitems(self, iter(sorted((hash(k), v) | |
for k, v in items))) | |
def save_set(self, set_items): | |
# forces order of items in Set to ensure consistent hash | |
Pickler.save(self, _ConsistentSet(set_items)) | |
dispatch[type(set())] = save_set | |
class NumpyHasher(Hasher): | |
""" Special case the hasher for when numpy is loaded. | |
""" | |
def __init__(self, hash_name='md5', coerce_mmap=False): | |
""" | |
Parameters | |
---------- | |
hash_name: string | |
The hash algorithm to be used | |
coerce_mmap: boolean | |
Make no difference between np.memmap and np.ndarray | |
objects. | |
""" | |
self.coerce_mmap = coerce_mmap | |
Hasher.__init__(self, hash_name=hash_name) | |
# delayed import of numpy, to avoid tight coupling | |
import numpy as np | |
self.np = np | |
if hasattr(np, 'getbuffer'): | |
self._getbuffer = np.getbuffer | |
else: | |
self._getbuffer = memoryview | |
def save(self, obj): | |
""" Subclass the save method, to hash ndarray subclass, rather | |
than pickling them. Off course, this is a total abuse of | |
the Pickler class. | |
""" | |
if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: | |
# Compute a hash of the object | |
# The update function of the hash requires a c_contiguous buffer. | |
if obj.shape == (): | |
# 0d arrays need to be flattened because viewing them as bytes | |
# raises a ValueError exception. | |
obj_c_contiguous = obj.flatten() | |
elif obj.flags.c_contiguous: | |
obj_c_contiguous = obj | |
elif obj.flags.f_contiguous: | |
obj_c_contiguous = obj.T | |
else: | |
# Cater for non-single-segment arrays: this creates a | |
# copy, and thus alleviates this issue. | |
# XXX: There might be a more efficient way of doing this | |
obj_c_contiguous = obj.flatten() | |
# memoryview is not supported for some dtypes, e.g. datetime64, see | |
# https://github.com/numpy/numpy/issues/4983. The | |
# workaround is to view the array as bytes before | |
# taking the memoryview. | |
self._hash.update( | |
self._getbuffer(obj_c_contiguous.view(self.np.uint8))) | |
# We store the class, to be able to distinguish between | |
# Objects with the same binary content, but different | |
# classes. | |
if self.coerce_mmap and isinstance(obj, self.np.memmap): | |
# We don't make the difference between memmap and | |
# normal ndarrays, to be able to reload previously | |
# computed results with memmap. | |
klass = self.np.ndarray | |
else: | |
klass = obj.__class__ | |
# We also return the dtype and the shape, to distinguish | |
# different views on the same data with different dtypes. | |
# The object will be pickled by the pickler hashed at the end. | |
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides)) | |
elif isinstance(obj, self.np.dtype): | |
# numpy.dtype consistent hashing is tricky to get right. This comes | |
# from the fact that atomic np.dtype objects are interned: | |
# ``np.dtype('f4') is np.dtype('f4')``. The situation is | |
# complicated by the fact that this interning does not resist a | |
# simple pickle.load/dump roundtrip: | |
# ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not | |
# np.dtype('f4') Because pickle relies on memoization during | |
# pickling, it is easy to | |
# produce different hashes for seemingly identical objects, such as | |
# ``[np.dtype('f4'), np.dtype('f4')]`` | |
# and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``. | |
# To prevent memoization from interfering with hashing, we isolate | |
# the serialization (and thus the pickle memoization) of each dtype | |
# using each time a different ``pickle.dumps`` call unrelated to | |
# the current Hasher instance. | |
self._hash.update("_HASHED_DTYPE".encode('utf-8')) | |
self._hash.update(pickle.dumps(obj)) | |
return | |
Hasher.save(self, obj) | |
def hash(obj, hash_name='md5', coerce_mmap=False): | |
""" Quick calculation of a hash to identify uniquely Python objects | |
containing numpy arrays. | |
Parameters | |
---------- | |
hash_name: 'md5' or 'sha1' | |
Hashing algorithm used. sha1 is supposedly safer, but md5 is | |
faster. | |
coerce_mmap: boolean | |
Make no difference between np.memmap and np.ndarray | |
""" | |
valid_hash_names = ('md5', 'sha1') | |
if hash_name not in valid_hash_names: | |
raise ValueError("Valid options for 'hash_name' are {}. " | |
"Got hash_name={!r} instead." | |
.format(valid_hash_names, hash_name)) | |
if 'numpy' in sys.modules: | |
hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) | |
else: | |
hasher = Hasher(hash_name=hash_name) | |
return hasher.hash(obj) | |