|
""" |
|
Test the hashing module. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
import collections |
|
import gc |
|
import hashlib |
|
import io |
|
import itertools |
|
import pickle |
|
import random |
|
import sys |
|
import time |
|
from concurrent.futures import ProcessPoolExecutor |
|
from decimal import Decimal |
|
|
|
from joblib.func_inspect import filter_args |
|
from joblib.hashing import hash |
|
from joblib.memory import Memory |
|
from joblib.test.common import np, with_numpy |
|
from joblib.testing import fixture, parametrize, raises, skipif |
|
|
|
|
|
def unicode(s): |
|
return s |
|
|
|
|
|
|
|
|
|
def time_func(func, *args): |
|
"""Time function func on *args.""" |
|
times = list() |
|
for _ in range(3): |
|
t1 = time.time() |
|
func(*args) |
|
times.append(time.time() - t1) |
|
return min(times) |
|
|
|
|
|
def relative_time(func1, func2, *args): |
|
"""Return the relative time between func1 and func2 applied on |
|
*args. |
|
""" |
|
time_func1 = time_func(func1, *args) |
|
time_func2 = time_func(func2, *args) |
|
relative_diff = 0.5 * (abs(time_func1 - time_func2) / (time_func1 + time_func2)) |
|
return relative_diff |
|
|
|
|
|
class Klass(object): |
|
def f(self, x): |
|
return x |
|
|
|
|
|
class KlassWithCachedMethod(object): |
|
def __init__(self, cachedir): |
|
mem = Memory(location=cachedir) |
|
self.f = mem.cache(self.f) |
|
|
|
def f(self, x): |
|
return x |
|
|
|
|
|
|
|
|
|
|
|
input_list = [ |
|
1, |
|
2, |
|
1.0, |
|
2.0, |
|
1 + 1j, |
|
2.0 + 1j, |
|
"a", |
|
"b", |
|
(1,), |
|
( |
|
1, |
|
1, |
|
), |
|
[ |
|
1, |
|
], |
|
[ |
|
1, |
|
1, |
|
], |
|
{1: 1}, |
|
{1: 2}, |
|
{2: 1}, |
|
None, |
|
gc.collect, |
|
[ |
|
1, |
|
].append, |
|
|
|
set(("a", 1)), |
|
set(("a", 1, ("a", 1))), |
|
|
|
{"a": 1, 1: 2}, |
|
{"a": 1, 1: 2, "d": {"a": 1}}, |
|
] |
|
|
|
|
|
@parametrize("obj1", input_list) |
|
@parametrize("obj2", input_list) |
|
def test_trivial_hash(obj1, obj2): |
|
"""Smoke test hash on various types.""" |
|
|
|
are_hashes_equal = hash(obj1) == hash(obj2) |
|
are_objs_identical = obj1 is obj2 |
|
assert are_hashes_equal == are_objs_identical |
|
|
|
|
|
def test_hash_methods(): |
|
|
|
a = io.StringIO(unicode("a")) |
|
assert hash(a.flush) == hash(a.flush) |
|
a1 = collections.deque(range(10)) |
|
a2 = collections.deque(range(9)) |
|
assert hash(a1.extend) != hash(a2.extend) |
|
|
|
|
|
@fixture(scope="function") |
|
@with_numpy |
|
def three_np_arrays(): |
|
rnd = np.random.RandomState(0) |
|
arr1 = rnd.random_sample((10, 10)) |
|
arr2 = arr1.copy() |
|
arr3 = arr2.copy() |
|
arr3[0] += 1 |
|
return arr1, arr2, arr3 |
|
|
|
|
|
def test_hash_numpy_arrays(three_np_arrays): |
|
arr1, arr2, arr3 = three_np_arrays |
|
|
|
for obj1, obj2 in itertools.product(three_np_arrays, repeat=2): |
|
are_hashes_equal = hash(obj1) == hash(obj2) |
|
are_arrays_equal = np.all(obj1 == obj2) |
|
assert are_hashes_equal == are_arrays_equal |
|
|
|
assert hash(arr1) != hash(arr1.T) |
|
|
|
|
|
def test_hash_numpy_dict_of_arrays(three_np_arrays): |
|
arr1, arr2, arr3 = three_np_arrays |
|
|
|
d1 = {1: arr1, 2: arr2} |
|
d2 = {1: arr2, 2: arr1} |
|
d3 = {1: arr2, 2: arr3} |
|
|
|
assert hash(d1) == hash(d2) |
|
assert hash(d1) != hash(d3) |
|
|
|
|
|
@with_numpy |
|
@parametrize("dtype", ["datetime64[s]", "timedelta64[D]"]) |
|
def test_numpy_datetime_array(dtype): |
|
|
|
|
|
a_hash = hash(np.arange(10)) |
|
array = np.arange(0, 10, dtype=dtype) |
|
assert hash(array) != a_hash |
|
|
|
|
|
@with_numpy |
|
def test_hash_numpy_noncontiguous(): |
|
a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order="F")[:, :1, :] |
|
b = np.ascontiguousarray(a) |
|
assert hash(a) != hash(b) |
|
|
|
c = np.asfortranarray(a) |
|
assert hash(a) != hash(c) |
|
|
|
|
|
@with_numpy |
|
@parametrize("coerce_mmap", [True, False]) |
|
def test_hash_memmap(tmpdir, coerce_mmap): |
|
"""Check that memmap and arrays hash identically if coerce_mmap is True.""" |
|
filename = tmpdir.join("memmap_temp").strpath |
|
try: |
|
m = np.memmap(filename, shape=(10, 10), mode="w+") |
|
a = np.asarray(m) |
|
are_hashes_equal = hash(a, coerce_mmap=coerce_mmap) == hash( |
|
m, coerce_mmap=coerce_mmap |
|
) |
|
assert are_hashes_equal == coerce_mmap |
|
finally: |
|
if "m" in locals(): |
|
del m |
|
|
|
|
|
|
|
gc.collect() |
|
|
|
|
|
@with_numpy |
|
@skipif( |
|
sys.platform == "win32", |
|
reason="This test is not stable under windows for some reason", |
|
) |
|
def test_hash_numpy_performance(): |
|
"""Check the performance of hashing numpy arrays: |
|
|
|
In [22]: a = np.random.random(1000000) |
|
|
|
In [23]: %timeit hashlib.md5(a).hexdigest() |
|
100 loops, best of 3: 20.7 ms per loop |
|
|
|
In [24]: %timeit hashlib.md5(pickle.dumps(a, protocol=2)).hexdigest() |
|
1 loops, best of 3: 73.1 ms per loop |
|
|
|
In [25]: %timeit hashlib.md5(cPickle.dumps(a, protocol=2)).hexdigest() |
|
10 loops, best of 3: 53.9 ms per loop |
|
|
|
In [26]: %timeit hash(a) |
|
100 loops, best of 3: 20.8 ms per loop |
|
""" |
|
rnd = np.random.RandomState(0) |
|
a = rnd.random_sample(1000000) |
|
|
|
def md5_hash(x): |
|
return hashlib.md5(memoryview(x)).hexdigest() |
|
|
|
relative_diff = relative_time(md5_hash, hash, a) |
|
assert relative_diff < 0.3 |
|
|
|
|
|
|
|
time_hashlib = 3 * time_func(md5_hash, a) |
|
time_hash = time_func(hash, (a, a, a)) |
|
relative_diff = 0.5 * (abs(time_hash - time_hashlib) / (time_hash + time_hashlib)) |
|
assert relative_diff < 0.3 |
|
|
|
|
|
def test_bound_methods_hash(): |
|
"""Make sure that calling the same method on two different instances |
|
of the same class does resolve to the same hashes. |
|
""" |
|
a = Klass() |
|
b = Klass() |
|
assert hash(filter_args(a.f, [], (1,))) == hash(filter_args(b.f, [], (1,))) |
|
|
|
|
|
def test_bound_cached_methods_hash(tmpdir): |
|
"""Make sure that calling the same _cached_ method on two different |
|
instances of the same class does resolve to the same hashes. |
|
""" |
|
a = KlassWithCachedMethod(tmpdir.strpath) |
|
b = KlassWithCachedMethod(tmpdir.strpath) |
|
assert hash(filter_args(a.f.func, [], (1,))) == hash( |
|
filter_args(b.f.func, [], (1,)) |
|
) |
|
|
|
|
|
@with_numpy |
|
def test_hash_object_dtype(): |
|
"""Make sure that ndarrays with dtype `object' hash correctly.""" |
|
|
|
a = np.array([np.arange(i) for i in range(6)], dtype=object) |
|
b = np.array([np.arange(i) for i in range(6)], dtype=object) |
|
|
|
assert hash(a) == hash(b) |
|
|
|
|
|
@with_numpy |
|
def test_numpy_scalar(): |
|
|
|
|
|
a = np.float64(2.0) |
|
b = np.float64(3.0) |
|
assert hash(a) != hash(b) |
|
|
|
|
|
def test_dict_hash(tmpdir): |
|
|
|
|
|
k = KlassWithCachedMethod(tmpdir.strpath) |
|
|
|
d = { |
|
"#s12069__c_maps.nii.gz": [33], |
|
"#s12158__c_maps.nii.gz": [33], |
|
"#s12258__c_maps.nii.gz": [33], |
|
"#s12277__c_maps.nii.gz": [33], |
|
"#s12300__c_maps.nii.gz": [33], |
|
"#s12401__c_maps.nii.gz": [33], |
|
"#s12430__c_maps.nii.gz": [33], |
|
"#s13817__c_maps.nii.gz": [33], |
|
"#s13903__c_maps.nii.gz": [33], |
|
"#s13916__c_maps.nii.gz": [33], |
|
"#s13981__c_maps.nii.gz": [33], |
|
"#s13982__c_maps.nii.gz": [33], |
|
"#s13983__c_maps.nii.gz": [33], |
|
} |
|
|
|
a = k.f(d) |
|
b = k.f(a) |
|
|
|
assert hash(a) == hash(b) |
|
|
|
|
|
def test_set_hash(tmpdir): |
|
|
|
|
|
k = KlassWithCachedMethod(tmpdir.strpath) |
|
|
|
s = set( |
|
[ |
|
"#s12069__c_maps.nii.gz", |
|
"#s12158__c_maps.nii.gz", |
|
"#s12258__c_maps.nii.gz", |
|
"#s12277__c_maps.nii.gz", |
|
"#s12300__c_maps.nii.gz", |
|
"#s12401__c_maps.nii.gz", |
|
"#s12430__c_maps.nii.gz", |
|
"#s13817__c_maps.nii.gz", |
|
"#s13903__c_maps.nii.gz", |
|
"#s13916__c_maps.nii.gz", |
|
"#s13981__c_maps.nii.gz", |
|
"#s13982__c_maps.nii.gz", |
|
"#s13983__c_maps.nii.gz", |
|
] |
|
) |
|
|
|
a = k.f(s) |
|
b = k.f(a) |
|
|
|
assert hash(a) == hash(b) |
|
|
|
|
|
def test_set_decimal_hash(): |
|
|
|
|
|
assert hash(set([Decimal(0), Decimal("NaN")])) == hash( |
|
set([Decimal("NaN"), Decimal(0)]) |
|
) |
|
|
|
|
|
def test_string(): |
|
|
|
|
|
string = "foo" |
|
a = {string: "bar"} |
|
b = {string: "bar"} |
|
c = pickle.loads(pickle.dumps(b)) |
|
assert hash([a, b]) == hash([a, c]) |
|
|
|
|
|
@with_numpy |
|
def test_numpy_dtype_pickling(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dt1 = np.dtype("f4") |
|
dt2 = np.dtype("f4") |
|
|
|
|
|
assert dt1 is dt2 |
|
assert hash(dt1) == hash(dt2) |
|
|
|
dt1_roundtripped = pickle.loads(pickle.dumps(dt1)) |
|
assert dt1 is not dt1_roundtripped |
|
assert hash(dt1) == hash(dt1_roundtripped) |
|
|
|
assert hash([dt1, dt1]) == hash([dt1_roundtripped, dt1_roundtripped]) |
|
assert hash([dt1, dt1]) == hash([dt1, dt1_roundtripped]) |
|
|
|
complex_dt1 = np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,))]) |
|
complex_dt2 = np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,))]) |
|
|
|
|
|
assert hash(complex_dt1) == hash(complex_dt2) |
|
|
|
complex_dt1_roundtripped = pickle.loads(pickle.dumps(complex_dt1)) |
|
assert complex_dt1_roundtripped is not complex_dt1 |
|
assert hash(complex_dt1) == hash(complex_dt1_roundtripped) |
|
|
|
assert hash([complex_dt1, complex_dt1]) == hash( |
|
[complex_dt1_roundtripped, complex_dt1_roundtripped] |
|
) |
|
assert hash([complex_dt1, complex_dt1]) == hash( |
|
[complex_dt1_roundtripped, complex_dt1] |
|
) |
|
|
|
|
|
@parametrize( |
|
"to_hash,expected", |
|
[ |
|
("This is a string to hash", "71b3f47df22cb19431d85d92d0b230b2"), |
|
("C'est l\xe9t\xe9", "2d8d189e9b2b0b2e384d93c868c0e576"), |
|
((123456, 54321, -98765), "e205227dd82250871fa25aa0ec690aa3"), |
|
( |
|
[random.Random(42).random() for _ in range(5)], |
|
"a11ffad81f9682a7d901e6edc3d16c84", |
|
), |
|
({"abcde": 123, "sadfas": [-9999, 2, 3]}, "aeda150553d4bb5c69f0e69d51b0e2ef"), |
|
], |
|
) |
|
def test_hashes_stay_the_same(to_hash, expected): |
|
|
|
|
|
|
|
|
|
|
|
assert hash(to_hash) == expected |
|
|
|
|
|
@with_numpy |
|
def test_hashes_are_different_between_c_and_fortran_contiguous_arrays(): |
|
|
|
|
|
rng = np.random.RandomState(0) |
|
arr_c = rng.random_sample((10, 10)) |
|
arr_f = np.asfortranarray(arr_c) |
|
assert hash(arr_c) != hash(arr_f) |
|
|
|
|
|
@with_numpy |
|
def test_0d_array(): |
|
hash(np.array(0)) |
|
|
|
|
|
@with_numpy |
|
def test_0d_and_1d_array_hashing_is_different(): |
|
assert hash(np.array(0)) != hash(np.array([0])) |
|
|
|
|
|
@with_numpy |
|
def test_hashes_stay_the_same_with_numpy_objects(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_objects_to_hash(): |
|
rng = np.random.RandomState(42) |
|
|
|
|
|
|
|
|
|
|
|
to_hash_list = [ |
|
rng.randint(-1000, high=1000, size=50).astype("<i8"), |
|
tuple(rng.randn(3).astype("<f4") for _ in range(5)), |
|
[rng.randn(3).astype("<f4") for _ in range(5)], |
|
{ |
|
-3333: rng.randn(3, 5).astype("<f4"), |
|
0: [ |
|
rng.randint(10, size=20).astype("<i8"), |
|
rng.randn(10).astype("<f4"), |
|
], |
|
}, |
|
|
|
|
|
np.arange(100, dtype="<i8").reshape((10, 10)), |
|
|
|
np.asfortranarray(np.arange(100, dtype="<i8").reshape((10, 10))), |
|
|
|
np.arange(100, dtype="<i8").reshape((10, 10))[:, :2], |
|
] |
|
return to_hash_list |
|
|
|
|
|
|
|
|
|
to_hash_list_one = create_objects_to_hash() |
|
to_hash_list_two = create_objects_to_hash() |
|
|
|
e1 = ProcessPoolExecutor(max_workers=1) |
|
e2 = ProcessPoolExecutor(max_workers=1) |
|
|
|
try: |
|
for obj_1, obj_2 in zip(to_hash_list_one, to_hash_list_two): |
|
|
|
hash_1 = e1.submit(hash, obj_1).result() |
|
hash_2 = e2.submit(hash, obj_1).result() |
|
assert hash_1 == hash_2 |
|
|
|
|
|
hash_3 = e1.submit(hash, obj_2).result() |
|
assert hash_1 == hash_3 |
|
|
|
finally: |
|
e1.shutdown() |
|
e2.shutdown() |
|
|
|
|
|
def test_hashing_pickling_error(): |
|
def non_picklable(): |
|
return 42 |
|
|
|
with raises(pickle.PicklingError) as excinfo: |
|
hash(non_picklable) |
|
excinfo.match("PicklingError while hashing") |
|
|
|
|
|
def test_wrong_hash_name(): |
|
msg = "Valid options for 'hash_name' are" |
|
with raises(ValueError, match=msg): |
|
data = {"foo": "bar"} |
|
hash(data, hash_name="invalid") |
|
|