File size: 2,664 Bytes
2a0bc63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from typing import Callable, Dict, List, Tuple

import numpy as np

VectorType = List[float]


# distance definitions. These all work batched in the first argument.
def distance_dot_product(
    embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
    """
    Given a list [emb_i] and a reference rEmb vector,
    return a list [distance_i] where each distance is
        distance_i = distance(emb_i, rEmb)
    At the moment only the dot product is supported
    (which for unitary vectors is the cosine difference).

    Not particularly optimized.
    """
    v1s = np.array(embedding_vectors, dtype=float)
    v2 = np.array(reference_embedding_vector, dtype=float)
    return list(
        np.dot(
            v1s,
            v2.T,
        )
    )


def distance_cos_difference(
    embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
    v1s = np.array(embedding_vectors, dtype=float)
    v2 = np.array(reference_embedding_vector, dtype=float)
    return list(
        np.dot(
            v1s,
            v2.T,
        )
        / (np.linalg.norm(v1s, axis=1) * np.linalg.norm(v2))
    )


def distance_l1(
    embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
    v1s = np.array(embedding_vectors, dtype=float)
    v2 = np.array(reference_embedding_vector, dtype=float)
    return list(np.linalg.norm(v1s - v2, axis=1, ord=1))


def distance_l2(
    embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
    v1s = np.array(embedding_vectors, dtype=float)
    v2 = np.array(reference_embedding_vector, dtype=float)
    return list(np.linalg.norm(v1s - v2, axis=1, ord=2))


def distance_max(
    embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
    v1s = np.array(embedding_vectors, dtype=float)
    v2 = np.array(reference_embedding_vector, dtype=float)
    return list(np.linalg.norm(v1s - v2, axis=1, ord=np.inf))


# The tuple is:
#   (
#       function,
#       sorting 'reverse' argument, nearest-to-farthest
#   )
# (i.e. True means that:
#     - in that metric higher is closer and that
#     - cutoff should be metric > threshold)
distance_metrics: Dict[
    str, Tuple[Callable[[List[VectorType], VectorType], List[float]], bool]
] = {
    "cos": (
        distance_cos_difference,
        True,
    ),
    "dot": (
        distance_dot_product,
        True,
    ),
    "l1": (
        distance_l1,
        False,
    ),
    "l2": (
        distance_l2,
        False,
    ),
    "max": (
        distance_max,
        False,
    ),
}