Spaces:
Running
Running
File size: 2,664 Bytes
2a0bc63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from typing import Callable, Dict, List, Tuple
import numpy as np
VectorType = List[float]
# distance definitions. These all work batched in the first argument.
def distance_dot_product(
embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
"""
Given a list [emb_i] and a reference rEmb vector,
return a list [distance_i] where each distance is
distance_i = distance(emb_i, rEmb)
At the moment only the dot product is supported
(which for unitary vectors is the cosine difference).
Not particularly optimized.
"""
v1s = np.array(embedding_vectors, dtype=float)
v2 = np.array(reference_embedding_vector, dtype=float)
return list(
np.dot(
v1s,
v2.T,
)
)
def distance_cos_difference(
embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
v1s = np.array(embedding_vectors, dtype=float)
v2 = np.array(reference_embedding_vector, dtype=float)
return list(
np.dot(
v1s,
v2.T,
)
/ (np.linalg.norm(v1s, axis=1) * np.linalg.norm(v2))
)
def distance_l1(
embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
v1s = np.array(embedding_vectors, dtype=float)
v2 = np.array(reference_embedding_vector, dtype=float)
return list(np.linalg.norm(v1s - v2, axis=1, ord=1))
def distance_l2(
embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
v1s = np.array(embedding_vectors, dtype=float)
v2 = np.array(reference_embedding_vector, dtype=float)
return list(np.linalg.norm(v1s - v2, axis=1, ord=2))
def distance_max(
embedding_vectors: List[VectorType], reference_embedding_vector: VectorType
) -> List[float]:
v1s = np.array(embedding_vectors, dtype=float)
v2 = np.array(reference_embedding_vector, dtype=float)
return list(np.linalg.norm(v1s - v2, axis=1, ord=np.inf))
# The tuple is:
# (
# function,
# sorting 'reverse' argument, nearest-to-farthest
# )
# (i.e. True means that:
# - in that metric higher is closer and that
# - cutoff should be metric > threshold)
distance_metrics: Dict[
str, Tuple[Callable[[List[VectorType], VectorType], List[float]], bool]
] = {
"cos": (
distance_cos_difference,
True,
),
"dot": (
distance_dot_product,
True,
),
"l1": (
distance_l1,
False,
),
"l2": (
distance_l2,
False,
),
"max": (
distance_max,
False,
),
}
|