Update instruction_template_retriever.py
Browse files
instruction_template_retriever.py
CHANGED
|
@@ -11,6 +11,161 @@ from huggingface_hub import hf_hub_download
|
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
class InstructionTemplateRetriever:
|
| 15 |
FINETEMPLATES_REVISION = "831ab22c90f9da011bd972585afdf609f40fa54b"
|
| 16 |
RETRIEVAL_EMBEDDING_NAME = "fineinstructions/matching_embedding"
|
|
|
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
|
| 13 |
|
| 14 |
+
class GaussianCoveragePooling(torch.nn.Module):
|
| 15 |
+
def __init__(self, coverage_chunks, sigma, alpha):
|
| 16 |
+
"""
|
| 17 |
+
Custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
|
| 18 |
+
Args:
|
| 19 |
+
coverage_chunks (int): Number of weighted pooling operations (N).
|
| 20 |
+
sigma (float): Standard deviation for Gaussian weighting.
|
| 21 |
+
alpha (float): Weighting factor for merging with standard mean pooling.
|
| 22 |
+
"""
|
| 23 |
+
super().__init__()
|
| 24 |
+
self.coverage_chunks = coverage_chunks
|
| 25 |
+
self.sigma = sigma # Controls width of Gaussians
|
| 26 |
+
self.alpha = alpha # Blends standard mean with weighted mean
|
| 27 |
+
|
| 28 |
+
def forward(self, features, chunk_indicators=None):
|
| 29 |
+
"""
|
| 30 |
+
Computes weighted mean pooling using Gaussian-based weights.
|
| 31 |
+
Args:
|
| 32 |
+
self (SentenceTransformer): The model.
|
| 33 |
+
features (dict): The token embeddings and attention mask.
|
| 34 |
+
chunk_indicators (tensor[bz, 1]): Index indicators to return a specific chunk,
|
| 35 |
+
leave as None to return embeddings for all chunks. Mainly useful for training,
|
| 36 |
+
not inference. Leave as None for inference.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
# Get token embeddings and attention mask
|
| 40 |
+
token_embeddings = features[
|
| 41 |
+
"token_embeddings"
|
| 42 |
+
] # (batch_size, seq_len, hidden_dim)
|
| 43 |
+
attention_mask = (
|
| 44 |
+
features["attention_mask"].float().unsqueeze(-1)
|
| 45 |
+
) # (batch_size, seq_len, 1)
|
| 46 |
+
|
| 47 |
+
# Get shapes and devices
|
| 48 |
+
batch_size, seq_len, hidden_dim = token_embeddings.shape
|
| 49 |
+
device = token_embeddings.device
|
| 50 |
+
|
| 51 |
+
# Compute actual sequence lengths (ignoring padding)
|
| 52 |
+
# (batch_size, 1)
|
| 53 |
+
seq_lengths = attention_mask.squeeze(-1).sum(dim=1, keepdim=True)
|
| 54 |
+
max_seq_length = int(torch.max(seq_lengths).item())
|
| 55 |
+
|
| 56 |
+
# Standard mean pooling
|
| 57 |
+
sum_embeddings = torch.sum(token_embeddings * attention_mask, dim=1)
|
| 58 |
+
sum_mask = torch.sum(attention_mask, dim=1).clamp(min=1e-9)
|
| 59 |
+
standard_mean = sum_embeddings / sum_mask # (batch_size, hidden_dim)
|
| 60 |
+
|
| 61 |
+
# Compute chunk centers dynamically based on sequence length
|
| 62 |
+
chunk_positions = torch.linspace(0, 1, self.coverage_chunks + 2, device=device)[
|
| 63 |
+
1:-1
|
| 64 |
+
] # Excludes 0 and 1
|
| 65 |
+
chunk_centers = chunk_positions * seq_lengths # (batch_size, N)
|
| 66 |
+
|
| 67 |
+
# Token positions per sequence (batch_size, seq_len)
|
| 68 |
+
token_positions = (
|
| 69 |
+
torch.arange(seq_len, device=device).float().unsqueeze(0)
|
| 70 |
+
) # (1, seq_len)
|
| 71 |
+
|
| 72 |
+
# Compute Gaussian weights (batch_size, N, seq_len)
|
| 73 |
+
seq_lengths = seq_lengths.view(seq_lengths.shape[0], 1, 1).repeat(
|
| 74 |
+
1, self.coverage_chunks, max_seq_length
|
| 75 |
+
)
|
| 76 |
+
gaussians = torch.exp(
|
| 77 |
+
-0.5
|
| 78 |
+
* (
|
| 79 |
+
(token_positions.unsqueeze(1) - chunk_centers.unsqueeze(2))
|
| 80 |
+
/ (self.sigma * seq_lengths)
|
| 81 |
+
)
|
| 82 |
+
** 2
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Mask out padding and normalize Gaussian weights per sequence
|
| 86 |
+
# (batch_size, N, seq_len)
|
| 87 |
+
gaussians = gaussians * attention_mask.squeeze(-1).unsqueeze(1)
|
| 88 |
+
|
| 89 |
+
# Normalize against gaussian weights
|
| 90 |
+
gaussians /= gaussians.sum(dim=2, keepdim=True).clamp(min=1e-9)
|
| 91 |
+
|
| 92 |
+
# Compute weighted mean for each chunk (batch_size, N, hidden_dim)
|
| 93 |
+
weighted_means = torch.einsum(
|
| 94 |
+
"bns,bsh->bnh", gaussians.to(token_embeddings.dtype), token_embeddings
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Blend with standard mean pooling
|
| 98 |
+
# (batch_size, N, hidden_dim)
|
| 99 |
+
combined_embeddings = (1 - self.alpha) * standard_mean.unsqueeze(
|
| 100 |
+
1
|
| 101 |
+
) + self.alpha * weighted_means
|
| 102 |
+
|
| 103 |
+
# Add an embedding for the entire document at index 0
|
| 104 |
+
# (batch_size, N+1, hidden_dim)
|
| 105 |
+
combined_embeddings = torch.cat(
|
| 106 |
+
[torch.zeros_like(combined_embeddings[:, :1]), combined_embeddings], 1
|
| 107 |
+
)
|
| 108 |
+
combined_embeddings[:, 0:1, :] = standard_mean.unsqueeze(1)
|
| 109 |
+
|
| 110 |
+
# Select the indicator if provided
|
| 111 |
+
if chunk_indicators is not None:
|
| 112 |
+
combined_embeddings = combined_embeddings[
|
| 113 |
+
torch.arange(combined_embeddings.size(0)), chunk_indicators
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
# Normalize all the embeddings
|
| 117 |
+
combined_embeddings = torch.nn.functional.normalize(
|
| 118 |
+
combined_embeddings, p=2, dim=-1
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# Flatten final embeddings (batch_size, hidden_dim * (N+1))
|
| 122 |
+
if chunk_indicators is None:
|
| 123 |
+
sentence_embedding = combined_embeddings.reshape(
|
| 124 |
+
batch_size, hidden_dim * (self.coverage_chunks + 1)
|
| 125 |
+
)
|
| 126 |
+
else:
|
| 127 |
+
sentence_embedding = combined_embeddings
|
| 128 |
+
|
| 129 |
+
# Return the final flattened entence embedding
|
| 130 |
+
features["sentence_embedding"] = sentence_embedding
|
| 131 |
+
return features
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def use_gaussian_coverage_pooling(m, coverage_chunks=10, sigma=0.05, alpha=1.0):
|
| 135 |
+
"""
|
| 136 |
+
Add custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
|
| 137 |
+
Args:
|
| 138 |
+
m (SentenceTransformer): The model to add pooling layer to.
|
| 139 |
+
coverage_chunks (int): Number of weighted pooling operations (N).
|
| 140 |
+
sigma (float): Standard deviation for Gaussian weighting.
|
| 141 |
+
alpha (float): Weighting factor for merging with standard mean pooling.
|
| 142 |
+
"""
|
| 143 |
+
if isinstance(m[1], GaussianCoveragePooling):
|
| 144 |
+
m = unuse_gaussian_coverage_pooling(m)
|
| 145 |
+
word_embedding_model = m[0]
|
| 146 |
+
custom_pooling = GaussianCoveragePooling(
|
| 147 |
+
coverage_chunks=coverage_chunks, sigma=sigma, alpha=alpha
|
| 148 |
+
)
|
| 149 |
+
old_pooling = m[1]
|
| 150 |
+
new_m = m.__class__(modules=[word_embedding_model, custom_pooling])
|
| 151 |
+
new_m.old_pooling = {"old_pooling": old_pooling}
|
| 152 |
+
return new_m
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def unuse_gaussian_coverage_pooling(m):
|
| 156 |
+
"""
|
| 157 |
+
Removes the custom pooling layer.
|
| 158 |
+
Args:
|
| 159 |
+
m (SentenceTransformer): The model to remove the pooling layer from.
|
| 160 |
+
"""
|
| 161 |
+
|
| 162 |
+
if isinstance(m[1], GaussianCoveragePooling):
|
| 163 |
+
new_m = m.__class__(modules=[m[0], m.old_pooling["old_pooling"]])
|
| 164 |
+
return new_m
|
| 165 |
+
else:
|
| 166 |
+
return m
|
| 167 |
+
|
| 168 |
+
|
| 169 |
class InstructionTemplateRetriever:
|
| 170 |
FINETEMPLATES_REVISION = "831ab22c90f9da011bd972585afdf609f40fa54b"
|
| 171 |
RETRIEVAL_EMBEDDING_NAME = "fineinstructions/matching_embedding"
|