Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
e7a2175
1
Parent(s):
7743187
try to fix ndcg bug
Browse files
app.py
CHANGED
@@ -103,13 +103,13 @@ class RepLlamaModel:
|
|
103 |
batch_dict = create_batch_dict(self.tokenizer, batch_texts, always_add_eos="last")
|
104 |
batch_dict = {key: value.cuda() for key, value in batch_dict.items()}
|
105 |
|
106 |
-
with torch.cuda.amp.autocast():
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
|
114 |
self.model = self.model.cpu()
|
115 |
return np.concatenate(all_embeddings, axis=0)
|
@@ -150,6 +150,7 @@ def load_corpus_lookups(dataset_name):
|
|
150 |
_, p_lookup = pickle.load(f)
|
151 |
corpus_lookups[dataset_name] += p_lookup
|
152 |
logger.info(f"Loaded corpus lookups for {dataset_name}. Total entries: {len(corpus_lookups[dataset_name])}")
|
|
|
153 |
|
154 |
def load_queries(dataset_name):
|
155 |
global queries, q_lookups, qrels
|
@@ -181,6 +182,12 @@ def evaluate(qrels, results, k_values):
|
|
181 |
for k in k_values:
|
182 |
metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
|
183 |
metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
return metrics
|
186 |
|
@@ -190,19 +197,35 @@ def run_evaluation(dataset, postfix):
|
|
190 |
current_dataset = dataset
|
191 |
|
192 |
input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[current_dataset]]
|
|
|
|
|
|
|
193 |
q_reps = model.encode(input_texts)
|
|
|
|
|
194 |
all_scores, psg_indices = search_queries(dataset, q_reps)
|
195 |
|
196 |
results = {qid: dict(zip(doc_ids, map(float, scores)))
|
197 |
for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
|
200 |
|
201 |
-
return
|
202 |
-
"NDCG@10": metrics["NDCG@10"],
|
203 |
-
"Recall@100": metrics["Recall@100"]
|
204 |
-
}
|
205 |
-
|
206 |
|
207 |
@spaces.GPU
|
208 |
def gradio_interface(dataset, postfix):
|
|
|
103 |
batch_dict = create_batch_dict(self.tokenizer, batch_texts, always_add_eos="last")
|
104 |
batch_dict = {key: value.cuda() for key, value in batch_dict.items()}
|
105 |
|
106 |
+
# with torch.cuda.amp.autocast():
|
107 |
+
with torch.no_grad():
|
108 |
+
outputs = self.model(**batch_dict)
|
109 |
+
embeddings = pool(outputs.last_hidden_state, batch_dict['attention_mask'], 'last')
|
110 |
+
embeddings = F.normalize(embeddings, p=2, dim=-1)
|
111 |
+
logger.info(f"Encoded shape: {embeddings.shape}, Norm of first embedding: {torch.norm(embeddings[0]).item()}")
|
112 |
+
all_embeddings.append(embeddings.cpu().numpy())
|
113 |
|
114 |
self.model = self.model.cpu()
|
115 |
return np.concatenate(all_embeddings, axis=0)
|
|
|
150 |
_, p_lookup = pickle.load(f)
|
151 |
corpus_lookups[dataset_name] += p_lookup
|
152 |
logger.info(f"Loaded corpus lookups for {dataset_name}. Total entries: {len(corpus_lookups[dataset_name])}")
|
153 |
+
logger.info(f"Sample corpus lookup entry: {corpus_lookups[dataset_name][0]}")
|
154 |
|
155 |
def load_queries(dataset_name):
|
156 |
global queries, q_lookups, qrels
|
|
|
182 |
for k in k_values:
|
183 |
metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
|
184 |
metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
|
185 |
+
logger.info(f"NDCG@{k}: mean={metrics[f'NDCG@{k}']}, min={min(ndcg_scores)}, max={max(ndcg_scores)}")
|
186 |
+
logger.info(f"Recall@{k}: mean={metrics[f'Recall@{k}']}, min={min(recall_scores)}, max={max(recall_scores)}")
|
187 |
+
|
188 |
+
# Add these lines
|
189 |
+
logger.info(f"Number of queries evaluated: {len(scores)}")
|
190 |
+
logger.info(f"Sample evaluation score: {list(scores.items())[0]}")
|
191 |
|
192 |
return metrics
|
193 |
|
|
|
197 |
current_dataset = dataset
|
198 |
|
199 |
input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[current_dataset]]
|
200 |
+
logger.info(f"Number of input texts: {len(input_texts)}")
|
201 |
+
logger.info(f"Sample input text: {input_texts[0]}")
|
202 |
+
|
203 |
q_reps = model.encode(input_texts)
|
204 |
+
logger.info(f"Encoded query representations shape: {q_reps.shape}")
|
205 |
+
|
206 |
all_scores, psg_indices = search_queries(dataset, q_reps)
|
207 |
|
208 |
results = {qid: dict(zip(doc_ids, map(float, scores)))
|
209 |
for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
|
210 |
|
211 |
+
logger.info(f"Number of results: {len(results)}")
|
212 |
+
logger.info(f"Sample result: {list(results.items())[0]}")
|
213 |
+
|
214 |
+
# Add these lines
|
215 |
+
logger.info(f"Number of queries in qrels: {len(qrels[dataset])}")
|
216 |
+
logger.info(f"Sample qrel: {list(qrels[dataset].items())[0]}")
|
217 |
+
logger.info(f"Number of queries in results: {len(results)}")
|
218 |
+
logger.info(f"Sample result: {list(results.items())[0]}")
|
219 |
+
|
220 |
+
# Check for mismatches
|
221 |
+
qrels_keys = set(qrels[dataset].keys())
|
222 |
+
results_keys = set(results.keys())
|
223 |
+
logger.info(f"Queries in qrels but not in results: {qrels_keys - results_keys}")
|
224 |
+
logger.info(f"Queries in results but not in qrels: {results_keys - qrels_keys}")
|
225 |
+
|
226 |
metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
|
227 |
|
228 |
+
return metrics
|
|
|
|
|
|
|
|
|
229 |
|
230 |
@spaces.GPU
|
231 |
def gradio_interface(dataset, postfix):
|