orionweller commited on
Commit
e7a2175
·
1 Parent(s): 7743187

try to fix ndcg bug

Browse files
Files changed (1) hide show
  1. app.py +35 -12
app.py CHANGED
@@ -103,13 +103,13 @@ class RepLlamaModel:
103
  batch_dict = create_batch_dict(self.tokenizer, batch_texts, always_add_eos="last")
104
  batch_dict = {key: value.cuda() for key, value in batch_dict.items()}
105
 
106
- with torch.cuda.amp.autocast():
107
- with torch.no_grad():
108
- outputs = self.model(**batch_dict)
109
- embeddings = pool(outputs.last_hidden_state, batch_dict['attention_mask'], 'last')
110
- embeddings = F.normalize(embeddings, p=2, dim=-1)
111
- logger.info(f"Encoded shape: {embeddings.shape}, Norm of first embedding: {torch.norm(embeddings[0]).item()}")
112
- all_embeddings.append(embeddings.cpu().numpy())
113
 
114
  self.model = self.model.cpu()
115
  return np.concatenate(all_embeddings, axis=0)
@@ -150,6 +150,7 @@ def load_corpus_lookups(dataset_name):
150
  _, p_lookup = pickle.load(f)
151
  corpus_lookups[dataset_name] += p_lookup
152
  logger.info(f"Loaded corpus lookups for {dataset_name}. Total entries: {len(corpus_lookups[dataset_name])}")
 
153
 
154
  def load_queries(dataset_name):
155
  global queries, q_lookups, qrels
@@ -181,6 +182,12 @@ def evaluate(qrels, results, k_values):
181
  for k in k_values:
182
  metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
183
  metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
 
 
 
 
 
 
184
 
185
  return metrics
186
 
@@ -190,19 +197,35 @@ def run_evaluation(dataset, postfix):
190
  current_dataset = dataset
191
 
192
  input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[current_dataset]]
 
 
 
193
  q_reps = model.encode(input_texts)
 
 
194
  all_scores, psg_indices = search_queries(dataset, q_reps)
195
 
196
  results = {qid: dict(zip(doc_ids, map(float, scores)))
197
  for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
200
 
201
- return {
202
- "NDCG@10": metrics["NDCG@10"],
203
- "Recall@100": metrics["Recall@100"]
204
- }
205
-
206
 
207
  @spaces.GPU
208
  def gradio_interface(dataset, postfix):
 
103
  batch_dict = create_batch_dict(self.tokenizer, batch_texts, always_add_eos="last")
104
  batch_dict = {key: value.cuda() for key, value in batch_dict.items()}
105
 
106
+ # with torch.cuda.amp.autocast():
107
+ with torch.no_grad():
108
+ outputs = self.model(**batch_dict)
109
+ embeddings = pool(outputs.last_hidden_state, batch_dict['attention_mask'], 'last')
110
+ embeddings = F.normalize(embeddings, p=2, dim=-1)
111
+ logger.info(f"Encoded shape: {embeddings.shape}, Norm of first embedding: {torch.norm(embeddings[0]).item()}")
112
+ all_embeddings.append(embeddings.cpu().numpy())
113
 
114
  self.model = self.model.cpu()
115
  return np.concatenate(all_embeddings, axis=0)
 
150
  _, p_lookup = pickle.load(f)
151
  corpus_lookups[dataset_name] += p_lookup
152
  logger.info(f"Loaded corpus lookups for {dataset_name}. Total entries: {len(corpus_lookups[dataset_name])}")
153
+ logger.info(f"Sample corpus lookup entry: {corpus_lookups[dataset_name][0]}")
154
 
155
  def load_queries(dataset_name):
156
  global queries, q_lookups, qrels
 
182
  for k in k_values:
183
  metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
184
  metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
185
+ logger.info(f"NDCG@{k}: mean={metrics[f'NDCG@{k}']}, min={min(ndcg_scores)}, max={max(ndcg_scores)}")
186
+ logger.info(f"Recall@{k}: mean={metrics[f'Recall@{k}']}, min={min(recall_scores)}, max={max(recall_scores)}")
187
+
188
+ # Add these lines
189
+ logger.info(f"Number of queries evaluated: {len(scores)}")
190
+ logger.info(f"Sample evaluation score: {list(scores.items())[0]}")
191
 
192
  return metrics
193
 
 
197
  current_dataset = dataset
198
 
199
  input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[current_dataset]]
200
+ logger.info(f"Number of input texts: {len(input_texts)}")
201
+ logger.info(f"Sample input text: {input_texts[0]}")
202
+
203
  q_reps = model.encode(input_texts)
204
+ logger.info(f"Encoded query representations shape: {q_reps.shape}")
205
+
206
  all_scores, psg_indices = search_queries(dataset, q_reps)
207
 
208
  results = {qid: dict(zip(doc_ids, map(float, scores)))
209
  for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
210
 
211
+ logger.info(f"Number of results: {len(results)}")
212
+ logger.info(f"Sample result: {list(results.items())[0]}")
213
+
214
+ # Add these lines
215
+ logger.info(f"Number of queries in qrels: {len(qrels[dataset])}")
216
+ logger.info(f"Sample qrel: {list(qrels[dataset].items())[0]}")
217
+ logger.info(f"Number of queries in results: {len(results)}")
218
+ logger.info(f"Sample result: {list(results.items())[0]}")
219
+
220
+ # Check for mismatches
221
+ qrels_keys = set(qrels[dataset].keys())
222
+ results_keys = set(results.keys())
223
+ logger.info(f"Queries in qrels but not in results: {qrels_keys - results_keys}")
224
+ logger.info(f"Queries in results but not in qrels: {results_keys - qrels_keys}")
225
+
226
  metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
227
 
228
+ return metrics
 
 
 
 
229
 
230
  @spaces.GPU
231
  def gradio_interface(dataset, postfix):