Spaces:

orionweller
/

retrieval-prompting

Running on Zero

App Files Files Community

orionweller commited on Sep 9, 2024

Commit

7743187

1 Parent(s): 9813925

try to fix ndcg bug

Browse files

Files changed (1) hide show

app.py +11 -7

app.py CHANGED Viewed

@@ -94,7 +94,7 @@ class RepLlamaModel:
         model.eval()
         return model
-    def encode(self, texts, batch_size=32, **kwargs):
         self.model = self.model.cuda()
         all_embeddings = []
         for i in range(0, len(texts), batch_size):
@@ -108,6 +108,7 @@ class RepLlamaModel:
                     outputs = self.model(**batch_dict)
                     embeddings = pool(outputs.last_hidden_state, batch_dict['attention_mask'], 'last')
                     embeddings = F.normalize(embeddings, p=2, dim=-1)
                     all_embeddings.append(embeddings.cpu().numpy())
         self.model = self.model.cpu()
@@ -118,7 +119,7 @@ def load_faiss_index(dataset_name):
     index_path = f"{dataset_name}/faiss_index.bin"
     if os.path.exists(index_path):
         logger.info(f"Loading existing FAISS index for {dataset_name} from {index_path}")
-        return faiss.read_index(index_path, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
     return None
 def search_queries(dataset_name, q_reps, depth=1000):
@@ -126,16 +127,15 @@ def search_queries(dataset_name, q_reps, depth=1000):
     if faiss_index is None:
         raise ValueError(f"No FAISS index found for dataset {dataset_name}")
-    # Ensure q_reps is a 2D numpy array of the correct type
-    q_reps = np.ascontiguousarray(q_reps.astype('float16'))
     # Perform the search
     all_scores, all_indices = faiss_index.search(q_reps, depth)
-    psg_indices = [[str(corpus_lookups[dataset_name][x]) for x in q_dd] for q_dd in all_indices]
-    # Clean up
-    del faiss_index
     return all_scores, np.array(psg_indices)
@@ -149,6 +149,7 @@ def load_corpus_lookups(dataset_name):
         with open(file, 'rb') as f:
             _, p_lookup = pickle.load(f)
         corpus_lookups[dataset_name] += p_lookup
 def load_queries(dataset_name):
     global queries, q_lookups, qrels
@@ -166,6 +167,9 @@ def load_queries(dataset_name):
             qrels[dataset_name][qrel.query_id] = {}
         qrels[dataset_name][qrel.query_id][qrel.doc_id] = qrel.relevance
 def evaluate(qrels, results, k_values):
     evaluator = pytrec_eval.RelevanceEvaluator(

         model.eval()
         return model
+    def encode(self, texts, batch_size=16, **kwargs):
         self.model = self.model.cuda()
         all_embeddings = []
         for i in range(0, len(texts), batch_size):
                     outputs = self.model(**batch_dict)
                     embeddings = pool(outputs.last_hidden_state, batch_dict['attention_mask'], 'last')
                     embeddings = F.normalize(embeddings, p=2, dim=-1)
+                    logger.info(f"Encoded shape: {embeddings.shape}, Norm of first embedding: {torch.norm(embeddings[0]).item()}")
                     all_embeddings.append(embeddings.cpu().numpy())
         self.model = self.model.cpu()
     index_path = f"{dataset_name}/faiss_index.bin"
     if os.path.exists(index_path):
         logger.info(f"Loading existing FAISS index for {dataset_name} from {index_path}")
+        return faiss.read_index(index_path)
     return None
 def search_queries(dataset_name, q_reps, depth=1000):
     if faiss_index is None:
         raise ValueError(f"No FAISS index found for dataset {dataset_name}")
+    logger.info(f"Searching queries. Shape of q_reps: {q_reps.shape}")
     # Perform the search
     all_scores, all_indices = faiss_index.search(q_reps, depth)
+    logger.info(f"Search completed. Shape of all_scores: {all_scores.shape}, all_indices: {all_indices.shape}")
+    logger.info(f"Sample scores: {all_scores[0][:5]}, Sample indices: {all_indices[0][:5]}")
+    psg_indices = [[str(corpus_lookups[dataset_name][x]) for x in q_dd] for q_dd in all_indices]
     return all_scores, np.array(psg_indices)
         with open(file, 'rb') as f:
             _, p_lookup = pickle.load(f)
         corpus_lookups[dataset_name] += p_lookup
+    logger.info(f"Loaded corpus lookups for {dataset_name}. Total entries: {len(corpus_lookups[dataset_name])}")
 def load_queries(dataset_name):
     global queries, q_lookups, qrels
             qrels[dataset_name][qrel.query_id] = {}
         qrels[dataset_name][qrel.query_id][qrel.doc_id] = qrel.relevance
+    logger.info(f"Loaded queries for {dataset_name}. Total queries: {len(queries[dataset_name])}")
+    logger.info(f"Loaded qrels for {dataset_name}. Total query IDs: {len(qrels[dataset_name])}")
 def evaluate(qrels, results, k_values):
     evaluator = pytrec_eval.RelevanceEvaluator(