Spaces:

wakeupmh
/

ama-autism

Sleeping

wakeupmh commited on Feb 15

Commit

92c1c48

1 Parent(s): 42d1dd5

fix: normalize faiss

Files changed (1) hide show

faiss_index/index.py CHANGED Viewed

@@ -61,20 +61,21 @@ def build_faiss_index(papers, dataset_dir=DATASET_DIR):
                 torch.cuda.empty_cache()
     # Convert to numpy array and build FAISS index
-    embeddings = np.array(embeddings)
     dimension = embeddings.shape[1]
-    # Use more efficient index type
-    index = faiss.IndexFlatIP(dimension)  # Simple but efficient dot-product index
-    # Normalize vectors to use dot product as similarity
-    faiss.normalize_L2(embeddings)
     index.add(embeddings)
     # Create and save the dataset
     dataset = Dataset.from_dict({
         "text": texts,
-        "embeddings": embeddings,
         "title": [p["title"] for p in papers]
     })

                 torch.cuda.empty_cache()
     # Convert to numpy array and build FAISS index
+    embeddings = np.array(embeddings, dtype=np.float32)  # Ensure float32 type
     dimension = embeddings.shape[1]
+    # Normalize the vectors manually
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    embeddings = embeddings / norms
+    # Create FAISS index
+    index = faiss.IndexFlatIP(dimension)
     index.add(embeddings)
     # Create and save the dataset
     dataset = Dataset.from_dict({
         "text": texts,
+        "embeddings": embeddings.tolist(),  # Convert to list for storage
         "title": [p["title"] for p in papers]
     })