Hidayatmahar commited on
Commit
ee612b6
·
verified ·
1 Parent(s): 9aaee28

Create create_faiss.py

Browse files
Files changed (1) hide show
  1. create_faiss.py +25 -0
create_faiss.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import faiss
3
+ from sentence_transformers import SentenceTransformer
4
+ import numpy as np
5
+
6
+ # Load the Pile dataset (legal text)
7
+ dataset = load_dataset("EleutherAI/the_pile", split="train")
8
+
9
+ # Extract legal-related documents
10
+ law_data = [item['text'] for item in dataset if item['meta']['pile_set_name'] == 'Pile-CC']
11
+
12
+ # Load embedding model
13
+ model = SentenceTransformer("all-MiniLM-L6-v2")
14
+
15
+ # Generate embeddings
16
+ embeddings = model.encode(law_data, convert_to_numpy=True)
17
+
18
+ # Create FAISS index
19
+ index = faiss.IndexFlatL2(embeddings.shape[1])
20
+ index.add(embeddings)
21
+
22
+ # Save FAISS index
23
+ faiss.write_index(index, "faiss_index.bin")
24
+
25
+ print("✅ FAISS index saved successfully!")