Hidayatmahar commited on
Commit
c6b82b8
·
verified ·
1 Parent(s): ee612b6

Update create_faiss.py

Browse files
Files changed (1) hide show
  1. create_faiss.py +8 -7
create_faiss.py CHANGED
@@ -3,11 +3,11 @@ import faiss
3
  from sentence_transformers import SentenceTransformer
4
  import numpy as np
5
 
6
- # Load the Pile dataset (legal text)
7
- dataset = load_dataset("EleutherAI/the_pile", split="train")
8
 
9
- # Extract legal-related documents
10
- law_data = [item['text'] for item in dataset if item['meta']['pile_set_name'] == 'Pile-CC']
11
 
12
  # Load embedding model
13
  model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -16,10 +16,11 @@ model = SentenceTransformer("all-MiniLM-L6-v2")
16
  embeddings = model.encode(law_data, convert_to_numpy=True)
17
 
18
  # Create FAISS index
19
- index = faiss.IndexFlatL2(embeddings.shape[1])
20
- index.add(embeddings)
 
21
 
22
  # Save FAISS index
23
  faiss.write_index(index, "faiss_index.bin")
24
 
25
- print("✅ FAISS index saved successfully!")
 
3
  from sentence_transformers import SentenceTransformer
4
  import numpy as np
5
 
6
+ # Load the US-LegalKit dataset
7
+ dataset = load_dataset("macadeliccc/US-LegalKit", split="train")
8
 
9
+ # Extract legal text documents
10
+ law_data = [item['text'] for item in dataset if 'text' in item]
11
 
12
  # Load embedding model
13
  model = SentenceTransformer("all-MiniLM-L6-v2")
 
16
  embeddings = model.encode(law_data, convert_to_numpy=True)
17
 
18
  # Create FAISS index
19
+ dimension = embeddings.shape[1]
20
+ index = faiss.IndexFlatL2(dimension) # L2 Distance Index
21
+ index.add(embeddings) # Add vectors to FAISS index
22
 
23
  # Save FAISS index
24
  faiss.write_index(index, "faiss_index.bin")
25
 
26
+ print("✅ FAISS index saved successfully as 'faiss_index.bin'!")