Penality commited on
Commit
c8d88b2
·
verified ·
1 Parent(s): 5dc8c5b

Update app.py

Browse files

updated FAISS indexing and file and metadata storing to avoid keyError: '0'

Files changed (1) hide show
  1. app.py +32 -31
app.py CHANGED
@@ -30,58 +30,55 @@ embedding_dim = 768 # Adjust according to model
30
  index = faiss.IndexFlatL2(embedding_dim)
31
  documents = [] # Store raw text for reference
32
 
33
-
34
- # initialize the variables to store documents
35
  DOCUMENT_DIR = os.path.join(os.path.dirname(__file__), "documents")
36
- INDEX_FILE = "faiss_index.py" # stores embeddings
37
- METADATA_FILE = "metadata.json" # stores Document metadata
38
 
39
- # create the directory
40
  os.makedirs(DOCUMENT_DIR, exist_ok=True)
41
 
42
- # load the faiss indexes file
43
- if os.path.exists(INDEX_FILE): # check if index file exists
44
- stored_embeddings = np.load(INDEX_FILE) # load emeddings
45
- if stored_embeddings.shape[0] > 0:
46
- index.add(stored_embeddings)
47
 
48
- # load the document metadata
49
- if os.path.exists(METADATA_FILE): # check if metadata exists
50
  with open(METADATA_FILE, "r") as f:
51
  metadata = json.load(f)
52
  else:
53
  metadata = {}
54
 
55
  def store_document(text):
56
- print("storing document")
57
 
58
  # Generate a unique filename
59
- filename = os.path.join(DOCUMENT_DIR, f"doc_{len(metadata) + 1}.txt")
 
 
60
 
61
- print(filename)
62
-
63
- # Save document in a file
64
- with open(filename, "w") as f:
65
  f.write(text)
 
66
 
67
- print("document saved")
68
-
69
- # Generate and store embedding
70
  embedding = embedding_model.encode([text]).astype(np.float32)
71
- index.add(embedding)
 
72
 
73
- print("emeddings generated")
74
-
75
- # Update metadata
76
- metadata[len(metadata)] = filename
 
77
  with open(METADATA_FILE, "w") as f:
78
  json.dump(metadata, f)
79
 
80
- # Save FAISS index
81
- np.save(INDEX_FILE, index.reconstruct_n(0, index.ntotal))
82
-
83
- print(f"your document has been stored at: {filename}")
84
-
85
  return "Document stored!"
86
 
87
  def retrieve_document(query):
@@ -90,6 +87,10 @@ def retrieve_document(query):
90
  query_embedding = embedding_model.encode([query]).astype(np.float32)
91
  _, closest_idx = index.search(query_embedding, 1)
92
 
 
 
 
 
93
  if closest_idx[0][0] in metadata: # Ensure a valid match
94
  filename = metadata[str(closest_idx[0][0])]
95
  with open(filename, "r") as f:
 
30
  index = faiss.IndexFlatL2(embedding_dim)
31
  documents = [] # Store raw text for reference
32
 
33
+ # Initialize paths
 
34
  DOCUMENT_DIR = os.path.join(os.path.dirname(__file__), "documents")
35
+ INDEX_FILE = "faiss_index.bin" # FAISS index file (binary format)
36
+ METADATA_FILE = "metadata.json" # Document metadata
37
 
38
+ # Create the documents directory if it doesn’t exist
39
  os.makedirs(DOCUMENT_DIR, exist_ok=True)
40
 
41
+ # Load FAISS index if it exists
42
+ if os.path.exists(INDEX_FILE):
43
+ index = faiss.read_index(INDEX_FILE)
 
 
44
 
45
+ # Load metadata
46
+ if os.path.exists(METADATA_FILE):
47
  with open(METADATA_FILE, "r") as f:
48
  metadata = json.load(f)
49
  else:
50
  metadata = {}
51
 
52
  def store_document(text):
53
+ print(" Storing document...")
54
 
55
  # Generate a unique filename
56
+ doc_id = len(metadata) + 1
57
+ filename = os.path.join(DOCUMENT_DIR, f"doc_{doc_id}.txt")
58
+ print(f"Saving document at: {filename}")
59
 
60
+ # Save document to file
61
+ with open(filename, "w", encoding="utf-8") as f:
 
 
62
  f.write(text)
63
+ print(" Document saved")
64
 
65
+ # Generate and store embedding
 
 
66
  embedding = embedding_model.encode([text]).astype(np.float32)
67
+ index.add(embedding) # Add to FAISS index
68
+ print(" Embeddings generated")
69
 
70
+ # Get FAISS index for the new document
71
+ doc_index = index.ntotal - 1
72
+
73
+ # Update metadata with FAISS index
74
+ metadata[str(doc_index)] = filename
75
  with open(METADATA_FILE, "w") as f:
76
  json.dump(metadata, f)
77
 
78
+ # Save FAISS index properly
79
+ faiss.write_index(index, INDEX_FILE)
80
+
81
+ print(f" Document stored successfully at: {filename}")
 
82
  return "Document stored!"
83
 
84
  def retrieve_document(query):
 
87
  query_embedding = embedding_model.encode([query]).astype(np.float32)
88
  _, closest_idx = index.search(query_embedding, 1)
89
 
90
+ if not closest_idx or closest_idx[0][0] not in metadata:
91
+ return "No relevant document found."
92
+
93
+
94
  if closest_idx[0][0] in metadata: # Ensure a valid match
95
  filename = metadata[str(closest_idx[0][0])]
96
  with open(filename, "r") as f: