Penality commited on
Commit
1ed3cce
·
verified ·
1 Parent(s): c94d8ae

Update app.py

Browse files

updated code to store files inside of hugging face dataset folder

Files changed (1) hide show
  1. app.py +19 -32
app.py CHANGED
@@ -24,17 +24,23 @@ embedding_model = SentenceTransformer(
24
  "togethercomputer/m2-bert-80M-8k-retrieval",
25
  trust_remote_code=True # Allow remote code execution
26
  )
 
 
 
 
 
 
 
 
 
27
  embedding_dim = 768 # Adjust according to model
28
 
29
  # Initialize FAISS index
30
  index = faiss.IndexFlatL2(embedding_dim)
31
 
32
- # Initialize paths
33
- INDEX_FILE = "faiss_index.bin" # FAISS index file (binary format)
34
- METADATA_FILE = "metadata.json" # Document metadata
35
-
36
- print(os.getcwd()) # This will print the current working directory
37
- print(os.listdir(".")) # This will show files in the current director
38
 
39
  # Load FAISS index if it exists
40
  if os.path.exists(INDEX_FILE):
@@ -46,7 +52,7 @@ else:
46
 
47
  # Load metadata
48
  if os.path.exists(METADATA_FILE):
49
- print("metadata exists")
50
  with open(METADATA_FILE, "r") as f:
51
  metadata = json.load(f)
52
  else:
@@ -55,9 +61,9 @@ else:
55
  def store_document(text):
56
  print(" Storing document...")
57
 
58
- # Generate a unique filename
59
  doc_id = len(metadata) + 1
60
- filename = f"doc_{doc_id}.txt"
61
  print(f"Saving document at: {filename}")
62
 
63
  # Save document to file
@@ -76,33 +82,14 @@ def store_document(text):
76
  # Update metadata with FAISS index
77
  metadata[str(doc_index)] = filename
78
  with open(METADATA_FILE, "w") as f:
79
- print(metadata)
80
  json.dump(metadata, f)
81
- print("saved Metadata")
82
 
83
- # Save FAISS index properly
84
  faiss.write_index(index, INDEX_FILE)
 
85
 
86
- print(f" Document stored successfully at: {filename}")
87
- return "Document stored!"
88
-
89
- def retrieve_document(query):
90
- print(f"retrieving doc based on: \n{query}")
91
-
92
- query_embedding = embedding_model.encode([query]).astype(np.float32)
93
- _, closest_idx = index.search(query_embedding, 1)
94
-
95
- if not closest_idx or closest_idx[0][0] not in metadata:
96
- print("No relevant Document found")
97
- return None
98
-
99
-
100
- if closest_idx[0][0] in metadata: # Ensure a valid match
101
- filename = metadata[str(closest_idx[0][0])]
102
- with open(filename, "r") as f:
103
- return f.read()
104
- else:
105
- return None
106
 
107
 
108
  def clean_text(text):
 
24
  "togethercomputer/m2-bert-80M-8k-retrieval",
25
  trust_remote_code=True # Allow remote code execution
26
  )
27
+
28
+ # Define dataset storage folder
29
+ DATASET_DIR = "/home/user/.cache/huggingface/datasets/my_documents"
30
+ os.makedirs(DATASET_DIR, exist_ok=True) # Ensure directory exists
31
+
32
+ # Define file paths inside dataset folder
33
+ INDEX_FILE = os.path.join(DATASET_DIR, "faiss_index.bin") # FAISS index file
34
+ METADATA_FILE = os.path.join(DATASET_DIR, "metadata.json") # Metadata file
35
+
36
  embedding_dim = 768 # Adjust according to model
37
 
38
  # Initialize FAISS index
39
  index = faiss.IndexFlatL2(embedding_dim)
40
 
41
+ # Debugging: Check working directory and available files
42
+ print("Current working directory:", os.getcwd())
43
+ print("Files in dataset directory:", os.listdir(DATASET_DIR))
 
 
 
44
 
45
  # Load FAISS index if it exists
46
  if os.path.exists(INDEX_FILE):
 
52
 
53
  # Load metadata
54
  if os.path.exists(METADATA_FILE):
55
+ print(" Metadata file exists")
56
  with open(METADATA_FILE, "r") as f:
57
  metadata = json.load(f)
58
  else:
 
61
  def store_document(text):
62
  print(" Storing document...")
63
 
64
+ # Generate a unique filename inside the dataset folder
65
  doc_id = len(metadata) + 1
66
+ filename = os.path.join(DATASET_DIR, f"doc_{doc_id}.txt")
67
  print(f"Saving document at: {filename}")
68
 
69
  # Save document to file
 
82
  # Update metadata with FAISS index
83
  metadata[str(doc_index)] = filename
84
  with open(METADATA_FILE, "w") as f:
 
85
  json.dump(metadata, f)
86
+ print(" Saved Metadata")
87
 
88
+ # Save FAISS index
89
  faiss.write_index(index, INDEX_FILE)
90
+ print(" FAISS index saved")
91
 
92
+ return f"Document stored at: {filename}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  def clean_text(text):