tstone87 commited on
Commit
75f0c72
Β·
verified Β·
1 Parent(s): a51fb99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -106
app.py CHANGED
@@ -1,98 +1,82 @@
1
  import os
2
- import shutil
3
  import faiss
4
  import numpy as np
5
  import gradio as gr
6
  from sentence_transformers import SentenceTransformer
7
- from huggingface_hub import HfApi, hf_hub_download, login
8
-
9
- # πŸ”Ή Hugging Face Repository Details
10
- HF_REPO_ID = "tstone87/repo" # Your dataset repo
11
- HF_TOKEN = os.getenv("HF_TOKEN") # Secure API token
12
-
13
- if not HF_TOKEN:
14
- raise ValueError("❌ ERROR: Hugging Face token not found. Add it as a secret in the Hugging Face Space settings.")
15
-
16
- # πŸ”Ή Authenticate with Hugging Face
17
- login(token=HF_TOKEN)
18
-
19
- # πŸ”Ή File Paths
20
- EMBEDDINGS_FILE = "policy_embeddings.npy"
21
- INDEX_FILE = "faiss_index.bin"
22
- TEXT_FILE = "combined_text_documents.txt"
23
-
24
- # πŸ”Ή Load policy text from file
25
- if os.path.exists(TEXT_FILE):
26
- with open(TEXT_FILE, "r", encoding="utf-8") as f:
27
- POLICY_TEXT = f.read()
28
- print("βœ… Loaded policy text from combined_text_documents.txt")
29
- else:
30
- print("❌ ERROR: combined_text_documents.txt not found! Ensure it's uploaded.")
31
- POLICY_TEXT = ""
32
-
33
- # πŸ”Ή Sentence Embedding Model (Optimized for Speed)
34
- model = SentenceTransformer("all-MiniLM-L6-v2")
35
-
36
- # πŸ”Ή Split policy text into chunks for FAISS indexing
37
- chunk_size = 500
38
- chunks = [POLICY_TEXT[i:i+chunk_size] for i in range(0, len(POLICY_TEXT), chunk_size)] if POLICY_TEXT else []
39
-
40
- # πŸ”Ή Function to Download FAISS Files from Hugging Face Hub if Available
41
- def download_faiss_from_hf():
42
- try:
43
- if not os.path.exists(EMBEDDINGS_FILE):
44
- print("πŸ“₯ Downloading FAISS embeddings from Hugging Face...")
45
- hf_hub_download(repo_id=HF_REPO_ID, filename=EMBEDDINGS_FILE, local_dir=".", token=HF_TOKEN)
46
-
47
- if not os.path.exists(INDEX_FILE):
48
- print("πŸ“₯ Downloading FAISS index from Hugging Face...")
49
- hf_hub_download(repo_id=HF_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
50
-
51
- print("βœ… FAISS files downloaded from Hugging Face.")
52
- return True
53
- except Exception as e:
54
- print(f"⚠️ FAISS files not found in Hugging Face repo. Recomputing... ({e})")
55
- return False
56
-
57
- # πŸ”Ή Check if FAISS Files Exist, Otherwise Download or Generate
58
- if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(INDEX_FILE):
59
- print("βœ… FAISS files found locally. Loading from disk...")
60
- embeddings = np.load(EMBEDDINGS_FILE)
61
- index = faiss.read_index(INDEX_FILE)
62
- elif download_faiss_from_hf():
63
- embeddings = np.load(EMBEDDINGS_FILE)
64
- index = faiss.read_index(INDEX_FILE)
65
- else:
66
- print("πŸš€ No FAISS files found. Creating new index...")
67
- if chunks:
68
- embeddings = np.array([model.encode(chunk) for chunk in chunks])
69
-
70
- # Save embeddings for future use
71
- np.save(EMBEDDINGS_FILE, embeddings)
72
-
73
- # Use FAISS optimized index for faster lookup
74
- d = embeddings.shape[1]
75
- nlist = 10 # Number of clusters
76
- index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, nlist)
77
- index.train(embeddings)
78
- index.add(embeddings)
79
- index.nprobe = 2 # Speed optimization
80
-
81
- # Save FAISS index
82
- faiss.write_index(index, INDEX_FILE)
83
- print("βœ… FAISS index created and saved.")
84
- else:
85
- print("❌ ERROR: No text to index. Check combined_text_documents.txt.")
86
- index = None
87
 
88
  # πŸ”Ή Function to Search FAISS
89
  def search_policy(query, top_k=3):
90
- if index is None:
91
- return "Error: FAISS index is not available."
92
-
93
  query_embedding = model.encode(query).reshape(1, -1)
94
  distances, indices = index.search(query_embedding, top_k)
95
-
96
  return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
97
 
98
  # πŸ”Ή Hugging Face LLM Client
@@ -113,10 +97,8 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
113
  policy_context = search_policy(message)
114
 
115
  if policy_context:
116
- # πŸ”Ή Display retrieved context in chat
117
  messages.append({"role": "assistant", "content": f"πŸ“„ **Relevant Policy Context:**\n\n{policy_context}"})
118
 
119
- # πŸ”Ή Force the LLM to use the retrieved policy text
120
  user_query_with_context = f"""
121
  The following is the most relevant policy information retrieved from the official Colorado public assistance policies:
122
 
@@ -127,7 +109,6 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
127
  """
128
  messages.append({"role": "user", "content": user_query_with_context})
129
  else:
130
- # If no relevant policy info is found, use the original message
131
  messages.append({"role": "user", "content": message})
132
 
133
  response = ""
@@ -147,7 +128,7 @@ demo = gr.ChatInterface(
147
  respond,
148
  additional_inputs=[
149
  gr.Textbox(
150
- value="You are a knowledgeable and professional chatbot designed to assist Colorado case workers in determining eligibility for public assistance programs.",
151
  label="System message"
152
  ),
153
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
@@ -156,22 +137,5 @@ demo = gr.ChatInterface(
156
  ],
157
  )
158
 
159
- # πŸ”Ή Function to Provide FAISS Files for Download
160
- def download_faiss_files():
161
- if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(INDEX_FILE):
162
- shutil.copy(EMBEDDINGS_FILE, "/mnt/data/policy_embeddings.npy")
163
- shutil.copy(INDEX_FILE, "/mnt/data/faiss_index.bin")
164
- return "βœ… FAISS files ready for download! Check the 'Files' tab in your Hugging Face Space."
165
- else:
166
- return "❌ FAISS files not found. Run the chatbot first to generate them."
167
-
168
- # Gradio button for downloading FAISS files
169
- with gr.Blocks() as file_download:
170
- gr.Markdown("### πŸ”½ Download FAISS Files to Your Computer")
171
- download_button = gr.Button("Prepare FAISS Files for Download")
172
- output_text = gr.Textbox()
173
- download_button.click(fn=download_faiss_files, outputs=output_text)
174
-
175
  if __name__ == "__main__":
176
  demo.launch()
177
- file_download.launch()
 
1
  import os
2
+ import fitz # PyMuPDF for PDF reading
3
  import faiss
4
  import numpy as np
5
  import gradio as gr
6
  from sentence_transformers import SentenceTransformer
7
+ from huggingface_hub import hf_hub_download
8
+
9
+ # πŸ”Ή Hugging Face Space Repository Details
10
+ HF_REPO_ID = "tstone87/ccr-colorado"
11
+
12
+ # πŸ”Ή Load Embedding Model (Better for QA Retrieval)
13
+ model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
14
+
15
+ # πŸ”Ή Define PDF Directory and Chunk Size
16
+ PDF_DIR = "./pdfs" # Local folder for downloaded PDFs
17
+ CHUNK_SIZE = 2500 # Larger chunks for better context
18
+
19
+ # πŸ”Ή Ensure Directory Exists
20
+ os.makedirs(PDF_DIR, exist_ok=True)
21
+
22
+ # πŸ”Ή Function to Download PDFs from Hugging Face Space
23
+ def download_pdfs():
24
+ pdf_files = [
25
+ "SNAP 10 CCR 2506-1 .pdf",
26
+ "Med 10 CCR 2505-10 8.100.pdf",
27
+ # Add other PDFs here if necessary
28
+ ]
29
+
30
+ for pdf_file in pdf_files:
31
+ pdf_path = os.path.join(PDF_DIR, pdf_file)
32
+ if not os.path.exists(pdf_path): # Download if not already present
33
+ print(f"πŸ“₯ Downloading {pdf_file}...")
34
+ hf_hub_download(repo_id=HF_REPO_ID, filename=pdf_file, local_dir=PDF_DIR)
35
+
36
+ print("βœ… All PDFs downloaded.")
37
+
38
+ # πŸ”Ή Function to Extract Text from PDFs
39
+ def extract_text_from_pdfs():
40
+ all_text = ""
41
+ for pdf_file in os.listdir(PDF_DIR):
42
+ if pdf_file.endswith(".pdf"):
43
+ pdf_path = os.path.join(PDF_DIR, pdf_file)
44
+ doc = fitz.open(pdf_path)
45
+ for page in doc:
46
+ all_text += page.get_text("text") + "\n"
47
+
48
+ return all_text
49
+
50
+ # πŸ”Ή Initialize FAISS and Embed Text
51
+ def initialize_faiss():
52
+ download_pdfs()
53
+ text_data = extract_text_from_pdfs()
54
+
55
+ if not text_data:
56
+ raise ValueError("❌ No text extracted from PDFs!")
57
+
58
+ # Split text into chunks
59
+ chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
60
+
61
+ # Generate embeddings
62
+ embeddings = np.array([model.encode(chunk) for chunk in chunks])
63
+
64
+ # Create FAISS index
65
+ index = faiss.IndexFlatL2(embeddings.shape[1])
66
+ index.add(embeddings)
67
+
68
+ print("βœ… FAISS index initialized.")
69
+
70
+ return index, chunks
71
+
72
+ # πŸ”Ή Initialize FAISS on Startup
73
+ index, chunks = initialize_faiss()
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # πŸ”Ή Function to Search FAISS
76
  def search_policy(query, top_k=3):
 
 
 
77
  query_embedding = model.encode(query).reshape(1, -1)
78
  distances, indices = index.search(query_embedding, top_k)
79
+
80
  return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
81
 
82
  # πŸ”Ή Hugging Face LLM Client
 
97
  policy_context = search_policy(message)
98
 
99
  if policy_context:
 
100
  messages.append({"role": "assistant", "content": f"πŸ“„ **Relevant Policy Context:**\n\n{policy_context}"})
101
 
 
102
  user_query_with_context = f"""
103
  The following is the most relevant policy information retrieved from the official Colorado public assistance policies:
104
 
 
109
  """
110
  messages.append({"role": "user", "content": user_query_with_context})
111
  else:
 
112
  messages.append({"role": "user", "content": message})
113
 
114
  response = ""
 
128
  respond,
129
  additional_inputs=[
130
  gr.Textbox(
131
+ value="You are a knowledgeable chatbot designed to assist Colorado case workers with Medicaid, SNAP, TANF, CHP+, and other programs.",
132
  label="System message"
133
  ),
134
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
 
137
  ],
138
  )
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  if __name__ == "__main__":
141
  demo.launch()