Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,98 +1,82 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
import faiss
|
4 |
import numpy as np
|
5 |
import gradio as gr
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
-
from huggingface_hub import
|
8 |
-
|
9 |
-
# πΉ Hugging Face Repository Details
|
10 |
-
HF_REPO_ID = "tstone87/
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
# πΉ
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
d = embeddings.shape[1]
|
75 |
-
nlist = 10 # Number of clusters
|
76 |
-
index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, nlist)
|
77 |
-
index.train(embeddings)
|
78 |
-
index.add(embeddings)
|
79 |
-
index.nprobe = 2 # Speed optimization
|
80 |
-
|
81 |
-
# Save FAISS index
|
82 |
-
faiss.write_index(index, INDEX_FILE)
|
83 |
-
print("β
FAISS index created and saved.")
|
84 |
-
else:
|
85 |
-
print("β ERROR: No text to index. Check combined_text_documents.txt.")
|
86 |
-
index = None
|
87 |
|
88 |
# πΉ Function to Search FAISS
|
89 |
def search_policy(query, top_k=3):
|
90 |
-
if index is None:
|
91 |
-
return "Error: FAISS index is not available."
|
92 |
-
|
93 |
query_embedding = model.encode(query).reshape(1, -1)
|
94 |
distances, indices = index.search(query_embedding, top_k)
|
95 |
-
|
96 |
return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
|
97 |
|
98 |
# πΉ Hugging Face LLM Client
|
@@ -113,10 +97,8 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
113 |
policy_context = search_policy(message)
|
114 |
|
115 |
if policy_context:
|
116 |
-
# πΉ Display retrieved context in chat
|
117 |
messages.append({"role": "assistant", "content": f"π **Relevant Policy Context:**\n\n{policy_context}"})
|
118 |
|
119 |
-
# πΉ Force the LLM to use the retrieved policy text
|
120 |
user_query_with_context = f"""
|
121 |
The following is the most relevant policy information retrieved from the official Colorado public assistance policies:
|
122 |
|
@@ -127,7 +109,6 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
127 |
"""
|
128 |
messages.append({"role": "user", "content": user_query_with_context})
|
129 |
else:
|
130 |
-
# If no relevant policy info is found, use the original message
|
131 |
messages.append({"role": "user", "content": message})
|
132 |
|
133 |
response = ""
|
@@ -147,7 +128,7 @@ demo = gr.ChatInterface(
|
|
147 |
respond,
|
148 |
additional_inputs=[
|
149 |
gr.Textbox(
|
150 |
-
value="You are a knowledgeable
|
151 |
label="System message"
|
152 |
),
|
153 |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
@@ -156,22 +137,5 @@ demo = gr.ChatInterface(
|
|
156 |
],
|
157 |
)
|
158 |
|
159 |
-
# πΉ Function to Provide FAISS Files for Download
|
160 |
-
def download_faiss_files():
|
161 |
-
if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(INDEX_FILE):
|
162 |
-
shutil.copy(EMBEDDINGS_FILE, "/mnt/data/policy_embeddings.npy")
|
163 |
-
shutil.copy(INDEX_FILE, "/mnt/data/faiss_index.bin")
|
164 |
-
return "β
FAISS files ready for download! Check the 'Files' tab in your Hugging Face Space."
|
165 |
-
else:
|
166 |
-
return "β FAISS files not found. Run the chatbot first to generate them."
|
167 |
-
|
168 |
-
# Gradio button for downloading FAISS files
|
169 |
-
with gr.Blocks() as file_download:
|
170 |
-
gr.Markdown("### π½ Download FAISS Files to Your Computer")
|
171 |
-
download_button = gr.Button("Prepare FAISS Files for Download")
|
172 |
-
output_text = gr.Textbox()
|
173 |
-
download_button.click(fn=download_faiss_files, outputs=output_text)
|
174 |
-
|
175 |
if __name__ == "__main__":
|
176 |
demo.launch()
|
177 |
-
file_download.launch()
|
|
|
1 |
import os
|
2 |
+
import fitz # PyMuPDF for PDF reading
|
3 |
import faiss
|
4 |
import numpy as np
|
5 |
import gradio as gr
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
+
|
9 |
+
# πΉ Hugging Face Space Repository Details
|
10 |
+
HF_REPO_ID = "tstone87/ccr-colorado"
|
11 |
+
|
12 |
+
# πΉ Load Embedding Model (Better for QA Retrieval)
|
13 |
+
model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
|
14 |
+
|
15 |
+
# πΉ Define PDF Directory and Chunk Size
|
16 |
+
PDF_DIR = "./pdfs" # Local folder for downloaded PDFs
|
17 |
+
CHUNK_SIZE = 2500 # Larger chunks for better context
|
18 |
+
|
19 |
+
# πΉ Ensure Directory Exists
|
20 |
+
os.makedirs(PDF_DIR, exist_ok=True)
|
21 |
+
|
22 |
+
# πΉ Function to Download PDFs from Hugging Face Space
|
23 |
+
def download_pdfs():
|
24 |
+
pdf_files = [
|
25 |
+
"SNAP 10 CCR 2506-1 .pdf",
|
26 |
+
"Med 10 CCR 2505-10 8.100.pdf",
|
27 |
+
# Add other PDFs here if necessary
|
28 |
+
]
|
29 |
+
|
30 |
+
for pdf_file in pdf_files:
|
31 |
+
pdf_path = os.path.join(PDF_DIR, pdf_file)
|
32 |
+
if not os.path.exists(pdf_path): # Download if not already present
|
33 |
+
print(f"π₯ Downloading {pdf_file}...")
|
34 |
+
hf_hub_download(repo_id=HF_REPO_ID, filename=pdf_file, local_dir=PDF_DIR)
|
35 |
+
|
36 |
+
print("β
All PDFs downloaded.")
|
37 |
+
|
38 |
+
# πΉ Function to Extract Text from PDFs
|
39 |
+
def extract_text_from_pdfs():
|
40 |
+
all_text = ""
|
41 |
+
for pdf_file in os.listdir(PDF_DIR):
|
42 |
+
if pdf_file.endswith(".pdf"):
|
43 |
+
pdf_path = os.path.join(PDF_DIR, pdf_file)
|
44 |
+
doc = fitz.open(pdf_path)
|
45 |
+
for page in doc:
|
46 |
+
all_text += page.get_text("text") + "\n"
|
47 |
+
|
48 |
+
return all_text
|
49 |
+
|
50 |
+
# πΉ Initialize FAISS and Embed Text
|
51 |
+
def initialize_faiss():
|
52 |
+
download_pdfs()
|
53 |
+
text_data = extract_text_from_pdfs()
|
54 |
+
|
55 |
+
if not text_data:
|
56 |
+
raise ValueError("β No text extracted from PDFs!")
|
57 |
+
|
58 |
+
# Split text into chunks
|
59 |
+
chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
|
60 |
+
|
61 |
+
# Generate embeddings
|
62 |
+
embeddings = np.array([model.encode(chunk) for chunk in chunks])
|
63 |
+
|
64 |
+
# Create FAISS index
|
65 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
66 |
+
index.add(embeddings)
|
67 |
+
|
68 |
+
print("β
FAISS index initialized.")
|
69 |
+
|
70 |
+
return index, chunks
|
71 |
+
|
72 |
+
# πΉ Initialize FAISS on Startup
|
73 |
+
index, chunks = initialize_faiss()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# πΉ Function to Search FAISS
|
76 |
def search_policy(query, top_k=3):
|
|
|
|
|
|
|
77 |
query_embedding = model.encode(query).reshape(1, -1)
|
78 |
distances, indices = index.search(query_embedding, top_k)
|
79 |
+
|
80 |
return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
|
81 |
|
82 |
# πΉ Hugging Face LLM Client
|
|
|
97 |
policy_context = search_policy(message)
|
98 |
|
99 |
if policy_context:
|
|
|
100 |
messages.append({"role": "assistant", "content": f"π **Relevant Policy Context:**\n\n{policy_context}"})
|
101 |
|
|
|
102 |
user_query_with_context = f"""
|
103 |
The following is the most relevant policy information retrieved from the official Colorado public assistance policies:
|
104 |
|
|
|
109 |
"""
|
110 |
messages.append({"role": "user", "content": user_query_with_context})
|
111 |
else:
|
|
|
112 |
messages.append({"role": "user", "content": message})
|
113 |
|
114 |
response = ""
|
|
|
128 |
respond,
|
129 |
additional_inputs=[
|
130 |
gr.Textbox(
|
131 |
+
value="You are a knowledgeable chatbot designed to assist Colorado case workers with Medicaid, SNAP, TANF, CHP+, and other programs.",
|
132 |
label="System message"
|
133 |
),
|
134 |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
|
|
137 |
],
|
138 |
)
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
if __name__ == "__main__":
|
141 |
demo.launch()
|
|