tstone87 commited on
Commit
4e46bd2
Β·
verified Β·
1 Parent(s): 8c0d33b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import fitz # PyMuPDF for PDF reading
4
+ import faiss
5
+ import numpy as np
6
+ import gradio as gr
7
+ from sentence_transformers import SentenceTransformer
8
+ from huggingface_hub import InferenceClient
9
+
10
+ # πŸ”Ή Define Directories and Chunk Size
11
+ APP_DIR = "./" # Root app folder
12
+ PDF_DIR = "./pdfs" # Where PDFs will be stored
13
+ CHUNK_SIZE = 2500 # Larger chunks for better context
14
+
15
+ # πŸ”Ή Ensure PDF Directory Exists
16
+ os.makedirs(PDF_DIR, exist_ok=True)
17
+
18
+ # πŸ”Ή Function to Auto-Detect PDFs in App Folder & Generate Download Links
19
+ def get_pdf_links():
20
+ base_url = "https://huggingface.co/spaces/tstone87/ccr-colorado/resolve/main/"
21
+ pdf_links = {}
22
+
23
+ for file in os.listdir(APP_DIR):
24
+ if file.endswith(".pdf"):
25
+ encoded_file = requests.utils.quote(file) # Encode spaces correctly
26
+ pdf_links[file] = f"{base_url}{encoded_file}?download=true"
27
+
28
+ return pdf_links
29
+
30
+ # πŸ”Ή Get List of PDFs & Their Download Links
31
+ PDF_FILES = get_pdf_links()
32
+
33
+ # πŸ”Ή Function to Download PDFs
34
+ def download_pdfs():
35
+ for filename, url in PDF_FILES.items():
36
+ pdf_path = os.path.join(PDF_DIR, filename)
37
+ if not os.path.exists(pdf_path):
38
+ print(f"πŸ“₯ Downloading {filename}...")
39
+ try:
40
+ response = requests.get(url, stream=True)
41
+ response.raise_for_status() # Ensure the request was successful
42
+
43
+ with open(pdf_path, "wb") as f:
44
+ for chunk in response.iter_content(chunk_size=8192):
45
+ f.write(chunk)
46
+
47
+ print(f"βœ… Successfully downloaded {filename}")
48
+ except Exception as e:
49
+ print(f"❌ Error downloading {filename}: {e}")
50
+
51
+ print("βœ… All PDFs downloaded.")
52
+
53
+ # πŸ”Ή Function to Extract Text from PDFs
54
+ def extract_text_from_pdfs():
55
+ all_text = ""
56
+ for pdf_file in os.listdir(PDF_DIR):
57
+ if pdf_file.endswith(".pdf"):
58
+ pdf_path = os.path.join(PDF_DIR, pdf_file)
59
+ doc = fitz.open(pdf_path)
60
+ for page in doc:
61
+ all_text += page.get_text("text") + "\n"
62
+
63
+ return all_text
64
+
65
+ # πŸ”Ή Initialize FAISS and Embed Text
66
+ def initialize_faiss():
67
+ download_pdfs()
68
+ text_data = extract_text_from_pdfs()
69
+
70
+ if not text_data:
71
+ raise ValueError("❌ No text extracted from PDFs!")
72
+
73
+ # Split text into chunks
74
+ chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
75
+
76
+ # Generate embeddings
77
+ model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
78
+ embeddings = np.array([model.encode(chunk) for chunk in chunks])
79
+
80
+ # Create FAISS index
81
+ index = faiss.IndexFlatL2(embeddings.shape[1])
82
+ index.add(embeddings)
83
+
84
+ print("βœ… FAISS index initialized.")
85
+
86
+ return index, chunks
87
+
88
+ # πŸ”Ή Initialize FAISS on Startup
89
+ index, chunks = initialize_faiss()
90
+
91
+ # πŸ”Ή Function to Search FAISS
92
+ def search_policy(query, top_k=3):
93
+ query_embedding = SentenceTransformer("multi-qa-mpnet-base-dot-v1").encode(query).reshape(1, -1)
94
+ distances, indices = index.search(query_embedding, top_k)
95
+
96
+ return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
97
+
98
+ # πŸ”Ή Hugging Face LLM Client
99
+ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
100
+
101
+ # πŸ”Ή Function to Handle Chat Responses
102
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
103
+ messages = [{"role": "system", "content": system_message}]
104
+
105
+ for val in history:
106
+ if val[0]:
107
+ messages.append({"role": "user", "content": val[0]})
108
+ if val[1]:
109
+ messages.append({"role": "assistant", "content": val[1]})
110
+
111
+ # πŸ”Ή Retrieve relevant policy info from FAISS
112
+ policy_context = search_policy(message)
113
+
114
+ if policy_context:
115
+ messages.append({"role": "assistant", "content": f"πŸ“„ **Relevant Policy Context:**\n\n{policy_context}"})
116
+
117
+ user_query_with_context = f"""
118
+ The following is the most relevant policy information retrieved from the official Colorado public assistance policies:
119
+
120
+ {policy_context}
121
+
122
+ Based on this information, answer the following question:
123
+ {message}
124
+ """
125
+ messages.append({"role": "user", "content": user_query_with_context})
126
+ else:
127
+ messages.append({"role": "user", "content": message})
128
+
129
+ response = ""
130
+ for message in client.chat_completion(
131
+ messages,
132
+ max_tokens=max_tokens,
133
+ stream=True,
134
+ temperature=temperature,
135
+ top_p=top_p,
136
+ ):
137
+ token = message.choices[0].delta.content
138
+ response += token
139
+ yield response
140
+
141
+ # πŸ”Ή Gradio Chat Interface
142
+ demo = gr.ChatInterface(
143
+ respond,
144
+ additional_inputs=[
145
+ gr.Textbox(
146
+ value="You are a knowledgeable chatbot designed to assist Colorado case workers with Medicaid, SNAP, TANF, CHP+, and other programs.",
147
+ label="System message"
148
+ ),
149
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
150
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
151
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
152
+ ],
153
+ )
154
+
155
+ if __name__ == "__main__":
156
+ demo.launch()