tstone87 commited on
Commit
0864cdf
Β·
verified Β·
1 Parent(s): 8236a6f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import fitz # PyMuPDF for PDF reading
4
+ import faiss
5
+ import numpy as np
6
+ import gradio as gr
7
+ from sentence_transformers import SentenceTransformer
8
+ from huggingface_hub import InferenceClient
9
+
10
+ # πŸ”Ή Define PDF Directory and Chunk Size
11
+ PDF_DIR = "./pdfs"
12
+ CHUNK_SIZE = 2500 # Larger chunks for better context
13
+
14
+ # πŸ”Ή Ensure Directory Exists
15
+ os.makedirs(PDF_DIR, exist_ok=True)
16
+
17
+ # πŸ”Ή Direct URLs for PDF Downloads (Colorado Policy Documents)
18
+ PDF_FILES = {
19
+ "SNAP 10 CCR 2506-1.pdf": "https://huggingface.co/spaces/tstone87/ccr-colorado/resolve/main/SNAP%2010%20CCR%202506-1%20.pdf?download=true",
20
+ "Med 10 CCR 2505-10 8.100.pdf": "https://huggingface.co/spaces/tstone87/ccr-colorado/resolve/main/Med%2010%20CCR%202505-10%208.100.pdf?download=true",
21
+ }
22
+
23
+ # πŸ”Ή Function to Download PDFs
24
+
25
+ def download_pdfs():
26
+ for filename, url in PDF_FILES.items():
27
+ pdf_path = os.path.join(PDF_DIR, filename)
28
+ if not os.path.exists(pdf_path):
29
+ print(f"πŸ“₯ Downloading {filename}...")
30
+ try:
31
+ response = requests.get(url, stream=True)
32
+ response.raise_for_status()
33
+ with open(pdf_path, "wb") as f:
34
+ for chunk in response.iter_content(chunk_size=8192):
35
+ f.write(chunk)
36
+ print(f"βœ… Downloaded {filename}")
37
+ except Exception as e:
38
+ print(f"❌ Error downloading {filename}: {e}")
39
+
40
+ # πŸ”Ή Function to Extract Text from PDFs
41
+
42
+ def extract_text_from_pdfs():
43
+ all_text = ""
44
+ for pdf_file in os.listdir(PDF_DIR):
45
+ if pdf_file.endswith(".pdf"):
46
+ pdf_path = os.path.join(PDF_DIR, pdf_file)
47
+ doc = fitz.open(pdf_path)
48
+ for page in doc:
49
+ all_text += page.get_text("text") + "\n"
50
+ return all_text
51
+
52
+ # πŸ”Ή Initialize FAISS Index
53
+
54
+ def initialize_faiss():
55
+ download_pdfs()
56
+ text_data = extract_text_from_pdfs()
57
+ if not text_data:
58
+ raise ValueError("❌ No text extracted from PDFs!")
59
+
60
+ chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
61
+ model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
62
+ embeddings = np.array([model.encode(chunk) for chunk in chunks])
63
+ index = faiss.IndexFlatL2(embeddings.shape[1])
64
+ index.add(embeddings)
65
+ print("βœ… FAISS index initialized.")
66
+ return index, chunks
67
+
68
+ # πŸ”Ή Initialize FAISS on Startup
69
+ index, chunks = initialize_faiss()
70
+
71
+ # πŸ”Ή Function to Search FAISS
72
+
73
+ def search_policy(query, top_k=3):
74
+ query_embedding = SentenceTransformer("multi-qa-mpnet-base-dot-v1").encode(query).reshape(1, -1)
75
+ distances, indices = index.search(query_embedding, top_k)
76
+ return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
77
+
78
+ # πŸ”Ή Hugging Face LLM Client
79
+ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
80
+
81
+ # πŸ”Ή Function to Handle Chat Responses
82
+
83
+ def respond(message, history):
84
+ messages = [{"role": "system", "content": "You are a chatbot specializing in Colorado public assistance programs."}]
85
+ for val in history:
86
+ if val[0]:
87
+ messages.append({"role": "user", "content": val[0]})
88
+ if val[1]:
89
+ messages.append({"role": "assistant", "content": val[1]})
90
+
91
+ policy_context = search_policy(message)
92
+ if policy_context:
93
+ messages.append({"role": "assistant", "content": f"πŸ“„ **Colorado Policy Info:**\n\n{policy_context}"})
94
+
95
+ messages.append({"role": "user", "content": message})
96
+ response = ""
97
+ for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95):
98
+ token = message.choices[0].delta.content
99
+ response += token
100
+ yield response
101
+
102
+ # πŸ”Ή Gradio Chat Interface (Colorado-Themed)
103
+ demo = gr.ChatInterface(
104
+ respond,
105
+ textbox=gr.Textbox(placeholder="Ask about Colorado public assistance programs...", interactive=True, show_label=False),
106
+ submit_btn=gr.Button("Send"),
107
+ chatbot=gr.Chatbot(),
108
+ )
109
+
110
+ demo.launch()