Spaces:

tstone87
/

ccr-colorado

Running

App Files Files Community

tstone87 commited on Feb 2

Commit

75f0c72

verified ·

1 Parent(s): a51fb99

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -106

app.py CHANGED Viewed

@@ -1,98 +1,82 @@
 import os
-import shutil
 import faiss
 import numpy as np
 import gradio as gr
 from sentence_transformers import SentenceTransformer
-from huggingface_hub import HfApi, hf_hub_download, login
-# 🔹 Hugging Face Repository Details
-HF_REPO_ID = "tstone87/repo"  # Your dataset repo
-HF_TOKEN = os.getenv("HF_TOKEN")  # Secure API token
-if not HF_TOKEN:
-    raise ValueError("❌ ERROR: Hugging Face token not found. Add it as a secret in the Hugging Face Space settings.")
-# 🔹 Authenticate with Hugging Face
-login(token=HF_TOKEN)
-# 🔹 File Paths
-EMBEDDINGS_FILE = "policy_embeddings.npy"
-INDEX_FILE = "faiss_index.bin"
-TEXT_FILE = "combined_text_documents.txt"
-# 🔹 Load policy text from file
-if os.path.exists(TEXT_FILE):
-    with open(TEXT_FILE, "r", encoding="utf-8") as f:
-        POLICY_TEXT = f.read()
-    print("✅ Loaded policy text from combined_text_documents.txt")
-else:
-    print("❌ ERROR: combined_text_documents.txt not found! Ensure it's uploaded.")
-    POLICY_TEXT = ""
-# 🔹 Sentence Embedding Model (Optimized for Speed)
-model = SentenceTransformer("all-MiniLM-L6-v2")
-# 🔹 Split policy text into chunks for FAISS indexing
-chunk_size = 500
-chunks = [POLICY_TEXT[i:i+chunk_size] for i in range(0, len(POLICY_TEXT), chunk_size)] if POLICY_TEXT else []
-# 🔹 Function to Download FAISS Files from Hugging Face Hub if Available
-def download_faiss_from_hf():
-    try:
-        if not os.path.exists(EMBEDDINGS_FILE):
-            print("📥 Downloading FAISS embeddings from Hugging Face...")
-            hf_hub_download(repo_id=HF_REPO_ID, filename=EMBEDDINGS_FILE, local_dir=".", token=HF_TOKEN)
-        if not os.path.exists(INDEX_FILE):
-            print("📥 Downloading FAISS index from Hugging Face...")
-            hf_hub_download(repo_id=HF_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
-        print("✅ FAISS files downloaded from Hugging Face.")
-        return True
-    except Exception as e:
-        print(f"⚠️ FAISS files not found in Hugging Face repo. Recomputing... ({e})")
-        return False
-# 🔹 Check if FAISS Files Exist, Otherwise Download or Generate
-if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(INDEX_FILE):
-    print("✅ FAISS files found locally. Loading from disk...")
-    embeddings = np.load(EMBEDDINGS_FILE)
-    index = faiss.read_index(INDEX_FILE)
-elif download_faiss_from_hf():
-    embeddings = np.load(EMBEDDINGS_FILE)
-    index = faiss.read_index(INDEX_FILE)
-else:
-    print("🚀 No FAISS files found. Creating new index...")
-    if chunks:
-        embeddings = np.array([model.encode(chunk) for chunk in chunks])
-        # Save embeddings for future use
-        np.save(EMBEDDINGS_FILE, embeddings)
-        # Use FAISS optimized index for faster lookup
-        d = embeddings.shape[1]
-        nlist = 10  # Number of clusters
-        index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, nlist)
-        index.train(embeddings)
-        index.add(embeddings)
-        index.nprobe = 2  # Speed optimization
-        # Save FAISS index
-        faiss.write_index(index, INDEX_FILE)
-        print("✅ FAISS index created and saved.")
-    else:
-        print("❌ ERROR: No text to index. Check combined_text_documents.txt.")
-        index = None
 # 🔹 Function to Search FAISS
 def search_policy(query, top_k=3):
-    if index is None:
-        return "Error: FAISS index is not available."
     query_embedding = model.encode(query).reshape(1, -1)
     distances, indices = index.search(query_embedding, top_k)
     return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
 # 🔹 Hugging Face LLM Client
@@ -113,10 +97,8 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
     policy_context = search_policy(message)
     if policy_context:
-        # 🔹 Display retrieved context in chat
         messages.append({"role": "assistant", "content": f"📄 **Relevant Policy Context:**\n\n{policy_context}"})
-        # 🔹 Force the LLM to use the retrieved policy text
         user_query_with_context = f"""
         The following is the most relevant policy information retrieved from the official Colorado public assistance policies:
@@ -127,7 +109,6 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         """
         messages.append({"role": "user", "content": user_query_with_context})
     else:
-        # If no relevant policy info is found, use the original message
         messages.append({"role": "user", "content": message})
     response = ""
@@ -147,7 +128,7 @@ demo = gr.ChatInterface(
     respond,
     additional_inputs=[
         gr.Textbox(
-            value="You are a knowledgeable and professional chatbot designed to assist Colorado case workers in determining eligibility for public assistance programs.",
             label="System message"
         ),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
@@ -156,22 +137,5 @@ demo = gr.ChatInterface(
     ],
 )
-# 🔹 Function to Provide FAISS Files for Download
-def download_faiss_files():
-    if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(INDEX_FILE):
-        shutil.copy(EMBEDDINGS_FILE, "/mnt/data/policy_embeddings.npy")
-        shutil.copy(INDEX_FILE, "/mnt/data/faiss_index.bin")
-        return "✅ FAISS files ready for download! Check the 'Files' tab in your Hugging Face Space."
-    else:
-        return "❌ FAISS files not found. Run the chatbot first to generate them."
-# Gradio button for downloading FAISS files
-with gr.Blocks() as file_download:
-    gr.Markdown("### 🔽 Download FAISS Files to Your Computer")
-    download_button = gr.Button("Prepare FAISS Files for Download")
-    output_text = gr.Textbox()
-    download_button.click(fn=download_faiss_files, outputs=output_text)
 if __name__ == "__main__":
     demo.launch()
-    file_download.launch()

 import os
+import fitz  # PyMuPDF for PDF reading
 import faiss
 import numpy as np
 import gradio as gr
 from sentence_transformers import SentenceTransformer
+from huggingface_hub import hf_hub_download
+# 🔹 Hugging Face Space Repository Details
+HF_REPO_ID = "tstone87/ccr-colorado"
+# 🔹 Load Embedding Model (Better for QA Retrieval)
+model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
+# 🔹 Define PDF Directory and Chunk Size
+PDF_DIR = "./pdfs"  # Local folder for downloaded PDFs
+CHUNK_SIZE = 2500  # Larger chunks for better context
+# 🔹 Ensure Directory Exists
+os.makedirs(PDF_DIR, exist_ok=True)
+# 🔹 Function to Download PDFs from Hugging Face Space
+def download_pdfs():
+    pdf_files = [
+        "SNAP 10 CCR 2506-1 .pdf",
+        "Med 10 CCR 2505-10 8.100.pdf",
+        # Add other PDFs here if necessary
+    ]
+    for pdf_file in pdf_files:
+        pdf_path = os.path.join(PDF_DIR, pdf_file)
+        if not os.path.exists(pdf_path):  # Download if not already present
+            print(f"📥 Downloading {pdf_file}...")
+            hf_hub_download(repo_id=HF_REPO_ID, filename=pdf_file, local_dir=PDF_DIR)
+    print("✅ All PDFs downloaded.")
+# 🔹 Function to Extract Text from PDFs
+def extract_text_from_pdfs():
+    all_text = ""
+    for pdf_file in os.listdir(PDF_DIR):
+        if pdf_file.endswith(".pdf"):
+            pdf_path = os.path.join(PDF_DIR, pdf_file)
+            doc = fitz.open(pdf_path)
+            for page in doc:
+                all_text += page.get_text("text") + "\n"
+    return all_text
+# 🔹 Initialize FAISS and Embed Text
+def initialize_faiss():
+    download_pdfs()
+    text_data = extract_text_from_pdfs()
+    if not text_data:
+        raise ValueError("❌ No text extracted from PDFs!")
+    # Split text into chunks
+    chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
+    # Generate embeddings
+    embeddings = np.array([model.encode(chunk) for chunk in chunks])
+    # Create FAISS index
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    print("✅ FAISS index initialized.")
+    return index, chunks
+# 🔹 Initialize FAISS on Startup
+index, chunks = initialize_faiss()
 # 🔹 Function to Search FAISS
 def search_policy(query, top_k=3):
     query_embedding = model.encode(query).reshape(1, -1)
     distances, indices = index.search(query_embedding, top_k)
     return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
 # 🔹 Hugging Face LLM Client
     policy_context = search_policy(message)
     if policy_context:
         messages.append({"role": "assistant", "content": f"📄 **Relevant Policy Context:**\n\n{policy_context}"})
         user_query_with_context = f"""
         The following is the most relevant policy information retrieved from the official Colorado public assistance policies:
         """
         messages.append({"role": "user", "content": user_query_with_context})
     else:
         messages.append({"role": "user", "content": message})
     response = ""
     respond,
     additional_inputs=[
         gr.Textbox(
+            value="You are a knowledgeable chatbot designed to assist Colorado case workers with Medicaid, SNAP, TANF, CHP+, and other programs.",
             label="System message"
         ),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
     ],
 )
 if __name__ == "__main__":
     demo.launch()