Spaces:

tstone87
/

ccr-colorado

Sleeping

App Files Files Community

tstone87 commited on Feb 2

Commit

a808742

verified ·

1 Parent(s): dd3a7d1

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -28

app.py CHANGED Viewed

@@ -1,46 +1,42 @@
 import os
-import urllib.parse
 import fitz  # PyMuPDF for PDF reading
 import faiss
 import numpy as np
 import gradio as gr
 from sentence_transformers import SentenceTransformer
-from huggingface_hub import hf_hub_download, InferenceClient
-# 🔹 Hugging Face Space Repository Details
-HF_REPO_ID = "tstone87/ccr-colorado"
-# 🔹 Load Embedding Model (Optimized for QA Retrieval)
-model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
 # 🔹 Define PDF Directory and Chunk Size
-PDF_DIR = "./pdfs"  # Local folder for downloaded PDFs
 CHUNK_SIZE = 2500  # Larger chunks for better context
 # 🔹 Ensure Directory Exists
 os.makedirs(PDF_DIR, exist_ok=True)
-# 🔹 Function to Download PDFs from Hugging Face Space (Handles Spaces)
-def download_pdfs():
-    pdf_files = [
-        "SNAP 10 CCR 2506-1 .pdf",
-        "Med 10 CCR 2505-10 8.100.pdf",
-    ]
-    for pdf_file in pdf_files:
-        pdf_path = os.path.join(PDF_DIR, pdf_file)
-        if not os.path.exists(pdf_path):  # Download if not already present
-            print(f"📥 Downloading {pdf_file}...")
-            # URL encode spaces correctly
-            encoded_filename = urllib.parse.quote(pdf_file)
             try:
-                hf_hub_download(repo_id=HF_REPO_ID, filename=encoded_filename, local_dir=PDF_DIR, force_download=True)
-                print(f"✅ Successfully downloaded {pdf_file}")
             except Exception as e:
-                print(f"❌ Error downloading {pdf_file}: {e}")
     print("✅ All PDFs downloaded.")
@@ -68,6 +64,7 @@ def initialize_faiss():
     chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
     # Generate embeddings
     embeddings = np.array([model.encode(chunk) for chunk in chunks])
     # Create FAISS index
@@ -83,7 +80,7 @@ index, chunks = initialize_faiss()
 # 🔹 Function to Search FAISS
 def search_policy(query, top_k=3):
-    query_embedding = model.encode(query).reshape(1, -1)
     distances, indices = index.search(query_embedding, top_k)
     return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])

 import os
+import requests
 import fitz  # PyMuPDF for PDF reading
 import faiss
 import numpy as np
 import gradio as gr
 from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient
 # 🔹 Define PDF Directory and Chunk Size
+PDF_DIR = "./pdfs"
 CHUNK_SIZE = 2500  # Larger chunks for better context
 # 🔹 Ensure Directory Exists
 os.makedirs(PDF_DIR, exist_ok=True)
+# 🔹 List of PDFs with Direct Hugging Face URLs
+PDF_FILES = {
+    "SNAP 10 CCR 2506-1.pdf": "https://huggingface.co/spaces/tstone87/ccr-colorado/resolve/main/SNAP%2010%20CCR%202506-1%20.pdf",
+    "Med 10 CCR 2505-10 8.100.pdf": "https://huggingface.co/spaces/tstone87/ccr-colorado/resolve/main/Med%2010%20CCR%202505-10%208.100.pdf",
+}
+# 🔹 Function to Download PDFs Directly
+def download_pdfs():
+    for filename, url in PDF_FILES.items():
+        pdf_path = os.path.join(PDF_DIR, filename)
+        if not os.path.exists(pdf_path):
+            print(f"📥 Downloading {filename}...")
             try:
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+                with open(pdf_path, "wb") as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                print(f"✅ Successfully downloaded {filename}")
             except Exception as e:
+                print(f"❌ Error downloading {filename}: {e}")
     print("✅ All PDFs downloaded.")
     chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
     # Generate embeddings
+    model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
     embeddings = np.array([model.encode(chunk) for chunk in chunks])
     # Create FAISS index
 # 🔹 Function to Search FAISS
 def search_policy(query, top_k=3):
+    query_embedding = SentenceTransformer("multi-qa-mpnet-base-dot-v1").encode(query).reshape(1, -1)
     distances, indices = index.search(query_embedding, top_k)
     return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])