tstone87 commited on
Commit
a808742
Β·
verified Β·
1 Parent(s): dd3a7d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -28
app.py CHANGED
@@ -1,46 +1,42 @@
1
  import os
2
- import urllib.parse
3
  import fitz # PyMuPDF for PDF reading
4
  import faiss
5
  import numpy as np
6
  import gradio as gr
7
  from sentence_transformers import SentenceTransformer
8
- from huggingface_hub import hf_hub_download, InferenceClient
9
-
10
- # πŸ”Ή Hugging Face Space Repository Details
11
- HF_REPO_ID = "tstone87/ccr-colorado"
12
-
13
- # πŸ”Ή Load Embedding Model (Optimized for QA Retrieval)
14
- model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
15
 
16
  # πŸ”Ή Define PDF Directory and Chunk Size
17
- PDF_DIR = "./pdfs" # Local folder for downloaded PDFs
18
  CHUNK_SIZE = 2500 # Larger chunks for better context
19
 
20
  # πŸ”Ή Ensure Directory Exists
21
  os.makedirs(PDF_DIR, exist_ok=True)
22
 
23
- # πŸ”Ή Function to Download PDFs from Hugging Face Space (Handles Spaces)
24
- def download_pdfs():
25
- pdf_files = [
26
- "SNAP 10 CCR 2506-1 .pdf",
27
- "Med 10 CCR 2505-10 8.100.pdf",
28
- ]
29
-
30
- for pdf_file in pdf_files:
31
- pdf_path = os.path.join(PDF_DIR, pdf_file)
32
-
33
- if not os.path.exists(pdf_path): # Download if not already present
34
- print(f"πŸ“₯ Downloading {pdf_file}...")
35
-
36
- # URL encode spaces correctly
37
- encoded_filename = urllib.parse.quote(pdf_file)
38
 
 
 
 
 
 
 
39
  try:
40
- hf_hub_download(repo_id=HF_REPO_ID, filename=encoded_filename, local_dir=PDF_DIR, force_download=True)
41
- print(f"βœ… Successfully downloaded {pdf_file}")
 
 
 
 
 
 
42
  except Exception as e:
43
- print(f"❌ Error downloading {pdf_file}: {e}")
44
 
45
  print("βœ… All PDFs downloaded.")
46
 
@@ -68,6 +64,7 @@ def initialize_faiss():
68
  chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
69
 
70
  # Generate embeddings
 
71
  embeddings = np.array([model.encode(chunk) for chunk in chunks])
72
 
73
  # Create FAISS index
@@ -83,7 +80,7 @@ index, chunks = initialize_faiss()
83
 
84
  # πŸ”Ή Function to Search FAISS
85
  def search_policy(query, top_k=3):
86
- query_embedding = model.encode(query).reshape(1, -1)
87
  distances, indices = index.search(query_embedding, top_k)
88
 
89
  return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])
 
1
  import os
2
+ import requests
3
  import fitz # PyMuPDF for PDF reading
4
  import faiss
5
  import numpy as np
6
  import gradio as gr
7
  from sentence_transformers import SentenceTransformer
8
+ from huggingface_hub import InferenceClient
 
 
 
 
 
 
9
 
10
  # πŸ”Ή Define PDF Directory and Chunk Size
11
+ PDF_DIR = "./pdfs"
12
  CHUNK_SIZE = 2500 # Larger chunks for better context
13
 
14
  # πŸ”Ή Ensure Directory Exists
15
  os.makedirs(PDF_DIR, exist_ok=True)
16
 
17
+ # πŸ”Ή List of PDFs with Direct Hugging Face URLs
18
+ PDF_FILES = {
19
+ "SNAP 10 CCR 2506-1.pdf": "https://huggingface.co/spaces/tstone87/ccr-colorado/resolve/main/SNAP%2010%20CCR%202506-1%20.pdf",
20
+ "Med 10 CCR 2505-10 8.100.pdf": "https://huggingface.co/spaces/tstone87/ccr-colorado/resolve/main/Med%2010%20CCR%202505-10%208.100.pdf",
21
+ }
 
 
 
 
 
 
 
 
 
 
22
 
23
+ # πŸ”Ή Function to Download PDFs Directly
24
+ def download_pdfs():
25
+ for filename, url in PDF_FILES.items():
26
+ pdf_path = os.path.join(PDF_DIR, filename)
27
+ if not os.path.exists(pdf_path):
28
+ print(f"πŸ“₯ Downloading {filename}...")
29
  try:
30
+ response = requests.get(url, stream=True)
31
+ response.raise_for_status()
32
+
33
+ with open(pdf_path, "wb") as f:
34
+ for chunk in response.iter_content(chunk_size=8192):
35
+ f.write(chunk)
36
+
37
+ print(f"βœ… Successfully downloaded {filename}")
38
  except Exception as e:
39
+ print(f"❌ Error downloading {filename}: {e}")
40
 
41
  print("βœ… All PDFs downloaded.")
42
 
 
64
  chunks = [text_data[i:i+CHUNK_SIZE] for i in range(0, len(text_data), CHUNK_SIZE)]
65
 
66
  # Generate embeddings
67
+ model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
68
  embeddings = np.array([model.encode(chunk) for chunk in chunks])
69
 
70
  # Create FAISS index
 
80
 
81
  # πŸ”Ή Function to Search FAISS
82
  def search_policy(query, top_k=3):
83
+ query_embedding = SentenceTransformer("multi-qa-mpnet-base-dot-v1").encode(query).reshape(1, -1)
84
  distances, indices = index.search(query_embedding, top_k)
85
 
86
  return "\n\n".join([chunks[i] for i in indices[0] if i < len(chunks)])