Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,12 @@ import faiss
|
|
7 |
import numpy as np
|
8 |
import time
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Function to process the uploaded PDF and save it temporarily
|
11 |
def process_pdf(file):
|
12 |
st.write("Processing uploaded PDF...")
|
@@ -39,8 +45,31 @@ def chunk_text(text, chunk_size=200):
|
|
39 |
st.error(f"Error chunking text: {e}")
|
40 |
return []
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Main function to run the Streamlit app
|
43 |
def main():
|
|
|
|
|
44 |
st.title("PDF Embedding and Query System")
|
45 |
|
46 |
# File uploader for the user to upload a PDF
|
@@ -57,31 +86,17 @@ def main():
|
|
57 |
st.error("No text extracted from the PDF. Please upload a valid file.")
|
58 |
return
|
59 |
|
60 |
-
# Initialize Sentence-Transformer model
|
61 |
-
|
62 |
-
|
63 |
|
64 |
# Chunk text into smaller sections for embedding generation
|
65 |
-
text_chunks = chunk_text(pdf_text, chunk_size=200)
|
66 |
-
|
67 |
if not text_chunks:
|
68 |
-
|
69 |
-
return
|
70 |
|
71 |
-
# Generate embeddings
|
72 |
-
|
73 |
-
|
74 |
-
embeddings = []
|
75 |
-
for i, chunk in enumerate(text_chunks):
|
76 |
-
embeddings.append(model.encode(chunk, convert_to_numpy=True))
|
77 |
-
progress_bar.progress((i + 1) / len(text_chunks))
|
78 |
-
embeddings = np.array(embeddings)
|
79 |
-
|
80 |
-
# Build FAISS index
|
81 |
-
st.write("Building FAISS index...")
|
82 |
-
dimension = embeddings.shape[-1]
|
83 |
-
index = faiss.IndexFlatL2(dimension)
|
84 |
-
index.add(embeddings)
|
85 |
|
86 |
# Query input field for users to enter their search queries
|
87 |
query = st.text_input("Enter a query to search:")
|
|
|
7 |
import numpy as np
|
8 |
import time
|
9 |
|
10 |
+
# Global variables for caching the model and embeddings
|
11 |
+
model = None
|
12 |
+
index = None
|
13 |
+
embeddings = None
|
14 |
+
text_chunks = []
|
15 |
+
|
16 |
# Function to process the uploaded PDF and save it temporarily
|
17 |
def process_pdf(file):
|
18 |
st.write("Processing uploaded PDF...")
|
|
|
45 |
st.error(f"Error chunking text: {e}")
|
46 |
return []
|
47 |
|
48 |
+
# Function to load the embedding model
|
49 |
+
def load_model():
|
50 |
+
global model
|
51 |
+
st.write("Loading embedding model...")
|
52 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
53 |
+
|
54 |
+
# Function to generate embeddings
|
55 |
+
def generate_embeddings():
|
56 |
+
global embeddings, text_chunks, index
|
57 |
+
st.write("Generating embeddings...")
|
58 |
+
embeddings = []
|
59 |
+
for chunk in text_chunks:
|
60 |
+
embeddings.append(model.encode(chunk, convert_to_numpy=True))
|
61 |
+
embeddings = np.array(embeddings)
|
62 |
+
|
63 |
+
# Build FAISS index
|
64 |
+
st.write("Building FAISS index...")
|
65 |
+
dimension = embeddings.shape[-1]
|
66 |
+
index = faiss.IndexFlatL2(dimension)
|
67 |
+
index.add(embeddings)
|
68 |
+
|
69 |
# Main function to run the Streamlit app
|
70 |
def main():
|
71 |
+
global embeddings, text_chunks, index, model
|
72 |
+
|
73 |
st.title("PDF Embedding and Query System")
|
74 |
|
75 |
# File uploader for the user to upload a PDF
|
|
|
86 |
st.error("No text extracted from the PDF. Please upload a valid file.")
|
87 |
return
|
88 |
|
89 |
+
# Initialize Sentence-Transformer model and embeddings only once
|
90 |
+
if model is None:
|
91 |
+
load_model()
|
92 |
|
93 |
# Chunk text into smaller sections for embedding generation
|
|
|
|
|
94 |
if not text_chunks:
|
95 |
+
text_chunks = chunk_text(pdf_text, chunk_size=200)
|
|
|
96 |
|
97 |
+
# Generate embeddings only once
|
98 |
+
if embeddings is None:
|
99 |
+
generate_embeddings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
# Query input field for users to enter their search queries
|
102 |
query = st.text_input("Enter a query to search:")
|