Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ from PyPDF2 import PdfReader
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
import faiss
|
7 |
import numpy as np
|
|
|
8 |
|
9 |
# Function to process the uploaded PDF and save it temporarily
|
10 |
def process_pdf(file):
|
@@ -21,6 +22,12 @@ def extract_text_from_pdf(pdf_path):
|
|
21 |
text += page.extract_text()
|
22 |
return text
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# Main function to run the Streamlit app
|
25 |
def main():
|
26 |
st.title("PDF Embedding and Query System")
|
@@ -33,17 +40,28 @@ def main():
|
|
33 |
tmp_file_path = process_pdf(uploaded_file)
|
34 |
|
35 |
# Extract text from the uploaded PDF
|
|
|
36 |
pdf_text = extract_text_from_pdf(tmp_file_path)
|
37 |
|
38 |
# Initialize Sentence-Transformer model for embeddings
|
39 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
# Build FAISS index
|
46 |
-
|
|
|
47 |
index = faiss.IndexFlatL2(dimension)
|
48 |
index.add(embeddings)
|
49 |
|
@@ -55,12 +73,9 @@ def main():
|
|
55 |
query_embedding = model.encode([query], convert_to_numpy=True)
|
56 |
|
57 |
# Perform similarity search using FAISS
|
|
|
|
|
58 |
D, I = index.search(query_embedding, k=5)
|
|
|
59 |
|
60 |
-
# Display the
|
61 |
-
for i in range(len(I[0])):
|
62 |
-
st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")
|
63 |
-
|
64 |
-
# Run the app if this script is executed directly
|
65 |
-
if __name__ == "__main__":
|
66 |
-
main()
|
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
import faiss
|
7 |
import numpy as np
|
8 |
+
import time
|
9 |
|
10 |
# Function to process the uploaded PDF and save it temporarily
|
11 |
def process_pdf(file):
|
|
|
22 |
text += page.extract_text()
|
23 |
return text
|
24 |
|
25 |
+
# Function to chunk text into smaller sections
|
26 |
+
def chunk_text(text, chunk_size=200):
|
27 |
+
words = text.split()
|
28 |
+
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
29 |
+
return chunks
|
30 |
+
|
31 |
# Main function to run the Streamlit app
|
32 |
def main():
|
33 |
st.title("PDF Embedding and Query System")
|
|
|
40 |
tmp_file_path = process_pdf(uploaded_file)
|
41 |
|
42 |
# Extract text from the uploaded PDF
|
43 |
+
st.write("Extracting text from the PDF...")
|
44 |
pdf_text = extract_text_from_pdf(tmp_file_path)
|
45 |
|
46 |
# Initialize Sentence-Transformer model for embeddings
|
47 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
48 |
|
49 |
+
# Chunk text into smaller sections for embedding generation
|
50 |
+
st.write("Chunking text for embedding generation...")
|
51 |
+
text_chunks = chunk_text(pdf_text, chunk_size=200)
|
52 |
+
|
53 |
+
# Generate embeddings with a progress bar
|
54 |
+
st.write("Generating embeddings...")
|
55 |
+
progress_bar = st.progress(0)
|
56 |
+
embeddings = []
|
57 |
+
for i, chunk in enumerate(text_chunks):
|
58 |
+
embeddings.append(model.encode(chunk, convert_to_numpy=True))
|
59 |
+
progress_bar.progress((i + 1) / len(text_chunks))
|
60 |
+
embeddings = np.array(embeddings)
|
61 |
|
62 |
# Build FAISS index
|
63 |
+
st.write("Building FAISS index...")
|
64 |
+
dimension = embeddings.shape[-1]
|
65 |
index = faiss.IndexFlatL2(dimension)
|
66 |
index.add(embeddings)
|
67 |
|
|
|
73 |
query_embedding = model.encode([query], convert_to_numpy=True)
|
74 |
|
75 |
# Perform similarity search using FAISS
|
76 |
+
st.write("Searching...")
|
77 |
+
start_time = time.time()
|
78 |
D, I = index.search(query_embedding, k=5)
|
79 |
+
end_time = time.time()
|
80 |
|
81 |
+
# Display the res
|
|
|
|
|
|
|
|
|
|
|
|