import tempfile import os import streamlit as st from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer import faiss import numpy as np import time # Global variables for caching the model and embeddings model = None index = None embeddings = None text_chunks = [] # Function to process the uploaded PDF and save it temporarily def process_pdf(file): st.write("Processing uploaded PDF...") with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile: tmpfile.write(file.read()) # Write the uploaded file's content to the temp file tmpfile_path = tmpfile.name # Get the temporary file path return tmpfile_path # Function to extract text from the PDF def extract_text_from_pdf(pdf_path): try: st.write("Extracting text from the PDF...") reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() return text except Exception as e: st.error(f"Error extracting text from PDF: {e}") return "" # Function to chunk text into smaller sections def chunk_text(text, chunk_size=200): try: st.write("Chunking text into smaller sections...") words = text.split() chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] return chunks except Exception as e: st.error(f"Error chunking text: {e}") return [] # Function to load the embedding model def load_model(): global model st.write("Loading embedding model...") model = SentenceTransformer('all-MiniLM-L6-v2') # Function to generate embeddings def generate_embeddings(): global embeddings, text_chunks, index st.write("Generating embeddings...") embeddings = [] for chunk in text_chunks: embeddings.append(model.encode(chunk, convert_to_numpy=True)) embeddings = np.array(embeddings) # Build FAISS index st.write("Building FAISS index...") dimension = embeddings.shape[-1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) # Main function to run the Streamlit app def main(): global embeddings, text_chunks, index, model st.title("PDF Embedding and Query System") # File uploader for the user to upload a PDF uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) if uploaded_file is not None: # Process the uploaded PDF and get its file path tmp_file_path = process_pdf(uploaded_file) # Extract text from the uploaded PDF pdf_text = extract_text_from_pdf(tmp_file_path) if not pdf_text: st.error("No text extracted from the PDF. Please upload a valid file.") return # Initialize Sentence-Transformer model and embeddings only once if model is None: load_model() # Chunk text into smaller sections for embedding generation if not text_chunks: text_chunks = chunk_text(pdf_text, chunk_size=200) # Generate embeddings only once if embeddings is None: generate_embeddings() # Query input field for users to enter their search queries query = st.text_input("Enter a query to search:") if query: # Generate embedding for the query query_embedding = model.encode([query], convert_to_numpy=True) # Perform similarity search using FAISS st.write("Searching...") start_time = time.time() D, I = index.search(query_embedding, k=5) end_time = time.time() # Display the results st.write(f"Query processed in {end_time - start_time:.2f} seconds.") for i in range(len(I[0])): st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})") if __name__ == "__main__": main()