Spaces:

Ahmadkhan12
/

Rag-university-act-2016

Sleeping

Rag-university-act-2016

File size: 2,866 Bytes

import tempfile
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time

# Function to process the uploaded PDF and save it temporarily
def process_pdf(file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
        tmpfile.write(file.read())  # Write the uploaded file's content to the temp file
        tmpfile_path = tmpfile.name  # Get the temporary file path
    return tmpfile_path

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to chunk text into smaller sections
def chunk_text(text, chunk_size=200):
    words = text.split()
    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Main function to run the Streamlit app
def main():
    st.title("PDF Embedding and Query System")

    # File uploader for the user to upload a PDF
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

    if uploaded_file is not None:
        # Process the uploaded PDF and get its file path
        tmp_file_path = process_pdf(uploaded_file)

        # Extract text from the uploaded PDF
        st.write("Extracting text from the PDF...")
        pdf_text = extract_text_from_pdf(tmp_file_path)

        # Initialize Sentence-Transformer model for embeddings
        model = SentenceTransformer('all-MiniLM-L6-v2')

        # Chunk text into smaller sections for embedding generation
        st.write("Chunking text for embedding generation...")
        text_chunks = chunk_text(pdf_text, chunk_size=200)

        # Generate embeddings with a progress bar
        st.write("Generating embeddings...")
        progress_bar = st.progress(0)
        embeddings = []
        for i, chunk in enumerate(text_chunks):
            embeddings.append(model.encode(chunk, convert_to_numpy=True))
            progress_bar.progress((i + 1) / len(text_chunks))
        embeddings = np.array(embeddings)

        # Build FAISS index
        st.write("Building FAISS index...")
        dimension = embeddings.shape[-1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)

        # Query input field for users to enter their search queries
        query = st.text_input("Enter a query to search:")

        if query:
            # Generate embedding for the query
            query_embedding = model.encode([query], convert_to_numpy=True)

            # Perform similarity search using FAISS
            st.write("Searching...")
            start_time = time.time()
            D, I = index.search(query_embedding, k=5)
            end_time = time.time()

            # Display the res