File size: 2,295 Bytes
ba5f07e
41a527e
180125b
2559e80
 
 
 
ba5f07e
54146e4
ba5f07e
 
54146e4
 
ba5f07e
 
2559e80
 
 
 
 
 
 
 
54146e4
ba5f07e
 
54146e4
 
ba5f07e
54146e4
ba5f07e
54146e4
ba5f07e
54146e4
2559e80
 
 
 
 
54146e4
2559e80
 
 
54146e4
2559e80
 
 
 
54146e4
 
ba5f07e
2559e80
ba5f07e
2559e80
 
 
 
 
 
54146e4
2559e80
 
ba5f07e
54146e4
ba5f07e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import tempfile
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Function to process the uploaded PDF and save it temporarily
def process_pdf(file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
        tmpfile.write(file.read())  # Write the uploaded file's content to the temp file
        tmpfile_path = tmpfile.name  # Get the temporary file path
    return tmpfile_path

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Main function to run the Streamlit app
def main():
    st.title("PDF Embedding and Query System")

    # File uploader for the user to upload a PDF
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

    if uploaded_file is not None:
        # Process the uploaded PDF and get its file path
        tmp_file_path = process_pdf(uploaded_file)

        # Extract text from the uploaded PDF
        pdf_text = extract_text_from_pdf(tmp_file_path)

        # Initialize Sentence-Transformer model for embeddings
        model = SentenceTransformer('all-MiniLM-L6-v2')

        # Generate embeddings for the text (split into chunks)
        text_chunks = pdf_text.split("\n")  # Split text into lines or paragraphs
        embeddings = model.encode(text_chunks, convert_to_numpy=True)

        # Build FAISS index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)

        # Query input field for users to enter their search queries
        query = st.text_input("Enter a query to search:")

        if query:
            # Generate embedding for the query
            query_embedding = model.encode([query], convert_to_numpy=True)

            # Perform similarity search using FAISS
            D, I = index.search(query_embedding, k=5)

            # Display the results
            for i in range(len(I[0])):
                st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")

# Run the app if this script is executed directly
if __name__ == "__main__":
    main()