Spaces:
Sleeping
Sleeping
File size: 2,866 Bytes
ba5f07e 41a527e 180125b 2559e80 b5b7646 ba5f07e 54146e4 ba5f07e 54146e4 ba5f07e 2559e80 b5b7646 54146e4 ba5f07e 54146e4 ba5f07e 54146e4 ba5f07e 54146e4 ba5f07e 54146e4 2559e80 b5b7646 2559e80 54146e4 b5b7646 54146e4 2559e80 b5b7646 2559e80 54146e4 ba5f07e 2559e80 ba5f07e 2559e80 b5b7646 2559e80 b5b7646 2559e80 b5b7646 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import tempfile
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time
# Function to process the uploaded PDF and save it temporarily
def process_pdf(file):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
tmpfile.write(file.read()) # Write the uploaded file's content to the temp file
tmpfile_path = tmpfile.name # Get the temporary file path
return tmpfile_path
# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to chunk text into smaller sections
def chunk_text(text, chunk_size=200):
words = text.split()
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
# Main function to run the Streamlit app
def main():
st.title("PDF Embedding and Query System")
# File uploader for the user to upload a PDF
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
# Process the uploaded PDF and get its file path
tmp_file_path = process_pdf(uploaded_file)
# Extract text from the uploaded PDF
st.write("Extracting text from the PDF...")
pdf_text = extract_text_from_pdf(tmp_file_path)
# Initialize Sentence-Transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
# Chunk text into smaller sections for embedding generation
st.write("Chunking text for embedding generation...")
text_chunks = chunk_text(pdf_text, chunk_size=200)
# Generate embeddings with a progress bar
st.write("Generating embeddings...")
progress_bar = st.progress(0)
embeddings = []
for i, chunk in enumerate(text_chunks):
embeddings.append(model.encode(chunk, convert_to_numpy=True))
progress_bar.progress((i + 1) / len(text_chunks))
embeddings = np.array(embeddings)
# Build FAISS index
st.write("Building FAISS index...")
dimension = embeddings.shape[-1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# Query input field for users to enter their search queries
query = st.text_input("Enter a query to search:")
if query:
# Generate embedding for the query
query_embedding = model.encode([query], convert_to_numpy=True)
# Perform similarity search using FAISS
st.write("Searching...")
start_time = time.time()
D, I = index.search(query_embedding, k=5)
end_time = time.time()
# Display the res
|