Ahmadkhan12 commited on
Commit
b5b7646
·
verified ·
1 Parent(s): cc90cb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -11
app.py CHANGED
@@ -5,6 +5,7 @@ from PyPDF2 import PdfReader
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
 
8
 
9
  # Function to process the uploaded PDF and save it temporarily
10
  def process_pdf(file):
@@ -21,6 +22,12 @@ def extract_text_from_pdf(pdf_path):
21
  text += page.extract_text()
22
  return text
23
 
 
 
 
 
 
 
24
  # Main function to run the Streamlit app
25
  def main():
26
  st.title("PDF Embedding and Query System")
@@ -33,17 +40,28 @@ def main():
33
  tmp_file_path = process_pdf(uploaded_file)
34
 
35
  # Extract text from the uploaded PDF
 
36
  pdf_text = extract_text_from_pdf(tmp_file_path)
37
 
38
  # Initialize Sentence-Transformer model for embeddings
39
  model = SentenceTransformer('all-MiniLM-L6-v2')
40
 
41
- # Generate embeddings for the text (split into chunks)
42
- text_chunks = pdf_text.split("\n") # Split text into lines or paragraphs
43
- embeddings = model.encode(text_chunks, convert_to_numpy=True)
 
 
 
 
 
 
 
 
 
44
 
45
  # Build FAISS index
46
- dimension = embeddings.shape[1]
 
47
  index = faiss.IndexFlatL2(dimension)
48
  index.add(embeddings)
49
 
@@ -55,12 +73,9 @@ def main():
55
  query_embedding = model.encode([query], convert_to_numpy=True)
56
 
57
  # Perform similarity search using FAISS
 
 
58
  D, I = index.search(query_embedding, k=5)
 
59
 
60
- # Display the results
61
- for i in range(len(I[0])):
62
- st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")
63
-
64
- # Run the app if this script is executed directly
65
- if __name__ == "__main__":
66
- main()
 
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
8
+ import time
9
 
10
  # Function to process the uploaded PDF and save it temporarily
11
  def process_pdf(file):
 
22
  text += page.extract_text()
23
  return text
24
 
25
+ # Function to chunk text into smaller sections
26
+ def chunk_text(text, chunk_size=200):
27
+ words = text.split()
28
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
29
+ return chunks
30
+
31
  # Main function to run the Streamlit app
32
  def main():
33
  st.title("PDF Embedding and Query System")
 
40
  tmp_file_path = process_pdf(uploaded_file)
41
 
42
  # Extract text from the uploaded PDF
43
+ st.write("Extracting text from the PDF...")
44
  pdf_text = extract_text_from_pdf(tmp_file_path)
45
 
46
  # Initialize Sentence-Transformer model for embeddings
47
  model = SentenceTransformer('all-MiniLM-L6-v2')
48
 
49
+ # Chunk text into smaller sections for embedding generation
50
+ st.write("Chunking text for embedding generation...")
51
+ text_chunks = chunk_text(pdf_text, chunk_size=200)
52
+
53
+ # Generate embeddings with a progress bar
54
+ st.write("Generating embeddings...")
55
+ progress_bar = st.progress(0)
56
+ embeddings = []
57
+ for i, chunk in enumerate(text_chunks):
58
+ embeddings.append(model.encode(chunk, convert_to_numpy=True))
59
+ progress_bar.progress((i + 1) / len(text_chunks))
60
+ embeddings = np.array(embeddings)
61
 
62
  # Build FAISS index
63
+ st.write("Building FAISS index...")
64
+ dimension = embeddings.shape[-1]
65
  index = faiss.IndexFlatL2(dimension)
66
  index.add(embeddings)
67
 
 
73
  query_embedding = model.encode([query], convert_to_numpy=True)
74
 
75
  # Perform similarity search using FAISS
76
+ st.write("Searching...")
77
+ start_time = time.time()
78
  D, I = index.search(query_embedding, k=5)
79
+ end_time = time.time()
80
 
81
+ # Display the res