Ahmadkhan12 commited on
Commit
61651bd
·
verified ·
1 Parent(s): b5b7646

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -16
app.py CHANGED
@@ -9,6 +9,7 @@ import time
9
 
10
  # Function to process the uploaded PDF and save it temporarily
11
  def process_pdf(file):
 
12
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
13
  tmpfile.write(file.read()) # Write the uploaded file's content to the temp file
14
  tmpfile_path = tmpfile.name # Get the temporary file path
@@ -16,17 +17,27 @@ def process_pdf(file):
16
 
17
  # Function to extract text from the PDF
18
  def extract_text_from_pdf(pdf_path):
19
- reader = PdfReader(pdf_path)
20
- text = ""
21
- for page in reader.pages:
22
- text += page.extract_text()
23
- return text
 
 
 
 
 
24
 
25
  # Function to chunk text into smaller sections
26
  def chunk_text(text, chunk_size=200):
27
- words = text.split()
28
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
29
- return chunks
 
 
 
 
 
30
 
31
  # Main function to run the Streamlit app
32
  def main():
@@ -40,16 +51,23 @@ def main():
40
  tmp_file_path = process_pdf(uploaded_file)
41
 
42
  # Extract text from the uploaded PDF
43
- st.write("Extracting text from the PDF...")
44
  pdf_text = extract_text_from_pdf(tmp_file_path)
45
 
 
 
 
 
46
  # Initialize Sentence-Transformer model for embeddings
 
47
  model = SentenceTransformer('all-MiniLM-L6-v2')
48
 
49
  # Chunk text into smaller sections for embedding generation
50
- st.write("Chunking text for embedding generation...")
51
  text_chunks = chunk_text(pdf_text, chunk_size=200)
52
 
 
 
 
 
53
  # Generate embeddings with a progress bar
54
  st.write("Generating embeddings...")
55
  progress_bar = st.progress(0)
@@ -73,9 +91,4 @@ def main():
73
  query_embedding = model.encode([query], convert_to_numpy=True)
74
 
75
  # Perform similarity search using FAISS
76
- st.write("Searching...")
77
- start_time = time.time()
78
- D, I = index.search(query_embedding, k=5)
79
- end_time = time.time()
80
-
81
- # Display the res
 
9
 
10
  # Function to process the uploaded PDF and save it temporarily
11
  def process_pdf(file):
12
+ st.write("Processing uploaded PDF...")
13
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
14
  tmpfile.write(file.read()) # Write the uploaded file's content to the temp file
15
  tmpfile_path = tmpfile.name # Get the temporary file path
 
17
 
18
  # Function to extract text from the PDF
19
  def extract_text_from_pdf(pdf_path):
20
+ try:
21
+ st.write("Extracting text from the PDF...")
22
+ reader = PdfReader(pdf_path)
23
+ text = ""
24
+ for page in reader.pages:
25
+ text += page.extract_text()
26
+ return text
27
+ except Exception as e:
28
+ st.error(f"Error extracting text from PDF: {e}")
29
+ return ""
30
 
31
  # Function to chunk text into smaller sections
32
  def chunk_text(text, chunk_size=200):
33
+ try:
34
+ st.write("Chunking text into smaller sections...")
35
+ words = text.split()
36
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
37
+ return chunks
38
+ except Exception as e:
39
+ st.error(f"Error chunking text: {e}")
40
+ return []
41
 
42
  # Main function to run the Streamlit app
43
  def main():
 
51
  tmp_file_path = process_pdf(uploaded_file)
52
 
53
  # Extract text from the uploaded PDF
 
54
  pdf_text = extract_text_from_pdf(tmp_file_path)
55
 
56
+ if not pdf_text:
57
+ st.error("No text extracted from the PDF. Please upload a valid file.")
58
+ return
59
+
60
  # Initialize Sentence-Transformer model for embeddings
61
+ st.write("Loading embedding model...")
62
  model = SentenceTransformer('all-MiniLM-L6-v2')
63
 
64
  # Chunk text into smaller sections for embedding generation
 
65
  text_chunks = chunk_text(pdf_text, chunk_size=200)
66
 
67
+ if not text_chunks:
68
+ st.error("Failed to split text into chunks. Exiting.")
69
+ return
70
+
71
  # Generate embeddings with a progress bar
72
  st.write("Generating embeddings...")
73
  progress_bar = st.progress(0)
 
91
  query_embedding = model.encode([query], convert_to_numpy=True)
92
 
93
  # Perform similarity search using FAISS
94
+ st.write("Searching.