Spaces:

Ahmadkhan12
/

Rag-university-act-2016

Sleeping

App Files Files Community

Ahmadkhan12 commited on Nov 24, 2024

Commit

b5b7646

verified ·

1 Parent(s): cc90cb9

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -11

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from PyPDF2 import PdfReader
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 # Function to process the uploaded PDF and save it temporarily
 def process_pdf(file):
@@ -21,6 +22,12 @@ def extract_text_from_pdf(pdf_path):
         text += page.extract_text()
     return text
 # Main function to run the Streamlit app
 def main():
     st.title("PDF Embedding and Query System")
@@ -33,17 +40,28 @@ def main():
         tmp_file_path = process_pdf(uploaded_file)
         # Extract text from the uploaded PDF
         pdf_text = extract_text_from_pdf(tmp_file_path)
         # Initialize Sentence-Transformer model for embeddings
         model = SentenceTransformer('all-MiniLM-L6-v2')
-        # Generate embeddings for the text (split into chunks)
-        text_chunks = pdf_text.split("\n")  # Split text into lines or paragraphs
-        embeddings = model.encode(text_chunks, convert_to_numpy=True)
         # Build FAISS index
-        dimension = embeddings.shape[1]
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings)
@@ -55,12 +73,9 @@ def main():
             query_embedding = model.encode([query], convert_to_numpy=True)
             # Perform similarity search using FAISS
             D, I = index.search(query_embedding, k=5)
-            # Display the results
-            for i in range(len(I[0])):
-                st.write(f"Match {i + 1}: {text_chunks[I[0][i]]} (Distance: {D[0][i]:.4f})")
-# Run the app if this script is executed directly
-if __name__ == "__main__":
-    main()

 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
+import time
 # Function to process the uploaded PDF and save it temporarily
 def process_pdf(file):
         text += page.extract_text()
     return text
+# Function to chunk text into smaller sections
+def chunk_text(text, chunk_size=200):
+    words = text.split()
+    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks
 # Main function to run the Streamlit app
 def main():
     st.title("PDF Embedding and Query System")
         tmp_file_path = process_pdf(uploaded_file)
         # Extract text from the uploaded PDF
+        st.write("Extracting text from the PDF...")
         pdf_text = extract_text_from_pdf(tmp_file_path)
         # Initialize Sentence-Transformer model for embeddings
         model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Chunk text into smaller sections for embedding generation
+        st.write("Chunking text for embedding generation...")
+        text_chunks = chunk_text(pdf_text, chunk_size=200)
+        # Generate embeddings with a progress bar
+        st.write("Generating embeddings...")
+        progress_bar = st.progress(0)
+        embeddings = []
+        for i, chunk in enumerate(text_chunks):
+            embeddings.append(model.encode(chunk, convert_to_numpy=True))
+            progress_bar.progress((i + 1) / len(text_chunks))
+        embeddings = np.array(embeddings)
         # Build FAISS index
+        st.write("Building FAISS index...")
+        dimension = embeddings.shape[-1]
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings)
             query_embedding = model.encode([query], convert_to_numpy=True)
             # Perform similarity search using FAISS
+            st.write("Searching...")
+            start_time = time.time()
             D, I = index.search(query_embedding, k=5)
+            end_time = time.time()
+            # Display the res