import os import streamlit as st from groq import Groq from PyPDF2 import PdfReader from docx import Document from sentence_transformers import SentenceTransformer import faiss import numpy as np # Initialize Groq API Client client = Groq(api_key=os.environ.get("Groq_Api")) # Title with Book Icon st.title("📖 A&Q From a File") # File Upload uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"]) if uploaded_file: st.write(f"**File Name:** {uploaded_file.name}") # Display file name # Extract Text def extract_text(file): if file.name.endswith(".pdf"): reader = PdfReader(file) return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()]) elif file.name.endswith(".docx"): doc = Document(file) return "\n".join([para.text for para in doc.paragraphs]) return "" file_text = extract_text(uploaded_file) if file_text: st.success("File uploaded and text extracted successfully!") st.write("Ask a question about the file:") query = st.text_input("Enter your question") if query: # Load Sentence Transformer Model model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Chunk & Embed Text chunk_size = 512 chunks = [file_text[i:i + chunk_size] for i in range(0, len(file_text), chunk_size)] embeddings = model.encode(chunks, convert_to_numpy=True) # Build FAISS Index for Fast Retrieval index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) # Query Embedding query_embedding = model.encode([query], convert_to_numpy=True) _, retrieved_idx = index.search(query_embedding, k=3) # Retrieve Top 3 Relevant Chunks relevant_text = " ".join([chunks[i] for i in retrieved_idx[0]]) # Query Groq API with relevant chunks only chat_completion = client.chat.completions.create( messages=[ {"role": "user", "content": f"Answer based on this document: {query}\n\n{relevant_text}"}, ], model="llama-3.3-70b-versatile", ) # Display Answer answer = chat_completion.choices[0].message.content st.subheader("Answer:") st.write(answer) else: st.error("Failed to extract text from the file. Please check the format.")