import os import streamlit as st from groq import Groq from langchain.chains import RetrievalQA from langchain.vectorstores import FAISS from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from io import BytesIO # Set up Groq API key GROQ_API_KEY = "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" # Define a custom embedding class for Groq class GroqEmbedding: def __init__(self, model="groq-embedding-model"): self.model = model self.client = Groq(api_key=GROQ_API_KEY) def embed_documents(self, texts): # Use Groq's API to generate embeddings for documents embeddings = self.client.embed_documents(texts, model=self.model) return embeddings def embed_query(self, query): # Use Groq's API to generate embedding for a query return self.client.embed_query(query, model=self.model) # Streamlit App UI st.title("PDF Question-Answering with Groq Embeddings") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") # Process the uploaded PDF if uploaded_file is not None: # Convert the uploaded file to a BytesIO object to read it in-memory pdf_file = BytesIO(uploaded_file.read()) # Load the PDF file with PyPDFLoader loader = PyPDFLoader(pdf_file) documents = loader.load() # Split documents into smaller chunks for better processing text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) split_docs = text_splitter.split_documents(documents) # Create embeddings using Groq embeddings = GroqEmbedding(model="groq-embedding-model") # Use your preferred Groq model # Create a FAISS vector store with the embeddings vector_db = FAISS.from_documents(split_docs, embeddings) # Initialize the retrieval-based QA system qa = RetrievalQA.from_chain_type(llm=None, chain_type="stuff", vectorstore=vector_db) # User input for querying the PDF content query = st.text_input("Ask a question about the PDF:") if query: result = qa.run(query) st.write("Answer:", result)