File size: 2,156 Bytes
41a527e
180125b
 
030a55c
b4717d0
20fe924
030a55c
b4717d0
030a55c
bb3c09f
 
030a55c
b4717d0
bb3c09f
 
030a55c
bb3c09f
 
030a55c
b4717d0
030a55c
 
 
 
b4717d0
030a55c
250d534
030a55c
 
20fe924
030a55c
41a527e
030a55c
 
b4717d0
 
 
 
 
180125b
030a55c
 
 
 
 
 
b4717d0
030a55c
b4717d0
030a55c
 
 
b4717d0
030a55c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import streamlit as st
from groq import Groq
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from io import BytesIO

# Set up Groq API key
GROQ_API_KEY = "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"

# Define a custom embedding class for Groq
class GroqEmbedding:
    def __init__(self, model="groq-embedding-model"):
        self.model = model
        self.client = Groq(api_key=GROQ_API_KEY)

    def embed_documents(self, texts):
        # Use Groq's API to generate embeddings for documents
        embeddings = self.client.embed_documents(texts, model=self.model)
        return embeddings
    
    def embed_query(self, query):
        # Use Groq's API to generate embedding for a query
        return self.client.embed_query(query, model=self.model)

# Streamlit App UI
st.title("PDF Question-Answering with Groq Embeddings")

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

# Process the uploaded PDF
if uploaded_file is not None:
    # Convert the uploaded file to a BytesIO object to read it in-memory
    pdf_file = BytesIO(uploaded_file.read())
    
    # Load the PDF file with PyPDFLoader
    loader = PyPDFLoader(pdf_file)
    documents = loader.load()
    
    # Split documents into smaller chunks for better processing
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_docs = text_splitter.split_documents(documents)

    # Create embeddings using Groq
    embeddings = GroqEmbedding(model="groq-embedding-model")  # Use your preferred Groq model
    
    # Create a FAISS vector store with the embeddings
    vector_db = FAISS.from_documents(split_docs, embeddings)
    
    # Initialize the retrieval-based QA system
    qa = RetrievalQA.from_chain_type(llm=None, chain_type="stuff", vectorstore=vector_db)
    
    # User input for querying the PDF content
    query = st.text_input("Ask a question about the PDF:")

    if query:
        result = qa.run(query)
        st.write("Answer:", result)