File size: 3,800 Bytes
e096a7f
080536a
d626451
080536a
d626451
 
 
 
080536a
d626451
080536a
e096a7f
 
 
d626451
 
 
080536a
 
d626451
080536a
d626451
 
 
 
 
 
080536a
d626451
080536a
 
 
d626451
7a8c2a1
d626451
080536a
 
d626451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
080536a
d626451
 
4bf1fe8
7a8c2a1
080536a
 
 
e096a7f
d626451
 
080536a
 
 
 
d626451
 
 
 
080536a
 
 
 
 
 
 
 
d626451
 
 
080536a
 
 
 
 
 
d626451
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Langchain imports
from langchain_groq import ChatGroq 
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_pinecone import PineconeVectorStore

# Embedding and model import
# Other
import streamlit as st
import os
import time
from PyPDF2 import PdfReader
import tempfile
import pdfplumber


st.title("Ask questions from your PDF(s) or website")
option = None

# Prompt user to choose between PDFs or website
option = st.radio("Choose input type:", ("PDF(s)", "Website"), index=None)

def get_pdf_processed(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        with pdfplumber.open(pdf) as pdf_file:
            for page in pdf_file.pages:
                text += page.extract_text()
    return text

def llm_model():
    # llm = ChatGroq(model="mixtral-8x7b-32768",groq_api_key=st.secrets['GROQ_API_KEY'])
    llm = ChatGroq(model="mixtral-8x7b-32768",groq_api_key=groq_api_key)
    prompt = ChatPromptTemplate.from_template(
    """
    Answer the question based on the provided context only.
    Please provide the most accurate response based on the question
    <context>
    {context}
    </context>
    Questions:{input}
    """
    )
    document_chain = create_stuff_documents_chain(llm,prompt)
    retriever = st.session_state.vector.as_retriever() if st.session_state.vector else None
    retrieval_chain = create_retrieval_chain(retriever,document_chain)

    prompt = st.text_input("Input your question here")

    if prompt:
        start = time.process_time()
        response = retrieval_chain.invoke({"input":prompt})
        st.write(response['answer'])
        st.write("Response time: ", time.process_time() - start)

# st.session_state.embeddings =GoogleGenerativeAIEmbeddings(model = 'models/embedding-001',google_api_key=st.secrets['GOOGLE_API_KEY'])
model_name = "all-MiniLM-L6-v2"
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size =1000, chunk_overlap= 200)

index_name = "myindex"
st.session_state.vector = PineconeVectorStore(index_name=index_name, embedding=st.session_state.embeddings)


if option:
    if option == "Website":
        website_link = st.text_input("Enter the website link:")
        if website_link:
            with st.spinner("Loading website content..."):
                st.session_state.loader = WebBaseLoader(website_link)
                st.session_state.docs = st.session_state.loader.load()
                st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.docs)
                st.session_state.vector = PineconeVectorStore.from_documents(st.session_state.final_documents, index_name=index_name, embedding = st.session_state.embeddings)
            st.success("Done!")
        llm_model()
            
    elif option == "PDF(s)":
        pdf_files = st.file_uploader("Upload your PDF files", type=["pdf"], accept_multiple_files=True)
        if pdf_files:
            with st.spinner("Loading pdf..."):
                st.session_state.docs = get_pdf_processed(pdf_files)
                st.session_state.final_documents = st.session_state.text_splitter.split_text(st.session_state.docs)
                st.session_state.vector = PineconeVectorStore.from_texts(st.session_state.final_documents, index_name=index_name, embedding = st.session_state.embeddings) 
            st.success("Done!")
            st.empty()
            llm_model()