|
|
|
from langchain_groq import ChatGroq |
|
from langchain_community.document_loaders import WebBaseLoader |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.chains.combine_documents import create_stuff_documents_chain |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from langchain.chains import create_retrieval_chain |
|
from langchain_pinecone import PineconeVectorStore |
|
|
|
|
|
|
|
import streamlit as st |
|
import os |
|
import time |
|
from PyPDF2 import PdfReader |
|
import tempfile |
|
import pdfplumber |
|
|
|
|
|
st.title("Ask questions from your PDF(s) or website") |
|
option = None |
|
|
|
|
|
option = st.radio("Choose input type:", ("PDF(s)", "Website"), index=None) |
|
|
|
def get_pdf_processed(pdf_docs): |
|
text = "" |
|
for pdf in pdf_docs: |
|
with pdfplumber.open(pdf) as pdf_file: |
|
for page in pdf_file.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
def llm_model(): |
|
|
|
llm = ChatGroq(model="mixtral-8x7b-32768",groq_api_key=groq_api_key) |
|
prompt = ChatPromptTemplate.from_template( |
|
""" |
|
Answer the question based on the provided context only. |
|
Please provide the most accurate response based on the question |
|
<context> |
|
{context} |
|
</context> |
|
Questions:{input} |
|
""" |
|
) |
|
document_chain = create_stuff_documents_chain(llm,prompt) |
|
retriever = st.session_state.vector.as_retriever() if st.session_state.vector else None |
|
retrieval_chain = create_retrieval_chain(retriever,document_chain) |
|
|
|
prompt = st.text_input("Input your question here") |
|
|
|
if prompt: |
|
start = time.process_time() |
|
response = retrieval_chain.invoke({"input":prompt}) |
|
st.write(response['answer']) |
|
st.write("Response time: ", time.process_time() - start) |
|
|
|
|
|
model_name = "all-MiniLM-L6-v2" |
|
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
|
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size =1000, chunk_overlap= 200) |
|
|
|
index_name = "myindex" |
|
st.session_state.vector = PineconeVectorStore(index_name=index_name, embedding=st.session_state.embeddings) |
|
|
|
|
|
if option: |
|
if option == "Website": |
|
website_link = st.text_input("Enter the website link:") |
|
if website_link: |
|
with st.spinner("Loading website content..."): |
|
st.session_state.loader = WebBaseLoader(website_link) |
|
st.session_state.docs = st.session_state.loader.load() |
|
st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.docs) |
|
st.session_state.vector = PineconeVectorStore.from_documents(st.session_state.final_documents, index_name=index_name, embedding = st.session_state.embeddings) |
|
st.success("Done!") |
|
llm_model() |
|
|
|
elif option == "PDF(s)": |
|
pdf_files = st.file_uploader("Upload your PDF files", type=["pdf"], accept_multiple_files=True) |
|
if pdf_files: |
|
with st.spinner("Loading pdf..."): |
|
st.session_state.docs = get_pdf_processed(pdf_files) |
|
st.session_state.final_documents = st.session_state.text_splitter.split_text(st.session_state.docs) |
|
st.session_state.vector = PineconeVectorStore.from_texts(st.session_state.final_documents, index_name=index_name, embedding = st.session_state.embeddings) |
|
st.success("Done!") |
|
st.empty() |
|
llm_model() |
|
|
|
|
|
|