|
import os
|
|
import streamlit as st
|
|
from PyPDF2 import PdfReader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain.embeddings import HuggingFaceEmbeddings
|
|
|
|
from langchain.vectorstores import FAISS
|
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
from langchain.chains.question_answering import load_qa_chain
|
|
from langchain.prompts import PromptTemplate
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
import torch
|
|
|
|
|
|
|
|
model_name = "mixedbread-ai/mxbai-embed-2d-large-v1"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
|
|
|
|
|
|
|
|
def get_pdf_text(pdf_docs):
|
|
text = ""
|
|
for pdf in pdf_docs:
|
|
pdf_reader = PdfReader(pdf)
|
|
for page in pdf_reader.pages:
|
|
text += page.extract_text()
|
|
return text
|
|
|
|
|
|
def get_text_chunks(text):
|
|
text_splitter= RecursiveCharacterTextSplitter(
|
|
chunk_size=10000,
|
|
chunk_overlap=1000,
|
|
|
|
)
|
|
chunks=text_splitter.split_text(text)
|
|
return chunks
|
|
|
|
|
|
def get_vector_store(text_chunks):
|
|
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
vector_store = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
|
vector_store.save_local("faiss_index")
|
|
|
|
|
|
|
|
|
|
def chat_with_huggingface(context, query):
|
|
prompt_template = """
|
|
Answer the query as detailed as possible from the provided context.
|
|
If the answer is not in the context, just say, "Answer is not available in the provided documents".
|
|
Context: {context}
|
|
Query: {query}
|
|
Answer:
|
|
"""
|
|
inputs = tokenizer(prompt_template, return_tensors="pt").to(model.device)
|
|
outputs = model.generate(**inputs, max_length=500, temperature=0.3)
|
|
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
def get_conversation_chain():
|
|
def huggingface_chain(inputs):
|
|
context = inputs["input_documents"][0].page_content
|
|
query = inputs["question"]
|
|
return {"output_text": chat_with_huggingface(context, query)}
|
|
|
|
return huggingface_chain
|
|
|
|
def user_input(user_question):
|
|
|
|
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
|
|
|
|
|
|
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
|
|
docs = new_db.similarity_search(user_question)
|
|
|
|
chain=get_conversation_chain()
|
|
|
|
response = chain(
|
|
{"input_documents": docs, "question": user_question}
|
|
, return_only_outputs=True)
|
|
|
|
print(response)
|
|
st.write("Reply: ", response["output_text"])
|
|
|
|
|
|
def main():
|
|
st.set_page_config(page_title="PDF Chatbot")
|
|
st.header("PDF Chatbot made for Pooja")
|
|
|
|
user_question = st.text_input("Puchiye kuch apne documents se:")
|
|
|
|
if user_question:
|
|
user_input(user_question)
|
|
|
|
with st.sidebar:
|
|
st.title("Menu:")
|
|
pdf_docs = st.file_uploader(
|
|
"Apne PDFs yaha pe upload karo then click on 'Process'", accept_multiple_files=True)
|
|
if st.button("Submit & Process"):
|
|
with st.spinner("Ruko Padh raha hu..."):
|
|
raw_text = get_pdf_text(pdf_docs)
|
|
text_chunks = get_text_chunks(raw_text)
|
|
get_vector_store(text_chunks)
|
|
st.success("Saare documents padh liya. Ab swaal pucho 😤")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |