baserag_hf / app.py
ravi259's picture
with app and other files
ef5b171
raw
history blame
6.49 kB
import easyocr as ocr #OCR
import streamlit as st #Web App
from PIL import Image #Image Processing
import numpy as np #Image Processing
# To analyze the PDF layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
# To extract the images from the PDFs
from PIL import Image
from pdf2image import convert_from_path
import streamlit as st
import pandas as pd
import gradio as gr
import time
from PyPDF2 import PdfReader
import easyocr as ocr #OCR
import streamlit as st #Web App
from PIL import Image #Image Processing
import numpy as np #Image Processing
# To read the PDF
import PyPDF2
# To analyze the PDF layout and extract text
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# To extract text from tables in PDF
import pdfplumber
# To extract the images from the PDFs
from PIL import Image
from pdf2image import convert_from_path
# To perform OCR to extract text from images
import pytesseract
# To remove the additional created files
import os
import tiktoken
import streamlit as st
import pandas as pd
from io import StringIO
import time
import json
import openai
import requests
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
#from langchain_community.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv,find_dotenv
#from langchain_community.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_recall,
context_precision,
)
import os
from dotenv import load_dotenv
from htmlTemplates import bot_template, user_template, css
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
def load_knowledgeBase():
embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY)
DB_FAISS_PATH = "../Ragas-LangChain-Evaluation/vectorstore/db_faiss/"
db = FAISS.load_local(
DB_FAISS_PATH,
embeddings,
allow_dangerous_deserialization=True,
index_name="njmvc_Index"
)
return db
def load_prompt():
prompt = """ You are helping students to pass NJMVC Knowledge Test. Provide a Single multiple choice question with 4 options to choose from.
Use the context to provide the question and answer choices.
context = {context}
question = {question}
if the answer is not in the pdf answer "i donot know what the hell you are asking about"
"""
prompt = ChatPromptTemplate.from_template(prompt)
return prompt
#function to load the OPENAI LLM
def load_llm():
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, api_key=OPENAI_API_KEY)
return llm
knowledgeBase=load_knowledgeBase()
prompt = load_prompt()
llm=load_llm()
def get_conversation_chain(vectorstore, llm):
llm = llm
#llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
memory = ConversationBufferMemory(memory_key="chat_history")
conversation_chain = ConversationChain(
llm=llm,
verbose=True,
memory=ConversationBufferMemory(),
)
return conversation_chain
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
def get_pdf_text(pdf_files):
text = ""
for pdf_file in pdf_files:
reader = PdfReader(pdf_file)
for page in reader.pages:
text += page.extract_text()
return text
def get_chunk_text(text):
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000,
chunk_overlap = 200,
length_function = len
)
chunks = text_splitter.split_text(text)
return chunks
def handle_user_input(question):
response = st.session_state.conversation({'question':question})
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
def main():
st.set_page_config(page_title='NJMVC Knowledge Test with RAGAS', page_icon=':cars:')
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.header('NJMVC Knowledge Test with RAGAS :cars:')
question = st.text_input("Input the Topic you want to test your knowledge: ")
if question:
#handle_user_input(question)
with st.spinner("Get ready..."):
text_chunks = get_chunk_text(question)
db = FAISS.load_local(folder_path="../Ragas-LangChain-Evaluation/vectorstore/db_faiss/",embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY),allow_dangerous_deserialization=True, index_name="njmvc_Index")
searchDocs = db.similarity_search("what is the NJMVC driving test")
similar_embeddings=FAISS.from_documents(documents=searchDocs, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY))
#creating the chain for integrating llm,prompt,stroutputparser
retriever = similar_embeddings.as_retriever()
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
#st.session_state.conversation = get_conversation_chain(vector_store)
response=rag_chain.invoke(question)
st.write(response)
if __name__ == '__main__':
main()