Spaces:
Runtime error
Runtime error
Adding all basic files
Browse files- app.py +172 -0
- doc_loading.py +51 -0
- llm_functions.py +77 -0
- requirements.txt +11 -0
- utils.py +28 -0
app.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from doc_loading import get_article_text, read_pdf_text
|
| 3 |
+
from utils import get_topics
|
| 4 |
+
from llm_functions import generate_qa_pairs, evaluate_answer, get_conversational_chain, get_topics_from_chunk
|
| 5 |
+
|
| 6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_openai import OpenAIEmbeddings
|
| 8 |
+
from langchain.vectorstores import FAISS
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
load_dotenv()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Setting up basics
|
| 16 |
+
st.set_page_config(page_title="LLM UC")
|
| 17 |
+
st.header("Let's check what you know")
|
| 18 |
+
|
| 19 |
+
# Greeting the user
|
| 20 |
+
st.write("Welcome!!")
|
| 21 |
+
|
| 22 |
+
st.session_state
|
| 23 |
+
# Sidebar with selectbox
|
| 24 |
+
|
| 25 |
+
if "option" not in st.session_state:
|
| 26 |
+
option = st.selectbox("How are you going to input your document?",
|
| 27 |
+
("Upload PDF", "Blog link", "YouTube Link", "Paste copied article"))
|
| 28 |
+
# Conditionally show components based on user's choice
|
| 29 |
+
file_name = ""
|
| 30 |
+
main_text = ""
|
| 31 |
+
|
| 32 |
+
if option == "Upload PDF":
|
| 33 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"], key="uploading_pdf")
|
| 34 |
+
if uploaded_file is not None:
|
| 35 |
+
# Process the uploaded file
|
| 36 |
+
st.write("PDF uploaded successfully!, we will read only first 5 pages")
|
| 37 |
+
main_text = read_pdf_text(uploaded_file)
|
| 38 |
+
|
| 39 |
+
elif option == "Paste copied article":
|
| 40 |
+
main_text = st.text_area("Paste your article here", key="article_key")
|
| 41 |
+
if st.button('Submit'):
|
| 42 |
+
# Process the pasted text
|
| 43 |
+
st.write("Text submitted successfully!")
|
| 44 |
+
|
| 45 |
+
else:
|
| 46 |
+
link_input = st.text_input("Paste your blog/youtube link here.", key='link_key')
|
| 47 |
+
if st.button('Submit'):
|
| 48 |
+
# Process the link
|
| 49 |
+
st.write("link submitted")
|
| 50 |
+
if option == "YouTube Link":
|
| 51 |
+
st.write("This functionality in under construction.")
|
| 52 |
+
main_text = ""
|
| 53 |
+
else:
|
| 54 |
+
# lets try now
|
| 55 |
+
article_text = get_article_text(link_input)
|
| 56 |
+
if article_text:
|
| 57 |
+
main_text = ""
|
| 58 |
+
st.write("This functionality in under construction.")
|
| 59 |
+
else:
|
| 60 |
+
st.write("Unable to fetch text from url. Can you please check the link?")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Show a warning if the user hasn't selected an option or if the uploaded file is not a PDF
|
| 64 |
+
|
| 65 |
+
if option == "Upload PDF":
|
| 66 |
+
if uploaded_file is not None and uploaded_file.type != "application/pdf":
|
| 67 |
+
st.error("Please choose a PDF file only.")
|
| 68 |
+
|
| 69 |
+
if "total_text" not in st.session_state:
|
| 70 |
+
st.session_state['total_text'] = main_text
|
| 71 |
+
|
| 72 |
+
if len(main_text) > 0:
|
| 73 |
+
# creating chunks of the given article
|
| 74 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 75 |
+
chunk_size=1000,
|
| 76 |
+
chunk_overlap=100,
|
| 77 |
+
length_function=len,
|
| 78 |
+
is_separator_regex=False
|
| 79 |
+
)
|
| 80 |
+
texts = text_splitter.create_documents([main_text])
|
| 81 |
+
|
| 82 |
+
# building vector database
|
| 83 |
+
|
| 84 |
+
embeddings = OpenAIEmbeddings()
|
| 85 |
+
# store in vector db
|
| 86 |
+
db = FAISS.from_documents(texts, embeddings)
|
| 87 |
+
|
| 88 |
+
hash_name = f"{option.replace(' ', '-')}"
|
| 89 |
+
db.save_local(f'faiss_{hash_name}_index')
|
| 90 |
+
|
| 91 |
+
# Create a toggle button
|
| 92 |
+
toggle_button_state = st.checkbox("check this, if i should quiz you")
|
| 93 |
+
|
| 94 |
+
# Display a message based on the toggle button state
|
| 95 |
+
if toggle_button_state:
|
| 96 |
+
# make the quiz
|
| 97 |
+
st.write("Quiz incoming...")
|
| 98 |
+
|
| 99 |
+
# give selection of toughness
|
| 100 |
+
toughness_selection = None
|
| 101 |
+
while "toughness_selection" not in st.session_state:
|
| 102 |
+
toughness_selection = st.selectbox("Select the question toughness",
|
| 103 |
+
("Easy", "Moderate", "Tough"))
|
| 104 |
+
st.session_state['toughness_selection'] = toughness_selection
|
| 105 |
+
|
| 106 |
+
top_topics = None
|
| 107 |
+
if "top_topics" not in st.session_state:
|
| 108 |
+
topics_chain = get_topics_from_chunk()
|
| 109 |
+
top_topics = get_topics(texts[:10], topics_chain)
|
| 110 |
+
top_topics.append("Any")
|
| 111 |
+
st.session_state['top_topics'] = top_topics
|
| 112 |
+
|
| 113 |
+
topic_selection = None
|
| 114 |
+
if "topic_selection" not in st.session_state:
|
| 115 |
+
top_topics = st.session_state['top_topics']
|
| 116 |
+
topic_selection = st.selectbox("Select the topic i should quiz from!!",
|
| 117 |
+
tuple(top_topics), key="topic_selection")
|
| 118 |
+
st.session_state['topics_selection'] = topic_selection
|
| 119 |
+
toughness_selection = st.session_state['toughness_selection']
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
if "response" not in st.session_state:
|
| 124 |
+
ques_chain = generate_qa_pairs()
|
| 125 |
+
topic_selection = st.session_state['topic_selection']
|
| 126 |
+
toughness_selection = st.session_state['toughness_selection']
|
| 127 |
+
# st.write(f"here we go, a {toughness_selection} level question from {topic_selection} topic.")
|
| 128 |
+
docs_for_questions = db.similarity_search(topic_selection, k=5)
|
| 129 |
+
response = ques_chain.invoke({"context": docs_for_questions,
|
| 130 |
+
"topic": topic_selection,
|
| 131 |
+
'toughness': toughness_selection})
|
| 132 |
+
st.session_state['response'] = response[0]
|
| 133 |
+
|
| 134 |
+
if "scoring" not in st.session_state:
|
| 135 |
+
eval_chain = evaluate_answer()
|
| 136 |
+
response = st.session_state['response']
|
| 137 |
+
st.write(f"\n Question: {response['question']}")
|
| 138 |
+
user_answer = st.text_input(f"Answer here: ", key="my_ans")
|
| 139 |
+
|
| 140 |
+
if st.button(f"Evaluate"):
|
| 141 |
+
score = eval_chain({"context": response['answer'],
|
| 142 |
+
"answer": user_answer})
|
| 143 |
+
st.write(f"You scored {score['score']}/10")
|
| 144 |
+
if int(score['score'])<6:
|
| 145 |
+
st.write(f"The correct answer would be: {response['answer']}")
|
| 146 |
+
else:
|
| 147 |
+
st.write("Good Job!!!")
|
| 148 |
+
|
| 149 |
+
st.session_state['scoring'] = score['score']
|
| 150 |
+
elif st.button("Don't know"):
|
| 151 |
+
st.write(f"The correct answer would be: {response['answer']}")
|
| 152 |
+
|
| 153 |
+
else:
|
| 154 |
+
st.write("What's your question?")
|
| 155 |
+
# let the user ask question.
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
if "input_question" not in st.session_state or st.session_state['input_question'] == "":
|
| 159 |
+
input_question = st.text_input("Here, input your question and click `Answer this`", key="Question")
|
| 160 |
+
st.session_state['input_question'] = input_question
|
| 161 |
+
answer_button = st.button('Answer this', key='my_question')
|
| 162 |
+
|
| 163 |
+
if answer_button:
|
| 164 |
+
input_question = st.session_state['input_question']
|
| 165 |
+
docs = db.similarity_search(input_question, k=5)
|
| 166 |
+
chain = get_conversational_chain()
|
| 167 |
+
response = chain({"input_documents" : docs,
|
| 168 |
+
"question": input_question,
|
| 169 |
+
})
|
| 170 |
+
st.write(response['output_text'])
|
| 171 |
+
del st.session_state['input_question']
|
| 172 |
+
|
doc_loading.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
# Use 1: when people want to share a webpage and ask you to generate question from here
|
| 5 |
+
|
| 6 |
+
def get_article_text(url):
|
| 7 |
+
# Fetch the webpage content
|
| 8 |
+
response = requests.get(url)
|
| 9 |
+
if response.status_code != 200:
|
| 10 |
+
print(f"Failed to fetch the webpage. Status code: {response.status_code}")
|
| 11 |
+
return None
|
| 12 |
+
|
| 13 |
+
# Parse the HTML content
|
| 14 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 15 |
+
|
| 16 |
+
# Find the main article body
|
| 17 |
+
article_body = soup.find('body')
|
| 18 |
+
|
| 19 |
+
# Extract text from the body
|
| 20 |
+
article_text = article_body.get_text()
|
| 21 |
+
|
| 22 |
+
return article_text
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Use 2. When people want to upload a pdf and ask you to generate question from it
|
| 26 |
+
import os
|
| 27 |
+
from PyPDF2 import PdfReader
|
| 28 |
+
|
| 29 |
+
# pdf_directory = os.getcwd()
|
| 30 |
+
|
| 31 |
+
# # Opening the pdf
|
| 32 |
+
# for filename in os.listdir(pdf_directory):
|
| 33 |
+
# if filename.endswith(".pdf"):
|
| 34 |
+
# pdf_file_name = filename
|
| 35 |
+
# pdf_file = open(os.path.join(pdf_directory, filename), "rb")
|
| 36 |
+
def read_pdf_text(pdf_file):
|
| 37 |
+
# Reading the pdf
|
| 38 |
+
pdf_reader = PdfReader(pdf_file)
|
| 39 |
+
all_text = ""
|
| 40 |
+
# make it limited. min(5, len(pages))
|
| 41 |
+
for idx, page in enumerate(pdf_reader.pages):
|
| 42 |
+
all_text += page.extract_text()
|
| 43 |
+
if idx > 4:
|
| 44 |
+
break
|
| 45 |
+
return all_text
|
| 46 |
+
|
| 47 |
+
# print(all_text)
|
| 48 |
+
|
| 49 |
+
# Use 3. let the user input an story to generate questions from
|
| 50 |
+
|
| 51 |
+
# Use 4. let the use share a video link to generate questions from
|
llm_functions.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.chains.question_answering import load_qa_chain
|
| 2 |
+
from langchain_openai import ChatOpenAI
|
| 3 |
+
from langchain.prompts import PromptTemplate
|
| 4 |
+
from langchain.chains import LLMChain
|
| 5 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 6 |
+
import streamlit as st
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_conversational_chain():
|
| 10 |
+
prompt_template = """Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is present in the document
|
| 11 |
+
feel free to say, "try ansking something else, this information is not available", don't provide the wrong answer no matter what is present in the question\n\n
|
| 12 |
+
Context:\n {context}?\n
|
| 13 |
+
Question: \n{question}\n
|
| 14 |
+
|
| 15 |
+
Answer:
|
| 16 |
+
"""
|
| 17 |
+
model = ChatOpenAI(temperature=0.7)
|
| 18 |
+
prompt = PromptTemplate(template=prompt_template,
|
| 19 |
+
input_variables=["context", "Question"])
|
| 20 |
+
chain = load_qa_chain(model, chain_type="stuff", prompt = prompt)
|
| 21 |
+
|
| 22 |
+
return chain
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_topics_from_chunk():
|
| 26 |
+
prompt_template = """
|
| 27 |
+
I will give a context, and you have to tell me what top 3 topics the text might belong to.
|
| 28 |
+
if you unable to find any, you can respond with <no_topic>, but dont output any rubbish topics.
|
| 29 |
+
do not write anything other than the topics names. also, give the topics in a comma separted way.\n\n
|
| 30 |
+
context:\n{context}\n
|
| 31 |
+
Answer:
|
| 32 |
+
"""
|
| 33 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
|
| 34 |
+
prompt = PromptTemplate(template=prompt_template,
|
| 35 |
+
input_variables=['context'])
|
| 36 |
+
response = LLMChain(llm=model, prompt=prompt, output_key='top_topics')
|
| 37 |
+
return response
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def generate_qa_pairs():
|
| 43 |
+
prompt_template = """
|
| 44 |
+
Given a context, i want you to generate 1 question of toughness level: {toughness} out of these three levels
|
| 45 |
+
Easy, Moderate and Tough. The question must belong to topic: {topic}.
|
| 46 |
+
Make sure the answer to the question you generate belong to the context provided.
|
| 47 |
+
give me the list of question answer pair in json format, with each element of list containing a dict with keys "question" and "answer".\n\n
|
| 48 |
+
context:\n{context}\n
|
| 49 |
+
"""
|
| 50 |
+
model = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
|
| 51 |
+
parser = JsonOutputParser()
|
| 52 |
+
prompt = PromptTemplate(template = prompt_template,
|
| 53 |
+
input_variables=['context', 'topic', 'toughness'],
|
| 54 |
+
partial_variables={'format_instructions': parser.get_format_instructions()})
|
| 55 |
+
|
| 56 |
+
chain = prompt | model | parser
|
| 57 |
+
|
| 58 |
+
return chain
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def evaluate_answer():
|
| 62 |
+
prompt_template = """
|
| 63 |
+
You are good teacher, suppose there are two texts, 1. real_answer and 2. user_answer. i want you to score the user_answer based on real_answer
|
| 64 |
+
you have to give integer score. max score can be 10, and min score can be 0. Be lenient in scoring,
|
| 65 |
+
but if someone give rubbish answer/out of context answer, feel free to give 0 score.
|
| 66 |
+
give final output as an integer value only. we dont want an explanation from you.
|
| 67 |
+
|
| 68 |
+
real_answer:\n{context}\n
|
| 69 |
+
user_answer:\n{answer}\n
|
| 70 |
+
|
| 71 |
+
score:
|
| 72 |
+
"""
|
| 73 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
|
| 74 |
+
prompt = PromptTemplate(template=prompt_template,
|
| 75 |
+
input_variables=['context', 'answer'])
|
| 76 |
+
response = LLMChain(llm=model, prompt=prompt, output_key = "score")
|
| 77 |
+
return response
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain
|
| 2 |
+
openai
|
| 3 |
+
langchain_openai
|
| 4 |
+
PyPDF2
|
| 5 |
+
python-dotenv
|
| 6 |
+
requests
|
| 7 |
+
beautifulsoup4
|
| 8 |
+
faiss-cpu
|
| 9 |
+
random
|
| 10 |
+
streamlit
|
| 11 |
+
counter
|
utils.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
def get_topics(texts, chain):
|
| 5 |
+
all_topics = []
|
| 6 |
+
for t in texts:
|
| 7 |
+
response = chain(t.page_content)
|
| 8 |
+
response = response['top_topics'].split(", ")
|
| 9 |
+
all_topics.extend([x.strip() for x in response])
|
| 10 |
+
most_common_words = Counter([x.lower() for x in all_topics]).most_common(3)
|
| 11 |
+
most_common_words_without_count = [word for word, _ in most_common_words]
|
| 12 |
+
return most_common_words_without_count
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# def question_answering(response, eval_chain):
|
| 17 |
+
# for i in range(len(response)):
|
| 18 |
+
# st.write(f"\n Question {i+1}: {response[i]['question']}")
|
| 19 |
+
# user_answer = st.text_input(f"Answer {i+1} here: ")
|
| 20 |
+
# if st.button(f"Evaluate {i+1}"):
|
| 21 |
+
# score = eval_chain({"context": response[i]['answer'],
|
| 22 |
+
# "answer": user_answer})
|
| 23 |
+
# st.write(f"You scored {score['score']}/10")
|
| 24 |
+
# if int(score['score'])<6:
|
| 25 |
+
# st.write(f"The correct answer would be: {response[i]['answer']}")
|
| 26 |
+
# else:
|
| 27 |
+
# st.write("Good Job!!!")
|
| 28 |
+
|