Spaces:
Runtime error
Runtime error
Adding all basic files
Browse files- app.py +172 -0
- doc_loading.py +51 -0
- llm_functions.py +77 -0
- requirements.txt +11 -0
- utils.py +28 -0
app.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from doc_loading import get_article_text, read_pdf_text
|
3 |
+
from utils import get_topics
|
4 |
+
from llm_functions import generate_qa_pairs, evaluate_answer, get_conversational_chain, get_topics_from_chunk
|
5 |
+
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain_openai import OpenAIEmbeddings
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from langchain_openai import ChatOpenAI
|
10 |
+
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
|
15 |
+
# Setting up basics
|
16 |
+
st.set_page_config(page_title="LLM UC")
|
17 |
+
st.header("Let's check what you know")
|
18 |
+
|
19 |
+
# Greeting the user
|
20 |
+
st.write("Welcome!!")
|
21 |
+
|
22 |
+
st.session_state
|
23 |
+
# Sidebar with selectbox
|
24 |
+
|
25 |
+
if "option" not in st.session_state:
|
26 |
+
option = st.selectbox("How are you going to input your document?",
|
27 |
+
("Upload PDF", "Blog link", "YouTube Link", "Paste copied article"))
|
28 |
+
# Conditionally show components based on user's choice
|
29 |
+
file_name = ""
|
30 |
+
main_text = ""
|
31 |
+
|
32 |
+
if option == "Upload PDF":
|
33 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"], key="uploading_pdf")
|
34 |
+
if uploaded_file is not None:
|
35 |
+
# Process the uploaded file
|
36 |
+
st.write("PDF uploaded successfully!, we will read only first 5 pages")
|
37 |
+
main_text = read_pdf_text(uploaded_file)
|
38 |
+
|
39 |
+
elif option == "Paste copied article":
|
40 |
+
main_text = st.text_area("Paste your article here", key="article_key")
|
41 |
+
if st.button('Submit'):
|
42 |
+
# Process the pasted text
|
43 |
+
st.write("Text submitted successfully!")
|
44 |
+
|
45 |
+
else:
|
46 |
+
link_input = st.text_input("Paste your blog/youtube link here.", key='link_key')
|
47 |
+
if st.button('Submit'):
|
48 |
+
# Process the link
|
49 |
+
st.write("link submitted")
|
50 |
+
if option == "YouTube Link":
|
51 |
+
st.write("This functionality in under construction.")
|
52 |
+
main_text = ""
|
53 |
+
else:
|
54 |
+
# lets try now
|
55 |
+
article_text = get_article_text(link_input)
|
56 |
+
if article_text:
|
57 |
+
main_text = ""
|
58 |
+
st.write("This functionality in under construction.")
|
59 |
+
else:
|
60 |
+
st.write("Unable to fetch text from url. Can you please check the link?")
|
61 |
+
|
62 |
+
|
63 |
+
# Show a warning if the user hasn't selected an option or if the uploaded file is not a PDF
|
64 |
+
|
65 |
+
if option == "Upload PDF":
|
66 |
+
if uploaded_file is not None and uploaded_file.type != "application/pdf":
|
67 |
+
st.error("Please choose a PDF file only.")
|
68 |
+
|
69 |
+
if "total_text" not in st.session_state:
|
70 |
+
st.session_state['total_text'] = main_text
|
71 |
+
|
72 |
+
if len(main_text) > 0:
|
73 |
+
# creating chunks of the given article
|
74 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
75 |
+
chunk_size=1000,
|
76 |
+
chunk_overlap=100,
|
77 |
+
length_function=len,
|
78 |
+
is_separator_regex=False
|
79 |
+
)
|
80 |
+
texts = text_splitter.create_documents([main_text])
|
81 |
+
|
82 |
+
# building vector database
|
83 |
+
|
84 |
+
embeddings = OpenAIEmbeddings()
|
85 |
+
# store in vector db
|
86 |
+
db = FAISS.from_documents(texts, embeddings)
|
87 |
+
|
88 |
+
hash_name = f"{option.replace(' ', '-')}"
|
89 |
+
db.save_local(f'faiss_{hash_name}_index')
|
90 |
+
|
91 |
+
# Create a toggle button
|
92 |
+
toggle_button_state = st.checkbox("check this, if i should quiz you")
|
93 |
+
|
94 |
+
# Display a message based on the toggle button state
|
95 |
+
if toggle_button_state:
|
96 |
+
# make the quiz
|
97 |
+
st.write("Quiz incoming...")
|
98 |
+
|
99 |
+
# give selection of toughness
|
100 |
+
toughness_selection = None
|
101 |
+
while "toughness_selection" not in st.session_state:
|
102 |
+
toughness_selection = st.selectbox("Select the question toughness",
|
103 |
+
("Easy", "Moderate", "Tough"))
|
104 |
+
st.session_state['toughness_selection'] = toughness_selection
|
105 |
+
|
106 |
+
top_topics = None
|
107 |
+
if "top_topics" not in st.session_state:
|
108 |
+
topics_chain = get_topics_from_chunk()
|
109 |
+
top_topics = get_topics(texts[:10], topics_chain)
|
110 |
+
top_topics.append("Any")
|
111 |
+
st.session_state['top_topics'] = top_topics
|
112 |
+
|
113 |
+
topic_selection = None
|
114 |
+
if "topic_selection" not in st.session_state:
|
115 |
+
top_topics = st.session_state['top_topics']
|
116 |
+
topic_selection = st.selectbox("Select the topic i should quiz from!!",
|
117 |
+
tuple(top_topics), key="topic_selection")
|
118 |
+
st.session_state['topics_selection'] = topic_selection
|
119 |
+
toughness_selection = st.session_state['toughness_selection']
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
if "response" not in st.session_state:
|
124 |
+
ques_chain = generate_qa_pairs()
|
125 |
+
topic_selection = st.session_state['topic_selection']
|
126 |
+
toughness_selection = st.session_state['toughness_selection']
|
127 |
+
# st.write(f"here we go, a {toughness_selection} level question from {topic_selection} topic.")
|
128 |
+
docs_for_questions = db.similarity_search(topic_selection, k=5)
|
129 |
+
response = ques_chain.invoke({"context": docs_for_questions,
|
130 |
+
"topic": topic_selection,
|
131 |
+
'toughness': toughness_selection})
|
132 |
+
st.session_state['response'] = response[0]
|
133 |
+
|
134 |
+
if "scoring" not in st.session_state:
|
135 |
+
eval_chain = evaluate_answer()
|
136 |
+
response = st.session_state['response']
|
137 |
+
st.write(f"\n Question: {response['question']}")
|
138 |
+
user_answer = st.text_input(f"Answer here: ", key="my_ans")
|
139 |
+
|
140 |
+
if st.button(f"Evaluate"):
|
141 |
+
score = eval_chain({"context": response['answer'],
|
142 |
+
"answer": user_answer})
|
143 |
+
st.write(f"You scored {score['score']}/10")
|
144 |
+
if int(score['score'])<6:
|
145 |
+
st.write(f"The correct answer would be: {response['answer']}")
|
146 |
+
else:
|
147 |
+
st.write("Good Job!!!")
|
148 |
+
|
149 |
+
st.session_state['scoring'] = score['score']
|
150 |
+
elif st.button("Don't know"):
|
151 |
+
st.write(f"The correct answer would be: {response['answer']}")
|
152 |
+
|
153 |
+
else:
|
154 |
+
st.write("What's your question?")
|
155 |
+
# let the user ask question.
|
156 |
+
|
157 |
+
|
158 |
+
if "input_question" not in st.session_state or st.session_state['input_question'] == "":
|
159 |
+
input_question = st.text_input("Here, input your question and click `Answer this`", key="Question")
|
160 |
+
st.session_state['input_question'] = input_question
|
161 |
+
answer_button = st.button('Answer this', key='my_question')
|
162 |
+
|
163 |
+
if answer_button:
|
164 |
+
input_question = st.session_state['input_question']
|
165 |
+
docs = db.similarity_search(input_question, k=5)
|
166 |
+
chain = get_conversational_chain()
|
167 |
+
response = chain({"input_documents" : docs,
|
168 |
+
"question": input_question,
|
169 |
+
})
|
170 |
+
st.write(response['output_text'])
|
171 |
+
del st.session_state['input_question']
|
172 |
+
|
doc_loading.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
# Use 1: when people want to share a webpage and ask you to generate question from here
|
5 |
+
|
6 |
+
def get_article_text(url):
|
7 |
+
# Fetch the webpage content
|
8 |
+
response = requests.get(url)
|
9 |
+
if response.status_code != 200:
|
10 |
+
print(f"Failed to fetch the webpage. Status code: {response.status_code}")
|
11 |
+
return None
|
12 |
+
|
13 |
+
# Parse the HTML content
|
14 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
15 |
+
|
16 |
+
# Find the main article body
|
17 |
+
article_body = soup.find('body')
|
18 |
+
|
19 |
+
# Extract text from the body
|
20 |
+
article_text = article_body.get_text()
|
21 |
+
|
22 |
+
return article_text
|
23 |
+
|
24 |
+
|
25 |
+
# Use 2. When people want to upload a pdf and ask you to generate question from it
|
26 |
+
import os
|
27 |
+
from PyPDF2 import PdfReader
|
28 |
+
|
29 |
+
# pdf_directory = os.getcwd()
|
30 |
+
|
31 |
+
# # Opening the pdf
|
32 |
+
# for filename in os.listdir(pdf_directory):
|
33 |
+
# if filename.endswith(".pdf"):
|
34 |
+
# pdf_file_name = filename
|
35 |
+
# pdf_file = open(os.path.join(pdf_directory, filename), "rb")
|
36 |
+
def read_pdf_text(pdf_file):
|
37 |
+
# Reading the pdf
|
38 |
+
pdf_reader = PdfReader(pdf_file)
|
39 |
+
all_text = ""
|
40 |
+
# make it limited. min(5, len(pages))
|
41 |
+
for idx, page in enumerate(pdf_reader.pages):
|
42 |
+
all_text += page.extract_text()
|
43 |
+
if idx > 4:
|
44 |
+
break
|
45 |
+
return all_text
|
46 |
+
|
47 |
+
# print(all_text)
|
48 |
+
|
49 |
+
# Use 3. let the user input an story to generate questions from
|
50 |
+
|
51 |
+
# Use 4. let the use share a video link to generate questions from
|
llm_functions.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains.question_answering import load_qa_chain
|
2 |
+
from langchain_openai import ChatOpenAI
|
3 |
+
from langchain.prompts import PromptTemplate
|
4 |
+
from langchain.chains import LLMChain
|
5 |
+
from langchain_core.output_parsers import JsonOutputParser
|
6 |
+
import streamlit as st
|
7 |
+
|
8 |
+
|
9 |
+
def get_conversational_chain():
|
10 |
+
prompt_template = """Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is present in the document
|
11 |
+
feel free to say, "try ansking something else, this information is not available", don't provide the wrong answer no matter what is present in the question\n\n
|
12 |
+
Context:\n {context}?\n
|
13 |
+
Question: \n{question}\n
|
14 |
+
|
15 |
+
Answer:
|
16 |
+
"""
|
17 |
+
model = ChatOpenAI(temperature=0.7)
|
18 |
+
prompt = PromptTemplate(template=prompt_template,
|
19 |
+
input_variables=["context", "Question"])
|
20 |
+
chain = load_qa_chain(model, chain_type="stuff", prompt = prompt)
|
21 |
+
|
22 |
+
return chain
|
23 |
+
|
24 |
+
|
25 |
+
def get_topics_from_chunk():
|
26 |
+
prompt_template = """
|
27 |
+
I will give a context, and you have to tell me what top 3 topics the text might belong to.
|
28 |
+
if you unable to find any, you can respond with <no_topic>, but dont output any rubbish topics.
|
29 |
+
do not write anything other than the topics names. also, give the topics in a comma separted way.\n\n
|
30 |
+
context:\n{context}\n
|
31 |
+
Answer:
|
32 |
+
"""
|
33 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
|
34 |
+
prompt = PromptTemplate(template=prompt_template,
|
35 |
+
input_variables=['context'])
|
36 |
+
response = LLMChain(llm=model, prompt=prompt, output_key='top_topics')
|
37 |
+
return response
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def generate_qa_pairs():
|
43 |
+
prompt_template = """
|
44 |
+
Given a context, i want you to generate 1 question of toughness level: {toughness} out of these three levels
|
45 |
+
Easy, Moderate and Tough. The question must belong to topic: {topic}.
|
46 |
+
Make sure the answer to the question you generate belong to the context provided.
|
47 |
+
give me the list of question answer pair in json format, with each element of list containing a dict with keys "question" and "answer".\n\n
|
48 |
+
context:\n{context}\n
|
49 |
+
"""
|
50 |
+
model = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
|
51 |
+
parser = JsonOutputParser()
|
52 |
+
prompt = PromptTemplate(template = prompt_template,
|
53 |
+
input_variables=['context', 'topic', 'toughness'],
|
54 |
+
partial_variables={'format_instructions': parser.get_format_instructions()})
|
55 |
+
|
56 |
+
chain = prompt | model | parser
|
57 |
+
|
58 |
+
return chain
|
59 |
+
|
60 |
+
|
61 |
+
def evaluate_answer():
|
62 |
+
prompt_template = """
|
63 |
+
You are good teacher, suppose there are two texts, 1. real_answer and 2. user_answer. i want you to score the user_answer based on real_answer
|
64 |
+
you have to give integer score. max score can be 10, and min score can be 0. Be lenient in scoring,
|
65 |
+
but if someone give rubbish answer/out of context answer, feel free to give 0 score.
|
66 |
+
give final output as an integer value only. we dont want an explanation from you.
|
67 |
+
|
68 |
+
real_answer:\n{context}\n
|
69 |
+
user_answer:\n{answer}\n
|
70 |
+
|
71 |
+
score:
|
72 |
+
"""
|
73 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
|
74 |
+
prompt = PromptTemplate(template=prompt_template,
|
75 |
+
input_variables=['context', 'answer'])
|
76 |
+
response = LLMChain(llm=model, prompt=prompt, output_key = "score")
|
77 |
+
return response
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
openai
|
3 |
+
langchain_openai
|
4 |
+
PyPDF2
|
5 |
+
python-dotenv
|
6 |
+
requests
|
7 |
+
beautifulsoup4
|
8 |
+
faiss-cpu
|
9 |
+
random
|
10 |
+
streamlit
|
11 |
+
counter
|
utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
def get_topics(texts, chain):
|
5 |
+
all_topics = []
|
6 |
+
for t in texts:
|
7 |
+
response = chain(t.page_content)
|
8 |
+
response = response['top_topics'].split(", ")
|
9 |
+
all_topics.extend([x.strip() for x in response])
|
10 |
+
most_common_words = Counter([x.lower() for x in all_topics]).most_common(3)
|
11 |
+
most_common_words_without_count = [word for word, _ in most_common_words]
|
12 |
+
return most_common_words_without_count
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
# def question_answering(response, eval_chain):
|
17 |
+
# for i in range(len(response)):
|
18 |
+
# st.write(f"\n Question {i+1}: {response[i]['question']}")
|
19 |
+
# user_answer = st.text_input(f"Answer {i+1} here: ")
|
20 |
+
# if st.button(f"Evaluate {i+1}"):
|
21 |
+
# score = eval_chain({"context": response[i]['answer'],
|
22 |
+
# "answer": user_answer})
|
23 |
+
# st.write(f"You scored {score['score']}/10")
|
24 |
+
# if int(score['score'])<6:
|
25 |
+
# st.write(f"The correct answer would be: {response[i]['answer']}")
|
26 |
+
# else:
|
27 |
+
# st.write("Good Job!!!")
|
28 |
+
|