Atif20024 commited on
Commit
d67d72b
·
verified ·
1 Parent(s): be19fe3

Adding all basic files

Browse files
Files changed (5) hide show
  1. app.py +172 -0
  2. doc_loading.py +51 -0
  3. llm_functions.py +77 -0
  4. requirements.txt +11 -0
  5. utils.py +28 -0
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from doc_loading import get_article_text, read_pdf_text
3
+ from utils import get_topics
4
+ from llm_functions import generate_qa_pairs, evaluate_answer, get_conversational_chain, get_topics_from_chunk
5
+
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from langchain_openai import ChatOpenAI
10
+
11
+ from dotenv import load_dotenv
12
+ load_dotenv()
13
+
14
+
15
+ # Setting up basics
16
+ st.set_page_config(page_title="LLM UC")
17
+ st.header("Let's check what you know")
18
+
19
+ # Greeting the user
20
+ st.write("Welcome!!")
21
+
22
+ st.session_state
23
+ # Sidebar with selectbox
24
+
25
+ if "option" not in st.session_state:
26
+ option = st.selectbox("How are you going to input your document?",
27
+ ("Upload PDF", "Blog link", "YouTube Link", "Paste copied article"))
28
+ # Conditionally show components based on user's choice
29
+ file_name = ""
30
+ main_text = ""
31
+
32
+ if option == "Upload PDF":
33
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"], key="uploading_pdf")
34
+ if uploaded_file is not None:
35
+ # Process the uploaded file
36
+ st.write("PDF uploaded successfully!, we will read only first 5 pages")
37
+ main_text = read_pdf_text(uploaded_file)
38
+
39
+ elif option == "Paste copied article":
40
+ main_text = st.text_area("Paste your article here", key="article_key")
41
+ if st.button('Submit'):
42
+ # Process the pasted text
43
+ st.write("Text submitted successfully!")
44
+
45
+ else:
46
+ link_input = st.text_input("Paste your blog/youtube link here.", key='link_key')
47
+ if st.button('Submit'):
48
+ # Process the link
49
+ st.write("link submitted")
50
+ if option == "YouTube Link":
51
+ st.write("This functionality in under construction.")
52
+ main_text = ""
53
+ else:
54
+ # lets try now
55
+ article_text = get_article_text(link_input)
56
+ if article_text:
57
+ main_text = ""
58
+ st.write("This functionality in under construction.")
59
+ else:
60
+ st.write("Unable to fetch text from url. Can you please check the link?")
61
+
62
+
63
+ # Show a warning if the user hasn't selected an option or if the uploaded file is not a PDF
64
+
65
+ if option == "Upload PDF":
66
+ if uploaded_file is not None and uploaded_file.type != "application/pdf":
67
+ st.error("Please choose a PDF file only.")
68
+
69
+ if "total_text" not in st.session_state:
70
+ st.session_state['total_text'] = main_text
71
+
72
+ if len(main_text) > 0:
73
+ # creating chunks of the given article
74
+ text_splitter = RecursiveCharacterTextSplitter(
75
+ chunk_size=1000,
76
+ chunk_overlap=100,
77
+ length_function=len,
78
+ is_separator_regex=False
79
+ )
80
+ texts = text_splitter.create_documents([main_text])
81
+
82
+ # building vector database
83
+
84
+ embeddings = OpenAIEmbeddings()
85
+ # store in vector db
86
+ db = FAISS.from_documents(texts, embeddings)
87
+
88
+ hash_name = f"{option.replace(' ', '-')}"
89
+ db.save_local(f'faiss_{hash_name}_index')
90
+
91
+ # Create a toggle button
92
+ toggle_button_state = st.checkbox("check this, if i should quiz you")
93
+
94
+ # Display a message based on the toggle button state
95
+ if toggle_button_state:
96
+ # make the quiz
97
+ st.write("Quiz incoming...")
98
+
99
+ # give selection of toughness
100
+ toughness_selection = None
101
+ while "toughness_selection" not in st.session_state:
102
+ toughness_selection = st.selectbox("Select the question toughness",
103
+ ("Easy", "Moderate", "Tough"))
104
+ st.session_state['toughness_selection'] = toughness_selection
105
+
106
+ top_topics = None
107
+ if "top_topics" not in st.session_state:
108
+ topics_chain = get_topics_from_chunk()
109
+ top_topics = get_topics(texts[:10], topics_chain)
110
+ top_topics.append("Any")
111
+ st.session_state['top_topics'] = top_topics
112
+
113
+ topic_selection = None
114
+ if "topic_selection" not in st.session_state:
115
+ top_topics = st.session_state['top_topics']
116
+ topic_selection = st.selectbox("Select the topic i should quiz from!!",
117
+ tuple(top_topics), key="topic_selection")
118
+ st.session_state['topics_selection'] = topic_selection
119
+ toughness_selection = st.session_state['toughness_selection']
120
+
121
+
122
+
123
+ if "response" not in st.session_state:
124
+ ques_chain = generate_qa_pairs()
125
+ topic_selection = st.session_state['topic_selection']
126
+ toughness_selection = st.session_state['toughness_selection']
127
+ # st.write(f"here we go, a {toughness_selection} level question from {topic_selection} topic.")
128
+ docs_for_questions = db.similarity_search(topic_selection, k=5)
129
+ response = ques_chain.invoke({"context": docs_for_questions,
130
+ "topic": topic_selection,
131
+ 'toughness': toughness_selection})
132
+ st.session_state['response'] = response[0]
133
+
134
+ if "scoring" not in st.session_state:
135
+ eval_chain = evaluate_answer()
136
+ response = st.session_state['response']
137
+ st.write(f"\n Question: {response['question']}")
138
+ user_answer = st.text_input(f"Answer here: ", key="my_ans")
139
+
140
+ if st.button(f"Evaluate"):
141
+ score = eval_chain({"context": response['answer'],
142
+ "answer": user_answer})
143
+ st.write(f"You scored {score['score']}/10")
144
+ if int(score['score'])<6:
145
+ st.write(f"The correct answer would be: {response['answer']}")
146
+ else:
147
+ st.write("Good Job!!!")
148
+
149
+ st.session_state['scoring'] = score['score']
150
+ elif st.button("Don't know"):
151
+ st.write(f"The correct answer would be: {response['answer']}")
152
+
153
+ else:
154
+ st.write("What's your question?")
155
+ # let the user ask question.
156
+
157
+
158
+ if "input_question" not in st.session_state or st.session_state['input_question'] == "":
159
+ input_question = st.text_input("Here, input your question and click `Answer this`", key="Question")
160
+ st.session_state['input_question'] = input_question
161
+ answer_button = st.button('Answer this', key='my_question')
162
+
163
+ if answer_button:
164
+ input_question = st.session_state['input_question']
165
+ docs = db.similarity_search(input_question, k=5)
166
+ chain = get_conversational_chain()
167
+ response = chain({"input_documents" : docs,
168
+ "question": input_question,
169
+ })
170
+ st.write(response['output_text'])
171
+ del st.session_state['input_question']
172
+
doc_loading.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ # Use 1: when people want to share a webpage and ask you to generate question from here
5
+
6
+ def get_article_text(url):
7
+ # Fetch the webpage content
8
+ response = requests.get(url)
9
+ if response.status_code != 200:
10
+ print(f"Failed to fetch the webpage. Status code: {response.status_code}")
11
+ return None
12
+
13
+ # Parse the HTML content
14
+ soup = BeautifulSoup(response.content, 'html.parser')
15
+
16
+ # Find the main article body
17
+ article_body = soup.find('body')
18
+
19
+ # Extract text from the body
20
+ article_text = article_body.get_text()
21
+
22
+ return article_text
23
+
24
+
25
+ # Use 2. When people want to upload a pdf and ask you to generate question from it
26
+ import os
27
+ from PyPDF2 import PdfReader
28
+
29
+ # pdf_directory = os.getcwd()
30
+
31
+ # # Opening the pdf
32
+ # for filename in os.listdir(pdf_directory):
33
+ # if filename.endswith(".pdf"):
34
+ # pdf_file_name = filename
35
+ # pdf_file = open(os.path.join(pdf_directory, filename), "rb")
36
+ def read_pdf_text(pdf_file):
37
+ # Reading the pdf
38
+ pdf_reader = PdfReader(pdf_file)
39
+ all_text = ""
40
+ # make it limited. min(5, len(pages))
41
+ for idx, page in enumerate(pdf_reader.pages):
42
+ all_text += page.extract_text()
43
+ if idx > 4:
44
+ break
45
+ return all_text
46
+
47
+ # print(all_text)
48
+
49
+ # Use 3. let the user input an story to generate questions from
50
+
51
+ # Use 4. let the use share a video link to generate questions from
llm_functions.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.question_answering import load_qa_chain
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain.chains import LLMChain
5
+ from langchain_core.output_parsers import JsonOutputParser
6
+ import streamlit as st
7
+
8
+
9
+ def get_conversational_chain():
10
+ prompt_template = """Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is present in the document
11
+ feel free to say, "try ansking something else, this information is not available", don't provide the wrong answer no matter what is present in the question\n\n
12
+ Context:\n {context}?\n
13
+ Question: \n{question}\n
14
+
15
+ Answer:
16
+ """
17
+ model = ChatOpenAI(temperature=0.7)
18
+ prompt = PromptTemplate(template=prompt_template,
19
+ input_variables=["context", "Question"])
20
+ chain = load_qa_chain(model, chain_type="stuff", prompt = prompt)
21
+
22
+ return chain
23
+
24
+
25
+ def get_topics_from_chunk():
26
+ prompt_template = """
27
+ I will give a context, and you have to tell me what top 3 topics the text might belong to.
28
+ if you unable to find any, you can respond with <no_topic>, but dont output any rubbish topics.
29
+ do not write anything other than the topics names. also, give the topics in a comma separted way.\n\n
30
+ context:\n{context}\n
31
+ Answer:
32
+ """
33
+ model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
34
+ prompt = PromptTemplate(template=prompt_template,
35
+ input_variables=['context'])
36
+ response = LLMChain(llm=model, prompt=prompt, output_key='top_topics')
37
+ return response
38
+
39
+
40
+
41
+
42
+ def generate_qa_pairs():
43
+ prompt_template = """
44
+ Given a context, i want you to generate 1 question of toughness level: {toughness} out of these three levels
45
+ Easy, Moderate and Tough. The question must belong to topic: {topic}.
46
+ Make sure the answer to the question you generate belong to the context provided.
47
+ give me the list of question answer pair in json format, with each element of list containing a dict with keys "question" and "answer".\n\n
48
+ context:\n{context}\n
49
+ """
50
+ model = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
51
+ parser = JsonOutputParser()
52
+ prompt = PromptTemplate(template = prompt_template,
53
+ input_variables=['context', 'topic', 'toughness'],
54
+ partial_variables={'format_instructions': parser.get_format_instructions()})
55
+
56
+ chain = prompt | model | parser
57
+
58
+ return chain
59
+
60
+
61
+ def evaluate_answer():
62
+ prompt_template = """
63
+ You are good teacher, suppose there are two texts, 1. real_answer and 2. user_answer. i want you to score the user_answer based on real_answer
64
+ you have to give integer score. max score can be 10, and min score can be 0. Be lenient in scoring,
65
+ but if someone give rubbish answer/out of context answer, feel free to give 0 score.
66
+ give final output as an integer value only. we dont want an explanation from you.
67
+
68
+ real_answer:\n{context}\n
69
+ user_answer:\n{answer}\n
70
+
71
+ score:
72
+ """
73
+ model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
74
+ prompt = PromptTemplate(template=prompt_template,
75
+ input_variables=['context', 'answer'])
76
+ response = LLMChain(llm=model, prompt=prompt, output_key = "score")
77
+ return response
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ langchain_openai
4
+ PyPDF2
5
+ python-dotenv
6
+ requests
7
+ beautifulsoup4
8
+ faiss-cpu
9
+ random
10
+ streamlit
11
+ counter
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ import streamlit as st
3
+
4
+ def get_topics(texts, chain):
5
+ all_topics = []
6
+ for t in texts:
7
+ response = chain(t.page_content)
8
+ response = response['top_topics'].split(", ")
9
+ all_topics.extend([x.strip() for x in response])
10
+ most_common_words = Counter([x.lower() for x in all_topics]).most_common(3)
11
+ most_common_words_without_count = [word for word, _ in most_common_words]
12
+ return most_common_words_without_count
13
+
14
+
15
+
16
+ # def question_answering(response, eval_chain):
17
+ # for i in range(len(response)):
18
+ # st.write(f"\n Question {i+1}: {response[i]['question']}")
19
+ # user_answer = st.text_input(f"Answer {i+1} here: ")
20
+ # if st.button(f"Evaluate {i+1}"):
21
+ # score = eval_chain({"context": response[i]['answer'],
22
+ # "answer": user_answer})
23
+ # st.write(f"You scored {score['score']}/10")
24
+ # if int(score['score'])<6:
25
+ # st.write(f"The correct answer would be: {response[i]['answer']}")
26
+ # else:
27
+ # st.write("Good Job!!!")
28
+