Spaces:
Runtime error
Runtime error
Commit
·
7697a39
1
Parent(s):
fae8909
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import urllib.request
|
2 |
import fitz
|
3 |
import re
|
@@ -7,18 +13,55 @@ import openai
|
|
7 |
import gradio as gr
|
8 |
import os
|
9 |
from sklearn.neighbors import NearestNeighbors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
|
|
|
|
12 |
urllib.request.urlretrieve(url, output_path)
|
13 |
|
14 |
|
15 |
-
def preprocess(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
text = text.replace('\n', ' ')
|
17 |
-
text = re.sub('\s+', ' ', text)
|
18 |
return text
|
19 |
|
20 |
|
21 |
-
def pdf_to_text(path, start_page=1, end_page=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
doc = fitz.open(path)
|
23 |
total_pages = doc.page_count
|
24 |
|
@@ -36,16 +79,33 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
36 |
return text_list
|
37 |
|
38 |
|
39 |
-
def text_to_chunks(texts, word_length=150, start_page=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
text_toks = [t.split(' ') for t in texts]
|
41 |
-
page_nums = []
|
42 |
chunks = []
|
43 |
-
|
44 |
for idx, words in enumerate(text_toks):
|
45 |
for i in range(0, len(words), word_length):
|
46 |
chunk = words[i:i+word_length]
|
47 |
if (i+word_length) > len(words) and (len(chunk) < word_length) and (
|
48 |
-
|
49 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
50 |
continue
|
51 |
chunk = ' '.join(chunk).strip()
|
@@ -54,33 +114,78 @@ def text_to_chunks(texts, word_length=150, start_page=1):
|
|
54 |
return chunks
|
55 |
|
56 |
|
57 |
-
class SemanticSearch:
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
self.fitted = False
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
self.data = data
|
66 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
67 |
n_neighbors = min(n_neighbors, len(self.embeddings))
|
68 |
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
|
69 |
self.nn.fit(self.embeddings)
|
70 |
self.fitted = True
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
inp_emb = self.use([text])
|
75 |
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
|
76 |
-
|
77 |
if return_data:
|
78 |
return [self.data[i] for i in neighbors]
|
79 |
else:
|
80 |
return neighbors
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
embeddings = []
|
85 |
for i in range(0, len(texts), batch):
|
86 |
text_batch = texts[i:(i+batch)]
|
@@ -90,16 +195,47 @@ class SemanticSearch:
|
|
90 |
return embeddings
|
91 |
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
|
|
|
|
95 |
global recommender
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
texts = pdf_to_text(path, start_page=start_page)
|
97 |
chunks = text_to_chunks(texts, start_page=start_page)
|
98 |
recommender.fit(chunks)
|
|
|
99 |
return 'Corpus Loaded.'
|
100 |
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
completions = openai.Completion.create(
|
104 |
engine=engine,
|
105 |
prompt=prompt,
|
@@ -112,60 +248,100 @@ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
|
|
112 |
return message
|
113 |
|
114 |
|
115 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
topn_chunks = recommender(question)
|
117 |
prompt = ""
|
118 |
prompt += 'search results:\n\n'
|
119 |
for c in topn_chunks:
|
120 |
prompt += c + '\n\n'
|
121 |
-
|
122 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
prompt += f"Query: {question}\nAnswer:"
|
132 |
-
answer = generate_text(
|
133 |
return answer
|
134 |
|
135 |
|
136 |
-
def question_answer(url, file, question,
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
139 |
if url.strip() == '' and file == None:
|
140 |
-
return '[ERROR]: Both URL and PDF is empty. Provide
|
141 |
-
|
142 |
-
if url.strip() != '' and file != None:
|
143 |
-
return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
|
144 |
-
|
145 |
-
if url.strip() != '':
|
146 |
-
glob_url = url
|
147 |
-
download_pdf(glob_url, 'corpus.pdf')
|
148 |
-
load_recommender('corpus.pdf')
|
149 |
-
|
150 |
-
else:
|
151 |
-
old_file_name = file.name
|
152 |
-
file_name = file.name
|
153 |
-
file_name = file_name[:-12] + file_name[-4:]
|
154 |
-
os.rename(old_file_name, file_name)
|
155 |
-
load_recommender(file_name)
|
156 |
-
|
157 |
-
if question.strip() == '':
|
158 |
-
return '[ERROR]: Question field is empty'
|
159 |
|
160 |
-
return generate_answer(question,openAI_key)
|
161 |
-
def question_answer(url, file, question,openAI_key):
|
162 |
-
if openAI_key.strip()=='':
|
163 |
-
return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
164 |
-
if url.strip() == '' and file == None:
|
165 |
-
return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
|
166 |
-
|
167 |
if url.strip() != '' and file != None:
|
168 |
-
return '[ERROR]: Both URL and PDF is provided. Please provide only one (
|
169 |
|
170 |
if url.strip() != '':
|
171 |
glob_url = url
|
@@ -182,13 +358,22 @@ def question_answer(url, file, question,openAI_key):
|
|
182 |
if question.strip() == '':
|
183 |
return '[ERROR]: Question field is empty'
|
184 |
|
185 |
-
return generate_answer(question,
|
186 |
|
187 |
|
188 |
recommender = SemanticSearch()
|
189 |
|
190 |
title = 'PDF GPT'
|
191 |
-
description = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
with gr.Blocks() as demo:
|
194 |
|
@@ -196,13 +381,18 @@ with gr.Blocks() as demo:
|
|
196 |
gr.Markdown(description)
|
197 |
|
198 |
with gr.Row():
|
199 |
-
|
200 |
with gr.Group():
|
201 |
-
gr.Markdown(
|
202 |
-
|
|
|
|
|
|
|
|
|
203 |
url = gr.Textbox(label='Enter PDF URL here')
|
204 |
gr.Markdown("<center><h4>OR<h4></center>")
|
205 |
-
file = gr.File(
|
|
|
206 |
question = gr.Textbox(label='Enter your question here')
|
207 |
btn = gr.Button(value='Submit')
|
208 |
btn.style(full_width=True)
|
@@ -210,6 +400,7 @@ with gr.Blocks() as demo:
|
|
210 |
with gr.Group():
|
211 |
answer = gr.Textbox(label='The answer to your question is :')
|
212 |
|
213 |
-
btn.click(question_answer, inputs=[
|
214 |
-
|
215 |
-
|
|
|
|
1 |
+
"""
|
2 |
+
This module provides functions for working with PDF files and URLs. It uses the urllib.request library
|
3 |
+
to download files from URLs, and the fitz library to extract text from PDF files. And GPT3 modules to generate
|
4 |
+
text completions.
|
5 |
+
"""
|
6 |
+
|
7 |
import urllib.request
|
8 |
import fitz
|
9 |
import re
|
|
|
13 |
import gradio as gr
|
14 |
import os
|
15 |
from sklearn.neighbors import NearestNeighbors
|
16 |
+
from typing import Optional, Union, IO
|
17 |
+
|
18 |
+
|
19 |
+
def download_pdf(url: str, output_path: str) -> None:
|
20 |
+
"""
|
21 |
+
Downloads a PDF file from the given URL and saves it to the specified output path.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
url (str): The URL of the PDF file to be downloaded.
|
25 |
+
output_path (str): The file path where the downloaded PDF file will be saved.
|
26 |
|
27 |
+
Returns:
|
28 |
+
None
|
29 |
+
"""
|
30 |
urllib.request.urlretrieve(url, output_path)
|
31 |
|
32 |
|
33 |
+
def preprocess(text: str) -> str:
|
34 |
+
"""
|
35 |
+
Preprocesses the given text by replacing newline characters with spaces and removing extra whitespaces.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
text (str): The input text to be preprocessed.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
str: The preprocessed text with newline characters replaced by spaces and extra whitespaces removed.
|
42 |
+
|
43 |
+
Example:
|
44 |
+
>>> preprocess("Hello\\n world!")
|
45 |
+
'Hello world!'
|
46 |
+
"""
|
47 |
text = text.replace('\n', ' ')
|
48 |
+
text = re.sub(r'\s+', ' ', text)
|
49 |
return text
|
50 |
|
51 |
|
52 |
+
def pdf_to_text(path: str, start_page: int = 1, end_page: Optional[int] = None) -> list[str]:
|
53 |
+
"""
|
54 |
+
Converts a PDF file to a list of text strings.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
path (str): The path to the PDF file.
|
58 |
+
start_page (int): The page number to start extracting text from (default is 1).
|
59 |
+
end_page (int): Page number to stop extracting text at (default is None, which means extract text from all ]
|
60 |
+
pages)
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
list: A list of text strings extracted from the PDF file.
|
64 |
+
"""
|
65 |
doc = fitz.open(path)
|
66 |
total_pages = doc.page_count
|
67 |
|
|
|
79 |
return text_list
|
80 |
|
81 |
|
82 |
+
def text_to_chunks(texts: list[str], word_length: int = 150, start_page: int = 1) -> list[str]:
|
83 |
+
"""
|
84 |
+
Splits a list of texts into chunks of specified length and formats them as strings.
|
85 |
+
|
86 |
+
Args:
|
87 |
+
- texts: A list of strings to be split into chunks.
|
88 |
+
- word_length: An integer representing the maximum number of words in each chunk. Default is 150.
|
89 |
+
- start_page: An integer representing the starting page number. Default is 1.
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
- A list of formatted string chunks, where each chunk contains a page number, enclosed in square brackets,
|
93 |
+
followed by the chunk of text enclosed in double quotes.
|
94 |
+
|
95 |
+
Example:
|
96 |
+
>>> texts = ['This is a sample text for testing the function.', 'It should split the text into chunks of 5 words.']
|
97 |
+
>>> text_to_chunks(texts, word_length=5, start_page=3)
|
98 |
+
['[3] "This is a sample text for"', '[3] "testing the function. It should"',
|
99 |
+
'[4] "split the text into chunks of"','[4] "5 words."']
|
100 |
+
"""
|
101 |
text_toks = [t.split(' ') for t in texts]
|
|
|
102 |
chunks = []
|
103 |
+
|
104 |
for idx, words in enumerate(text_toks):
|
105 |
for i in range(0, len(words), word_length):
|
106 |
chunk = words[i:i+word_length]
|
107 |
if (i+word_length) > len(words) and (len(chunk) < word_length) and (
|
108 |
+
len(text_toks) != (idx+1)):
|
109 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
110 |
continue
|
111 |
chunk = ' '.join(chunk).strip()
|
|
|
114 |
return chunks
|
115 |
|
116 |
|
117 |
+
class SemanticSearch(object):
|
118 |
+
"""
|
119 |
+
This class provides functionality for semantic search.
|
120 |
+
"""
|
121 |
+
|
122 |
+
def __init__(self) -> None:
|
123 |
+
"""
|
124 |
+
Initializes an instance of the class.
|
125 |
+
|
126 |
+
Attributes:
|
127 |
+
-----------
|
128 |
+
use : tensorflow_hub.KerasLayer
|
129 |
+
A pre-trained Universal Sentence Encoder model from TensorFlow Hub.
|
130 |
+
fitted : bool
|
131 |
+
A flag indicating whether the model has been fitted to data or not.
|
132 |
+
"""
|
133 |
+
self.use = hub.load(
|
134 |
+
'https://tfhub.dev/google/universal-sentence-encoder/4')
|
135 |
self.fitted = False
|
136 |
+
|
137 |
+
def fit(self, data: list[str], batch: int = 1000, n_neighbors: int = 5) -> None:
|
138 |
+
"""
|
139 |
+
Fits the nearest neighbor model to the given data.
|
140 |
+
|
141 |
+
Args:
|
142 |
+
data (list[str]): A list of strings to fit the model on.
|
143 |
+
batch (int): The batch size to use when computing text embeddings. Defaults to 1000.
|
144 |
+
n_neighbors (int): The number of nearest neighbors to find for each query. Defaults to 5.
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
None
|
148 |
+
"""
|
149 |
self.data = data
|
150 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
151 |
n_neighbors = min(n_neighbors, len(self.embeddings))
|
152 |
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
|
153 |
self.nn.fit(self.embeddings)
|
154 |
self.fitted = True
|
155 |
+
|
156 |
+
def __call__(self, text: str, return_data: bool = True) -> Union[list[str], np.ndarray]:
|
157 |
+
"""
|
158 |
+
Finds nearest neighbors to a given text in the embedding space.
|
159 |
+
|
160 |
+
Args:
|
161 |
+
text (str): The input text to find nearest neighbors for.
|
162 |
+
return_data (bool): Whether to return the actual data points corresponding to the nearest neighbors.
|
163 |
+
If False, returns only the indices of the nearest neighbors. Defaults to True.
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
Union[List[str], np.ndarray]: If return_data is True, returns a list of strings representing the
|
167 |
+
nearest neighbors. If return_data is False, returns a numpy array of shape (n_neighbors,)
|
168 |
+
containing the indices of the nearest neighbors.
|
169 |
+
"""
|
170 |
inp_emb = self.use([text])
|
171 |
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
|
172 |
+
|
173 |
if return_data:
|
174 |
return [self.data[i] for i in neighbors]
|
175 |
else:
|
176 |
return neighbors
|
177 |
+
|
178 |
+
def get_text_embedding(self, texts: list[str], batch: int = 1000) -> np.ndarray:
|
179 |
+
"""
|
180 |
+
Generates embeddings for a list of texts using the Universal Sentence Encoder.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
texts (List[str]): A list of strings to generate embeddings for.
|
184 |
+
batch (int): The batch size to use when generating embeddings. Defaults to 1000.
|
185 |
+
|
186 |
+
Returns:
|
187 |
+
np.ndarray: An array of shape (n_texts, embedding_size) containing the embeddings for each text.
|
188 |
+
"""
|
189 |
embeddings = []
|
190 |
for i in range(0, len(texts), batch):
|
191 |
text_batch = texts[i:(i+batch)]
|
|
|
195 |
return embeddings
|
196 |
|
197 |
|
198 |
+
def load_recommender(path: str, start_page: int = 1) -> str:
|
199 |
+
"""
|
200 |
+
Loads embeddings from file if available, otherwise generates embeddings and saves them to file.
|
201 |
+
|
202 |
+
Args:
|
203 |
+
path (str): The path of the PDF file.
|
204 |
+
start_page (int): The page number to start generating embeddings from. Default is 1.
|
205 |
|
206 |
+
Returns:
|
207 |
+
str: A message indicating whether embeddings were loaded from file or generated and saved to file.
|
208 |
+
"""
|
209 |
global recommender
|
210 |
+
pdf_file = os.path.basename(path)
|
211 |
+
embeddings_file = f"{pdf_file}_{start_page}.npy"
|
212 |
+
|
213 |
+
if os.path.isfile(embeddings_file):
|
214 |
+
embeddings = np.load(embeddings_file)
|
215 |
+
recommender.embeddings = embeddings
|
216 |
+
recommender.fitted = True
|
217 |
+
return "Embeddings loaded from file"
|
218 |
+
|
219 |
texts = pdf_to_text(path, start_page=start_page)
|
220 |
chunks = text_to_chunks(texts, start_page=start_page)
|
221 |
recommender.fit(chunks)
|
222 |
+
np.save(embeddings_file, recommender.embeddings)
|
223 |
return 'Corpus Loaded.'
|
224 |
|
225 |
+
|
226 |
+
def generate_text(openai_key: str, prompt: str, engine: str = "text-davinci-003") -> str:
|
227 |
+
"""
|
228 |
+
Generates text using OpenAI's GPT-3 language model.
|
229 |
+
|
230 |
+
Parameters:
|
231 |
+
openai_key (str): The API key for accessing OpenAI's API.
|
232 |
+
prompt (str): The starting text prompt to generate the text from.
|
233 |
+
engine (str): The ID of the language model to use. Defaults to "text-davinci-003".
|
234 |
+
|
235 |
+
Returns:
|
236 |
+
str: The generated text based on the given prompt.
|
237 |
+
"""
|
238 |
+
openai.api_key = openai_key
|
239 |
completions = openai.Completion.create(
|
240 |
engine=engine,
|
241 |
prompt=prompt,
|
|
|
248 |
return message
|
249 |
|
250 |
|
251 |
+
def generate_text2(openai_key: str, prompt: str, engine: str = "gpt-3.5-turbo-0301") -> str:
|
252 |
+
"""
|
253 |
+
Generates text using OpenAI's GPT-3 language model.
|
254 |
+
|
255 |
+
Args:
|
256 |
+
openai_key (str): The API key for accessing OpenAI's GPT-3 language model.
|
257 |
+
prompt (str): The user's prompt to generate a response to.
|
258 |
+
engine (str, optional): The name of the GPT-3 engine to use. Defaults to "gpt-3.5-turbo-0301".
|
259 |
+
|
260 |
+
Returns:
|
261 |
+
str: The generated text response from the GPT-3 language model.
|
262 |
+
"""
|
263 |
+
openai.api_key = openai_key
|
264 |
+
messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
|
265 |
+
{'role': 'user', 'content': prompt}]
|
266 |
+
|
267 |
+
completions = openai.ChatCompletion.create(
|
268 |
+
model=engine,
|
269 |
+
messages=messages,
|
270 |
+
max_tokens=512,
|
271 |
+
n=1,
|
272 |
+
stop=None,
|
273 |
+
temperature=0.7,
|
274 |
+
)
|
275 |
+
message = completions.choices[0].message['content']
|
276 |
+
return message
|
277 |
+
|
278 |
+
|
279 |
+
def generate_answer(question: str, openai_key: str) -> str:
|
280 |
+
"""
|
281 |
+
Generates an answer to the given question using OpenAI's GPT-3 language model.
|
282 |
+
|
283 |
+
Args:
|
284 |
+
question (str): The question to answer.
|
285 |
+
openai_key (str): The API key for accessing OpenAI's GPT-3 API.
|
286 |
+
|
287 |
+
Returns:
|
288 |
+
str: The generated answer to the question.
|
289 |
+
"""
|
290 |
topn_chunks = recommender(question)
|
291 |
prompt = ""
|
292 |
prompt += 'search results:\n\n'
|
293 |
for c in topn_chunks:
|
294 |
prompt += c + '\n\n'
|
295 |
+
|
296 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
297 |
+
"Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
|
298 |
+
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
|
299 |
+
"with the same name, create separate answers for each. Only include information found in the results and "\
|
300 |
+
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
|
301 |
+
"If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
|
302 |
+
"search results which has nothing to do with the question. Only answer what is asked. The "\
|
303 |
+
"answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
|
304 |
+
|
305 |
prompt += f"Query: {question}\nAnswer:"
|
306 |
+
answer = generate_text(openai_key, prompt, "text-davinci-003")
|
307 |
return answer
|
308 |
|
309 |
|
310 |
+
def question_answer(url: str, file: IO[str], question: str, openai_key: str) -> str:
|
311 |
+
"""
|
312 |
+
Generates an answer to a given question using OpenAI's GPT-3 model.
|
313 |
+
|
314 |
+
Parameters:
|
315 |
+
-----------
|
316 |
+
url : str
|
317 |
+
The URL of a webpage to extract text from. If provided, the text will be saved as a PDF and used
|
318 |
+
as input for the model.
|
319 |
+
file : file-like object
|
320 |
+
A file object containing a PDF document to use as input for the model. If provided, the text will
|
321 |
+
be extracted from the PDF and used as input for the model.
|
322 |
+
question : str
|
323 |
+
The question to generate an answer for.
|
324 |
+
openai_key : str
|
325 |
+
An API key for accessing OpenAI's GPT-3 model.
|
326 |
+
|
327 |
+
Returns:
|
328 |
+
--------
|
329 |
+
str
|
330 |
+
The generated answer to the given question.
|
331 |
+
|
332 |
+
Raises:
|
333 |
+
-------
|
334 |
+
ValueError
|
335 |
+
If both `url` and `file` are empty or if both are provided.
|
336 |
+
If `question` is empty.
|
337 |
+
"""
|
338 |
+
if openai_key.strip() == '':
|
339 |
return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
340 |
if url.strip() == '' and file == None:
|
341 |
+
return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
if url.strip() != '' and file != None:
|
344 |
+
return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
|
345 |
|
346 |
if url.strip() != '':
|
347 |
glob_url = url
|
|
|
358 |
if question.strip() == '':
|
359 |
return '[ERROR]: Question field is empty'
|
360 |
|
361 |
+
return generate_answer(question, openai_key)
|
362 |
|
363 |
|
364 |
recommender = SemanticSearch()
|
365 |
|
366 |
title = 'PDF GPT'
|
367 |
+
description = """ What is PDF GPT ?
|
368 |
+
1. The problem is that Open AI has a 4K token limit and cannot take an entire PDF file as input. Additionally,
|
369 |
+
it sometimes returns irrelevant responses due to poor embeddings. ChatGPT cannot directly talk to external data.
|
370 |
+
The solution is PDF GPT, which allows you to chat with an uploaded PDF file using GPT functionalities.
|
371 |
+
The application breaks the document into smaller chunks and generates embeddings using a powerful Deep Averaging
|
372 |
+
Network Encoder. A semantic search is performed on your query, and the top relevant chunks are used to generate a
|
373 |
+
response.
|
374 |
+
2. The returned response can even cite the page number in square brackets([]) where the information is located,
|
375 |
+
adding credibility to the responses and helping to locate pertinent information quickly. The Responses are much
|
376 |
+
better than the naive responses by Open AI."""
|
377 |
|
378 |
with gr.Blocks() as demo:
|
379 |
|
|
|
381 |
gr.Markdown(description)
|
382 |
|
383 |
with gr.Row():
|
384 |
+
|
385 |
with gr.Group():
|
386 |
+
gr.Markdown(
|
387 |
+
'<p style="text-align:center">'
|
388 |
+
'Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a>'
|
389 |
+
'</p>'
|
390 |
+
)
|
391 |
+
openAI_key = gr.Textbox(label='Enter your OpenAI API key here')
|
392 |
url = gr.Textbox(label='Enter PDF URL here')
|
393 |
gr.Markdown("<center><h4>OR<h4></center>")
|
394 |
+
file = gr.File(
|
395 |
+
label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
|
396 |
question = gr.Textbox(label='Enter your question here')
|
397 |
btn = gr.Button(value='Submit')
|
398 |
btn.style(full_width=True)
|
|
|
400 |
with gr.Group():
|
401 |
answer = gr.Textbox(label='The answer to your question is :')
|
402 |
|
403 |
+
btn.click(question_answer, inputs=[
|
404 |
+
url, file, question, openAI_key], outputs=[answer])
|
405 |
+
|
406 |
+
demo.launch()
|