bhaskartripathi commited on
Commit
1f19bae
·
1 Parent(s): 85846d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -81
app.py CHANGED
@@ -7,10 +7,6 @@ import openai
7
  import gradio as gr
8
  import os
9
  from sklearn.neighbors import NearestNeighbors
10
- from jina import Document, DocumentArray
11
-
12
- # Create a new DocumentArray for file storage
13
- doc_array = DocumentArray()
14
 
15
  def download_pdf(url, output_path):
16
  urllib.request.urlretrieve(url, output_path)
@@ -21,17 +17,6 @@ def preprocess(text):
21
  text = re.sub('\s+', ' ', text)
22
  return text
23
 
24
- # Store a file in the DocumentArray
25
- def store_file_in_docarray(file_name, file_content):
26
- doc = Document(id=file_name, content=file_content)
27
- doc_array.append(doc)
28
-
29
- # Retrieve a file from the DocumentArray
30
- def get_file_from_docarray(file_name):
31
- for doc in doc_array:
32
- if doc.id == file_name:
33
- return doc.content
34
- return None
35
 
36
  def pdf_to_text(path, start_page=1, end_page=None):
37
  doc = fitz.open(path)
@@ -50,24 +35,13 @@ def pdf_to_text(path, start_page=1, end_page=None):
50
  doc.close()
51
  return text_list
52
 
53
- def text_to_chunks(texts, file_names, word_length=150, start_page=1):
 
54
  text_toks = [t.split(' ') for t in texts]
55
  page_nums = []
56
  chunks = []
57
 
58
- total_pages = len(texts)
59
-
60
  for idx, words in enumerate(text_toks):
61
- current_file_idx = 0
62
- current_page = idx + start_page
63
-
64
- for i, num_pages in enumerate(file_names.values()):
65
- if current_page > num_pages:
66
- current_page -= num_pages
67
- current_file_idx += 1
68
- else:
69
- break
70
-
71
  for i in range(0, len(words), word_length):
72
  chunk = words[i:i+word_length]
73
  if (i+word_length) > len(words) and (len(chunk) < word_length) and (
@@ -75,18 +49,18 @@ def text_to_chunks(texts, file_names, word_length=150, start_page=1):
75
  text_toks[idx+1] = chunk + text_toks[idx+1]
76
  continue
77
  chunk = ' '.join(chunk).strip()
78
- chunk = f'[{list(file_names.keys())[current_file_idx]}, Page no. {current_page}]' + ' ' + '"' + chunk + '"'
79
  chunks.append(chunk)
80
  return chunks
81
 
82
 
83
-
84
  class SemanticSearch:
85
 
86
  def __init__(self):
87
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
88
  self.fitted = False
89
 
 
90
  def fit(self, data, batch=1000, n_neighbors=5):
91
  self.data = data
92
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -95,6 +69,7 @@ class SemanticSearch:
95
  self.nn.fit(self.embeddings)
96
  self.fitted = True
97
 
 
98
  def __call__(self, text, return_data=True):
99
  inp_emb = self.use([text])
100
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
@@ -104,6 +79,7 @@ class SemanticSearch:
104
  else:
105
  return neighbors
106
 
 
107
  def get_text_embedding(self, texts, batch=1000):
108
  embeddings = []
109
  for i in range(0, len(texts), batch):
@@ -113,21 +89,17 @@ class SemanticSearch:
113
  embeddings = np.vstack(embeddings)
114
  return embeddings
115
 
116
- def load_recommender(paths, start_page=1):
117
- global recommender
118
- all_texts = []
119
- file_names = {}
120
 
121
- for path in paths:
122
- texts = pdf_to_text(path, start_page=start_page)
123
- all_texts.extend(texts)
124
- file_names[os.path.basename(path)] = len(texts)
125
 
126
- chunks = text_to_chunks(all_texts, file_names, start_page=start_page)
 
 
 
127
  recommender.fit(chunks)
128
  return 'Corpus Loaded.'
129
 
130
- def generate_text(openAI_key, prompt, engine="text-davinci-003"):
 
131
  openai.api_key = openAI_key
132
  completions = openai.Completion.create(
133
  engine=engine,
@@ -140,7 +112,8 @@ def generate_text(openAI_key, prompt, engine="text-davinci-003"):
140
  message = completions.choices[0].text
141
  return message
142
 
143
- def generate_answer(question, openAI_key):
 
144
  topn_chunks = recommender(question)
145
  prompt = ""
146
  prompt += 'search results:\n\n'
@@ -148,61 +121,51 @@ def generate_answer(question, openAI_key):
148
  prompt += c + '\n\n'
149
 
150
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
151
- "Cite each reference using [PDF FILE NAME, PAGE NUMBER:] notation (every result has this number at the beginning). "\
152
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
153
  "with the same name, create separate answers for each. Only include information found in the results and "\
154
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
155
- "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
156
- "search results which have nothing to do with the question. Only answer what is asked. The "\
157
- "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
 
158
  prompt += f"Query: {question}\nAnswer:"
159
- answer = generate_text(openAI_key, prompt, "text-davinci-003")
160
  return answer
161
 
162
- def question_answer(url, files, question, openAI_key):
163
- if openAI_key.strip() == '':
164
- return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
165
- if url.strip() == '' and not files:
166
- return '[ERROR]: Both URL and PDF are empty. Provide at least one.'
167
-
168
- if url.strip() != '' and files:
169
- return '[ERROR]: Both URL and PDF are provided. Please provide only one (either URL or PDF).'
170
 
171
- pdf_paths = []
 
 
 
 
 
 
 
172
 
173
  if url.strip() != '':
174
  glob_url = url
175
- output_path = 'corpus.pdf'
176
- download_pdf(glob_url, output_path)
177
- pdf_paths.append(output_path)
178
 
179
  else:
180
- for file in files:
181
- old_file_name = file.name
182
- file_name = file.name
183
- file_name = file_name[:-12] + file_name[-4:]
184
- os.rename(old_file_name, file_name)
185
- pdf_paths.append(file_name)
186
 
187
  if question.strip() == '':
188
  return '[ERROR]: Question field is empty'
189
 
190
- # Store the PDF content in the DocumentArray
191
- for pdf_path in pdf_paths:
192
- with open(pdf_path, "rb") as f:
193
- content = f.read()
194
- store_file_in_docarray(pdf_path, content)
195
 
196
- # Load the recommender
197
- load_recommender(pdf_paths)
198
-
199
- # Generate an answer
200
- return generate_answer(question, openAI_key)
201
 
202
  recommender = SemanticSearch()
203
- title = 'PDF GPT (Sandbox)'
204
 
205
- description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number and file name in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
 
 
206
 
207
  with gr.Blocks() as demo:
208
 
@@ -213,10 +176,10 @@ with gr.Blocks() as demo:
213
 
214
  with gr.Group():
215
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
216
- openAI_key = gr.Textbox(label='Enter your OpenAI API key here')
217
  url = gr.Textbox(label='Enter PDF URL here')
218
  gr.Markdown("<center><h4>OR<h4></center>")
219
- files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count='multiple')
220
  question = gr.Textbox(label='Enter your question here')
221
  btn = gr.Button(value='Submit')
222
  btn.style(full_width=True)
@@ -224,7 +187,6 @@ with gr.Blocks() as demo:
224
  with gr.Group():
225
  answer = gr.Textbox(label='The answer to your question is :')
226
 
227
- btn.click(question_answer, inputs=[url, files, question, openAI_key], outputs=[answer])
228
-
229
  demo.launch()
230
-
 
7
  import gradio as gr
8
  import os
9
  from sklearn.neighbors import NearestNeighbors
 
 
 
 
10
 
11
  def download_pdf(url, output_path):
12
  urllib.request.urlretrieve(url, output_path)
 
17
  text = re.sub('\s+', ' ', text)
18
  return text
19
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def pdf_to_text(path, start_page=1, end_page=None):
22
  doc = fitz.open(path)
 
35
  doc.close()
36
  return text_list
37
 
38
+
39
+ def text_to_chunks(texts, word_length=150, start_page=1):
40
  text_toks = [t.split(' ') for t in texts]
41
  page_nums = []
42
  chunks = []
43
 
 
 
44
  for idx, words in enumerate(text_toks):
 
 
 
 
 
 
 
 
 
 
45
  for i in range(0, len(words), word_length):
46
  chunk = words[i:i+word_length]
47
  if (i+word_length) > len(words) and (len(chunk) < word_length) and (
 
49
  text_toks[idx+1] = chunk + text_toks[idx+1]
50
  continue
51
  chunk = ' '.join(chunk).strip()
52
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
  chunks.append(chunk)
54
  return chunks
55
 
56
 
 
57
  class SemanticSearch:
58
 
59
  def __init__(self):
60
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
61
  self.fitted = False
62
 
63
+
64
  def fit(self, data, batch=1000, n_neighbors=5):
65
  self.data = data
66
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
69
  self.nn.fit(self.embeddings)
70
  self.fitted = True
71
 
72
+
73
  def __call__(self, text, return_data=True):
74
  inp_emb = self.use([text])
75
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
 
79
  else:
80
  return neighbors
81
 
82
+
83
  def get_text_embedding(self, texts, batch=1000):
84
  embeddings = []
85
  for i in range(0, len(texts), batch):
 
89
  embeddings = np.vstack(embeddings)
90
  return embeddings
91
 
 
 
 
 
92
 
 
 
 
 
93
 
94
+ def load_recommender(path, start_page=1):
95
+ global recommender
96
+ texts = pdf_to_text(path, start_page=start_page)
97
+ chunks = text_to_chunks(texts, start_page=start_page)
98
  recommender.fit(chunks)
99
  return 'Corpus Loaded.'
100
 
101
+
102
+ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
103
  openai.api_key = openAI_key
104
  completions = openai.Completion.create(
105
  engine=engine,
 
112
  message = completions.choices[0].text
113
  return message
114
 
115
+
116
+ def generate_answer(question,openAI_key):
117
  topn_chunks = recommender(question)
118
  prompt = ""
119
  prompt += 'search results:\n\n'
 
121
  prompt += c + '\n\n'
122
 
123
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
124
+ "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
125
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
126
  "with the same name, create separate answers for each. Only include information found in the results and "\
127
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
128
+ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
129
+ "search results which has nothing to do with the question. Only answer what is asked. The "\
130
+ "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
131
+
132
  prompt += f"Query: {question}\nAnswer:"
133
+ answer = generate_text(openAI_key, prompt,"text-davinci-003")
134
  return answer
135
 
 
 
 
 
 
 
 
 
136
 
137
+ def question_answer(url, file, question,openAI_key):
138
+ if openAI_key.strip()=='':
139
+ return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
140
+ if url.strip() == '' and file == None:
141
+ return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
142
+
143
+ if url.strip() != '' and file != None:
144
+ return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
145
 
146
  if url.strip() != '':
147
  glob_url = url
148
+ download_pdf(glob_url, 'corpus.pdf')
149
+ load_recommender('corpus.pdf')
 
150
 
151
  else:
152
+ old_file_name = file.name
153
+ file_name = file.name
154
+ file_name = file_name[:-12] + file_name[-4:]
155
+ os.rename(old_file_name, file_name)
156
+ load_recommender(file_name)
 
157
 
158
  if question.strip() == '':
159
  return '[ERROR]: Question field is empty'
160
 
161
+ return generate_answer(question,openAI_key)
 
 
 
 
162
 
 
 
 
 
 
163
 
164
  recommender = SemanticSearch()
 
165
 
166
+ title = 'PDF GPT'
167
+ description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
168
+
169
 
170
  with gr.Blocks() as demo:
171
 
 
176
 
177
  with gr.Group():
178
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
179
+ openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
180
  url = gr.Textbox(label='Enter PDF URL here')
181
  gr.Markdown("<center><h4>OR<h4></center>")
182
+ file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
183
  question = gr.Textbox(label='Enter your question here')
184
  btn = gr.Button(value='Submit')
185
  btn.style(full_width=True)
 
187
  with gr.Group():
188
  answer = gr.Textbox(label='The answer to your question is :')
189
 
190
+ btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
191
+ #openai.api_key = os.getenv('Your_Key_Here')
192
  demo.launch()