bhaskartripathi commited on
Commit
e96b7ee
·
1 Parent(s): 8468a99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -49
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import glob
2
  import urllib.request
3
  import fitz
4
  import re
@@ -12,17 +11,14 @@ from sklearn.neighbors import NearestNeighbors
12
  def download_pdf(url, output_path):
13
  urllib.request.urlretrieve(url, output_path)
14
 
15
-
16
  def preprocess(text):
17
  text = text.replace('\n', ' ')
18
  text = re.sub('\s+', ' ', text)
19
  return text
20
 
21
-
22
- def pdf_to_text(file_path, start_page=1, end_page=None):
23
- doc = fitz.open(file_path)
24
  total_pages = doc.page_count
25
- file_name = os.path.basename(file_path)
26
 
27
  if end_page is None:
28
  end_page = total_pages
@@ -32,26 +28,25 @@ def pdf_to_text(file_path, start_page=1, end_page=None):
32
  for i in range(start_page-1, end_page):
33
  text = doc.load_page(i).get_text("text")
34
  text = preprocess(text)
35
- text_list.append((file_name, text))
36
 
37
  doc.close()
38
  return text_list
39
-
40
 
41
  def text_to_chunks(texts, word_length=150, start_page=1):
42
- text_toks = [(file_name, t.split(' ')) for file_name, t in texts]
43
  page_nums = []
44
  chunks = []
45
 
46
- for idx, (file_name, words) in enumerate(text_toks):
47
  for i in range(0, len(words), word_length):
48
  chunk = words[i:i+word_length]
49
  if (i+word_length) > len(words) and (len(chunk) < word_length) and (
50
  len(text_toks) != (idx+1)):
51
- text_toks[idx+1] = (file_name, chunk + text_toks[idx+1][1])
52
  continue
53
  chunk = ' '.join(chunk).strip()
54
- chunk = f'[{file_name}, Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
55
  chunks.append(chunk)
56
  return chunks
57
 
@@ -61,7 +56,6 @@ class SemanticSearch:
61
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
62
  self.fitted = False
63
 
64
-
65
  def fit(self, data, batch=1000, n_neighbors=5):
66
  self.data = data
67
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -70,7 +64,6 @@ class SemanticSearch:
70
  self.nn.fit(self.embeddings)
71
  self.fitted = True
72
 
73
-
74
  def __call__(self, text, return_data=True):
75
  inp_emb = self.use([text])
76
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
@@ -80,7 +73,6 @@ class SemanticSearch:
80
  else:
81
  return neighbors
82
 
83
-
84
  def get_text_embedding(self, texts, batch=1000):
85
  embeddings = []
86
  for i in range(0, len(texts), batch):
@@ -90,19 +82,22 @@ class SemanticSearch:
90
  embeddings = np.vstack(embeddings)
91
  return embeddings
92
 
 
 
 
93
 
 
 
 
94
 
95
- def load_recommender(directory_path, start_page=1):
96
- global recommender
97
-
98
- texts = []
99
- for file_path in glob.glob(os.path.join(directory_path, '*.pdf')):
100
- texts.extend(pdf_to_text(file_path, start_page=start_page))
101
 
102
- chunks = text_to_chunks(texts, start_page=start_page)
103
  recommender.fit(chunks)
104
  return 'Corpus Loaded.'
105
-
106
  def generate_text(openAI_key, prompt, engine="text-davinci-003"):
107
  openai.api_key = openAI_key
108
  completions = openai.Completion.create(
@@ -124,47 +119,52 @@ def generate_answer(question, openAI_key):
124
  prompt += c + '\n\n'
125
 
126
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
127
- "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
128
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
129
  "with the same name, create separate answers for each. Only include information found in the results and "\
130
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
131
  "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
132
- "search results which has nothing to do with the question. Only answer what is asked. The "\
133
  "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
134
 
135
  prompt += f"Query: {question}\nAnswer:"
136
  answer = generate_text(openAI_key, prompt, "text-davinci-003")
137
  return answer
138
 
139
-
140
- def question_answer(files, question, openAI_key):
141
  if openAI_key.strip() == '':
142
- return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
143
-
144
- if len(files) == 0:
145
- return '[ERROR]: No PDF files uploaded.'
 
 
146
 
147
- directory_path = os.path.join(os.getcwd(), 'uploaded_files')
148
- if not os.path.exists(directory_path):
149
- os.makedirs(directory_path)
 
150
 
151
- for file in files:
152
- with open(os.path.join(directory_path, file.name), 'wb') as f:
153
- f.write(file.read())
 
 
 
 
 
154
 
155
- load_recommender(directory_path)
156
 
157
  if question.strip() == '':
158
  return '[ERROR]: Question field is empty'
159
 
160
  return generate_answer(question, openAI_key)
161
 
162
-
163
-
164
  recommender = SemanticSearch()
165
-
166
  title = 'PDF GPT'
167
- description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
 
168
 
169
  with gr.Blocks() as demo:
170
 
@@ -172,18 +172,21 @@ with gr.Blocks() as demo:
172
  gr.Markdown(description)
173
 
174
  with gr.Row():
175
-
176
  with gr.Group():
177
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
178
- openAI_key = gr.inputs.Textbox(label='Enter your OpenAI API key here')
179
- #files = gr.inputs.File(file_count="multiple", accept=".pdf", label="Upload PDF files")
180
- files = gr.File(label = "Upload .pdf documents.", file_count="multiple")
181
- question = gr.inputs.Textbox(label='Enter your question here')
 
182
  btn = gr.Button(value='Submit')
 
183
 
184
  with gr.Group():
185
- answer = gr.outputs.Textbox(label='The answer to your question is :')
186
 
187
- btn.click(question_answer, inputs=[files, question, openAI_key], outputs=[answer])
188
 
189
  demo.launch()
 
 
 
1
  import urllib.request
2
  import fitz
3
  import re
 
11
  def download_pdf(url, output_path):
12
  urllib.request.urlretrieve(url, output_path)
13
 
 
14
  def preprocess(text):
15
  text = text.replace('\n', ' ')
16
  text = re.sub('\s+', ' ', text)
17
  return text
18
 
19
+ def pdf_to_text(path, start_page=1, end_page=None):
20
+ doc = fitz.open(path)
 
21
  total_pages = doc.page_count
 
22
 
23
  if end_page is None:
24
  end_page = total_pages
 
28
  for i in range(start_page-1, end_page):
29
  text = doc.load_page(i).get_text("text")
30
  text = preprocess(text)
31
+ text_list.append(text)
32
 
33
  doc.close()
34
  return text_list
 
35
 
36
  def text_to_chunks(texts, word_length=150, start_page=1):
37
+ text_toks = [t.split(' ') for t in texts]
38
  page_nums = []
39
  chunks = []
40
 
41
+ for idx, words in enumerate(text_toks):
42
  for i in range(0, len(words), word_length):
43
  chunk = words[i:i+word_length]
44
  if (i+word_length) > len(words) and (len(chunk) < word_length) and (
45
  len(text_toks) != (idx+1)):
46
+ text_toks[idx+1] = chunk + text_toks[idx+1]
47
  continue
48
  chunk = ' '.join(chunk).strip()
49
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
50
  chunks.append(chunk)
51
  return chunks
52
 
 
56
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
57
  self.fitted = False
58
 
 
59
  def fit(self, data, batch=1000, n_neighbors=5):
60
  self.data = data
61
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
64
  self.nn.fit(self.embeddings)
65
  self.fitted = True
66
 
 
67
  def __call__(self, text, return_data=True):
68
  inp_emb = self.use([text])
69
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
 
73
  else:
74
  return neighbors
75
 
 
76
  def get_text_embedding(self, texts, batch=1000):
77
  embeddings = []
78
  for i in range(0, len(texts), batch):
 
82
  embeddings = np.vstack(embeddings)
83
  return embeddings
84
 
85
+ def load_recommender(paths, start_page=1):
86
+ global recommender
87
+ all_texts = []
88
 
89
+ for path in paths:
90
+ texts = pdf_to_text(path, start_page=start_page)
91
+ all_texts.extend([(text, path) for text in texts])
92
 
93
+ chunks = []
94
+ for text, path in all_texts:
95
+ pdf_chunks = text_to_chunks([text], start_page=start_page)
96
+ pdf_chunks = [f'[PDF {os.path.basename(path)}, {chunk}]' for chunk in pdf_chunks]
97
+ chunks.extend(pdf_chunks)
 
98
 
 
99
  recommender.fit(chunks)
100
  return 'Corpus Loaded.'
 
101
  def generate_text(openAI_key, prompt, engine="text-davinci-003"):
102
  openai.api_key = openAI_key
103
  completions = openai.Completion.create(
 
119
  prompt += c + '\n\n'
120
 
121
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
122
+ "Cite each reference using [PDF FILE NAME, PAGE NUMBER:] notation (every result has this number at the beginning). "\
123
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
124
  "with the same name, create separate answers for each. Only include information found in the results and "\
125
  "don't add any additional information. Make sure the answer is correct and don't output false content. "\
126
  "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
127
+ "search results which have nothing to do with the question. Only answer what is asked. The "\
128
  "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
129
 
130
  prompt += f"Query: {question}\nAnswer:"
131
  answer = generate_text(openAI_key, prompt, "text-davinci-003")
132
  return answer
133
 
134
+ def question_answer(url, files, question, openAI_key):
 
135
  if openAI_key.strip() == '':
136
+ return '[ERROR]: Please enter your Open AI Key. Get your key here: https://platform.openai.com/account/api-keys'
137
+ if url.strip() == '' and not files:
138
+ return '[ERROR]: Both URL and PDF are empty. Provide at least one.'
139
+
140
+ if url.strip() != '' and files:
141
+ return '[ERROR]: Both URL and PDF are provided. Please provide only one (either URL or PDF).'
142
 
143
+ if url.strip() != '':
144
+ glob_url = url
145
+ download_pdf(glob_url, 'corpus.pdf')
146
+ load_recommender(['corpus.pdf'])
147
 
148
+ else:
149
+ file_paths = []
150
+ for file in files:
151
+ old_file_name = file.name
152
+ file_name = file.name
153
+ file_name = file_name[:-12] + file_name[-4:]
154
+ os.rename(old_file_name, file_name)
155
+ file_paths.append(file_name)
156
 
157
+ load_recommender(file_paths)
158
 
159
  if question.strip() == '':
160
  return '[ERROR]: Question field is empty'
161
 
162
  return generate_answer(question, openAI_key)
163
 
 
 
164
  recommender = SemanticSearch()
 
165
  title = 'PDF GPT'
166
+
167
+ description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number and file name in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
168
 
169
  with gr.Blocks() as demo:
170
 
 
172
  gr.Markdown(description)
173
 
174
  with gr.Row():
175
+
176
  with gr.Group():
177
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
178
+ openAI_key = gr.Textbox(label='Enter your OpenAI API key here')
179
+ url = gr.Textbox(label='Enter PDF URL here')
180
+ gr.Markdown("<center><h4>OR<h4></center>")
181
+ files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count='multiple')
182
+ question = gr.Textbox(label='Enter your question here')
183
  btn = gr.Button(value='Submit')
184
+ btn.style(full_width=True)
185
 
186
  with gr.Group():
187
+ answer = gr.Textbox(label='The answer to your question is :')
188
 
189
+ btn.click(question_answer, inputs=[url, files, question, openAI_key], outputs=[answer])
190
 
191
  demo.launch()
192
+