alan commited on
Commit
e0214ec
·
1 Parent(s): 3f73cea
Files changed (3) hide show
  1. app.py +618 -0
  2. requirements.txt +15 -0
  3. utils/openai_utils.py +161 -0
app.py ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ from io import BytesIO
5
+ import datetime
6
+ import time
7
+ import openai, tenacity
8
+ import argparse
9
+ import configparser
10
+ import json
11
+ import fitz
12
+ import PyPDF2
13
+ import gradio
14
+ import sys
15
+ from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk, OCRResponse
16
+ from pathlib import Path
17
+ utils_dir = Path(__file__).parent / 'utils'
18
+ sys.path.append(str(utils_dir))
19
+ from openai_utils import *
20
+ import base64
21
+ from pdf2image import convert_from_bytes
22
+ import requests
23
+ import bibtexparser
24
+ from pybtex.database import parse_string
25
+ from pybtex.plugin import find_plugin
26
+
27
+ PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
28
+ PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
29
+ MISTRAL_API = os.getenv('MISTRAL_API')
30
+
31
+ def insert_sentence(text, sentence, interval):
32
+ lines = text.split('\n')
33
+ new_lines = []
34
+
35
+ for line in lines:
36
+ words = line.split()
37
+ separator = ' '
38
+
39
+ new_words = []
40
+ count = 0
41
+
42
+ for word in words:
43
+ new_words.append(word)
44
+ count += 1
45
+
46
+ if count % interval == 0:
47
+ new_words.append(sentence)
48
+
49
+ new_lines.append(separator.join(new_words))
50
+
51
+ return '\n'.join(new_lines)
52
+
53
+
54
+ def format_bibtex(paper, style='apa'):
55
+ bibtex_entry = paper["citationStyles"]["bibtex"]
56
+ try:
57
+ bib_data = parse_string(bibtex_entry, 'bibtex')
58
+ formatter = find_plugin('pybtex.style.formatting', style)()
59
+ entries = list(bib_data.entries.values())
60
+ formatted = formatter.format_entries(entries)
61
+ return '\n'.join(e.text.render_as('text') for e in formatted)
62
+ except:
63
+ # Fallback: ▸ return raw BibTeX ▸ or convert to a safe @misc record
64
+ return bibtex_entry.strip()
65
+
66
+ def search_paper(query):
67
+ SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
68
+ url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
69
+
70
+ response = requests.get(url)
71
+ while response.status_code != 200:
72
+ time.sleep(1)
73
+ # print(response)
74
+ response = requests.get(url)
75
+
76
+ return response.json()
77
+
78
+ def get_combined_markdown(pdf_response: OCRResponse) -> str:
79
+ markdowns: list[str] = []
80
+ for page in pdf_response.pages:
81
+ markdowns.append(page.markdown)
82
+
83
+ return "\n\n".join(markdowns)
84
+
85
+ def split_text_into_chunks(pdf_response: OCRResponse) -> str:
86
+ # words = text.split()
87
+ # chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
88
+ # return chunks
89
+ markdowns: list[str] = []
90
+ for page in pdf_response.pages:
91
+ markdowns.append(page.markdown)
92
+ return markdowns
93
+
94
+ def download_pdf(paper):
95
+ pdf_url = paper["openAccessPdf"]["url"]
96
+ try:
97
+ response = requests.get(pdf_url)
98
+ response.raise_for_status()
99
+
100
+
101
+ file_object = BytesIO(response.content)
102
+ chunks = extract_chapter(file_object)
103
+ return chunks
104
+ except:
105
+ return []
106
+
107
+
108
+ def recommendation(s2_id, limit=500):
109
+ SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
110
+ url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf,citationStyles"
111
+
112
+ # print(url)
113
+ response = requests.get(url)
114
+ while response.status_code != 200:
115
+ time.sleep(1)
116
+ # print(response)
117
+ response = requests.get(url)
118
+
119
+ return response.json()
120
+
121
+
122
+ def extract_chapter(file_object):
123
+ client = Mistral(api_key=MISTRAL_API)
124
+ uploaded_file = client.files.upload(
125
+ file={
126
+ "file_name": "retrieve.pdf",
127
+ "content": file_object.read(),
128
+ },
129
+ purpose="ocr",
130
+ )
131
+
132
+ signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
133
+ pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
134
+ # response_dict = json.loads(pdf_response.json())
135
+ chunks = split_text_into_chunks(pdf_response)
136
+ return chunks
137
+
138
+
139
+
140
+ class Reviewer:
141
+
142
+ def __init__(self, api, api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
143
+ self.api = api
144
+ self.api_base = api_base
145
+ self.aspect = aspect
146
+ self.paper_pdf = paper_pdf
147
+ self.model_name = model_name
148
+ self.limit_num = int(limit_num)
149
+ self.enable_rag = enable_rag
150
+ # self.max_token_num = 50000
151
+ # self.encoding = tiktoken.get_encoding("gpt2")
152
+
153
+
154
+ def review_by_chatgpt(self, paper_list):
155
+ text, title, abstract = self.extract_from_paper(self.paper_pdf)
156
+ content = f"Paper to review: \nTitle: {title}\n" + text
157
+
158
+ if self.enable_rag:
159
+ papers = self.retrieve_papers(title, abstract)
160
+ if papers != None:
161
+ retrieval_content = ""
162
+ retrieved_papers = ""
163
+ cnt = 1
164
+ for paper in papers:
165
+ retrieval_content += f"Relevant Paper {str(cnt)}:\n"
166
+ retrieval_content += f"Author and Title: {format_bibtex(paper, 'unsrt')}\n{paper['content']}\n\n"
167
+ formatted_citation = format_bibtex(paper, 'unsrt')
168
+ retrieved_papers += f"{str(cnt)}. {formatted_citation}\n({paper['url']})\n\n"
169
+ cnt += 1
170
+ text = retrieval_content + content
171
+ chat_review_limitations = self.chat_review(text=text)
172
+ chat_review_text = self.chat_refine(text=text, limitations=chat_review_limitations)
173
+ else:
174
+ text = content
175
+ chat_review_limitations = self.chat_review(text=text)
176
+ retrieved_papers = ""
177
+ chat_review_text = self.chat_refine(text=text, limitations=chat_review_limitations)
178
+ else:
179
+ text = content
180
+ chat_review_limitations = self.chat_review(text=text)
181
+ retrieved_papers = ""
182
+ chat_review_text = self.chat_refine(text=text, limitations=chat_review_limitations)
183
+
184
+ # text = f"Paper:\n{paper['content']}\n\n"
185
+ # chat_review_text = self.chat_refine(text=text, limitations=chat_review_limitations)
186
+ return chat_review_text, retrieved_papers
187
+
188
+ def query_gen(self, abstract):
189
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
190
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
191
+ client = AsyncOpenAI()
192
+
193
+ messages=[
194
+ {"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
195
+ {"role": "user", "content": abstract},
196
+ ]
197
+
198
+ responses = asyncio.run(
199
+ generate_from_openai_chat_completion(
200
+ client,
201
+ messages=[messages],
202
+ engine_name="gpt-4.1-mini", # gpt-3.5-turbo
203
+ max_tokens=1000, # 32
204
+ requests_per_minute = 20,
205
+ # response_format={"type":"json_object"},
206
+ )
207
+ )
208
+ return responses[0]
209
+
210
+
211
+ def rerank(self, paper_list, title, abstract):
212
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
213
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
214
+ client = AsyncOpenAI()
215
+
216
+ rec_content = ""
217
+ rec_paper_cnt = 1
218
+
219
+ for rec_paper in paper_list:
220
+ rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
221
+ rec_paper_cnt += 1
222
+
223
+ rec_content += f"Reference Paper: {title}\n"
224
+ rec_content += f"Abstract: {abstract}\n"
225
+
226
+ messages=[
227
+ {"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
228
+ {"role": "user", "content": rec_content},
229
+ ]
230
+
231
+ responses = asyncio.run(
232
+ generate_from_openai_chat_completion(
233
+ client,
234
+ messages=[messages],
235
+ engine_name="gpt-4.1-mini", # gpt-3.5-turbo
236
+ max_tokens=1000, # 32
237
+ requests_per_minute = 20,
238
+ response_format={"type":"json_object"},
239
+ )
240
+ )
241
+ response_data = json.loads(responses[0])
242
+ rec_papers = []
243
+ for rec_num in response_data["ranking"][:5]:
244
+ num = int(rec_num)
245
+ rec_papers.append(paper_list[num-1])
246
+
247
+ return rec_papers
248
+
249
+ def extract_related_content(self, papers, aspect):
250
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
251
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
252
+
253
+ messages = []
254
+ chunk_index_map = []
255
+ paper_data_list = []
256
+ paper_chunk_list = []
257
+ for paper_idx, paper in enumerate(papers):
258
+ paper_chunks = download_pdf(paper)
259
+ paper_chunk_list.append(paper_chunks)
260
+
261
+ SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."
262
+
263
+ for chunk_idx, paper_chunk in enumerate(paper_chunks):
264
+ message = [
265
+ {"role": "system", "content": SYSTEM_INPUT},
266
+ {"role": "user", "content": paper_chunk},
267
+ ]
268
+ messages.append(message)
269
+ chunk_index_map.append((paper_idx, chunk_idx)) # 标记每个 chunk 归属哪个 paper
270
+
271
+ client = AsyncOpenAI()
272
+ responses = asyncio.run(
273
+ generate_from_openai_chat_completion(
274
+ client,
275
+ messages=messages,
276
+ engine_name="gpt-4.1-mini",
277
+ max_tokens=1000,
278
+ requests_per_minute=100,
279
+ )
280
+ )
281
+
282
+ paper_data_list = [{"title": paper["title"], "content": "", "citationStyles": paper["citationStyles"], "url": paper["url"]} for paper in papers]
283
+
284
+ for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
285
+ if response.strip().lower().startswith("yes"):
286
+ paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"
287
+
288
+ for idx, paper_data in enumerate(paper_data_list):
289
+ if not paper_data["content"].strip():
290
+ paper_data["content"] = papers[idx]["abstract"]
291
+
292
+
293
+ if aspect == "Methodology":
294
+ SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
295
+ Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
296
+ Organize the result in JSON format as follows:
297
+ {
298
+ "revised_text": str, not dict, not a summary
299
+ }
300
+ """
301
+ elif aspect == "Result Analysis":
302
+ SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
303
+ Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
304
+ Organize the result in JSON format as follows:
305
+ {
306
+ "revised_text": str, not dict, not a summary
307
+ }
308
+ """
309
+ elif aspect == "Experimental Design":
310
+ SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
311
+ Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
312
+ Organize the result in JSON format as follows:
313
+ {
314
+ "revised_text": str, not dict, not a summary
315
+ }
316
+ """
317
+ elif aspect == "Literature Review":
318
+ SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
319
+ Remove sentences that are irrelevant to the literature review, and keep details about the related works.
320
+ Organize the result in JSON format as follows:
321
+ {
322
+ "revised_text": str, not dict, not a summary
323
+ }
324
+ """
325
+ messages = []
326
+ for paper_data in paper_data_list:
327
+ message=[
328
+ {"role": "system", "content": SYSTEM_INPUT} ,
329
+ {"role": "user", "content": paper_data["content"]},
330
+ ]
331
+ messages.append(message)
332
+
333
+ responses = asyncio.run(
334
+ generate_from_openai_chat_completion(
335
+ client,
336
+ messages=messages,
337
+ engine_name="gpt-4o-mini", # gpt-3.5-turbo
338
+ max_tokens=5000, # 32
339
+ requests_per_minute = 20,
340
+ response_format={"type":"json_object"},
341
+ )
342
+ )
343
+
344
+ results = []
345
+ for paper_data, response in zip(paper_data_list, responses):
346
+ # print(response)
347
+ response = json.loads(response)
348
+ results.append({"title": paper_data["title"], "content": response["revised_text"], "citationStyles": paper_data["citationStyles"], "url": paper_data["url"]})
349
+ return results
350
+
351
+
352
+
353
+ def chat_review(self, text):
354
+ os.environ["OPENAI_BASE_URL"] = self.api_base
355
+ os.environ["OPENAI_API_KEY"] = self.api
356
+ client = AsyncOpenAI()
357
+
358
+ if self.aspect == "Methodology":
359
+ hint = "focusing on the fundamental approaches and techniques employed in the research. These include issues such as inappropriate choice of methods, unstated assumptions that may not hold, and problems with data quality or preprocessing that could introduce bias."
360
+ elif self.aspect == "Experimental Design":
361
+ hint = "focusing on weaknesses in how the research validates its claims. These include issues such as insufficient baseline comparisons, limited datasets that may not represent the full problem space, and lack of ablation studies to isolate the contribution of different components."
362
+ elif self.aspect == "Result Analysis":
363
+ hint = "focusing on how findings are evaluated and interpreted. This includes using inadequate evaluation metrics that may not capture important aspects of performance, insufficient error analysis, and lack of statistical significance testing."
364
+ elif self.aspect == "Literature Review":
365
+ hint = "focusing on how the research connects to and builds upon existing work. This includes missing citations of relevant prior work, mischaracterization of existing methods, and failure to properly contextualize contributions within the broader research landscape."
366
+ if self.enable_rag:
367
+ messages=[
368
+ {"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate {str(self.limit_num)} major limitations related to its {self.aspect} in this paper, {hint} Do not include any limitation explicitly mentioned in the paper itself. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
369
+ {"role": "user", "content": text},
370
+ ]
371
+ else:
372
+ messages=[
373
+ {"role": "system", "content": f"Read the following scientific paper and generate {str(self.limit_num)} major limitations in this paper about its {self.aspect}, {hint} Do not include any limitation explicitly mentioned in the paper itself. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
374
+ {"role": "user", "content": text},
375
+ ]
376
+
377
+ responses = asyncio.run(
378
+ generate_from_openai_chat_completion(
379
+ client,
380
+ messages=[messages],
381
+ engine_name=self.model_name, # gpt-3.5-turbo
382
+ max_tokens=1000, # 32
383
+ requests_per_minute = 20,
384
+ # response_format={"type":"json_object"},
385
+ )
386
+ )
387
+ try:
388
+ limitations = json.loads(responses[0])["limitations"][:self.limit_num]
389
+ result = ""
390
+ limit_cnt = 1
391
+ for limitation in limitations:
392
+ result += f"{str(limit_cnt)}. {limitation}\n"
393
+ limit_cnt += 1
394
+ except:
395
+ SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
396
+ messages=[
397
+ {"role": "system", "content": SYSTEM_INPUT},
398
+ {"role": "user", "content": responses[0]},
399
+ ]
400
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
401
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
402
+ client = AsyncOpenAI()
403
+ responses = asyncio.run(
404
+ generate_from_openai_chat_completion(
405
+ client,
406
+ messages=[messages],
407
+ engine_name="gpt-4.1-mini", # gpt-3.5-turbo
408
+ max_tokens=1000, # 32
409
+ requests_per_minute = 20,
410
+ response_format={"type":"json_object"},
411
+ )
412
+ )
413
+ limitations = json.loads(responses[0])["limitations"][:self.limit_num]
414
+
415
+
416
+ return limitations
417
+
418
+ def chat_refine(self, text, limitations):
419
+ os.environ["OPENAI_BASE_URL"] = self.api_base
420
+ os.environ["OPENAI_API_KEY"] = self.api
421
+ client = AsyncOpenAI()
422
+
423
+ messages = []
424
+ if self.enable_rag:
425
+ SYSTEM_INPUT = "Read the following scientific paper, its limitation, and several relevant papers to gain knowledge of the relevant field. Using insights from the relevant papers, provide a highly specific and actionable suggestion to address the limitation in the paper to review. You need to cite the related paper when giving advice. If suggesting an additional dataset, specify the exact dataset(s) by name. If proposing a methodological change, describe the specific modification. Keep the response within 50 words."
426
+ else:
427
+ SYSTEM_INPUT = "Read the following scientific paper and its limitation, and provide a highly specific and actionable suggestion to address the limitation. If suggesting an additional dataset, specify the exact dataset(s) by name. If proposing a methodological change, describe the specific modification. Keep the response within 50 words."
428
+
429
+ for limitation in limitations:
430
+ message=[
431
+ {"role": "system", "content": SYSTEM_INPUT},
432
+ {"role": "user", "content": f"{text}\nLimitation: {limitation}"},
433
+ ]
434
+ messages.append(message)
435
+
436
+ responses = asyncio.run(
437
+ generate_from_openai_chat_completion(
438
+ client,
439
+ messages=messages,
440
+ engine_name=self.model_name, # gpt-3.5-turbo
441
+ max_tokens=1000, # 32
442
+ requests_per_minute = 20,
443
+ # response_format={"type":"json_object"},
444
+ )
445
+ )
446
+
447
+ result = ""
448
+ limit_cnt = 1
449
+ for limitation, response in zip(limitations, responses):
450
+ result += f"{str(limit_cnt)}. {limitation} {response}\n\n"
451
+ limit_cnt += 1
452
+ print("********"*10)
453
+ print(result)
454
+ print("********"*10)
455
+ return result
456
+
457
+
458
+ def retrieve_papers(self, title, abstract):
459
+ query = title
460
+ search_results = search_paper(query)
461
+ if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
462
+ search_result = search_results["data"][0]
463
+ retrieval = recommendation(search_result["paperId"])
464
+ recommended_paper_list = []
465
+ for recommended_paper in retrieval["recommendedPapers"]:
466
+ if recommended_paper["abstract"] is None:
467
+ continue
468
+ if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
469
+ recommended_paper_list.append(recommended_paper)
470
+
471
+ if len(recommended_paper_list) >= 20:
472
+ break
473
+
474
+ else:
475
+ query = self.query_gen(abstract)
476
+ search_results = search_paper(query)
477
+ recommended_paper_list = []
478
+ if search_results["data"] == []:
479
+ return None
480
+ for search_result in search_results["data"]:
481
+ retrieval = recommendation(search_result["paperId"])
482
+ recommended_papers = []
483
+ for recommended_paper in retrieval["recommendedPapers"]:
484
+ if recommended_paper["abstract"] is None:
485
+ continue
486
+ if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
487
+ recommended_papers.append(recommended_paper)
488
+
489
+ if len(recommended_papers) >= 5:
490
+ break
491
+ recommended_paper_list.extend(recommended_papers)
492
+
493
+ if recommended_paper_list == []:
494
+ return None
495
+ final_papers = self.rerank(recommended_paper_list, title, abstract)
496
+ retrieved_papers = self.extract_related_content(final_papers, self.aspect)
497
+
498
+ return retrieved_papers
499
+
500
+
501
+
502
+
503
+ def extract_from_paper(self, pdf_path):
504
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
505
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
506
+ client = AsyncOpenAI()
507
+
508
+ # with open(pdf_path, 'rb') as f: # TODO
509
+ # pdf_bytes = f.read()
510
+ # file_object = BytesIO(pdf_bytes)
511
+
512
+ file_object = BytesIO(pdf_path) # TODO
513
+ pdf_reader = PyPDF2.PdfReader(file_object)
514
+
515
+ doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO path/bytes
516
+ page = doc.load_page(0)
517
+ pix = page.get_pixmap()
518
+ image_bytes = pix.tobytes("png")
519
+
520
+ image_base64 = base64.b64encode(image_bytes).decode('utf-8')
521
+
522
+ USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
523
+ messages=[
524
+ {"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
525
+ {"role": "user", "content": USER_INPUT},
526
+ ]
527
+ responses = asyncio.run(
528
+ generate_from_openai_chat_completion(
529
+ client,
530
+ messages=[messages],
531
+ engine_name="gpt-4.1-mini", # gpt-3.5-turbo
532
+ max_tokens=1000, # 32
533
+ requests_per_minute = 20,
534
+ response_format={"type":"json_object"},
535
+ )
536
+ )
537
+
538
+ response = json.loads(responses[0])
539
+ title = response["title"]
540
+ abstract = response["abstract"]
541
+
542
+ client = Mistral(api_key=MISTRAL_API)
543
+ file_object.seek(0)
544
+ uploaded_file = client.files.upload(
545
+ file={
546
+ "file_name": "upload.pdf",
547
+ "content": file_object.read(),
548
+ },
549
+ purpose="ocr",
550
+ )
551
+
552
+ signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
553
+ pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
554
+ # response_dict = json.loads(pdf_response.json())
555
+ extracted_text = get_combined_markdown(pdf_response)
556
+
557
+ return extracted_text, title, abstract
558
+
559
+ def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
560
+ start_time = time.time()
561
+ # print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
562
+ comments = ''
563
+ output2 = ''
564
+ retrieved_content = ''
565
+ if not api or not paper_pdf or not api_base:
566
+ comments = "It looks like there's a missing API key/base URL or PDF input. Make sure you've provided the necessary information or uploaded the required file."
567
+ output2 = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
568
+ if not limit_num.isdigit() or int(limit_num) <= 0:
569
+ comments = "The input number is not a positive integer."
570
+ output2 = "The input number is not a positive integer."
571
+ else:
572
+ try:
573
+ reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag)
574
+ comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
575
+ time_used = time.time() - start_time
576
+ output2 ="Processing Time:"+ str(round(time_used, 2)) +"seconds"
577
+ except Exception as e:
578
+ comments = "Error: "+ str(e)
579
+ output2 = "Error: "+ str(e)
580
+ return retrieved_content, comments, output2
581
+
582
+
583
+ ########################################################################################################
584
+
585
+ title = "Critique Generation with Actionable Feedback"
586
+
587
+
588
+ description = '''<div align='left'>
589
+ <strong>We present a demo for a SciMentor feature. Upload the PDF of the text you want to review, and the demo will automatically generate its identified limitations.
590
+ </div>
591
+ '''
592
+
593
+ inp = [gradio.Textbox(label="Enter your API-key",
594
+ value="",
595
+ type='password'),
596
+ gradio.Textbox(label="Enter the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
597
+ value="https://api.openai.com/v1"),
598
+
599
+ gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
600
+ gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
601
+ value="Methodology",
602
+ label="Select the aspect"),
603
+ gradio.Dropdown(["gpt-4.1-mini","gpt-4.1"],
604
+ label="Select the model name",
605
+ value="gpt-4.1"),
606
+ gradio.Textbox(label="Enter the number of limitations to generate.",
607
+ value="3"),
608
+ gradio.Checkbox(label="Enable RAG", value=False),
609
+ ]
610
+
611
+ chat_reviewer_gui = gradio.Interface(fn=main,
612
+ inputs=inp,
613
+ outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
614
+ title=title,
615
+ description=description)
616
+
617
+ # Start server
618
+ chat_reviewer_gui .launch(quiet=True, show_api=False)
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF==1.21.1
2
+ tenacity==8.2.2
3
+ pybase64==1.2.3
4
+ Pillow==9.4.0
5
+ openai==1.33.0
6
+ markdown
7
+ PyPDF2
8
+ aiolimiter
9
+ pdf2image
10
+ mistralai
11
+ bibtexparser
12
+ pybtex
13
+ httpx==0.27.2
14
+ pydantic<2.11
15
+ gradio==3.20.1
utils/openai_utils.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ from typing import Any
5
+ from aiohttp import ClientSession
6
+ from tqdm.asyncio import tqdm_asyncio
7
+ import random
8
+ from time import sleep
9
+ import sys
10
+ import aiolimiter
11
+
12
+ import openai
13
+ from openai import AsyncOpenAI, OpenAIError
14
+
15
+
16
+ def prepare_message(SYSTEM_INPUT, USER_INPUT):
17
+ cur_message = [
18
+ {
19
+ "role": "system",
20
+ "content": SYSTEM_INPUT
21
+ },
22
+ {
23
+ "role": "user",
24
+ "content": USER_INPUT,
25
+ }
26
+ ]
27
+ return cur_message
28
+
29
+ def prepare_remove_message(USER_INPUT):
30
+ cur_message = [
31
+ {
32
+ "role": "system",
33
+ "content": "Remove sentences about experimental design and results: "
34
+ },
35
+ {
36
+ "role": "user",
37
+ "content": USER_INPUT,
38
+ }
39
+ ]
40
+ return cur_message
41
+
42
+ def prepare_generation_input(title, abstract, sections, filepath):
43
+ with open(filepath, 'r', encoding='utf-8') as file:
44
+ SYSTEM_INPUT=file.read()
45
+ return SYSTEM_INPUT,f"Paper title: {title}\n\nPaper abstract: {abstract}\n\nPaper Sections: {sections}"
46
+
47
+ def prepare_remove_input(title, abstract, introduction, filepath):
48
+ with open(filepath,'r',encoding='utf-8') as file:
49
+ SYSTEM_INPUT=file.read()
50
+ print(SYSTEM_INPUT)
51
+ return SYSTEM_INPUT,f"Paper title: {title}\n\nPaper abstract: {abstract}\n\nIntroduction: {introduction}\n\n"
52
+
53
+
54
+ async def _throttled_openai_chat_completion_acreate(
55
+ client: AsyncOpenAI,
56
+ model: str,
57
+ messages,
58
+ temperature: float,
59
+ max_tokens: int,
60
+ top_p: float,
61
+ limiter: aiolimiter.AsyncLimiter,
62
+ response_format: dict = {},
63
+ ):
64
+ async with limiter:
65
+ for _ in range(10):
66
+ try:
67
+ if response_format["type"] == "text":
68
+ return await client.chat.completions.create(
69
+ model=model,
70
+ messages=messages,
71
+ temperature=temperature,
72
+ max_tokens=max_tokens,
73
+ top_p=top_p,
74
+ )
75
+ else:
76
+ return await client.chat.completions.create(
77
+ model=model,
78
+ messages=messages,
79
+ temperature=temperature,
80
+ max_tokens=max_tokens,
81
+ top_p=top_p,
82
+ response_format=response_format,
83
+ )
84
+ except openai.BadRequestError as e:
85
+ print(e)
86
+ return None
87
+ except OpenAIError as e:
88
+ print(e)
89
+ sleep(random.randint(5, 10))
90
+ return None
91
+
92
+
93
+ async def generate_from_openai_chat_completion(
94
+ client,
95
+ messages,
96
+ engine_name: str,
97
+ temperature: float = 1.0,
98
+ max_tokens: int = 512,
99
+ top_p: float = 1.0,
100
+ requests_per_minute: int = 100,
101
+ response_format: dict = {"type":"text"},
102
+ ):
103
+ """Generate from OpenAI Chat Completion API.
104
+ Args:
105
+ messages: List of messages to proceed.
106
+ engine_name: Engine name to use, see https://platform.openai.com/docs/models
107
+ temperature: Temperature to use.
108
+ max_tokens: Maximum number of tokens to generate.
109
+ top_p: Top p to use.
110
+ requests_per_minute: Number of requests per minute to allow.
111
+ Returns:
112
+ List of generated responses.
113
+ """
114
+ limiter = aiolimiter.AsyncLimiter(requests_per_minute)
115
+
116
+ async_responses = [
117
+ _throttled_openai_chat_completion_acreate(
118
+ client,
119
+ model=engine_name,
120
+ messages=message,
121
+ temperature=temperature,
122
+ max_tokens=max_tokens,
123
+ top_p=top_p,
124
+ limiter=limiter,
125
+ response_format=response_format,
126
+ )
127
+ for message in messages
128
+ ]
129
+
130
+ responses = await tqdm_asyncio.gather(*async_responses, file=sys.stdout)
131
+
132
+ outputs = []
133
+ for response in responses:
134
+ if response:
135
+ outputs.append(response.choices[0].message.content)
136
+ else:
137
+ outputs.append("Invalid Message")
138
+ return outputs
139
+
140
+
141
+ # Example usage
142
+ if __name__ == "__main__":
143
+ os.environ["OPENAI_API_KEY"] = "xxx" # Set your OpenAI API key here
144
+
145
+ client = AsyncOpenAI()
146
+ AsyncOpenAI.api_key = os.getenv('OPENAI_API_KEY')
147
+
148
+ messages = [
149
+ {"role": "system", "content": "You are a helpful assistant."},
150
+ {"role": "user", "content": "What is the purpose of life? Output result in json format."},
151
+ ]
152
+ responses = asyncio.run(
153
+ generate_from_openai_chat_completion(
154
+ client,
155
+ messages=[messages]*50,
156
+ engine_name="gpt-3.5-turbo-0125",
157
+ max_tokens=256,
158
+ response_format={"type":"json_object"},
159
+ )
160
+ )
161
+ print(responses)