C2MV commited on
Commit
b86b9af
·
verified ·
1 Parent(s): 5d5b6d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -206
app.py CHANGED
@@ -99,171 +99,170 @@ class PaperDownloader:
99
 
100
  logger.debug(f"Too many redirects or retries {url}, not following this link further")
101
  return None
102
-
103
  async def download_paper_direct_doi_async(self, session, doi):
104
- """Attempt to download the pdf from the landing page of the doi"""
105
- if not doi:
106
- return None
107
-
108
- try:
109
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
110
-
111
- # First, let's try to download the URL directly in case it is already the pdf.
112
- pdf_content = await self.fetch_pdf_content(session, doi_url)
113
- if pdf_content:
114
- logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
115
- return pdf_content
116
-
117
- # If direct DOI link was not a pdf, fetch landing page and extract links
118
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
119
- if not text:
120
- return None
121
-
122
- pdf_patterns = [
123
- r'(https?://[^\s<>"]+?\.pdf)',
124
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
125
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
126
- ]
127
-
128
- pdf_urls = []
129
- for pattern in pdf_patterns:
130
- pdf_urls.extend(re.findall(pattern, text))
131
-
132
- # Attempt each pdf url and break when you find a PDF content.
133
- for pdf_url in pdf_urls:
134
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
135
- if pdf_content:
136
- logger.debug(f"Found PDF from: {pdf_url}")
137
- return pdf_content
138
-
139
- except Exception as e:
140
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
141
- return None
142
-
143
- async def download_paper_scihub_async(self, session, doi):
144
- """Improved method to download paper from Sci-Hub using async requests"""
145
  if not doi:
146
- logger.warning("DOI not provided")
147
  return None
148
-
149
- for base_url in self.download_sources:
150
- try:
151
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
152
- text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
153
- if not text:
154
- continue
155
-
156
- # Search for multiple PDF URL patterns
157
- pdf_patterns = [
158
- r'(https?://[^\s<>"]+?\.pdf)',
159
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
160
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
 
 
 
 
 
 
161
  ]
162
-
163
- pdf_urls = []
164
- for pattern in pdf_patterns:
165
- pdf_urls.extend(re.findall(pattern, text))
166
-
167
- # Try downloading from found URLs, but iterate over ALL
168
- for pdf_url in pdf_urls:
169
- pdf_content = await self.fetch_pdf_content(session,pdf_url)
170
- if pdf_content:
171
- logger.debug(f"Found PDF from: {pdf_url}")
172
- return pdf_content
173
-
174
- except Exception as e:
175
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
176
-
177
- return None
178
-
179
- async def download_paper_libgen_async(self, session, doi):
180
- """Download from Libgen, handles the query and the redirection"""
181
  if not doi:
 
182
  return None
183
 
184
- base_url = 'https://libgen.rs/scimag/'
185
- try:
186
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
187
- text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
 
 
188
 
189
- if not text or "No results" in text:
190
- logger.debug(f"No results for DOI: {doi} on libgen")
191
- return None
 
 
 
192
 
193
- soup = BeautifulSoup(text, 'html.parser')
 
 
194
 
195
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
 
 
 
 
 
196
 
197
- if links:
198
- link = links[0]
199
- pdf_url = link['href']
200
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
201
- if pdf_content:
202
- logger.debug(f"Found PDF from: {pdf_url}")
203
- return pdf_content
204
- except Exception as e:
205
- logger.debug(f"Error trying to download {doi} from libgen: {e}")
206
  return None
207
 
208
- async def download_paper_google_scholar_async(self, session, doi):
209
- """Search google scholar to find an article with the given doi, try to get the pdf"""
210
  if not doi:
211
  return None
212
 
 
213
  try:
214
- query = f'doi:"{doi}"'
215
- params = {'q': query}
216
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
217
 
218
- text, headers = await self.fetch_with_headers(session, url, timeout=10)
219
- if not text:
220
  return None
221
 
222
  soup = BeautifulSoup(text, 'html.parser')
223
 
224
- # Find any links with [PDF]
225
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
226
 
227
  if links:
228
- pdf_url = links[0]['href']
229
- pdf_content = await self.fetch_pdf_content(session,pdf_url)
 
230
  if pdf_content:
231
- logger.debug(f"Found PDF from: {pdf_url}")
232
- return pdf_content
233
-
234
  except Exception as e:
235
- logger.debug(f"Google Scholar error for {doi}: {e}")
236
-
237
  return None
238
 
239
- async def download_paper_crossref_async(self, session, doi):
240
- """Alternative search method using Crossref"""
241
- if not doi:
 
 
 
 
 
 
 
 
 
242
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- try:
245
- # Search for open access link
246
- url = f"https://api.crossref.org/works/{doi}"
247
- response = await session.get(url, headers=self.headers, timeout=10)
248
-
249
- if response.status == 200:
250
- data = await response.json()
251
- work = data.get('message', {})
252
-
253
- # Search for open access links
254
- links = work.get('link', [])
255
- for link in links:
256
- if link.get('content-type') == 'application/pdf':
257
- pdf_url = link.get('URL')
258
- if pdf_url:
259
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
260
- if pdf_content:
261
- logger.debug(f"Found PDF from: {pdf_url}")
262
- return pdf_content
263
- except Exception as e:
264
- logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
268
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
269
  pdf_content = None
@@ -284,14 +283,15 @@ class PaperDownloader:
284
  return pdf_content
285
  except Exception as e:
286
  logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
287
-
288
  if not pdf_content:
289
  retries += 1
290
  logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
291
  await asyncio.sleep(delay)
292
  delay *= 2 # Exponential backoff
 
293
  return None
294
-
295
  async def download_single_doi_async(self, doi):
296
  """Downloads a single paper using a DOI"""
297
  if not doi:
@@ -302,7 +302,7 @@ class PaperDownloader:
302
 
303
  if pdf_content:
304
  if doi is None:
305
- return None, "Error: DOI not provided", "Error: DOI not provided"
306
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
307
  filepath = os.path.join(self.output_dir, filename)
308
 
@@ -310,106 +310,102 @@ class PaperDownloader:
310
  loop = asyncio.get_running_loop()
311
  await loop.run_in_executor(None, lambda: open(filepath, 'wb').write(pdf_content))
312
 
313
-
314
  logger.info(f"Successfully downloaded: {filename}")
315
  return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
316
  else:
317
- logger.warning(f"Could not download: {doi}")
318
- return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
319
 
320
  except Exception as e:
321
- logger.error(f"Error processing {doi}: {e}")
322
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
323
 
324
  async def download_multiple_dois_async(self, dois_text):
325
- """Downloads multiple papers from a list of DOIs"""
326
- if not dois_text:
327
- return None, "Error: No DOIs provided", "Error: No DOIs provided"
328
-
329
- dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
330
- if not dois:
331
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
332
-
333
- downloaded_files = []
334
- failed_dois = []
335
- downloaded_links = []
336
-
337
-
338
- for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
339
- filepath, success_message, fail_message = await self.download_single_doi_async(doi)
340
- if filepath:
341
  # Unique filename for zip
342
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
343
  filepath_unique = os.path.join(self.output_dir, filename)
344
  os.rename(filepath, filepath_unique)
345
  downloaded_files.append(filepath_unique)
346
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
347
- else:
348
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
349
-
350
-
351
- if downloaded_files:
352
- zip_filename = 'papers.zip'
353
- loop = asyncio.get_running_loop()
354
- await loop.run_in_executor(None, lambda: self.create_zip(zip_filename,downloaded_files) )
355
- logger.info(f"ZIP file created: {zip_filename}")
356
-
357
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
358
-
359
  def create_zip(self, zip_filename, downloaded_files):
360
  with zipfile.ZipFile(zip_filename, 'w') as zipf:
361
  for file_path in downloaded_files:
362
  zipf.write(file_path, arcname=os.path.basename(file_path))
363
 
364
  async def process_bibtex_async(self, bib_file):
365
- """Process BibTeX file and download papers with multiple strategies"""
366
- # Read BibTeX file content from the uploaded object
367
- try:
368
- with open(bib_file.name, 'r', encoding='utf-8') as f:
369
- bib_content = f.read()
370
- except Exception as e:
371
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
372
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
373
-
374
- # Parse BibTeX data
375
- try:
376
- bib_database = bibtexparser.loads(bib_content)
377
- except Exception as e:
378
- logger.error(f"Error parsing BibTeX data: {e}")
379
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
380
 
381
- # Extract DOIs
382
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
383
- logger.info(f"Found {len(dois)} DOIs to download")
 
 
 
384
 
385
- # Result lists
386
- downloaded_files = []
387
- failed_dois = []
388
- downloaded_links = []
389
 
390
- # Use asyncio.gather to run all downloads concurrently and show propert progress
391
- tasks = [self.download_single_doi_async(doi) for doi in dois]
392
- results = await asyncio.gather(*tasks)
 
 
 
 
393
 
394
- for i, (filepath, success_message, fail_message) in enumerate(results):
395
- if filepath:
 
396
  # Unique filename for zip
397
  filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
398
  filepath_unique = os.path.join(self.output_dir, filename)
399
  os.rename(filepath, filepath_unique)
400
  downloaded_files.append(filepath_unique)
401
  downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
402
- else:
403
  failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
 
 
 
 
 
 
404
 
405
-
406
- if downloaded_files:
407
- zip_filename = 'papers.zip'
408
- loop = asyncio.get_running_loop()
409
- await loop.run_in_executor(None, lambda: self.create_zip(zip_filename,downloaded_files) )
410
- logger.info(f"ZIP file created: {zip_filename}")
411
-
412
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
413
 
414
  def create_gradio_interface():
415
  """Create Gradio interface for Paper Downloader"""
@@ -420,7 +416,7 @@ def create_gradio_interface():
420
  # Check file type
421
  if not bib_file.name.lower().endswith('.bib'):
422
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
423
-
424
  zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file)
425
  return zip_path, downloaded_dois, failed_dois, None
426
  elif doi_input:
@@ -432,6 +428,7 @@ def create_gradio_interface():
432
  else:
433
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
434
 
 
435
  # Gradio Interface
436
  interface = gr.Interface(
437
  fn=download_papers,
@@ -487,6 +484,7 @@ def create_gradio_interface():
487
  """,
488
  cache_examples=False,
489
  )
 
490
  # Add Javascript to update HTML
491
  interface.load = """
492
  function(downloaded_dois, failed_dois){
@@ -506,6 +504,7 @@ def create_gradio_interface():
506
  """
507
  return interface
508
 
 
509
  def main():
510
  interface = create_gradio_interface()
511
  interface.launch(share=True)
 
99
 
100
  logger.debug(f"Too many redirects or retries {url}, not following this link further")
101
  return None
102
+
103
  async def download_paper_direct_doi_async(self, session, doi):
104
+ """Attempt to download the pdf from the landing page of the doi"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  if not doi:
 
106
  return None
107
+
108
+ try:
109
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
110
+
111
+ # First, let's try to download the URL directly in case it is already the pdf.
112
+ pdf_content = await self.fetch_pdf_content(session, doi_url)
113
+ if pdf_content:
114
+ logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
115
+ return pdf_content
116
+
117
+ # If direct DOI link was not a pdf, fetch landing page and extract links
118
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
119
+ if not text:
120
+ return None
121
+
122
+ pdf_patterns = [
123
+ r'(https?://[^\s<>"]+?\.pdf)',
124
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
125
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
126
  ]
127
+
128
+ pdf_urls = []
129
+ for pattern in pdf_patterns:
130
+ pdf_urls.extend(re.findall(pattern, text))
131
+
132
+ # Attempt each pdf url and break when you find a PDF content.
133
+ for pdf_url in pdf_urls:
134
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
135
+ if pdf_content:
136
+ logger.debug(f"Found PDF from: {pdf_url}")
137
+ return pdf_content
138
+
139
+ except Exception as e:
140
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
141
+ return None
142
+
143
+ async def download_paper_scihub_async(self, session, doi):
144
+ """Improved method to download paper from Sci-Hub using async requests"""
 
145
  if not doi:
146
+ logger.warning("DOI not provided")
147
  return None
148
 
149
+ for base_url in self.download_sources:
150
+ try:
151
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
152
+ text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
153
+ if not text:
154
+ continue
155
 
156
+ # Search for multiple PDF URL patterns
157
+ pdf_patterns = [
158
+ r'(https?://[^\s<>"]+?\.pdf)',
159
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
160
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
161
+ ]
162
 
163
+ pdf_urls = []
164
+ for pattern in pdf_patterns:
165
+ pdf_urls.extend(re.findall(pattern, text))
166
 
167
+ # Try downloading from found URLs, but iterate over ALL
168
+ for pdf_url in pdf_urls:
169
+ pdf_content = await self.fetch_pdf_content(session,pdf_url)
170
+ if pdf_content:
171
+ logger.debug(f"Found PDF from: {pdf_url}")
172
+ return pdf_content
173
 
174
+ except Exception as e:
175
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
176
+
 
 
 
 
 
 
177
  return None
178
 
179
+ async def download_paper_libgen_async(self, session, doi):
180
+ """Download from Libgen, handles the query and the redirection"""
181
  if not doi:
182
  return None
183
 
184
+ base_url = 'https://libgen.rs/scimag/'
185
  try:
186
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
187
+ text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
 
188
 
189
+ if not text or "No results" in text:
190
+ logger.debug(f"No results for DOI: {doi} on libgen")
191
  return None
192
 
193
  soup = BeautifulSoup(text, 'html.parser')
194
 
195
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
 
196
 
197
  if links:
198
+ link = links[0]
199
+ pdf_url = link['href']
200
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
201
  if pdf_content:
202
+ logger.debug(f"Found PDF from: {pdf_url}")
203
+ return pdf_content
 
204
  except Exception as e:
205
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
 
206
  return None
207
 
208
+ async def download_paper_google_scholar_async(self, session, doi):
209
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
210
+ if not doi:
211
+ return None
212
+
213
+ try:
214
+ query = f'doi:"{doi}"'
215
+ params = {'q': query}
216
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
217
+
218
+ text, headers = await self.fetch_with_headers(session, url, timeout=10)
219
+ if not text:
220
  return None
221
+
222
+ soup = BeautifulSoup(text, 'html.parser')
223
+
224
+ # Find any links with [PDF]
225
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
226
+
227
+ if links:
228
+ pdf_url = links[0]['href']
229
+ pdf_content = await self.fetch_pdf_content(session,pdf_url)
230
+ if pdf_content:
231
+ logger.debug(f"Found PDF from: {pdf_url}")
232
+ return pdf_content
233
+ except Exception as e:
234
+ logger.debug(f"Google Scholar error for {doi}: {e}")
235
+
236
+ return None
237
 
238
+ async def download_paper_crossref_async(self, session, doi):
239
+ """Alternative search method using Crossref"""
240
+ if not doi:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  return None
242
+
243
+ try:
244
+ # Search for open access link
245
+ url = f"https://api.crossref.org/works/{doi}"
246
+ response = await session.get(url, headers=self.headers, timeout=10)
247
+
248
+ if response.status == 200:
249
+ data = await response.json()
250
+ work = data.get('message', {})
251
+
252
+ # Search for open access links
253
+ links = work.get('link', [])
254
+ for link in links:
255
+ if link.get('content-type') == 'application/pdf':
256
+ pdf_url = link.get('URL')
257
+ if pdf_url:
258
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
259
+ if pdf_content:
260
+ logger.debug(f"Found PDF from: {pdf_url}")
261
+ return pdf_content
262
+ except Exception as e:
263
+ logger.debug(f"Crossref error for {doi}: {e}")
264
+ return None
265
+
266
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
267
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
268
  pdf_content = None
 
283
  return pdf_content
284
  except Exception as e:
285
  logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
286
+
287
  if not pdf_content:
288
  retries += 1
289
  logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
290
  await asyncio.sleep(delay)
291
  delay *= 2 # Exponential backoff
292
+
293
  return None
294
+
295
  async def download_single_doi_async(self, doi):
296
  """Downloads a single paper using a DOI"""
297
  if not doi:
 
302
 
303
  if pdf_content:
304
  if doi is None:
305
+ return None, "Error: DOI not provided", "Error: DOI not provided"
306
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
307
  filepath = os.path.join(self.output_dir, filename)
308
 
 
310
  loop = asyncio.get_running_loop()
311
  await loop.run_in_executor(None, lambda: open(filepath, 'wb').write(pdf_content))
312
 
 
313
  logger.info(f"Successfully downloaded: {filename}")
314
  return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
315
  else:
316
+ logger.warning(f"Could not download: {doi}")
317
+ return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
318
 
319
  except Exception as e:
320
+ logger.error(f"Error processing {doi}: {e}")
321
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
322
 
323
  async def download_multiple_dois_async(self, dois_text):
324
+ """Downloads multiple papers from a list of DOIs"""
325
+ if not dois_text:
326
+ return None, "Error: No DOIs provided", "Error: No DOIs provided"
327
+
328
+ dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
329
+ if not dois:
330
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
331
+
332
+ downloaded_files = []
333
+ failed_dois = []
334
+ downloaded_links = []
335
+
336
+ for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
337
+ filepath, success_message, fail_message = await self.download_single_doi_async(doi)
338
+ if filepath:
 
339
  # Unique filename for zip
340
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
341
  filepath_unique = os.path.join(self.output_dir, filename)
342
  os.rename(filepath, filepath_unique)
343
  downloaded_files.append(filepath_unique)
344
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
345
+ else:
346
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
347
+
348
+ if downloaded_files:
349
+ zip_filename = 'papers.zip'
350
+ loop = asyncio.get_running_loop()
351
+ await loop.run_in_executor(None, lambda: self.create_zip(zip_filename,downloaded_files) )
352
+ logger.info(f"ZIP file created: {zip_filename}")
353
+
354
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
355
+
 
356
  def create_zip(self, zip_filename, downloaded_files):
357
  with zipfile.ZipFile(zip_filename, 'w') as zipf:
358
  for file_path in downloaded_files:
359
  zipf.write(file_path, arcname=os.path.basename(file_path))
360
 
361
  async def process_bibtex_async(self, bib_file):
362
+ """Process BibTeX file and download papers with multiple strategies"""
363
+ # Read BibTeX file content from the uploaded object
364
+ try:
365
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
366
+ bib_content = f.read()
367
+ except Exception as e:
368
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
369
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
 
 
 
 
 
 
 
370
 
371
+ # Parse BibTeX data
372
+ try:
373
+ bib_database = bibtexparser.loads(bib_content)
374
+ except Exception as e:
375
+ logger.error(f"Error parsing BibTeX data: {e}")
376
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
377
 
378
+ # Extract DOIs
379
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
380
+ logger.info(f"Found {len(dois)} DOIs to download")
 
381
 
382
+ # Result lists
383
+ downloaded_files = []
384
+ failed_dois = []
385
+ downloaded_links = []
386
+
387
+ tasks = [self.download_single_doi_async(doi) for doi in dois]
388
+ results = await asyncio.gather(*tasks)
389
 
390
+
391
+ for i, (filepath, success_message, fail_message) in enumerate(results):
392
+ if filepath:
393
  # Unique filename for zip
394
  filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
395
  filepath_unique = os.path.join(self.output_dir, filename)
396
  os.rename(filepath, filepath_unique)
397
  downloaded_files.append(filepath_unique)
398
  downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
399
+ else:
400
  failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
401
+
402
+ if downloaded_files:
403
+ zip_filename = 'papers.zip'
404
+ loop = asyncio.get_running_loop()
405
+ await loop.run_in_executor(None, lambda: self.create_zip(zip_filename,downloaded_files) )
406
+ logger.info(f"ZIP file created: {zip_filename}")
407
 
408
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 
 
 
 
 
 
 
409
 
410
  def create_gradio_interface():
411
  """Create Gradio interface for Paper Downloader"""
 
416
  # Check file type
417
  if not bib_file.name.lower().endswith('.bib'):
418
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
419
+
420
  zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file)
421
  return zip_path, downloaded_dois, failed_dois, None
422
  elif doi_input:
 
428
  else:
429
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
430
 
431
+
432
  # Gradio Interface
433
  interface = gr.Interface(
434
  fn=download_papers,
 
484
  """,
485
  cache_examples=False,
486
  )
487
+
488
  # Add Javascript to update HTML
489
  interface.load = """
490
  function(downloaded_dois, failed_dois){
 
504
  """
505
  return interface
506
 
507
+
508
  def main():
509
  interface = create_gradio_interface()
510
  interface.launch(share=True)