C2MV commited on
Commit
b1121bf
·
verified ·
1 Parent(s): bc25c79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -63
app.py CHANGED
@@ -263,7 +263,7 @@ class PaperDownloader:
263
  except Exception as e:
264
  logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
-
267
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
268
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
269
  pdf_content = None
@@ -294,36 +294,37 @@ class PaperDownloader:
294
  return None
295
 
296
  async def download_single_doi_async(self, doi, progress_callback):
297
- """Downloads a single paper using a DOI"""
298
- if not doi:
299
- return None, "Error: DOI not provided", "Error: DOI not provided"
300
-
301
- try:
302
- pdf_content = await self.download_with_retry_async(doi)
303
-
304
- if pdf_content:
305
- if doi is None:
306
- return None, "Error: DOI not provided", "Error: DOI not provided"
307
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
308
- filepath = os.path.join(self.output_dir, filename)
309
- loop = asyncio.get_running_loop()
310
- await loop.run_in_executor(self.executor, lambda: open(filepath, 'wb').write(pdf_content))
311
 
312
- logger.info(f"Successfully downloaded: {filename}")
313
- progress_callback(f"Successfully downloaded: <a href='https://doi.org/{doi}'>{doi}</a>")
314
- return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
315
- else:
316
- logger.warning(f"Could not download: {doi}")
317
- progress_callback(f"Could not download: <a href='https://doi.org/{doi}'>{doi}</a>")
318
- return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
319
-
320
- except Exception as e:
 
 
 
321
  logger.error(f"Error processing {doi}: {e}")
322
  progress_callback(f"Error processing {doi}: <a href='https://doi.org/{doi}'>{doi}</a> {e}")
323
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
324
 
325
  async def download_multiple_dois_async(self, dois_text, progress_callback):
326
- """Downloads multiple papers from a list of DOIs and updates the UI using the progress_callback"""
327
  if not dois_text:
328
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
329
 
@@ -334,32 +335,32 @@ class PaperDownloader:
334
  downloaded_files = []
335
  failed_dois = []
336
  downloaded_links = []
337
-
338
  for i, doi in enumerate(dois):
339
  filepath, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
340
  if filepath:
341
- # Unique filename for zip
342
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
343
- filepath_unique = os.path.join(self.output_dir, filename)
344
- os.rename(filepath, filepath_unique)
345
- downloaded_files.append(filepath_unique)
346
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
347
  else:
348
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
349
-
350
  if downloaded_files:
351
- zip_filename = 'papers.zip'
352
- loop = asyncio.get_running_loop()
353
- await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files) )
354
- logger.info(f"ZIP file created: {zip_filename}")
355
-
356
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
357
-
358
  def create_zip(self, zip_filename, downloaded_files):
359
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
360
  for file_path in downloaded_files:
361
  zipf.write(file_path, arcname=os.path.basename(file_path))
362
-
363
  async def process_bibtex_async(self, bib_file, progress_callback):
364
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
365
  # Read BibTeX file content from the uploaded object
@@ -385,25 +386,27 @@ class PaperDownloader:
385
  downloaded_files = []
386
  failed_dois = []
387
  downloaded_links = []
 
 
 
388
 
389
- for i, doi in enumerate(dois):
390
- filepath, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
391
- if filepath:
392
- # Unique filename for zip
393
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
394
- filepath_unique = os.path.join(self.output_dir, filename)
395
- os.rename(filepath, filepath_unique)
396
- downloaded_files.append(filepath_unique)
397
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
398
- else:
399
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
400
-
401
  if downloaded_files:
402
- zip_filename = 'papers.zip'
403
- loop = asyncio.get_running_loop()
404
- await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
405
- logger.info(f"ZIP file created: {zip_filename}")
406
-
407
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
408
 
409
  def create_gradio_interface():
@@ -415,17 +418,19 @@ def create_gradio_interface():
415
  # Check file type
416
  if not bib_file.name.lower().endswith('.bib'):
417
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
418
-
 
419
  zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file, progress.update)
 
420
  return zip_path, downloaded_dois, failed_dois, None
421
  elif doi_input:
422
  filepath, message, failed_doi = await downloader.download_single_doi_async(doi_input,progress.update)
423
  return None, message, failed_doi, filepath
424
  elif dois_input:
425
- zip_path, downloaded_dois, failed_dois = await downloader.download_multiple_dois_async(dois_input, progress.update)
426
- return zip_path, downloaded_dois, failed_dois, None
427
  else:
428
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
429
 
430
 
431
  # Gradio Interface
@@ -435,6 +440,7 @@ def create_gradio_interface():
435
  gr.File(file_types=['.bib'], label="Upload BibTeX File"),
436
  gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
437
  gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
 
438
  ],
439
  outputs=[
440
  gr.File(label="Download Papers (ZIP) or Single PDF"),
 
263
  except Exception as e:
264
  logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
+
267
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
268
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
269
  pdf_content = None
 
294
  return None
295
 
296
  async def download_single_doi_async(self, doi, progress_callback):
297
+ """Downloads a single paper using a DOI, and updates the given progress_callback"""
298
+ if not doi:
299
+ return None, "Error: DOI not provided", "Error: DOI not provided"
300
+
301
+ try:
302
+ pdf_content = await self.download_with_retry_async(doi)
303
+
304
+ if pdf_content:
305
+ if doi is None:
306
+ return None, "Error: DOI not provided", "Error: DOI not provided"
307
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
308
+ filepath = os.path.join(self.output_dir, filename)
 
 
309
 
310
+ loop = asyncio.get_running_loop()
311
+ await loop.run_in_executor(self.executor, lambda: open(filepath, 'wb').write(pdf_content))
312
+
313
+ logger.info(f"Successfully downloaded: {filename}")
314
+ progress_callback(f"Successfully downloaded: <a href='https://doi.org/{doi}'>{doi}</a>")
315
+ return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
316
+ else:
317
+ logger.warning(f"Could not download: {doi}")
318
+ progress_callback(f"Could not download: <a href='https://doi.org/{doi}'>{doi}</a>")
319
+ return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
320
+
321
+ except Exception as e:
322
  logger.error(f"Error processing {doi}: {e}")
323
  progress_callback(f"Error processing {doi}: <a href='https://doi.org/{doi}'>{doi}</a> {e}")
324
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
325
 
326
  async def download_multiple_dois_async(self, dois_text, progress_callback):
327
+ """Downloads multiple papers from a list of DOIs and uses a callback for UI"""
328
  if not dois_text:
329
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
330
 
 
335
  downloaded_files = []
336
  failed_dois = []
337
  downloaded_links = []
338
+
339
  for i, doi in enumerate(dois):
340
  filepath, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
341
  if filepath:
342
+ # Unique filename for zip
343
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
344
+ filepath_unique = os.path.join(self.output_dir, filename)
345
+ os.rename(filepath, filepath_unique)
346
+ downloaded_files.append(filepath_unique)
347
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
348
  else:
349
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
350
+
351
  if downloaded_files:
352
+ zip_filename = 'papers.zip'
353
+ loop = asyncio.get_running_loop()
354
+ await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
355
+ logger.info(f"ZIP file created: {zip_filename}")
356
+
357
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
358
+
359
  def create_zip(self, zip_filename, downloaded_files):
360
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
361
  for file_path in downloaded_files:
362
  zipf.write(file_path, arcname=os.path.basename(file_path))
363
+
364
  async def process_bibtex_async(self, bib_file, progress_callback):
365
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
366
  # Read BibTeX file content from the uploaded object
 
386
  downloaded_files = []
387
  failed_dois = []
388
  downloaded_links = []
389
+
390
+ tasks = [self.download_single_doi_async(doi, progress_callback) for doi in dois]
391
+ results = await asyncio.gather(*tasks)
392
 
393
+ for i, (filepath, success_message, fail_message) in enumerate(results):
394
+ if filepath:
395
+ # Unique filename for zip
396
+ filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
397
+ filepath_unique = os.path.join(self.output_dir, filename)
398
+ os.rename(filepath, filepath_unique)
399
+ downloaded_files.append(filepath_unique)
400
+ downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
401
+ else:
402
+ failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
403
+
 
404
  if downloaded_files:
405
+ zip_filename = 'papers.zip'
406
+ loop = asyncio.get_running_loop()
407
+ await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
408
+ logger.info(f"ZIP file created: {zip_filename}")
409
+
410
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
411
 
412
  def create_gradio_interface():
 
418
  # Check file type
419
  if not bib_file.name.lower().endswith('.bib'):
420
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
421
+
422
+
423
  zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file, progress.update)
424
+
425
  return zip_path, downloaded_dois, failed_dois, None
426
  elif doi_input:
427
  filepath, message, failed_doi = await downloader.download_single_doi_async(doi_input,progress.update)
428
  return None, message, failed_doi, filepath
429
  elif dois_input:
430
+ zip_path, downloaded_dois, failed_dois = await downloader.download_multiple_dois_async(dois_input, progress.update)
431
+ return zip_path, downloaded_dois, failed_dois, None
432
  else:
433
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
434
 
435
 
436
  # Gradio Interface
 
440
  gr.File(file_types=['.bib'], label="Upload BibTeX File"),
441
  gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
442
  gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
443
+
444
  ],
445
  outputs=[
446
  gr.File(label="Download Papers (ZIP) or Single PDF"),