C2MV commited on
Commit
bc356d0
verified
1 Parent(s): 434b119

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -130
app.py CHANGED
@@ -43,7 +43,7 @@ class PaperDownloader:
43
  }
44
  self.executor = ThreadPoolExecutor(max_workers=4)
45
  self.download_task = None # Added attribute
46
-
47
  def clean_doi(self, doi):
48
  """Clean and encode DOI for URL"""
49
  if not isinstance(doi, str):
@@ -331,11 +331,11 @@ class PaperDownloader:
331
  # Si se agotan todos los reintentos
332
  return None
333
 
334
- def _download_single_doi(self, doi, progress_callback, cancel_event): # removed async keyword
335
  """Descargar un 煤nico DOI con retroalimentaci贸n de progreso"""
336
  if not doi:
337
- progress_callback(None, "Error: DOI no proporcionado", "Error: DOI no proporcionado","" , None)
338
- return None
339
  logger.info(f"Starting download process for DOI: {doi}")
340
 
341
  try:
@@ -363,24 +363,23 @@ class PaperDownloader:
363
  logger.warning(f"No se pudo descargar: {doi}")
364
  return None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
365
 
366
- filepath, message, error = asyncio.run(call_async()) #added the loop event here
367
- progress_callback(filepath, message, error, None ) # call this once the callback is made
368
-
369
  except CancelledError:
370
  logger.info(f"Download Cancelled DOI: {doi}")
371
- progress_callback(None, f"Download cancelled {doi}","Download Cancelled", None ) #send proper types with null values
372
 
373
  except Exception as e:
374
  logger.error(f"Error processing {doi}: {e}")
375
- progress_callback(None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}", None ) #send proper type of results
376
 
377
 
378
- def download_multiple_dois(self, dois_text, progress_callback, cancel_event): #removed async here
379
  """Download multiple DOIs"""
380
  # Validar entrada
381
  if not dois_text:
382
- progress_callback(None, "Error: No DOIs provided", "Error: No DOIs provided","" , None)
383
- return None
384
 
385
  # Sanitizar y filtrar DOIs
386
  # Eliminar l铆neas vac铆as, espacios en blanco, y DOIs duplicados
@@ -388,60 +387,56 @@ class PaperDownloader:
388
 
389
  # Validar lista de DOIs
390
  if not dois:
391
- progress_callback(None, "Error: No valid DOIs provided", "Error: No valid DOIs provided", "" , None)
392
- return None
393
 
394
  # Listas para rastrear resultados
395
  downloaded_files = [] # Rutas de archivos descargados
396
  failed_dois = [] # DOIs que no se pudieron descargar
397
  downloaded_links = [] # Links de DOIs descargados
398
-
399
-
400
  for doi in dois:
401
- filepath, message, error= self._download_single_doi(doi, lambda a,b,c,d : progress_callback(a,b,c,d), cancel_event )
402
- if cancel_event.is_set():
403
- logger.info("Downloads cancelled on multiple dois download")
404
- progress_callback(None, "Downloads cancelled","Downloads cancelled", None) # early return on cancelled
405
- return None #break here when is cancelled
406
-
407
- result = self.results_dict.get(doi, (None,None,"", None)) # obtain from self.results
408
 
409
- if result is None: #when errors happen results are none
410
- continue;
411
-
412
- # Manejar diferentes tipos de resultados
413
- if isinstance(result, Exception):
414
- # Excepci贸n inesperada
 
 
 
 
415
  error_msg = f"Unexpected error: {str(result)}"
416
  logger.error(f"Error downloading {doi}: {error_msg}")
417
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
418
 
419
- elif result[0] is None:
420
- # Descarga fallida (resultado de download_single_doi_async)
421
  error_msg = result[1]
422
  logger.warning(f"Failed to download {doi}: {error_msg}")
423
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
424
-
425
- else:
426
- # Descarga exitosa
427
- filepath = result[0]
428
-
429
- # Generar nombre de archivo 煤nico
430
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
431
- filepath_unique = os.path.join(self.output_dir, filename)
432
-
433
- try:
434
- # Renombrar archivo
435
- os.rename(filepath, filepath_unique)
436
 
437
- # A帽adir a lista de archivos descargados
438
- downloaded_files.append(filepath_unique)
439
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
440
 
441
- except Exception as rename_error:
442
- logger.error(f"Error renaming file for {doi}: {rename_error}")
443
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Error saving file')
444
-
445
 
446
 
447
  # Crear archivo ZIP si hay archivos descargados
@@ -456,13 +451,11 @@ class PaperDownloader:
456
  lambda: self.create_zip(zip_filename, downloaded_files)
457
  )
458
  logger.info(f"ZIP file created: {zip_filename}")
459
-
 
460
 
461
- # Devolver resultados
462
- progress_callback( zip_filename if downloaded_files else None, "\n".join(downloaded_links),"\n".join(failed_dois),"" , None) # send empty values on callback to not break it.
463
- return
464
-
465
- def process_bibtex(self, bib_file, progress_callback, cancel_event):# removed async here
466
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
467
  # Read BibTeX file content from the uploaded object
468
  try:
@@ -470,16 +463,14 @@ class PaperDownloader:
470
  bib_content = f.read()
471
  except Exception as e:
472
  logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
473
- progress_callback(None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None )
474
- return None
475
 
476
  # Parse BibTeX data
477
  try:
478
  bib_database = bibtexparser.loads(bib_content)
479
  except Exception as e:
480
  logger.error(f"Error parsing BibTeX data: {e}")
481
- progress_callback(None,f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None)
482
- return None
483
 
484
  # Extract DOIs
485
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
@@ -489,24 +480,22 @@ class PaperDownloader:
489
  downloaded_files = []
490
  failed_dois = []
491
  downloaded_links = []
 
492
  for doi in dois:
 
493
 
494
- filepath, message, error= self._download_single_doi(doi, lambda a,b,c,d: progress_callback(a,b,c,d), cancel_event ) # added lambda for params handling.
495
- if cancel_event.is_set():
496
- logger.info("Download Cancelled in bibtex mode")
497
- progress_callback(None, "Download Cancelled", "Download Cancelled", None)
498
- return None #cancel if requested
499
-
500
- result = self.results_dict.get(doi, (None,None,"",None)) # obtain from self.results
501
-
502
- if result is None:
503
- continue # skips for a None type results when callback fails
504
 
505
  if isinstance(result, Exception):
506
  # Excepci贸n inesperada
507
- error_msg = f"Unexpected error: {str(result)}"
508
- logger.error(f"Error downloading {doi}: {error_msg}")
509
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
510
 
511
  elif result[0] is None:
512
  # Descarga fallida (resultado de download_single_doi_async)
@@ -515,7 +504,7 @@ class PaperDownloader:
515
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
516
 
517
  else:
518
- # Descarga exitosa
519
  filepath = result[0]
520
 
521
  # Unique filename for zip
@@ -524,16 +513,15 @@ class PaperDownloader:
524
  os.rename(filepath, filepath_unique)
525
  downloaded_files.append(filepath_unique)
526
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
527
-
528
-
529
  if downloaded_files:
530
  zip_filename = 'papers.zip'
531
  loop = asyncio.get_running_loop()
532
  loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
533
  logger.info(f"ZIP file created: {zip_filename}")
534
-
535
- progress_callback(zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois),"") # send "", None to conform output
536
- return
537
 
538
  def create_zip(self, zip_filename, files):
539
  """Crea un archivo zip con los pdfs descargados"""
@@ -551,66 +539,50 @@ def create_gradio_interface():
551
  """Create Gradio interface for Paper Downloader"""
552
  downloader = PaperDownloader()
553
 
554
- downloader.results_dict = {} #shared results dict, since it runs on different threads
555
-
556
  def update_progress( message="", logs=""):
557
  return gr.Textbox.update(value=f"{message}"),gr.Textbox.update(value=f"<pre>{logs}</pre>")
558
 
559
 
560
  def download_papers(bib_file, doi_input, dois_input, output_file, downloaded_dois_textbox,failed_dois_textbox,logs, single_file):
561
- cancel_event = asyncio.Event() # Create cancellation event for every submission.
562
- downloader.cancel_event = cancel_event # store the event so that it is available to stop the process
563
 
564
- def custom_progress_callback(filepath, message, fail_message, doi=None): #new callback to send to the execution function
565
-
566
- logger.info(f"Callback message: {message}") # log each callback msg
567
-
568
- #store data for single or multiple mode on download_papers execution.
569
- if doi:
570
- downloader.results_dict[doi] = (filepath, message,fail_message, "")
571
-
572
-
573
- return update_progress(message, fail_message) # send return values only with results
574
 
575
- if bib_file:
576
- # Check file type
577
- if not bib_file.name.lower().endswith('.bib'):
578
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", "", None # must return all 5 results at each possibility
579
-
580
 
581
- downloader.download_task = downloader.executor.submit(
582
- downloader.process_bibtex,
583
- bib_file,
584
- lambda a,b,c, d: update_progress(a,f"{b}<br>{c}"), #convert for ui output, the return data will contain the HTML
585
- cancel_event # Added cancelllation event.
586
  )
587
-
588
- return None, "","", "", None #must be None ,str , str, str, None ( five params)
589
-
590
 
591
- elif doi_input:
592
- downloader.download_task = downloader.executor.submit( #changed async execution method
593
- downloader._download_single_doi,
594
- doi_input,
595
- lambda a,b,c,d: update_progress(a,f"{b}<br>{c}"), #callback function, format output and send html info, removed lambda from executor calls
596
- cancel_event # Add cancellation event.
597
- )
598
-
599
- return None, "","", "", None #must be None ,str , str, str, None ( five params)
600
 
601
- elif dois_input:
602
- downloader.download_task = downloader.executor.submit( #changed async execution method
603
- downloader.download_multiple_dois,
604
- dois_input,
605
- lambda a,b,c,d: update_progress(a,f"{b}<br>{c}"), #callback function, return simple values
606
- cancel_event #Add cancellation event.
 
 
 
 
 
 
 
 
 
607
  )
608
-
609
- return None, "","", "", None #must be None ,str , str, str, None ( five params)
610
 
611
- else:
612
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", "", None #must be None ,str , str, str, None ( five params)
613
-
614
 
615
  with gr.Blocks(theme="Hev832/Applio", css="""
616
  .gradio-container {
@@ -670,14 +642,14 @@ def create_gradio_interface():
670
 
671
  submit_button.click(
672
  download_papers,
673
- inputs=[bib_file, doi_input, dois_input],
674
  outputs=[output_file, downloaded_dois_textbox, failed_dois_textbox,logs, single_file ], # the new output should be a tuple and we output logs too for debugging.
675
-
676
  )
677
-
678
  interface.title="馃敩 Academic Paper Batch Downloader"
679
  interface.description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment."
680
-
681
  return interface
682
 
683
 
 
43
  }
44
  self.executor = ThreadPoolExecutor(max_workers=4)
45
  self.download_task = None # Added attribute
46
+ self.results_dict = {}
47
  def clean_doi(self, doi):
48
  """Clean and encode DOI for URL"""
49
  if not isinstance(doi, str):
 
331
  # Si se agotan todos los reintentos
332
  return None
333
 
334
+ def _download_single_doi(self, doi, cancel_event): # removed async keyword
335
  """Descargar un 煤nico DOI con retroalimentaci贸n de progreso"""
336
  if not doi:
337
+
338
+ return None, "Error: DOI no proporcionado", "Error: DOI no proporcionado"
339
  logger.info(f"Starting download process for DOI: {doi}")
340
 
341
  try:
 
363
  logger.warning(f"No se pudo descargar: {doi}")
364
  return None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
365
 
366
+ filepath, message, error = asyncio.run(call_async()) #added the loop event here
367
+ return filepath, message, error
 
368
  except CancelledError:
369
  logger.info(f"Download Cancelled DOI: {doi}")
370
+ return None, f"Download cancelled {doi}","Download Cancelled"
371
 
372
  except Exception as e:
373
  logger.error(f"Error processing {doi}: {e}")
374
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
375
 
376
 
377
+ def download_multiple_dois(self, dois_text, cancel_event): #removed async here
378
  """Download multiple DOIs"""
379
  # Validar entrada
380
  if not dois_text:
381
+
382
+ return None, "Error: No DOIs provided", "Error: No DOIs provided",""
383
 
384
  # Sanitizar y filtrar DOIs
385
  # Eliminar l铆neas vac铆as, espacios en blanco, y DOIs duplicados
 
387
 
388
  # Validar lista de DOIs
389
  if not dois:
390
+
391
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided", ""
392
 
393
  # Listas para rastrear resultados
394
  downloaded_files = [] # Rutas de archivos descargados
395
  failed_dois = [] # DOIs que no se pudieron descargar
396
  downloaded_links = [] # Links de DOIs descargados
 
 
397
  for doi in dois:
398
+
399
+ result = self._download_single_doi(doi,cancel_event)
 
 
 
 
 
400
 
401
+ if cancel_event.is_set():
402
+
403
+ logger.info("Downloads cancelled on multiple dois download")
404
+ return None,"Downloads cancelled","Downloads cancelled","" # early return on cancelled
405
+
406
+ if result is None: #when errors happen results are none
407
+ continue
408
+ # Manejar diferentes tipos de resultados
409
+ if isinstance(result, Exception):
410
+ # Excepci贸n inesperada
411
  error_msg = f"Unexpected error: {str(result)}"
412
  logger.error(f"Error downloading {doi}: {error_msg}")
413
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
414
 
415
+ elif result[0] is None:
416
+ # Descarga fallida (resultado de download_single_doi_async)
417
  error_msg = result[1]
418
  logger.warning(f"Failed to download {doi}: {error_msg}")
419
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
420
+
421
+ else:
422
+ # Descarga exitosa
423
+ filepath = result[0]
424
+
425
+ # Generar nombre de archivo 煤nico
426
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
427
+ filepath_unique = os.path.join(self.output_dir, filename)
428
+
429
+ try:
430
+ # Renombrar archivo
431
+ os.rename(filepath, filepath_unique)
432
 
433
+ # A帽adir a lista de archivos descargados
434
+ downloaded_files.append(filepath_unique)
435
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
436
 
437
+ except Exception as rename_error:
438
+ logger.error(f"Error renaming file for {doi}: {rename_error}")
439
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Error saving file')
 
440
 
441
 
442
  # Crear archivo ZIP si hay archivos descargados
 
451
  lambda: self.create_zip(zip_filename, downloaded_files)
452
  )
453
  logger.info(f"ZIP file created: {zip_filename}")
454
+
455
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_links),"\n".join(failed_dois), "" # returning only results here and not in lambda
456
 
457
+
458
+ def process_bibtex(self, bib_file, cancel_event):# removed async here
 
 
 
459
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
460
  # Read BibTeX file content from the uploaded object
461
  try:
 
463
  bib_content = f.read()
464
  except Exception as e:
465
  logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
466
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}",""
 
467
 
468
  # Parse BibTeX data
469
  try:
470
  bib_database = bibtexparser.loads(bib_content)
471
  except Exception as e:
472
  logger.error(f"Error parsing BibTeX data: {e}")
473
+ return None,f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}",""
 
474
 
475
  # Extract DOIs
476
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
 
480
  downloaded_files = []
481
  failed_dois = []
482
  downloaded_links = []
483
+
484
  for doi in dois:
485
+ result = self._download_single_doi(doi, cancel_event) # removed lambda call from executor
486
 
487
+ if cancel_event.is_set():
488
+ logger.info("Download Cancelled in bibtex mode")
489
+ return None, "Download Cancelled", "Download Cancelled" ,"" #cancel if requested
490
+
491
+ if result is None: #skips if it contains null values as a results.
492
+ continue
 
 
 
 
493
 
494
  if isinstance(result, Exception):
495
  # Excepci贸n inesperada
496
+ error_msg = f"Unexpected error: {str(result)}"
497
+ logger.error(f"Error downloading {doi}: {error_msg}")
498
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
499
 
500
  elif result[0] is None:
501
  # Descarga fallida (resultado de download_single_doi_async)
 
504
  failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
505
 
506
  else:
507
+ # Descarga exitosa
508
  filepath = result[0]
509
 
510
  # Unique filename for zip
 
513
  os.rename(filepath, filepath_unique)
514
  downloaded_files.append(filepath_unique)
515
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
516
+
 
517
  if downloaded_files:
518
  zip_filename = 'papers.zip'
519
  loop = asyncio.get_running_loop()
520
  loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
521
  logger.info(f"ZIP file created: {zip_filename}")
522
+
523
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois),"" #send all results
524
+
525
 
526
  def create_zip(self, zip_filename, files):
527
  """Crea un archivo zip con los pdfs descargados"""
 
539
  """Create Gradio interface for Paper Downloader"""
540
  downloader = PaperDownloader()
541
 
 
 
542
  def update_progress( message="", logs=""):
543
  return gr.Textbox.update(value=f"{message}"),gr.Textbox.update(value=f"<pre>{logs}</pre>")
544
 
545
 
546
  def download_papers(bib_file, doi_input, dois_input, output_file, downloaded_dois_textbox,failed_dois_textbox,logs, single_file):
547
+ cancel_event = asyncio.Event() # Create cancellation event for every submission.
548
+ downloader.cancel_event = cancel_event # store the event so that it is available to stop the process
549
 
 
 
 
 
 
 
 
 
 
 
550
 
551
+ if bib_file:
552
+ # Check file type
553
+ if not bib_file.name.lower().endswith('.bib'):
554
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", "", None
 
555
 
556
+ downloader.download_task = downloader.executor.submit(
557
+ downloader.process_bibtex,
558
+ bib_file,
559
+ cancel_event # Added cancelllation event.
 
560
  )
561
+ zip_file, downloaded_dois, failed_dois, logs_text = downloader.download_task.result() # gets results from method
562
+ return zip_file, downloaded_dois, failed_dois, logs_text, None # we use this method because all outputs values were already done inside the callback
 
563
 
 
 
 
 
 
 
 
 
 
564
 
565
+ elif doi_input:
566
+ downloader.download_task = downloader.executor.submit( #changed async execution method
567
+ downloader._download_single_doi,
568
+ doi_input,
569
+ cancel_event
570
+ )
571
+ filepath, message, error= downloader.download_task.result()
572
+
573
+ return None, message, error, "", filepath
574
+
575
+ elif dois_input:
576
+ downloader.download_task = downloader.executor.submit( #changed async execution method
577
+ downloader.download_multiple_dois,
578
+ dois_input,
579
+ cancel_event
580
  )
581
+ zip_file, downloaded_dois, failed_dois, logs_text= downloader.download_task.result()
582
+ return zip_file, downloaded_dois, failed_dois, logs_text, None #we use the direct assignments with returns as before on multi.
583
 
584
+ else:
585
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", "", None # all parameters must have return data, to prevent gradio to crash on incomplete data for block
 
586
 
587
  with gr.Blocks(theme="Hev832/Applio", css="""
588
  .gradio-container {
 
642
 
643
  submit_button.click(
644
  download_papers,
645
+ inputs=[bib_file, doi_input, dois_input],
646
  outputs=[output_file, downloaded_dois_textbox, failed_dois_textbox,logs, single_file ], # the new output should be a tuple and we output logs too for debugging.
647
+
648
  )
649
+
650
  interface.title="馃敩 Academic Paper Batch Downloader"
651
  interface.description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment."
652
+
653
  return interface
654
 
655