C2MV commited on
Commit
7f39ca5
·
verified ·
1 Parent(s): 706bc66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +305 -214
app.py CHANGED
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
12
  import io
13
  import asyncio
14
  import aiohttp
15
- from concurrent.futures import ThreadPoolExecutor
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO,
@@ -42,6 +42,7 @@ class PaperDownloader:
42
  'Accept-Language': 'en-US,en;q=0.9',
43
  }
44
  self.executor = ThreadPoolExecutor(max_workers=4)
 
45
 
46
  def clean_doi(self, doi):
47
  """Clean and encode DOI for URL"""
@@ -67,11 +68,12 @@ class PaperDownloader:
67
  retry_count = 0
68
 
69
  while redirect_count <= max_redirects:
 
70
  try:
71
  while retry_count <= max_retries:
72
  try:
73
- logger.debug(f"Fetching PDF from {current_url} - Retry {retry_count + 1}")#ADDED
74
- async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
75
  if response.status in [301, 302, 307, 308]:
76
  current_url = response.headers['Location']
77
  redirect_count += 1
@@ -90,13 +92,18 @@ class PaperDownloader:
90
  logger.debug(f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
91
  retry_count += 1
92
  await asyncio.sleep(retry_delay)
93
-
94
  retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
 
 
 
95
 
96
  except Exception as e:
97
  logger.debug(f"Error getting PDF from {current_url}: {e}")
98
  return None
99
 
 
 
100
  logger.debug(f"Too many redirects or retries {url}, not following this link further")
101
  return None
102
 
@@ -139,19 +146,22 @@ class PaperDownloader:
139
  except Exception as e:
140
  logger.debug(f"Error trying to get the PDF from {doi}: {e}")
141
  return None
142
-
143
  async def download_paper_scihub_async(self, session, doi):
144
  """Improved method to download paper from Sci-Hub using async requests"""
145
  if not doi:
146
  logger.warning("DOI not provided")
147
  return None
148
-
 
149
  for base_url in self.download_sources:
 
150
  try:
151
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
152
  text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
153
  if not text:
154
  continue
 
155
 
156
  # Search for multiple PDF URL patterns
157
  pdf_patterns = [
@@ -159,10 +169,12 @@ class PaperDownloader:
159
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
160
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
161
  ]
 
162
 
163
  pdf_urls = []
164
  for pattern in pdf_patterns:
165
  pdf_urls.extend(re.findall(pattern, text))
 
166
 
167
  # Try downloading from found URLs, but iterate over ALL
168
  for pdf_url in pdf_urls:
@@ -170,10 +182,11 @@ class PaperDownloader:
170
  if pdf_content:
171
  logger.debug(f"Found PDF from: {pdf_url}")
172
  return pdf_content
173
-
174
  except Exception as e:
175
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
176
-
 
177
  return None
178
 
179
  async def download_paper_libgen_async(self, session, doi):
@@ -300,223 +313,300 @@ class PaperDownloader:
300
  if pdf_content:
301
  logger.info(f"Descarga exitosa de {doi} usando {strategy.__name__}")
302
  return pdf_content
 
 
 
 
303
  except Exception as e:
304
  logger.debug(f"Error en estrategia {strategy.__name__} para {doi}: {e}") #ADDED
305
-
 
306
  # Si ninguna estrategia funcionó, esperar un poco antes de reintentar
307
  await asyncio.sleep(1) # Pequeña pausa entre reintentos
308
-
 
 
 
 
309
  # Si se agotan todos los reintentos
310
- logger.warning(f"FALLO FINAL: No se pudo descargar DOI {doi} después de {max_retries} intentos")
311
  return None
312
-
313
- async def download_single_doi_async(self, doi, progress_callback):
314
  """Descargar un único DOI con retroalimentación de progreso"""
315
  if not doi:
316
- return None, "Error: DOI no proporcionado", "Error: DOI no proporcionado", ""
 
317
  logger.info(f"Starting download process for DOI: {doi}")
 
318
  try:
319
- pdf_content = await self.download_with_retry_async(doi)
320
- if pdf_content:
321
- logger.info(f"Downloaded PDF for DOI: {doi}")
322
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
323
- filepath = os.path.join(self.output_dir, filename)
324
-
325
- # Escribir contenido del PDF
326
- loop = asyncio.get_running_loop()
327
- await loop.run_in_executor(
328
- self.executor,
329
- lambda: open(filepath, 'wb').write(pdf_content)
330
- )
331
- logger.info(f"Saved PDF to file: {filepath}")
332
- await asyncio.sleep(0.1) #force wait here
333
- progress_callback(f"Descargado exitosamente: <a href='https://doi.org/{doi}'>{doi}</a>")
334
- return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', "", ""
335
- else:
336
- logger.warning(f"No se pudo descargar: {doi}")
337
- await asyncio.sleep(0.1) #force wait here
338
- progress_callback(f"No se pudo descargar: <a href='https://doi.org/{doi}'>{doi}</a>")
339
- return None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>', ""
340
-
 
 
 
 
 
 
 
 
341
  except Exception as e:
342
- logger.error(f"Error processing {doi}: {e}")
343
- await asyncio.sleep(0.1) #force wait here
344
- progress_callback(f"Error procesando {doi}: <a href='https://doi.org/{doi}'>{e}")
345
- return None, f"Error procesando {doi}: {e}", f"Error processing {doi}: {e}", ""
346
-
347
-
348
- async def download_multiple_dois_async(self, dois_text, progress_callback):
349
- # Validar entrada
350
- if not dois_text:
351
- return None, "Error: No DOIs provided", "Error: No DOIs provided", ""
352
 
353
- # Sanitizar y filtrar DOIs
354
- # Eliminar líneas vacías, espacios en blanco, y DOIs duplicados
355
- dois = list(set([doi.strip() for doi in dois_text.split('\n') if doi.strip()]))
356
-
357
- # Validar lista de DOIs
358
- if not dois:
359
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided", ""
360
-
361
- # Listas para rastrear resultados
362
- downloaded_files = [] # Rutas de archivos descargados
363
- failed_dois = [] # DOIs que no se pudieron descargar
364
- downloaded_links = [] # Links de DOIs descargados
 
 
 
365
 
366
- # Generar tareas de descarga concurrentes
367
- download_tasks = []
368
- for doi in dois:
369
- task = self.download_single_doi_async(doi, progress_callback)
370
- download_tasks.append(task)
371
-
372
- # Ejecutar todas las descargas concurrentemente
373
- results = await asyncio.gather(*download_tasks, return_exceptions=True)
374
-
375
- # Procesar resultados de cada DOI
376
- for i, result in enumerate(results):
377
- doi = dois[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
- # Manejar diferentes tipos de resultados
380
- if isinstance(result, Exception):
381
- # Excepción inesperada
382
- error_msg = f"Unexpected error: {str(result)}"
383
- logger.error(f"Error downloading {doi}: {error_msg}")
384
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
 
 
 
 
 
385
 
386
- elif result[0] is None:
387
- # Descarga fallida (resultado de download_single_doi_async)
388
- error_msg = result[1]
389
- logger.warning(f"Failed to download {doi}: {error_msg}")
390
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
- else:
393
- # Descarga exitosa
394
- filepath = result[0]
 
 
395
 
396
- # Generar nombre de archivo único
397
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
398
- filepath_unique = os.path.join(self.output_dir, filename)
 
 
399
 
400
- try:
401
- # Renombrar archivo
402
- os.rename(filepath, filepath_unique)
403
 
404
- # Añadir a lista de archivos descargados
 
 
 
405
  downloaded_files.append(filepath_unique)
406
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
407
-
408
- except Exception as rename_error:
409
- logger.error(f"Error renaming file for {doi}: {rename_error}")
410
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Error saving file')
411
- await asyncio.sleep(0.1)
412
- # Crear archivo ZIP si hay archivos descargados
413
- zip_filename = None
414
- if downloaded_files:
415
- zip_filename = 'papers.zip'
416
- loop = asyncio.get_running_loop()
417
-
418
- # Ejecutar creación de ZIP en un executor para no bloquear
419
- await loop.run_in_executor(
420
- self.executor,
421
- lambda: self.create_zip(zip_filename, downloaded_files)
422
- )
423
- logger.info(f"ZIP file created: {zip_filename}")
424
-
425
- # Devolver resultados
426
- return (
427
- zip_filename if downloaded_files else None, # Archivo ZIP o None
428
- "\n".join(downloaded_links), # DOIs descargados
429
- "\n".join(failed_dois), # DOIs fallidos
430
- ""
431
- )
432
-
433
- async def process_bibtex_async(self, bib_file, progress_callback):
434
- """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
435
- # Read BibTeX file content from the uploaded object
436
- try:
437
- with open(bib_file.name, 'r', encoding='utf-8') as f:
438
- bib_content = f.read()
439
- except Exception as e:
440
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
441
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", ""
442
-
443
- # Parse BibTeX data
444
- try:
445
- bib_database = bibtexparser.loads(bib_content)
446
- except Exception as e:
447
- logger.error(f"Error parsing BibTeX data: {e}")
448
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", ""
449
-
450
- # Extract DOIs
451
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
452
- logger.info(f"Found {len(dois)} DOIs to download")
453
-
454
- # Result lists
455
- downloaded_files = []
456
- failed_dois = []
457
- downloaded_links = []
458
-
459
- tasks = [self.download_single_doi_async(doi, progress_callback) for doi in dois]
460
- results = await asyncio.gather(*tasks)
461
-
462
- for i, (filepath, success_message, fail_message,_) in enumerate(results):
463
- if filepath:
464
- # Unique filename for zip
465
- filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
466
- filepath_unique = os.path.join(self.output_dir, filename)
467
- os.rename(filepath, filepath_unique)
468
- downloaded_files.append(filepath_unique)
469
- downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
470
- else:
471
- failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
472
- await asyncio.sleep(0.1) #force wait
473
- if downloaded_files:
474
- zip_filename = 'papers.zip'
475
- loop = asyncio.get_running_loop()
476
- await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
477
- logger.info(f"ZIP file created: {zip_filename}")
478
-
479
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), ""
480
 
 
 
 
481
  def create_zip(self, zip_filename, files):
482
  """Crea un archivo zip con los pdfs descargados"""
483
  with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
484
  for file in files:
485
  zf.write(file, os.path.basename(file))
 
 
 
 
 
 
486
 
487
  def create_gradio_interface():
488
  """Create Gradio interface for Paper Downloader"""
489
  downloader = PaperDownloader()
490
 
491
- def update_progress(message, log_message=""):
492
- return gr.HTML(value=f"{message}"), gr.HTML(value=f"<pre>{log_message}</pre>")
493
 
494
- async def download_papers(bib_file, doi_input, dois_input):
 
 
495
 
496
- def custom_progress_callback(message):
497
- logger.info(f"Callback message: {message}")
 
498
 
499
- return update_progress(message)
500
-
501
- if bib_file:
502
- # Check file type
503
- if not bib_file.name.lower().endswith('.bib'):
504
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", "", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
 
506
- zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file, custom_progress_callback)
507
- await asyncio.sleep(0.1) #force wait
508
- return zip_path, downloaded_dois, failed_dois, "", None
509
- elif doi_input:
510
- filepath, message, failed_doi, _ = await downloader.download_single_doi_async(doi_input,custom_progress_callback)
511
- await asyncio.sleep(0.1) #force wait
512
- return None, message, failed_doi, "", filepath
513
- elif dois_input:
514
- zip_path, downloaded_dois, failed_dois, _ = await downloader.download_multiple_dois_async(dois_input, custom_progress_callback)
515
- await asyncio.sleep(0.1) #force wait
516
- return zip_path, downloaded_dois, failed_dois, "", None
517
- else:
518
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", "", None
519
-
520
 
521
 
522
  # Gradio Interface
@@ -525,37 +615,23 @@ def create_gradio_interface():
525
  inputs=[
526
  gr.File(file_types=['.bib'], label="Upload BibTeX File"),
527
  gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
528
- gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
 
529
  ],
530
  outputs=[
531
  gr.File(label="Download Papers (ZIP) or Single PDF"),
532
- gr.HTML(label="""
533
- <div style='padding-bottom: 5px; font-weight: bold;'>
534
- Found DOIs
535
- </div>
536
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
537
- <div id="downloaded-dois"></div>
538
- </div>
539
  """),
540
- gr.HTML(label="""
541
- <div style='padding-bottom: 5px; font-weight: bold;'>
542
- Missed DOIs
543
- </div>
544
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
545
- <div id="failed-dois"></div>
546
- </div>
547
  """),
548
- gr.HTML(label="""
549
- <div style='padding-bottom: 5px; font-weight: bold;'>
550
- Logs
551
- </div>
552
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px; max-height: 150px; overflow-y: auto;white-space: pre-line;'>
553
-
554
- </div>
555
-
556
- """,),
557
  gr.File(label="Downloaded Single PDF")
558
  ],
 
559
  title="🔬 Academic Paper Batch Downloader",
560
  description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
561
  theme="Hev832/Applio",
@@ -584,9 +660,24 @@ def create_gradio_interface():
584
 
585
  }
586
  """,
 
 
587
  cache_examples=False,
588
  )
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  return interface
591
 
592
  def main():
 
12
  import io
13
  import asyncio
14
  import aiohttp
15
+ from concurrent.futures import ThreadPoolExecutor, CancelledError
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO,
 
42
  'Accept-Language': 'en-US,en;q=0.9',
43
  }
44
  self.executor = ThreadPoolExecutor(max_workers=4)
45
+ self.download_task = None # Added attribute
46
 
47
  def clean_doi(self, doi):
48
  """Clean and encode DOI for URL"""
 
68
  retry_count = 0
69
 
70
  while redirect_count <= max_redirects:
71
+
72
  try:
73
  while retry_count <= max_retries:
74
  try:
75
+ logger.debug(f"Fetching PDF from {current_url} - Retry {retry_count + 1}")#ADDED
76
+ async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
77
  if response.status in [301, 302, 307, 308]:
78
  current_url = response.headers['Location']
79
  redirect_count += 1
 
92
  logger.debug(f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
93
  retry_count += 1
94
  await asyncio.sleep(retry_delay)
95
+
96
  retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
97
+ except CancelledError:
98
+ logger.info(f"Fetch PDF cancelled from: {url}")
99
+ return None
100
 
101
  except Exception as e:
102
  logger.debug(f"Error getting PDF from {current_url}: {e}")
103
  return None
104
 
105
+
106
+
107
  logger.debug(f"Too many redirects or retries {url}, not following this link further")
108
  return None
109
 
 
146
  except Exception as e:
147
  logger.debug(f"Error trying to get the PDF from {doi}: {e}")
148
  return None
149
+
150
  async def download_paper_scihub_async(self, session, doi):
151
  """Improved method to download paper from Sci-Hub using async requests"""
152
  if not doi:
153
  logger.warning("DOI not provided")
154
  return None
155
+
156
+
157
  for base_url in self.download_sources:
158
+
159
  try:
160
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
161
  text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
162
  if not text:
163
  continue
164
+
165
 
166
  # Search for multiple PDF URL patterns
167
  pdf_patterns = [
 
169
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
170
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
171
  ]
172
+
173
 
174
  pdf_urls = []
175
  for pattern in pdf_patterns:
176
  pdf_urls.extend(re.findall(pattern, text))
177
+
178
 
179
  # Try downloading from found URLs, but iterate over ALL
180
  for pdf_url in pdf_urls:
 
182
  if pdf_content:
183
  logger.debug(f"Found PDF from: {pdf_url}")
184
  return pdf_content
185
+
186
  except Exception as e:
187
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
188
+
189
+
190
  return None
191
 
192
  async def download_paper_libgen_async(self, session, doi):
 
313
  if pdf_content:
314
  logger.info(f"Descarga exitosa de {doi} usando {strategy.__name__}")
315
  return pdf_content
316
+ except CancelledError:
317
+ logger.info(f"Download cancelled on strategy: {strategy.__name__} with DOI {doi}")
318
+ return None # return here in order to stop retry
319
+
320
  except Exception as e:
321
  logger.debug(f"Error en estrategia {strategy.__name__} para {doi}: {e}") #ADDED
322
+
323
+
324
  # Si ninguna estrategia funcionó, esperar un poco antes de reintentar
325
  await asyncio.sleep(1) # Pequeña pausa entre reintentos
326
+
327
+ if retry == max_retries -1: #log all if it does not works on max retries.
328
+ logger.warning(f"FALLO FINAL: No se pudo descargar DOI {doi} después de {max_retries} intentos")
329
+
330
+
331
  # Si se agotan todos los reintentos
 
332
  return None
333
+
334
+ def _download_single_doi(self, doi, progress_callback, cancel_event): # removed async keyword
335
  """Descargar un único DOI con retroalimentación de progreso"""
336
  if not doi:
337
+ progress_callback(None, "Error: DOI no proporcionado", "Error: DOI no proporcionado")
338
+ return None
339
  logger.info(f"Starting download process for DOI: {doi}")
340
+
341
  try:
342
+
343
+ async def call_async():# Added this in order to execute correctly on executor
344
+
345
+ pdf_content = await self.download_with_retry_async(doi)
346
+ if pdf_content:
347
+ logger.info(f"Downloaded PDF for DOI: {doi}")
348
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
349
+ filepath = os.path.join(self.output_dir, filename)
350
+
351
+ # Escribir contenido del PDF
352
+
353
+ open(filepath, 'wb').write(pdf_content)
354
+
355
+ logger.info(f"Saved PDF to file: {filepath}")
356
+
357
+ logger.info(f"Descarga exitosa: {filename}")
358
+
359
+ progress_callback(filepath, f"Descargado exitosamente: <a href='https://doi.org/{doi}'>{doi}</a>", "")
360
+
361
+
362
+ else:
363
+ logger.warning(f"No se pudo descargar: {doi}")
364
+ progress_callback(None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>')
365
+
366
+ asyncio.run(call_async()) #added the loop event here
367
+
368
+ except CancelledError:
369
+ logger.info(f"Download Cancelled DOI: {doi}")
370
+ progress_callback(None, f"Download cancelled {doi}","Download Cancelled" )
371
+
372
  except Exception as e:
373
+ logger.error(f"Error processing {doi}: {e}")
374
+ progress_callback(None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}" )
375
+
 
 
 
 
 
 
 
376
 
377
+ def download_multiple_dois(self, dois_text, progress_callback, cancel_event): #removed async here
378
+ """Download multiple DOIs"""
379
+ # Validar entrada
380
+ if not dois_text:
381
+ progress_callback(None, "Error: No DOIs provided", "Error: No DOIs provided",)
382
+ return None
383
+
384
+ # Sanitizar y filtrar DOIs
385
+ # Eliminar líneas vacías, espacios en blanco, y DOIs duplicados
386
+ dois = list(set([doi.strip() for doi in dois_text.split('\n') if doi.strip()]))
387
+
388
+ # Validar lista de DOIs
389
+ if not dois:
390
+ progress_callback(None, "Error: No valid DOIs provided", "Error: No valid DOIs provided")
391
+ return None
392
 
393
+ # Listas para rastrear resultados
394
+ downloaded_files = [] # Rutas de archivos descargados
395
+ failed_dois = [] # DOIs que no se pudieron descargar
396
+ downloaded_links = [] # Links de DOIs descargados
397
+
398
+
399
+ for doi in dois:
400
+ self._download_single_doi(doi, lambda a,b,c: progress_callback(a,b,c, doi) , cancel_event )
401
+ if cancel_event.is_set():
402
+ logger.info("Downloads cancelled on multiple dois download")
403
+ progress_callback(None, "Downloads cancelled","Downloads cancelled") # early return on cancelled
404
+ return None #break here when is cancelled
405
+
406
+ result = self.results_dict.get(doi, (None,None,"")) # obtain from self.results
407
+
408
+ # Manejar diferentes tipos de resultados
409
+ if isinstance(result, Exception):
410
+ # Excepción inesperada
411
+ error_msg = f"Unexpected error: {str(result)}"
412
+ logger.error(f"Error downloading {doi}: {error_msg}")
413
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
414
+
415
+ elif result[0] is None:
416
+ # Descarga fallida (resultado de download_single_doi_async)
417
+ error_msg = result[1]
418
+ logger.warning(f"Failed to download {doi}: {error_msg}")
419
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
420
+
421
+ else:
422
+ # Descarga exitosa
423
+ filepath = result[0]
424
+
425
+ # Generar nombre de archivo único
426
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
427
+ filepath_unique = os.path.join(self.output_dir, filename)
428
+
429
+ try:
430
+ # Renombrar archivo
431
+ os.rename(filepath, filepath_unique)
432
+
433
+ # Añadir a lista de archivos descargados
434
+ downloaded_files.append(filepath_unique)
435
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
436
+
437
+ except Exception as rename_error:
438
+ logger.error(f"Error renaming file for {doi}: {rename_error}")
439
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Error saving file')
440
+
441
+
442
+
443
+
444
+ # Crear archivo ZIP si hay archivos descargados
445
+ zip_filename = None
446
+ if downloaded_files:
447
+ zip_filename = 'papers.zip'
448
+ loop = asyncio.get_running_loop()
449
 
450
+ # Ejecutar creación de ZIP en un executor para no bloquear
451
+ loop.run_in_executor(
452
+ self.executor,
453
+ lambda: self.create_zip(zip_filename, downloaded_files)
454
+ )
455
+ logger.info(f"ZIP file created: {zip_filename}")
456
+
457
+
458
+ # Devolver resultados
459
+ progress_callback( zip_filename if downloaded_files else None, "\n".join(downloaded_links),"\n".join(failed_dois))
460
+ return
461
 
462
+ def process_bibtex(self, bib_file, progress_callback, cancel_event):# removed async here
463
+ """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
464
+ # Read BibTeX file content from the uploaded object
465
+ try:
466
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
467
+ bib_content = f.read()
468
+ except Exception as e:
469
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
470
+ progress_callback(None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}" )
471
+ return None
472
+
473
+ # Parse BibTeX data
474
+ try:
475
+ bib_database = bibtexparser.loads(bib_content)
476
+ except Exception as e:
477
+ logger.error(f"Error parsing BibTeX data: {e}")
478
+ progress_callback(None,f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}")
479
+ return None
480
+
481
+ # Extract DOIs
482
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
483
+ logger.info(f"Found {len(dois)} DOIs to download")
484
+
485
+ # Result lists
486
+ downloaded_files = []
487
+ failed_dois = []
488
+ downloaded_links = []
489
+
490
+ for doi in dois:
491
+
492
+ self._download_single_doi(doi, lambda a,b,c: progress_callback(a,b,c, doi), cancel_event )
493
+ if cancel_event.is_set():
494
+ logger.info("Download Cancelled in bibtex mode")
495
+ progress_callback(None, "Download Cancelled", "Download Cancelled")
496
+ return None #cancel if requested
497
+
498
+ result = self.results_dict.get(doi, (None,None,"")) # obtain from self.results
499
 
500
+ if isinstance(result, Exception):
501
+ # Excepción inesperada
502
+ error_msg = f"Unexpected error: {str(result)}"
503
+ logger.error(f"Error downloading {doi}: {error_msg}")
504
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
505
 
506
+ elif result[0] is None:
507
+ # Descarga fallida (resultado de download_single_doi_async)
508
+ error_msg = result[1]
509
+ logger.warning(f"Failed to download {doi}: {error_msg}")
510
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
511
 
512
+ else:
513
+ # Descarga exitosa
514
+ filepath = result[0]
515
 
516
+ # Unique filename for zip
517
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
518
+ filepath_unique = os.path.join(self.output_dir, filename)
519
+ os.rename(filepath, filepath_unique)
520
  downloaded_files.append(filepath_unique)
521
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
522
+
523
+
524
+ if downloaded_files:
525
+ zip_filename = 'papers.zip'
526
+ loop = asyncio.get_running_loop()
527
+ loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
528
+ logger.info(f"ZIP file created: {zip_filename}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
+ progress_callback(zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)) #after process finishes
531
+ return
532
+
533
  def create_zip(self, zip_filename, files):
534
  """Crea un archivo zip con los pdfs descargados"""
535
  with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
536
  for file in files:
537
  zf.write(file, os.path.basename(file))
538
+ def cancel_download(self):
539
+ if self.download_task:
540
+ self.cancel_event.set()
541
+ # Cancel the download task if it exists and it is cancelable
542
+
543
+ self.download_task.cancel()
544
 
545
  def create_gradio_interface():
546
  """Create Gradio interface for Paper Downloader"""
547
  downloader = PaperDownloader()
548
 
549
+ downloader.results_dict = {} #shared results dict, since it runs on different threads
 
550
 
551
+ def update_progress( message="", logs=""):
552
+ return gr.Textbox.update(value=f"{message}"),gr.Textbox.update(value=f"<pre>{logs}</pre>")
553
+
554
 
555
+ def download_papers(bib_file, doi_input, dois_input):
556
+ cancel_event = asyncio.Event() # Create cancellation event for every submission.
557
+ downloader.cancel_event = cancel_event # store the event so that it is available to stop the process
558
 
559
+ def custom_progress_callback(filepath, message, fail_message, doi=None): #new callback to send to the execution function
560
+
561
+ logger.info(f"Callback message: {message}") # log each callback msg
562
+
563
+ #store data for single or multiple mode on download_papers execution.
564
+ if doi:
565
+ downloader.results_dict[doi] = (filepath, message,fail_message)
566
+
567
+ updates = update_progress(message)
568
+
569
+ return updates
570
+
571
+
572
+ if bib_file:
573
+ # Check file type
574
+ if not bib_file.name.lower().endswith('.bib'):
575
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
576
+
577
+ downloader.download_task = downloader.executor.submit(
578
+ downloader.process_bibtex,
579
+ bib_file,
580
+ lambda a,b,c: update_progress(a,f"{b}<br>{c}"), #convert for ui output, the return data will contain the HTML
581
+ cancel_event # Added cancelllation event.
582
+ )
583
+
584
+ return None,"","",None
585
+
586
+ elif doi_input:
587
+
588
+ downloader.download_task = downloader.executor.submit( #changed async execution method
589
+ downloader._download_single_doi,
590
+ doi_input,
591
+ lambda a,b,c: update_progress(a,f"{b}<br>{c}") , #callback function, format output and send html info, removed lambda from executor calls
592
+ cancel_event # Add cancellation event.
593
+ )
594
+
595
+ return None, "","", None
596
 
597
+ elif dois_input:
598
+ downloader.download_task = downloader.executor.submit( #changed async execution method
599
+ downloader.download_multiple_dois,
600
+ dois_input,
601
+ lambda a,b,c: update_progress(a,f"{b}<br>{c}") ,#callback function
602
+ cancel_event #Add cancellation event.
603
+ )
604
+
605
+ return None, "","", None
606
+
607
+ else:
608
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs","", None
609
+
 
610
 
611
 
612
  # Gradio Interface
 
615
  inputs=[
616
  gr.File(file_types=['.bib'], label="Upload BibTeX File"),
617
  gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
618
+ gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n..."),
619
+
620
  ],
621
  outputs=[
622
  gr.File(label="Download Papers (ZIP) or Single PDF"),
623
+ gr.Textbox(label="""
624
+ Found DOIs
 
 
 
 
 
625
  """),
626
+ gr.Textbox(label="""
627
+ Missed DOIs
 
 
 
 
 
628
  """),
629
+ gr.Textbox(label="""
630
+ Logs
631
+ """, lines = 10),
 
 
 
 
 
 
632
  gr.File(label="Downloaded Single PDF")
633
  ],
634
+
635
  title="🔬 Academic Paper Batch Downloader",
636
  description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
637
  theme="Hev832/Applio",
 
660
 
661
  }
662
  """,
663
+
664
+
665
  cache_examples=False,
666
  )
667
 
668
+ interface.load= """
669
+ () => {
670
+
671
+ return []
672
+ }
673
+ """
674
+ # Adding clear button
675
+ with gr.Row():
676
+
677
+ interface.stop_btn= gr.Button(value="Stop Downloads")
678
+
679
+ interface.stop_btn.click(lambda: downloader.cancel_download(), outputs=None) #added function in object downloader
680
+
681
  return interface
682
 
683
  def main():