Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -307,57 +307,48 @@ class PaperDownloader:
|
|
307 |
delay *= 2 # Exponential backoff
|
308 |
|
309 |
return None
|
310 |
-
|
311 |
-
|
312 |
async def _download_single_doi(self, doi):
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
if pdf_content:
|
322 |
logger.info(f"Downloaded PDF for DOI: {doi}")
|
323 |
filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
|
324 |
filepath = os.path.join(self.output_dir, filename)
|
325 |
-
|
326 |
-
# Escribir contenido del PDF
|
327 |
-
|
328 |
with open(filepath, 'wb') as f:
|
329 |
f.write(pdf_content)
|
330 |
-
|
331 |
logger.info(f"Saved PDF to file: {filepath}")
|
332 |
-
|
333 |
logger.info(f"Descarga exitosa: {filename}")
|
334 |
-
|
335 |
return filepath, f"Descargado exitosamente: <a href='https://doi.org/{doi}'>{doi}</a>", ""
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
|
|
|
|
|
|
|
|
340 |
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
except Exception as e:
|
345 |
-
logger.error(f"Error processing {doi}: {e}")
|
346 |
-
return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
|
347 |
|
348 |
-
|
349 |
async def download_multiple_dois(self, dois_text, cancel_event):
|
350 |
"""Download multiple DOIs"""
|
351 |
if not dois_text:
|
352 |
return None, "Error: No DOIs provided", "Error: No DOIs provided", ""
|
353 |
-
|
354 |
# Sanitizar y filtrar DOIs
|
355 |
# Eliminar l铆neas vac铆as, espacios en blanco, y DOIs duplicados
|
356 |
dois = list(set([doi.strip() for doi in dois_text.split('\n') if doi.strip()]))
|
357 |
|
358 |
# Validar lista de DOIs
|
359 |
if not dois:
|
360 |
-
|
361 |
|
362 |
# Listas para rastrear resultados
|
363 |
downloaded_files = [] # Rutas de archivos descargados
|
@@ -365,46 +356,44 @@ class PaperDownloader:
|
|
365 |
downloaded_links = [] # Links de DOIs descargados
|
366 |
|
367 |
for i, doi in enumerate(dois):
|
368 |
-
result = await self._download_single_doi(doi
|
369 |
-
|
370 |
if cancel_event.is_set():
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
if result is None:
|
375 |
-
|
376 |
-
|
377 |
if isinstance(result, Exception):
|
378 |
# Excepci贸n inesperada
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
elif result[0] is None:
|
384 |
-
|
385 |
error_msg = result[1]
|
386 |
logger.warning(f"Failed to download {doi}: {error_msg}")
|
387 |
failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
|
388 |
-
|
389 |
else:
|
390 |
-
|
391 |
filepath = result[0]
|
392 |
-
|
393 |
# Generar nombre de archivo 煤nico
|
394 |
-
filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
|
395 |
-
filepath_unique = os.path.join(self.output_dir, filename)
|
396 |
-
|
397 |
try:
|
398 |
-
|
399 |
-
os.rename(filepath, filepath_unique)
|
400 |
|
401 |
-
|
402 |
-
downloaded_files.append(filepath_unique) #Fixed
|
403 |
-
downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')#Fixed
|
404 |
|
405 |
except Exception as rename_error:
|
406 |
-
|
407 |
-
|
408 |
|
409 |
|
410 |
# Crear archivo ZIP si hay archivos descargados
|
@@ -419,75 +408,78 @@ class PaperDownloader:
|
|
419 |
lambda: self.create_zip(zip_filename, downloaded_files)
|
420 |
)
|
421 |
logger.info(f"ZIP file created: {zip_filename}")
|
|
|
|
|
422 |
|
423 |
-
return zip_filename if downloaded_files else None, "\n".join(downloaded_links),"\n".join(failed_dois), ""
|
424 |
-
|
425 |
-
async def process_bibtex(self, bib_file, cancel_event):
|
426 |
-
"""Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
|
427 |
-
# Read BibTeX file content from the uploaded object
|
428 |
-
try:
|
429 |
-
with open(bib_file.name, 'r', encoding='utf-8') as f:
|
430 |
-
bib_content = f.read()
|
431 |
-
except Exception as e:
|
432 |
-
logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
|
433 |
-
return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", ""
|
434 |
-
|
435 |
-
# Parse BibTeX data
|
436 |
-
try:
|
437 |
-
bib_database = bibtexparser.loads(bib_content)
|
438 |
-
except Exception as e:
|
439 |
-
logger.error(f"Error parsing BibTeX data: {e}")
|
440 |
-
return None,f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}",""
|
441 |
-
|
442 |
-
# Extract DOIs
|
443 |
-
dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
|
444 |
-
logger.info(f"Found {len(dois)} DOIs to download")
|
445 |
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
|
|
|
|
|
|
452 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
if cancel_event.is_set():
|
454 |
-
|
455 |
-
|
|
|
456 |
if result is None:
|
457 |
-
|
458 |
|
459 |
if isinstance(result, Exception):
|
460 |
-
|
461 |
error_msg = f"Unexpected error: {str(result)}"
|
462 |
logger.error(f"Error downloading {doi}: {error_msg}")
|
463 |
failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
|
475 |
# Unique filename for zip
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
|
482 |
|
483 |
-
if downloaded_files:
|
484 |
-
zip_filename = 'papers.zip'
|
485 |
-
loop = asyncio.get_running_loop()
|
486 |
-
loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
|
487 |
-
logger.info(f"ZIP file created: {zip_filename}")
|
488 |
-
|
489 |
-
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), ""
|
490 |
-
|
491 |
def create_zip(self, zip_filename, files):
|
492 |
"""Crea un archivo zip con los pdfs descargados"""
|
493 |
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
|
@@ -504,30 +496,32 @@ def create_gradio_interface():
|
|
504 |
"""Create Gradio interface for Paper Downloader"""
|
505 |
downloader = PaperDownloader()
|
506 |
|
507 |
-
|
508 |
def update_progress( message="", logs=""):
|
509 |
-
|
510 |
|
511 |
|
512 |
async def download_papers(bib_file, doi_input, dois_input):
|
513 |
cancel_event = asyncio.Event() # Create cancellation event for every submission.
|
514 |
downloader.cancel_event = cancel_event # store the event so that it is available to stop the process
|
|
|
515 |
if bib_file:
|
516 |
# Check file type
|
517 |
-
|
518 |
-
|
|
|
|
|
|
|
519 |
|
520 |
-
zip_file, downloaded_dois, failed_dois, logs_text= await downloader.process_bibtex(bib_file, cancel_event)
|
521 |
-
return zip_file, downloaded_dois, failed_dois, logs_text, None
|
522 |
elif doi_input:
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
elif dois_input:
|
527 |
-
|
528 |
-
|
529 |
else:
|
530 |
-
return
|
|
|
531 |
|
532 |
with gr.Blocks(theme="Hev832/Applio", css="""
|
533 |
.gradio-container {
|
@@ -569,10 +563,10 @@ def create_gradio_interface():
|
|
569 |
|
570 |
|
571 |
output_file = gr.File(label="Download Papers (ZIP) or Single PDF")
|
572 |
-
downloaded_dois_textbox = gr.
|
573 |
Found DOIs
|
574 |
""",)
|
575 |
-
failed_dois_textbox=gr.
|
576 |
Missed DOIs
|
577 |
""",)
|
578 |
logs = gr.Textbox(label="""
|
@@ -590,14 +584,15 @@ def create_gradio_interface():
|
|
590 |
inputs=[bib_file, doi_input, dois_input],
|
591 |
outputs=[output_file, downloaded_dois_textbox, failed_dois_textbox,logs, single_file ], # the new output should be a tuple and we output logs too for debugging.
|
592 |
)
|
593 |
-
|
594 |
interface.title="馃敩 Academic Paper Batch Downloader"
|
595 |
interface.description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment."
|
|
|
596 |
return interface
|
597 |
|
598 |
def main():
|
599 |
interface = create_gradio_interface()
|
600 |
-
interface.launch()
|
601 |
|
602 |
if __name__ == "__main__":
|
603 |
main()
|
|
|
307 |
delay *= 2 # Exponential backoff
|
308 |
|
309 |
return None
|
|
|
|
|
310 |
async def _download_single_doi(self, doi):
|
311 |
+
"""Descargar un 煤nico DOI con retroalimentaci贸n de progreso"""
|
312 |
+
if not doi:
|
313 |
+
return None, "Error: DOI no proporcionado", "Error: DOI no proporcionado"
|
314 |
+
logger.info(f"Starting download process for DOI: {doi}")
|
315 |
+
|
316 |
+
try:
|
317 |
+
pdf_content = await self.download_with_retry_async(doi)
|
318 |
+
if pdf_content:
|
|
|
319 |
logger.info(f"Downloaded PDF for DOI: {doi}")
|
320 |
filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
|
321 |
filepath = os.path.join(self.output_dir, filename)
|
|
|
|
|
|
|
322 |
with open(filepath, 'wb') as f:
|
323 |
f.write(pdf_content)
|
|
|
324 |
logger.info(f"Saved PDF to file: {filepath}")
|
|
|
325 |
logger.info(f"Descarga exitosa: {filename}")
|
|
|
326 |
return filepath, f"Descargado exitosamente: <a href='https://doi.org/{doi}'>{doi}</a>", ""
|
327 |
+
|
328 |
+
else:
|
329 |
+
logger.warning(f"No se pudo descargar: {doi}")
|
330 |
+
return None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
|
331 |
+
|
332 |
+
except CancelledError:
|
333 |
+
logger.info(f"Download Cancelled DOI: {doi}")
|
334 |
+
return None, f"Download cancelled {doi}","Download Cancelled"
|
335 |
|
336 |
+
except Exception as e:
|
337 |
+
logger.error(f"Error processing {doi}: {e}")
|
338 |
+
return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
|
|
|
|
|
|
|
339 |
|
|
|
340 |
async def download_multiple_dois(self, dois_text, cancel_event):
|
341 |
"""Download multiple DOIs"""
|
342 |
if not dois_text:
|
343 |
return None, "Error: No DOIs provided", "Error: No DOIs provided", ""
|
344 |
+
|
345 |
# Sanitizar y filtrar DOIs
|
346 |
# Eliminar l铆neas vac铆as, espacios en blanco, y DOIs duplicados
|
347 |
dois = list(set([doi.strip() for doi in dois_text.split('\n') if doi.strip()]))
|
348 |
|
349 |
# Validar lista de DOIs
|
350 |
if not dois:
|
351 |
+
return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided", ""
|
352 |
|
353 |
# Listas para rastrear resultados
|
354 |
downloaded_files = [] # Rutas de archivos descargados
|
|
|
356 |
downloaded_links = [] # Links de DOIs descargados
|
357 |
|
358 |
for i, doi in enumerate(dois):
|
359 |
+
result = await self._download_single_doi(doi)
|
360 |
+
|
361 |
if cancel_event.is_set():
|
362 |
+
logger.info("Downloads cancelled on multiple dois download")
|
363 |
+
return None,"Downloads cancelled","Downloads cancelled", ""
|
|
|
364 |
if result is None:
|
365 |
+
continue
|
|
|
366 |
if isinstance(result, Exception):
|
367 |
# Excepci贸n inesperada
|
368 |
+
error_msg = f"Unexpected error: {str(result)}"
|
369 |
+
logger.error(f"Error downloading {doi}: {error_msg}")
|
370 |
+
failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
|
371 |
+
|
372 |
elif result[0] is None:
|
373 |
+
# Descarga fallida (resultado de download_single_doi_async)
|
374 |
error_msg = result[1]
|
375 |
logger.warning(f"Failed to download {doi}: {error_msg}")
|
376 |
failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
|
377 |
+
|
378 |
else:
|
379 |
+
# Descarga exitosa
|
380 |
filepath = result[0]
|
381 |
+
|
382 |
# Generar nombre de archivo 煤nico
|
383 |
+
filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf" # indent fix.
|
384 |
+
filepath_unique = os.path.join(self.output_dir, filename)
|
385 |
+
|
386 |
try:
|
387 |
+
# Renombrar archivo
|
388 |
+
os.rename(filepath, filepath_unique) #Fixed ident
|
389 |
|
390 |
+
# A帽adir a lista de archivos descargados
|
391 |
+
downloaded_files.append(filepath_unique) #Fixed ident
|
392 |
+
downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')#Fixed ident
|
393 |
|
394 |
except Exception as rename_error:
|
395 |
+
logger.error(f"Error renaming file for {doi}: {rename_error}")
|
396 |
+
failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Error saving file')#Fixed ident
|
397 |
|
398 |
|
399 |
# Crear archivo ZIP si hay archivos descargados
|
|
|
408 |
lambda: self.create_zip(zip_filename, downloaded_files)
|
409 |
)
|
410 |
logger.info(f"ZIP file created: {zip_filename}")
|
411 |
+
|
412 |
+
return zip_filename if downloaded_files else None, "\n".join(downloaded_links),"\n".join(failed_dois),""
|
413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
|
415 |
+
async def process_bibtex(self, bib_file, cancel_event):
|
416 |
+
"""Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
|
417 |
+
# Read BibTeX file content from the uploaded object
|
418 |
+
try:
|
419 |
+
with open(bib_file.name, 'r', encoding='utf-8') as f:
|
420 |
+
bib_content = f.read()
|
421 |
+
except Exception as e:
|
422 |
+
logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
|
423 |
+
return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", ""
|
424 |
|
425 |
+
# Parse BibTeX data
|
426 |
+
try:
|
427 |
+
bib_database = bibtexparser.loads(bib_content)
|
428 |
+
except Exception as e:
|
429 |
+
logger.error(f"Error parsing BibTeX data: {e}")
|
430 |
+
return None,f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}",""
|
431 |
+
|
432 |
+
# Extract DOIs
|
433 |
+
dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
|
434 |
+
logger.info(f"Found {len(dois)} DOIs to download")
|
435 |
+
|
436 |
+
# Result lists
|
437 |
+
downloaded_files = []
|
438 |
+
failed_dois = []
|
439 |
+
downloaded_links = []
|
440 |
+
|
441 |
+
for i, doi in enumerate(dois):
|
442 |
+
result = await self._download_single_doi(doi, cancel_event) # now its async directly here
|
443 |
+
|
444 |
if cancel_event.is_set():
|
445 |
+
logger.info("Download Cancelled in bibtex mode")
|
446 |
+
return None, "Download Cancelled", "Download Cancelled", ""
|
447 |
+
|
448 |
if result is None:
|
449 |
+
continue
|
450 |
|
451 |
if isinstance(result, Exception):
|
452 |
+
# Excepci贸n inesperada
|
453 |
error_msg = f"Unexpected error: {str(result)}"
|
454 |
logger.error(f"Error downloading {doi}: {error_msg}")
|
455 |
failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
|
456 |
+
|
457 |
+
elif result[0] is None:
|
458 |
+
# Descarga fallida (resultado de download_single_doi_async)
|
459 |
+
error_msg = result[1]
|
460 |
+
logger.warning(f"Failed to download {doi}: {error_msg}")
|
461 |
+
failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
|
462 |
+
|
463 |
+
else:
|
464 |
+
# Descarga exitosa
|
465 |
+
filepath = result[0]
|
466 |
|
467 |
# Unique filename for zip
|
468 |
+
filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
|
469 |
+
filepath_unique = os.path.join(self.output_dir, filename)
|
470 |
+
os.rename(filepath, filepath_unique)
|
471 |
+
downloaded_files.append(filepath_unique)
|
472 |
+
downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
|
473 |
+
|
474 |
+
if downloaded_files:
|
475 |
+
zip_filename = 'papers.zip'
|
476 |
+
loop = asyncio.get_running_loop()
|
477 |
+
loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
|
478 |
+
logger.info(f"ZIP file created: {zip_filename}")
|
479 |
+
|
480 |
+
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois),""
|
481 |
|
482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
def create_zip(self, zip_filename, files):
|
484 |
"""Crea un archivo zip con los pdfs descargados"""
|
485 |
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
|
|
|
496 |
"""Create Gradio interface for Paper Downloader"""
|
497 |
downloader = PaperDownloader()
|
498 |
|
|
|
499 |
def update_progress( message="", logs=""):
|
500 |
+
return gr.Textbox.update(value=f"{message}"),gr.Textbox.update(value=f"<pre>{logs}</pre>")
|
501 |
|
502 |
|
503 |
async def download_papers(bib_file, doi_input, dois_input):
|
504 |
cancel_event = asyncio.Event() # Create cancellation event for every submission.
|
505 |
downloader.cancel_event = cancel_event # store the event so that it is available to stop the process
|
506 |
+
|
507 |
if bib_file:
|
508 |
# Check file type
|
509 |
+
if not bib_file.name.lower().endswith('.bib'):
|
510 |
+
return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", "", None
|
511 |
+
zip_file, downloaded_dois, failed_dois, logs_text= await downloader.process_bibtex(bib_file, cancel_event)
|
512 |
+
|
513 |
+
return zip_file, downloaded_dois, failed_dois, logs_text, None #all outputs at return.
|
514 |
|
|
|
|
|
515 |
elif doi_input:
|
516 |
+
filepath, message, error = await downloader._download_single_doi(doi_input,cancel_event)
|
517 |
+
return None, message, error, "", filepath
|
518 |
+
|
519 |
elif dois_input:
|
520 |
+
zip_file, downloaded_dois, failed_dois, logs_text= await downloader.download_multiple_dois(dois_input, cancel_event)
|
521 |
+
return zip_file, downloaded_dois, failed_dois, logs_text, None
|
522 |
else:
|
523 |
+
return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs","", None #all output data returned
|
524 |
+
|
525 |
|
526 |
with gr.Blocks(theme="Hev832/Applio", css="""
|
527 |
.gradio-container {
|
|
|
563 |
|
564 |
|
565 |
output_file = gr.File(label="Download Papers (ZIP) or Single PDF")
|
566 |
+
downloaded_dois_textbox = gr.HTML(label="""
|
567 |
Found DOIs
|
568 |
""",)
|
569 |
+
failed_dois_textbox=gr.HTML(label="""
|
570 |
Missed DOIs
|
571 |
""",)
|
572 |
logs = gr.Textbox(label="""
|
|
|
584 |
inputs=[bib_file, doi_input, dois_input],
|
585 |
outputs=[output_file, downloaded_dois_textbox, failed_dois_textbox,logs, single_file ], # the new output should be a tuple and we output logs too for debugging.
|
586 |
)
|
587 |
+
|
588 |
interface.title="馃敩 Academic Paper Batch Downloader"
|
589 |
interface.description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment."
|
590 |
+
|
591 |
return interface
|
592 |
|
593 |
def main():
|
594 |
interface = create_gradio_interface()
|
595 |
+
interface.launch(share=True)
|
596 |
|
597 |
if __name__ == "__main__":
|
598 |
main()
|