C2MV commited on
Commit
cffe4a4
·
verified ·
1 Parent(s): 4cb7c88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -57
app.py CHANGED
@@ -264,75 +264,82 @@ class PaperDownloader:
264
  logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
 
267
- async def download_with_retry_async(self, doi, max_retries=5, initial_delay=2):
268
- pdf_content = None
269
- retries = 0
270
- delay = initial_delay
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  async with aiohttp.ClientSession() as session:
273
- while retries < max_retries and not pdf_content:
274
- try:
275
- logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
276
-
277
- # Strategies in order
278
- download_strategies = [
279
- self.download_paper_direct_doi_async,
280
- self.download_paper_scihub_async,
281
- self.download_paper_libgen_async,
282
- self.download_paper_google_scholar_async,
283
- self.download_paper_crossref_async
284
- ]
285
-
286
- for strategy in download_strategies:
287
  pdf_content = await strategy(session, doi)
288
  if pdf_content:
289
- logger.info(f"Successfully downloaded {doi} using {strategy.__name__}")
290
  return pdf_content
291
-
292
- # If no strategy worked, skip this iteration
293
- retries += 1
294
- logger.warning(f"No successful strategy for DOI: {doi}. Retry {retries}")
295
- await asyncio.sleep(delay)
296
- delay *= 2 # Exponential backoff
297
 
298
- except Exception as e:
299
- logger.error(f"Unexpected error in download attempt {retries + 1} for DOI {doi}: {e}")
300
- retries += 1
301
- await asyncio.sleep(delay)
302
- delay *= 2
303
-
304
- logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
305
  return None
306
 
307
  async def download_single_doi_async(self, doi, progress_callback):
308
- """Downloads a single paper using a DOI, and updates the given progress_callback"""
309
- if not doi:
310
- return None, "Error: DOI not provided", "Error: DOI not provided"
311
 
312
- try:
313
- pdf_content = await self.download_with_retry_async(doi)
314
 
315
- if pdf_content:
316
- if doi is None:
317
- return None, "Error: DOI not provided", "Error: DOI not provided"
318
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
319
- filepath = os.path.join(self.output_dir, filename)
320
-
321
- loop = asyncio.get_running_loop()
322
- await loop.run_in_executor(self.executor, lambda: open(filepath, 'wb').write(pdf_content))
323
-
324
- logger.info(f"Successfully downloaded: {filename}")
325
- progress_callback(f"Successfully downloaded: <a href='https://doi.org/{doi}'>{doi}</a>")
326
- return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
327
- else:
328
- logger.warning(f"Could not download: {doi}")
329
- progress_callback(f"Could not download: <a href='https://doi.org/{doi}'>{doi}</a>")
330
- return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
331
 
332
- except Exception as e:
333
- logger.error(f"Error processing {doi}: {e}")
334
- progress_callback(f"Error processing {doi}: <a href='https://doi.org/{doi}'>{doi}</a> {e}")
335
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  async def download_multiple_dois_async(self, dois_text, progress_callback):
338
  # Validar entrada
 
264
  logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
 
267
+ async def download_with_retry_async(self, doi, max_retries=3):
268
+ """
269
+ Intenta descargar un paper con múltiples estrategias y un número limitado de reintentos.
270
+
271
+ Args:
272
+ doi (str): DOI del paper a descargar
273
+ max_retries (int): Número máximo de reintentos
274
+
275
+ Returns:
276
+ bytes or None: Contenido del PDF o None si no se puede descargar
277
+ """
278
+ if not doi:
279
+ logger.warning("DOI no proporcionado")
280
+ return None
281
+
282
+ # Estrategias de descarga en orden de preferencia
283
+ download_strategies = [
284
+ self.download_paper_direct_doi_async,
285
+ self.download_paper_scihub_async,
286
+ self.download_paper_libgen_async,
287
+ self.download_paper_google_scholar_async,
288
+ self.download_paper_crossref_async
289
+ ]
290
 
291
  async with aiohttp.ClientSession() as session:
292
+ for retry in range(max_retries):
293
+ logger.info(f"Intento de descarga {retry + 1} para DOI: {doi}")
294
+
295
+ # Probar cada estrategia de descarga
296
+ for strategy in download_strategies:
297
+ try:
 
 
 
 
 
 
 
 
298
  pdf_content = await strategy(session, doi)
299
  if pdf_content:
300
+ logger.info(f"Descarga exitosa de {doi} usando {strategy.__name__}")
301
  return pdf_content
302
+ except Exception as e:
303
+ logger.debug(f"Error en estrategia {strategy.__name__} para {doi}: {e}")
304
+
305
+ # Si ninguna estrategia funcionó, esperar un poco antes de reintentar
306
+ await asyncio.sleep(1) # Pequeña pausa entre reintentos
 
307
 
308
+ # Si se agotan todos los reintentos
309
+ logger.warning(f"FALLO FINAL: No se pudo descargar DOI {doi} después de {max_retries} intentos")
 
 
 
 
 
310
  return None
311
 
312
  async def download_single_doi_async(self, doi, progress_callback):
313
+ """Descargar un único DOI con retroalimentación de progreso"""
314
+ if not doi:
315
+ return None, "Error: DOI no proporcionado", "Error: DOI no proporcionado"
316
 
317
+ try:
318
+ pdf_content = await self.download_with_retry_async(doi)
319
 
320
+ if pdf_content:
321
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
322
+ filepath = os.path.join(self.output_dir, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
+ # Escribir contenido del PDF
325
+ loop = asyncio.get_running_loop()
326
+ await loop.run_in_executor(
327
+ self.executor,
328
+ lambda: open(filepath, 'wb').write(pdf_content)
329
+ )
330
+
331
+ logger.info(f"Descarga exitosa: {filename}")
332
+ progress_callback(f"Descargado exitosamente: <a href='https://doi.org/{doi}'>{doi}</a>")
333
+ return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
334
+ else:
335
+ logger.warning(f"No se pudo descargar: {doi}")
336
+ progress_callback(f"No se pudo descargar: <a href='https://doi.org/{doi}'>{doi}</a>")
337
+ return None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
338
+
339
+ except Exception as e:
340
+ logger.error(f"Error procesando {doi}: {e}")
341
+ progress_callback(f"Error procesando {doi}: <a href='https://doi.org/{doi}'>{doi}</a> {e}")
342
+ return None, f"Error procesando {doi}: {e}", f"Error procesando {doi}: {e}"
343
 
344
  async def download_multiple_dois_async(self, dois_text, progress_callback):
345
  # Validar entrada