Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -264,75 +264,82 @@ class PaperDownloader:
|
|
264 |
logger.debug(f"Crossref error for {doi}: {e}")
|
265 |
return None
|
266 |
|
267 |
-
async def download_with_retry_async(self, doi, max_retries=
|
268 |
-
|
269 |
-
|
270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
async with aiohttp.ClientSession() as session:
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
self.download_paper_direct_doi_async,
|
280 |
-
self.download_paper_scihub_async,
|
281 |
-
self.download_paper_libgen_async,
|
282 |
-
self.download_paper_google_scholar_async,
|
283 |
-
self.download_paper_crossref_async
|
284 |
-
]
|
285 |
-
|
286 |
-
for strategy in download_strategies:
|
287 |
pdf_content = await strategy(session, doi)
|
288 |
if pdf_content:
|
289 |
-
logger.info(f"
|
290 |
return pdf_content
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
delay *= 2 # Exponential backoff
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
retries += 1
|
301 |
-
await asyncio.sleep(delay)
|
302 |
-
delay *= 2
|
303 |
-
|
304 |
-
logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
|
305 |
return None
|
306 |
|
307 |
async def download_single_doi_async(self, doi, progress_callback):
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
|
312 |
-
|
313 |
-
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
|
319 |
-
filepath = os.path.join(self.output_dir, filename)
|
320 |
-
|
321 |
-
loop = asyncio.get_running_loop()
|
322 |
-
await loop.run_in_executor(self.executor, lambda: open(filepath, 'wb').write(pdf_content))
|
323 |
-
|
324 |
-
logger.info(f"Successfully downloaded: {filename}")
|
325 |
-
progress_callback(f"Successfully downloaded: <a href='https://doi.org/{doi}'>{doi}</a>")
|
326 |
-
return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
|
327 |
-
else:
|
328 |
-
logger.warning(f"Could not download: {doi}")
|
329 |
-
progress_callback(f"Could not download: <a href='https://doi.org/{doi}'>{doi}</a>")
|
330 |
-
return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
|
331 |
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
|
337 |
async def download_multiple_dois_async(self, dois_text, progress_callback):
|
338 |
# Validar entrada
|
|
|
264 |
logger.debug(f"Crossref error for {doi}: {e}")
|
265 |
return None
|
266 |
|
267 |
+
async def download_with_retry_async(self, doi, max_retries=3):
|
268 |
+
"""
|
269 |
+
Intenta descargar un paper con múltiples estrategias y un número limitado de reintentos.
|
270 |
+
|
271 |
+
Args:
|
272 |
+
doi (str): DOI del paper a descargar
|
273 |
+
max_retries (int): Número máximo de reintentos
|
274 |
+
|
275 |
+
Returns:
|
276 |
+
bytes or None: Contenido del PDF o None si no se puede descargar
|
277 |
+
"""
|
278 |
+
if not doi:
|
279 |
+
logger.warning("DOI no proporcionado")
|
280 |
+
return None
|
281 |
+
|
282 |
+
# Estrategias de descarga en orden de preferencia
|
283 |
+
download_strategies = [
|
284 |
+
self.download_paper_direct_doi_async,
|
285 |
+
self.download_paper_scihub_async,
|
286 |
+
self.download_paper_libgen_async,
|
287 |
+
self.download_paper_google_scholar_async,
|
288 |
+
self.download_paper_crossref_async
|
289 |
+
]
|
290 |
|
291 |
async with aiohttp.ClientSession() as session:
|
292 |
+
for retry in range(max_retries):
|
293 |
+
logger.info(f"Intento de descarga {retry + 1} para DOI: {doi}")
|
294 |
+
|
295 |
+
# Probar cada estrategia de descarga
|
296 |
+
for strategy in download_strategies:
|
297 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
pdf_content = await strategy(session, doi)
|
299 |
if pdf_content:
|
300 |
+
logger.info(f"Descarga exitosa de {doi} usando {strategy.__name__}")
|
301 |
return pdf_content
|
302 |
+
except Exception as e:
|
303 |
+
logger.debug(f"Error en estrategia {strategy.__name__} para {doi}: {e}")
|
304 |
+
|
305 |
+
# Si ninguna estrategia funcionó, esperar un poco antes de reintentar
|
306 |
+
await asyncio.sleep(1) # Pequeña pausa entre reintentos
|
|
|
307 |
|
308 |
+
# Si se agotan todos los reintentos
|
309 |
+
logger.warning(f"FALLO FINAL: No se pudo descargar DOI {doi} después de {max_retries} intentos")
|
|
|
|
|
|
|
|
|
|
|
310 |
return None
|
311 |
|
312 |
async def download_single_doi_async(self, doi, progress_callback):
|
313 |
+
"""Descargar un único DOI con retroalimentación de progreso"""
|
314 |
+
if not doi:
|
315 |
+
return None, "Error: DOI no proporcionado", "Error: DOI no proporcionado"
|
316 |
|
317 |
+
try:
|
318 |
+
pdf_content = await self.download_with_retry_async(doi)
|
319 |
|
320 |
+
if pdf_content:
|
321 |
+
filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
|
322 |
+
filepath = os.path.join(self.output_dir, filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
|
324 |
+
# Escribir contenido del PDF
|
325 |
+
loop = asyncio.get_running_loop()
|
326 |
+
await loop.run_in_executor(
|
327 |
+
self.executor,
|
328 |
+
lambda: open(filepath, 'wb').write(pdf_content)
|
329 |
+
)
|
330 |
+
|
331 |
+
logger.info(f"Descarga exitosa: {filename}")
|
332 |
+
progress_callback(f"Descargado exitosamente: <a href='https://doi.org/{doi}'>{doi}</a>")
|
333 |
+
return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
|
334 |
+
else:
|
335 |
+
logger.warning(f"No se pudo descargar: {doi}")
|
336 |
+
progress_callback(f"No se pudo descargar: <a href='https://doi.org/{doi}'>{doi}</a>")
|
337 |
+
return None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
|
338 |
+
|
339 |
+
except Exception as e:
|
340 |
+
logger.error(f"Error procesando {doi}: {e}")
|
341 |
+
progress_callback(f"Error procesando {doi}: <a href='https://doi.org/{doi}'>{doi}</a> {e}")
|
342 |
+
return None, f"Error procesando {doi}: {e}", f"Error procesando {doi}: {e}"
|
343 |
|
344 |
async def download_multiple_dois_async(self, dois_text, progress_callback):
|
345 |
# Validar entrada
|