C2MV commited on
Commit
4cb7c88
verified
1 Parent(s): 8555a57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -74
app.py CHANGED
@@ -265,27 +265,16 @@ class PaperDownloader:
265
  return None
266
 
267
  async def download_with_retry_async(self, doi, max_retries=5, initial_delay=2):
268
- """Downloads a paper using multiple strategies with exponential backoff and async requests"""
269
  pdf_content = None
270
  retries = 0
271
  delay = initial_delay
272
 
273
- # Additional Sci-Hub and alternative sources
274
- additional_sources = [
275
- 'https://sci-hub.ren/',
276
- 'https://sci-hub.se/',
277
- 'https://sci-hub.mksa.top/',
278
- 'https://sci-hub.ru/',
279
- 'https://sci-hub.st/',
280
- 'https://libgen.rs/scimag/'
281
- ]
282
-
283
  async with aiohttp.ClientSession() as session:
284
  while retries < max_retries and not pdf_content:
285
  try:
286
  logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
287
 
288
- # Try primary sources
289
  download_strategies = [
290
  self.download_paper_direct_doi_async,
291
  self.download_paper_scihub_async,
@@ -300,47 +289,20 @@ class PaperDownloader:
300
  logger.info(f"Successfully downloaded {doi} using {strategy.__name__}")
301
  return pdf_content
302
 
303
- # If not found, try additional sources
304
- if not pdf_content and retries > 1:
305
- for source in additional_sources:
306
- try:
307
- scihub_url = f"{source}{self.clean_doi(doi)}"
308
- logger.info(f"Trying alternative source: {scihub_url}")
309
-
310
- text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
311
- if text:
312
- # Extract potential PDF links
313
- pdf_patterns = [
314
- r'(https?://[^\s<>"]+?\.pdf)',
315
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
316
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
317
- ]
318
- pdf_urls = []
319
- for pattern in pdf_patterns:
320
- pdf_urls.extend(re.findall(pattern, text))
321
-
322
- # Try downloading from found URLs
323
- for pdf_url in pdf_urls:
324
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
325
- if pdf_content:
326
- logger.info(f"Found PDF from alternative source: {pdf_url}")
327
- return pdf_content
328
- except Exception as e:
329
- logger.debug(f"Error with alternative source {source}: {e}")
330
 
331
  except Exception as e:
332
  logger.error(f"Unexpected error in download attempt {retries + 1} for DOI {doi}: {e}")
333
-
334
- # Prepare for next retry
335
- if not pdf_content:
336
  retries += 1
337
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
338
  await asyncio.sleep(delay)
339
- delay *= 2 # Exponential backoff
340
-
341
- # Log final failure
342
- logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
343
- return None
344
 
345
  async def download_single_doi_async(self, doi, progress_callback):
346
  """Downloads a single paper using a DOI, and updates the given progress_callback"""
@@ -373,60 +335,88 @@ class PaperDownloader:
373
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
374
 
375
  async def download_multiple_dois_async(self, dois_text, progress_callback):
 
376
  if not dois_text:
377
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
378
 
379
- # Sanitize and filter DOIs
380
- dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
 
 
 
381
  if not dois:
382
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
383
 
384
- downloaded_files = []
385
- failed_dois = []
386
- downloaded_links = []
387
-
388
- # Use asyncio.gather to process all DOIs concurrently
 
389
  download_tasks = []
390
  for doi in dois:
391
  task = self.download_single_doi_async(doi, progress_callback)
392
  download_tasks.append(task)
393
 
394
- # Wait for all downloads to complete
395
  results = await asyncio.gather(*download_tasks, return_exceptions=True)
396
 
 
397
  for i, result in enumerate(results):
398
  doi = dois[i]
399
 
400
- # Handle different result types
401
  if isinstance(result, Exception):
402
- # Unexpected error
403
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Unexpected error: {str(result)}')
 
 
 
404
  elif result[0] is None:
405
- # Download failed
406
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {result[1]}')
 
 
 
407
  else:
408
- # Successful download
409
  filepath = result[0]
410
 
411
- # Create unique filename for zip
412
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
413
  filepath_unique = os.path.join(self.output_dir, filename)
414
 
415
- # Rename and add to downloaded files
416
- os.rename(filepath, filepath_unique)
417
- downloaded_files.append(filepath_unique)
418
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
419
-
420
- # Create zip if any files were downloaded
 
 
 
 
 
 
 
 
421
  if downloaded_files:
422
  zip_filename = 'papers.zip'
423
  loop = asyncio.get_running_loop()
424
- await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename, downloaded_files))
 
 
 
 
 
425
  logger.info(f"ZIP file created: {zip_filename}")
426
-
427
- return (zip_filename if downloaded_files else None,
428
- "\n".join(downloaded_links),
429
- "\n".join(failed_dois))
 
 
 
430
 
431
  async def process_bibtex_async(self, bib_file, progress_callback):
432
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
 
265
  return None
266
 
267
  async def download_with_retry_async(self, doi, max_retries=5, initial_delay=2):
 
268
  pdf_content = None
269
  retries = 0
270
  delay = initial_delay
271
 
 
 
 
 
 
 
 
 
 
 
272
  async with aiohttp.ClientSession() as session:
273
  while retries < max_retries and not pdf_content:
274
  try:
275
  logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
276
 
277
+ # Strategies in order
278
  download_strategies = [
279
  self.download_paper_direct_doi_async,
280
  self.download_paper_scihub_async,
 
289
  logger.info(f"Successfully downloaded {doi} using {strategy.__name__}")
290
  return pdf_content
291
 
292
+ # If no strategy worked, skip this iteration
293
+ retries += 1
294
+ logger.warning(f"No successful strategy for DOI: {doi}. Retry {retries}")
295
+ await asyncio.sleep(delay)
296
+ delay *= 2 # Exponential backoff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
  except Exception as e:
299
  logger.error(f"Unexpected error in download attempt {retries + 1} for DOI {doi}: {e}")
 
 
 
300
  retries += 1
 
301
  await asyncio.sleep(delay)
302
+ delay *= 2
303
+
304
+ logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
305
+ return None
 
306
 
307
  async def download_single_doi_async(self, doi, progress_callback):
308
  """Downloads a single paper using a DOI, and updates the given progress_callback"""
 
335
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
336
 
337
  async def download_multiple_dois_async(self, dois_text, progress_callback):
338
+ # Validar entrada
339
  if not dois_text:
340
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
341
 
342
+ # Sanitizar y filtrar DOIs
343
+ # Eliminar l铆neas vac铆as, espacios en blanco, y DOIs duplicados
344
+ dois = list(set([doi.strip() for doi in dois_text.split('\n') if doi.strip()]))
345
+
346
+ # Validar lista de DOIs
347
  if not dois:
348
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
349
 
350
+ # Listas para rastrear resultados
351
+ downloaded_files = [] # Rutas de archivos descargados
352
+ failed_dois = [] # DOIs que no se pudieron descargar
353
+ downloaded_links = [] # Links de DOIs descargados
354
+
355
+ # Generar tareas de descarga concurrentes
356
  download_tasks = []
357
  for doi in dois:
358
  task = self.download_single_doi_async(doi, progress_callback)
359
  download_tasks.append(task)
360
 
361
+ # Ejecutar todas las descargas concurrentemente
362
  results = await asyncio.gather(*download_tasks, return_exceptions=True)
363
 
364
+ # Procesar resultados de cada DOI
365
  for i, result in enumerate(results):
366
  doi = dois[i]
367
 
368
+ # Manejar diferentes tipos de resultados
369
  if isinstance(result, Exception):
370
+ # Excepci贸n inesperada
371
+ error_msg = f"Unexpected error: {str(result)}"
372
+ logger.error(f"Error downloading {doi}: {error_msg}")
373
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
374
+
375
  elif result[0] is None:
376
+ # Descarga fallida (resultado de download_single_doi_async)
377
+ error_msg = result[1]
378
+ logger.warning(f"Failed to download {doi}: {error_msg}")
379
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
380
+
381
  else:
382
+ # Descarga exitosa
383
  filepath = result[0]
384
 
385
+ # Generar nombre de archivo 煤nico
386
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
387
  filepath_unique = os.path.join(self.output_dir, filename)
388
 
389
+ try:
390
+ # Renombrar archivo
391
+ os.rename(filepath, filepath_unique)
392
+
393
+ # A帽adir a lista de archivos descargados
394
+ downloaded_files.append(filepath_unique)
395
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
396
+
397
+ except Exception as rename_error:
398
+ logger.error(f"Error renaming file for {doi}: {rename_error}")
399
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Error saving file')
400
+
401
+ # Crear archivo ZIP si hay archivos descargados
402
+ zip_filename = None
403
  if downloaded_files:
404
  zip_filename = 'papers.zip'
405
  loop = asyncio.get_running_loop()
406
+
407
+ # Ejecutar creaci贸n de ZIP en un executor para no bloquear
408
+ await loop.run_in_executor(
409
+ self.executor,
410
+ lambda: self.create_zip(zip_filename, downloaded_files)
411
+ )
412
  logger.info(f"ZIP file created: {zip_filename}")
413
+
414
+ # Devolver resultados
415
+ return (
416
+ zip_filename if downloaded_files else None, # Archivo ZIP o None
417
+ "\n".join(downloaded_links), # DOIs descargados
418
+ "\n".join(failed_dois) # DOIs fallidos
419
+ )
420
 
421
  async def process_bibtex_async(self, bib_file, progress_callback):
422
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""