C2MV commited on
Commit
8555a57
·
verified ·
1 Parent(s): b0f1670

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -62
app.py CHANGED
@@ -270,12 +270,14 @@ class PaperDownloader:
270
  retries = 0
271
  delay = initial_delay
272
 
273
- # Additional download sources
274
  additional_sources = [
275
- f"https://sci-hub.ren/{doi}",
276
- f"https://sci-hub.se/{doi}",
277
- f"https://sci-hub.mksa.top/{doi}",
278
- f"https://sci-hub.ru/{doi}"
 
 
279
  ]
280
 
281
  async with aiohttp.ClientSession() as session:
@@ -283,54 +285,60 @@ class PaperDownloader:
283
  try:
284
  logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
285
 
286
- # Try primary sources first
287
- pdf_content = (
288
- await self.download_paper_direct_doi_async(session, doi) or
289
- await self.download_paper_scihub_async(session, doi) or
290
- await self.download_paper_libgen_async(session, doi) or
291
- await self.download_paper_google_scholar_async(session, doi) or
292
- await self.download_paper_crossref_async(session, doi)
293
- )
294
-
295
- # If not found, try additional Sci-Hub sources
 
 
 
 
 
 
296
  if not pdf_content and retries > 1:
297
  for source in additional_sources:
298
  try:
299
- custom_scihub = f"{source}{self.clean_doi(doi)}"
300
- logger.info(f"Trying custom source: {custom_scihub}")
301
- async with session.get(custom_scihub, headers=self.headers, timeout=15) as response:
302
- if response.status == 200:
303
- text = await response.text()
304
- pdf_patterns = [
305
- r'(https?://[^\s<>"]+?\.pdf)',
306
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
307
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
308
- ]
309
- pdf_urls = []
310
- for pattern in pdf_patterns:
311
- pdf_urls.extend(re.findall(pattern, text))
312
-
313
- for pdf_url in pdf_urls:
314
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
315
- if pdf_content:
316
- logger.info(f"Found PDF from custom source: {pdf_url}")
317
- break
 
 
318
  except Exception as e:
319
- logger.debug(f"Error with custom source {source}: {e}")
320
 
321
- if pdf_content:
322
- return pdf_content
323
-
324
  except Exception as e:
325
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
326
 
 
327
  if not pdf_content:
328
  retries += 1
329
  logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
330
  await asyncio.sleep(delay)
331
  delay *= 2 # Exponential backoff
332
 
333
- # Log detailed failure information
334
  logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
335
  return None
336
 
@@ -368,6 +376,7 @@ class PaperDownloader:
368
  if not dois_text:
369
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
370
 
 
371
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
372
  if not dois:
373
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
@@ -376,34 +385,48 @@ class PaperDownloader:
376
  failed_dois = []
377
  downloaded_links = []
378
 
379
- for i, doi in enumerate(dois):
380
- try:
381
- filepath, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
382
- if filepath:
383
- # Unique filename for zip
384
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
385
- filepath_unique = os.path.join(self.output_dir, filename)
386
- os.rename(filepath, filepath_unique)
387
- downloaded_files.append(filepath_unique)
388
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
389
- else:
390
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {fail_message}')
391
- except Exception as e:
392
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Unexpected error: {str(e)}')
393
- continue # Continue to next DOI even if this one fails
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  if downloaded_files:
396
  zip_filename = 'papers.zip'
397
  loop = asyncio.get_running_loop()
398
- await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
399
  logger.info(f"ZIP file created: {zip_filename}")
400
 
401
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
402
-
403
- def create_zip(self, zip_filename, downloaded_files):
404
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
405
- for file_path in downloaded_files:
406
- zipf.write(file_path, arcname=os.path.basename(file_path))
407
 
408
  async def process_bibtex_async(self, bib_file, progress_callback):
409
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
 
270
  retries = 0
271
  delay = initial_delay
272
 
273
+ # Additional Sci-Hub and alternative sources
274
  additional_sources = [
275
+ 'https://sci-hub.ren/',
276
+ 'https://sci-hub.se/',
277
+ 'https://sci-hub.mksa.top/',
278
+ 'https://sci-hub.ru/',
279
+ 'https://sci-hub.st/',
280
+ 'https://libgen.rs/scimag/'
281
  ]
282
 
283
  async with aiohttp.ClientSession() as session:
 
285
  try:
286
  logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
287
 
288
+ # Try primary sources
289
+ download_strategies = [
290
+ self.download_paper_direct_doi_async,
291
+ self.download_paper_scihub_async,
292
+ self.download_paper_libgen_async,
293
+ self.download_paper_google_scholar_async,
294
+ self.download_paper_crossref_async
295
+ ]
296
+
297
+ for strategy in download_strategies:
298
+ pdf_content = await strategy(session, doi)
299
+ if pdf_content:
300
+ logger.info(f"Successfully downloaded {doi} using {strategy.__name__}")
301
+ return pdf_content
302
+
303
+ # If not found, try additional sources
304
  if not pdf_content and retries > 1:
305
  for source in additional_sources:
306
  try:
307
+ scihub_url = f"{source}{self.clean_doi(doi)}"
308
+ logger.info(f"Trying alternative source: {scihub_url}")
309
+
310
+ text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
311
+ if text:
312
+ # Extract potential PDF links
313
+ pdf_patterns = [
314
+ r'(https?://[^\s<>"]+?\.pdf)',
315
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
316
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
317
+ ]
318
+ pdf_urls = []
319
+ for pattern in pdf_patterns:
320
+ pdf_urls.extend(re.findall(pattern, text))
321
+
322
+ # Try downloading from found URLs
323
+ for pdf_url in pdf_urls:
324
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
325
+ if pdf_content:
326
+ logger.info(f"Found PDF from alternative source: {pdf_url}")
327
+ return pdf_content
328
  except Exception as e:
329
+ logger.debug(f"Error with alternative source {source}: {e}")
330
 
 
 
 
331
  except Exception as e:
332
+ logger.error(f"Unexpected error in download attempt {retries + 1} for DOI {doi}: {e}")
333
 
334
+ # Prepare for next retry
335
  if not pdf_content:
336
  retries += 1
337
  logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
338
  await asyncio.sleep(delay)
339
  delay *= 2 # Exponential backoff
340
 
341
+ # Log final failure
342
  logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
343
  return None
344
 
 
376
  if not dois_text:
377
  return None, "Error: No DOIs provided", "Error: No DOIs provided"
378
 
379
+ # Sanitize and filter DOIs
380
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
381
  if not dois:
382
  return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
 
385
  failed_dois = []
386
  downloaded_links = []
387
 
388
+ # Use asyncio.gather to process all DOIs concurrently
389
+ download_tasks = []
390
+ for doi in dois:
391
+ task = self.download_single_doi_async(doi, progress_callback)
392
+ download_tasks.append(task)
393
+
394
+ # Wait for all downloads to complete
395
+ results = await asyncio.gather(*download_tasks, return_exceptions=True)
 
 
 
 
 
 
 
396
 
397
+ for i, result in enumerate(results):
398
+ doi = dois[i]
399
+
400
+ # Handle different result types
401
+ if isinstance(result, Exception):
402
+ # Unexpected error
403
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Unexpected error: {str(result)}')
404
+ elif result[0] is None:
405
+ # Download failed
406
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {result[1]}')
407
+ else:
408
+ # Successful download
409
+ filepath = result[0]
410
+
411
+ # Create unique filename for zip
412
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
413
+ filepath_unique = os.path.join(self.output_dir, filename)
414
+
415
+ # Rename and add to downloaded files
416
+ os.rename(filepath, filepath_unique)
417
+ downloaded_files.append(filepath_unique)
418
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
419
+
420
+ # Create zip if any files were downloaded
421
  if downloaded_files:
422
  zip_filename = 'papers.zip'
423
  loop = asyncio.get_running_loop()
424
+ await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename, downloaded_files))
425
  logger.info(f"ZIP file created: {zip_filename}")
426
 
427
+ return (zip_filename if downloaded_files else None,
428
+ "\n".join(downloaded_links),
429
+ "\n".join(failed_dois))
 
 
 
430
 
431
  async def process_bibtex_async(self, bib_file, progress_callback):
432
  """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""