C2MV commited on
Commit
b0f1670
·
verified ·
1 Parent(s): 7550ff7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -5
app.py CHANGED
@@ -264,15 +264,26 @@ class PaperDownloader:
264
  logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
 
267
- async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
268
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
269
  pdf_content = None
270
  retries = 0
271
  delay = initial_delay
272
-
 
 
 
 
 
 
 
 
273
  async with aiohttp.ClientSession() as session:
274
  while retries < max_retries and not pdf_content:
275
  try:
 
 
 
276
  pdf_content = (
277
  await self.download_paper_direct_doi_async(session, doi) or
278
  await self.download_paper_scihub_async(session, doi) or
@@ -280,17 +291,47 @@ class PaperDownloader:
280
  await self.download_paper_google_scholar_async(session, doi) or
281
  await self.download_paper_crossref_async(session, doi)
282
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  if pdf_content:
284
  return pdf_content
 
285
  except Exception as e:
286
  logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
287
-
288
  if not pdf_content:
289
  retries += 1
290
  logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
291
  await asyncio.sleep(delay)
292
  delay *= 2 # Exponential backoff
293
-
 
 
294
  return None
295
 
296
  async def download_single_doi_async(self, doi, progress_callback):
@@ -515,7 +556,7 @@ def create_gradio_interface():
515
 
516
  def main():
517
  interface = create_gradio_interface()
518
- interface.launch(share=True)
519
 
520
  if __name__ == "__main__":
521
  main()
 
264
  logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
 
267
+ async def download_with_retry_async(self, doi, max_retries=5, initial_delay=2):
268
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
269
  pdf_content = None
270
  retries = 0
271
  delay = initial_delay
272
+
273
+ # Additional download sources
274
+ additional_sources = [
275
+ f"https://sci-hub.ren/{doi}",
276
+ f"https://sci-hub.se/{doi}",
277
+ f"https://sci-hub.mksa.top/{doi}",
278
+ f"https://sci-hub.ru/{doi}"
279
+ ]
280
+
281
  async with aiohttp.ClientSession() as session:
282
  while retries < max_retries and not pdf_content:
283
  try:
284
+ logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
285
+
286
+ # Try primary sources first
287
  pdf_content = (
288
  await self.download_paper_direct_doi_async(session, doi) or
289
  await self.download_paper_scihub_async(session, doi) or
 
291
  await self.download_paper_google_scholar_async(session, doi) or
292
  await self.download_paper_crossref_async(session, doi)
293
  )
294
+
295
+ # If not found, try additional Sci-Hub sources
296
+ if not pdf_content and retries > 1:
297
+ for source in additional_sources:
298
+ try:
299
+ custom_scihub = f"{source}{self.clean_doi(doi)}"
300
+ logger.info(f"Trying custom source: {custom_scihub}")
301
+ async with session.get(custom_scihub, headers=self.headers, timeout=15) as response:
302
+ if response.status == 200:
303
+ text = await response.text()
304
+ pdf_patterns = [
305
+ r'(https?://[^\s<>"]+?\.pdf)',
306
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
307
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
308
+ ]
309
+ pdf_urls = []
310
+ for pattern in pdf_patterns:
311
+ pdf_urls.extend(re.findall(pattern, text))
312
+
313
+ for pdf_url in pdf_urls:
314
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
315
+ if pdf_content:
316
+ logger.info(f"Found PDF from custom source: {pdf_url}")
317
+ break
318
+ except Exception as e:
319
+ logger.debug(f"Error with custom source {source}: {e}")
320
+
321
  if pdf_content:
322
  return pdf_content
323
+
324
  except Exception as e:
325
  logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
326
+
327
  if not pdf_content:
328
  retries += 1
329
  logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
330
  await asyncio.sleep(delay)
331
  delay *= 2 # Exponential backoff
332
+
333
+ # Log detailed failure information
334
+ logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
335
  return None
336
 
337
  async def download_single_doi_async(self, doi, progress_callback):
 
556
 
557
  def main():
558
  interface = create_gradio_interface()
559
+ interface.launch()
560
 
561
  if __name__ == "__main__":
562
  main()