FreeBibTec2

Sleeping

App Files Files

xet

Community

C2MV commited on Dec 14, 2024

Commit

8555a57

verified ·

1 Parent(s): b0f1670

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -62

app.py CHANGED Viewed

@@ -270,12 +270,14 @@ class PaperDownloader:
         retries = 0
         delay = initial_delay
-        # Additional download sources
         additional_sources = [
-            f"https://sci-hub.ren/{doi}",
-            f"https://sci-hub.se/{doi}",
-            f"https://sci-hub.mksa.top/{doi}",
-            f"https://sci-hub.ru/{doi}"
         ]
         async with aiohttp.ClientSession() as session:
@@ -283,54 +285,60 @@ class PaperDownloader:
                 try:
                     logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
-                    # Try primary sources first
-                    pdf_content = (
-                        await self.download_paper_direct_doi_async(session, doi) or
-                        await self.download_paper_scihub_async(session, doi) or
-                        await self.download_paper_libgen_async(session, doi) or
-                        await self.download_paper_google_scholar_async(session, doi) or
-                        await self.download_paper_crossref_async(session, doi)
-                    )
-                    # If not found, try additional Sci-Hub sources
                     if not pdf_content and retries > 1:
                         for source in additional_sources:
                             try:
-                                custom_scihub = f"{source}{self.clean_doi(doi)}"
-                                logger.info(f"Trying custom source: {custom_scihub}")
-                                async with session.get(custom_scihub, headers=self.headers, timeout=15) as response:
-                                    if response.status == 200:
-                                        text = await response.text()
-                                        pdf_patterns = [
-                                            r'(https?://[^\s<>"]+?\.pdf)',
-                                            r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-                                            r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-                                        ]
-                                        pdf_urls = []
-                                        for pattern in pdf_patterns:
-                                            pdf_urls.extend(re.findall(pattern, text))
-                                        for pdf_url in pdf_urls:
-                                            pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                                            if pdf_content:
-                                                logger.info(f"Found PDF from custom source: {pdf_url}")
-                                                break
                             except Exception as e:
-                                logger.debug(f"Error with custom source {source}: {e}")
-                    if pdf_content:
-                        return pdf_content
                 except Exception as e:
-                    logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
                 if not pdf_content:
                     retries += 1
                     logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
                     await asyncio.sleep(delay)
                     delay *= 2  # Exponential backoff
-        # Log detailed failure information
         logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
         return None
@@ -368,6 +376,7 @@ class PaperDownloader:
         if not dois_text:
             return None, "Error: No DOIs provided", "Error: No DOIs provided"
         dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
         if not dois:
             return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
@@ -376,34 +385,48 @@ class PaperDownloader:
         failed_dois = []
         downloaded_links = []
-        for i, doi in enumerate(dois):
-            try:
-                filepath, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
-                if filepath:
-                    # Unique filename for zip
-                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
-                    filepath_unique = os.path.join(self.output_dir, filename)
-                    os.rename(filepath, filepath_unique)
-                    downloaded_files.append(filepath_unique)
-                    downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                else:
-                    failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {fail_message}')
-            except Exception as e:
-                failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Unexpected error: {str(e)}')
-                continue  # Continue to next DOI even if this one fails
         if downloaded_files:
             zip_filename = 'papers.zip'
             loop = asyncio.get_running_loop()
-            await loop.run_in_executor(self.executor, lambda:  self.create_zip(zip_filename,downloaded_files))
             logger.info(f"ZIP file created: {zip_filename}")
-        return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
-    def create_zip(self, zip_filename, downloaded_files):
-        with zipfile.ZipFile(zip_filename, 'w') as zipf:
-                for file_path in downloaded_files:
-                    zipf.write(file_path, arcname=os.path.basename(file_path))
     async def process_bibtex_async(self, bib_file, progress_callback):
             """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""

         retries = 0
         delay = initial_delay
+        # Additional Sci-Hub and alternative sources
         additional_sources = [
+            'https://sci-hub.ren/',
+            'https://sci-hub.se/',
+            'https://sci-hub.mksa.top/',
+            'https://sci-hub.ru/',
+            'https://sci-hub.st/',
+            'https://libgen.rs/scimag/'
         ]
         async with aiohttp.ClientSession() as session:
                 try:
                     logger.info(f"Attempt {retries + 1} to download DOI: {doi}")
+                    # Try primary sources
+                    download_strategies = [
+                        self.download_paper_direct_doi_async,
+                        self.download_paper_scihub_async,
+                        self.download_paper_libgen_async,
+                        self.download_paper_google_scholar_async,
+                        self.download_paper_crossref_async
+                    ]
+                    for strategy in download_strategies:
+                        pdf_content = await strategy(session, doi)
+                        if pdf_content:
+                            logger.info(f"Successfully downloaded {doi} using {strategy.__name__}")
+                            return pdf_content
+                    # If not found, try additional sources
                     if not pdf_content and retries > 1:
                         for source in additional_sources:
                             try:
+                                scihub_url = f"{source}{self.clean_doi(doi)}"
+                                logger.info(f"Trying alternative source: {scihub_url}")
+                                text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
+                                if text:
+                                    # Extract potential PDF links
+                                    pdf_patterns = [
+                                        r'(https?://[^\s<>"]+?\.pdf)',
+                                        r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+                                        r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+                                    ]
+                                    pdf_urls = []
+                                    for pattern in pdf_patterns:
+                                        pdf_urls.extend(re.findall(pattern, text))
+                                    # Try downloading from found URLs
+                                    for pdf_url in pdf_urls:
+                                        pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                                        if pdf_content:
+                                            logger.info(f"Found PDF from alternative source: {pdf_url}")
+                                            return pdf_content
                             except Exception as e:
+                                logger.debug(f"Error with alternative source {source}: {e}")
                 except Exception as e:
+                    logger.error(f"Unexpected error in download attempt {retries + 1} for DOI {doi}: {e}")
+                # Prepare for next retry
                 if not pdf_content:
                     retries += 1
                     logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
                     await asyncio.sleep(delay)
                     delay *= 2  # Exponential backoff
+        # Log final failure
         logger.warning(f"FINAL FAILURE: Could not download DOI {doi} after {max_retries} attempts")
         return None
         if not dois_text:
             return None, "Error: No DOIs provided", "Error: No DOIs provided"
+        # Sanitize and filter DOIs
         dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
         if not dois:
             return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
         failed_dois = []
         downloaded_links = []
+        # Use asyncio.gather to process all DOIs concurrently
+        download_tasks = []
+        for doi in dois:
+            task = self.download_single_doi_async(doi, progress_callback)
+            download_tasks.append(task)
+        # Wait for all downloads to complete
+        results = await asyncio.gather(*download_tasks, return_exceptions=True)
+        for i, result in enumerate(results):
+            doi = dois[i]
+            # Handle different result types
+            if isinstance(result, Exception):
+                # Unexpected error
+                failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Unexpected error: {str(result)}')
+            elif result[0] is None:
+                # Download failed
+                failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {result[1]}')
+            else:
+                # Successful download
+                filepath = result[0]
+                # Create unique filename for zip
+                filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
+                filepath_unique = os.path.join(self.output_dir, filename)
+                # Rename and add to downloaded files
+                os.rename(filepath, filepath_unique)
+                downloaded_files.append(filepath_unique)
+                downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+        # Create zip if any files were downloaded
         if downloaded_files:
             zip_filename = 'papers.zip'
             loop = asyncio.get_running_loop()
+            await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename, downloaded_files))
             logger.info(f"ZIP file created: {zip_filename}")
+        return (zip_filename if downloaded_files else None,
+                "\n".join(downloaded_links),
+                "\n".join(failed_dois))
     async def process_bibtex_async(self, bib_file, progress_callback):
             """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""