FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

5d5b6d2

verified ·

1 Parent(s): 9898cdf

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -169

app.py CHANGED Viewed

@@ -59,47 +59,47 @@ class PaperDownloader:
                 return None, None
     async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
-        """Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
-        current_url = url
-        redirect_count = 0
-        retry_count = 0
-        while redirect_count <= max_redirects:
-            try:
-                while retry_count <= max_retries:
-                    try:
-                        async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
-                            if response.status in [301, 302, 307, 308]:
-                                current_url = response.headers['Location']
-                                redirect_count += 1
-                                logger.debug(f"Following redirect from {url} to {current_url}")
-                                break  # Break out of the retry loop for a redirect
-                            response.raise_for_status()
-                            if 'application/pdf' in response.headers.get('Content-Type', ''):
-                                return await response.read()
-                            else:
-                                logger.debug(
-                                    f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
-                                return None
-                    except Exception as e:
-                        logger.debug(
-                            f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
-                        retry_count += 1
-                        await asyncio.sleep(retry_delay)
-                retry_count = 0  # Reset the retry count, in case there's a next redirect attempt
-            except Exception as e:
-                logger.debug(f"Error getting PDF from {current_url}: {e}")
-                return None
-        logger.debug(f"Too many redirects or retries {url}, not following this link further")
-        return None
     async def download_paper_direct_doi_async(self, session, doi):
         """Attempt to download the pdf from the landing page of the doi"""
         if not doi:
@@ -107,18 +107,18 @@ class PaperDownloader:
         try:
              doi_url = f"https://doi.org/{self.clean_doi(doi)}"
              # First, let's try to download the URL directly in case it is already the pdf.
              pdf_content = await self.fetch_pdf_content(session, doi_url)
              if pdf_content:
                 logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
                 return pdf_content
              # If direct DOI link was not a pdf, fetch landing page and extract links
              text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
              if not text:
                 return None
              pdf_patterns = [
                 r'(https?://[^\s<>"]+?\.pdf)',
                 r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
@@ -128,72 +128,72 @@ class PaperDownloader:
              pdf_urls = []
              for pattern in pdf_patterns:
                 pdf_urls.extend(re.findall(pattern, text))
              # Attempt each pdf url and break when you find a PDF content.
              for pdf_url in pdf_urls:
                 pdf_content = await self.fetch_pdf_content(session, pdf_url)
                 if pdf_content:
                   logger.debug(f"Found PDF from: {pdf_url}")
                   return pdf_content
         except Exception as e:
             logger.debug(f"Error trying to get the PDF from {doi}: {e}")
             return None
     async def download_paper_scihub_async(self, session, doi):
             """Improved method to download paper from Sci-Hub using async requests"""
             if not doi:
                 logger.warning("DOI not provided")
                 return None
             for base_url in self.download_sources:
                 try:
                     scihub_url = f"{base_url}{self.clean_doi(doi)}"
                     text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
                     if not text:
                         continue
                     # Search for multiple PDF URL patterns
                     pdf_patterns = [
                         r'(https?://[^\s<>"]+?\.pdf)',
                         r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
                         r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
                     ]
                     pdf_urls = []
                     for pattern in pdf_patterns:
                         pdf_urls.extend(re.findall(pattern, text))
                     # Try downloading from found URLs, but iterate over ALL
                     for pdf_url in pdf_urls:
                          pdf_content = await self.fetch_pdf_content(session,pdf_url)
                          if pdf_content:
                             logger.debug(f"Found PDF from: {pdf_url}")
                             return pdf_content
                 except Exception as e:
                     logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
             return None
     async def download_paper_libgen_async(self, session, doi):
         """Download from Libgen, handles the query and the redirection"""
         if not doi:
             return None
         base_url = 'https://libgen.rs/scimag/'
         try:
             search_url = f"{base_url}?q={self.clean_doi(doi)}"
             text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
             if not text or "No results" in text:
                 logger.debug(f"No results for DOI: {doi} on libgen")
                 return None
             soup = BeautifulSoup(text, 'html.parser')
             links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
             if links:
                 link = links[0]
                 pdf_url = link['href']
@@ -235,7 +235,7 @@ class PaperDownloader:
                 logger.debug(f"Google Scholar error for {doi}: {e}")
             return None
     async def download_paper_crossref_async(self, session, doi):
             """Alternative search method using Crossref"""
             if not doi:
@@ -263,7 +263,7 @@ class PaperDownloader:
             except Exception as e:
                 logger.debug(f"Crossref error for {doi}: {e}")
             return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
@@ -279,7 +279,6 @@ class PaperDownloader:
                         await self.download_paper_libgen_async(session, doi) or
                         await self.download_paper_google_scholar_async(session, doi) or
                         await self.download_paper_crossref_async(session, doi)
                     )
                     if pdf_content:
                         return pdf_content
@@ -291,136 +290,126 @@ class PaperDownloader:
                     logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
                     await asyncio.sleep(delay)
                     delay *= 2  # Exponential backoff
         return None
     async def download_single_doi_async(self, doi):
-            """Downloads a single paper using a DOI"""
-            if not doi:
-                return None, "Error: DOI not provided", "Error: DOI not provided"
-            try:
-                pdf_content = await self.download_with_retry_async(doi)
-                if pdf_content:
-                    if doi is None:
-                         return None, "Error: DOI not provided", "Error: DOI not provided"
-                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
-                    filepath = os.path.join(self.output_dir, filename)
-                    #write the file asynchronously here so it doesn't block
-                    loop = asyncio.get_running_loop()
-                    await loop.run_in_executor(None, lambda:  open(filepath, 'wb').write(pdf_content))
-                    logger.info(f"Successfully downloaded: {filename}")
-                    return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
-                else:
-                     logger.warning(f"Could not download: {doi}")
-                     return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
-            except Exception as e:
-                 logger.error(f"Error processing {doi}: {e}")
-                 return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
     async def download_multiple_dois_async(self, dois_text):
-            """Downloads multiple papers from a list of DOIs"""
-            if not dois_text:
-                return None, "Error: No DOIs provided", "Error: No DOIs provided"
-            dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
-            if not dois:
-                return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
-            downloaded_files = []
-            failed_dois = []
-            downloaded_links = []
-            for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
-                filepath, success_message, fail_message = await self.download_single_doi_async(doi)
-                if filepath:
                     # Unique filename for zip
                     filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
                     os.rename(filepath, filepath_unique)
                     downloaded_files.append(filepath_unique)
                     downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                else:
                     failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-            if downloaded_files:
-                zip_filename = 'papers.zip'
-                # Zip asynchronously
-                loop = asyncio.get_running_loop()
-                await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
-                logger.info(f"ZIP file created: {zip_filename}")
-            return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     def create_zip(self, zip_filename, downloaded_files):
-      with zipfile.ZipFile(zip_filename, 'w') as zipf:
-            for file_path in downloaded_files:
-                zipf.write(file_path, arcname=os.path.basename(file_path))
     async def process_bibtex_async(self, bib_file):
-            """Process BibTeX file and download papers with multiple strategies"""
-            # Read BibTeX file content from the uploaded object
-            try:
-                with open(bib_file.name, 'r', encoding='utf-8') as f:
-                    bib_content = f.read()
-            except Exception as e:
-                logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
-                return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
-            # Parse BibTeX data
-            try:
-                bib_database = bibtexparser.loads(bib_content)
-            except Exception as e:
-                logger.error(f"Error parsing BibTeX data: {e}")
-                return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
-            # Extract DOIs
-            dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
-            logger.info(f"Found {len(dois)} DOIs to download")
-            # Result lists
-            downloaded_files = []
-            failed_dois = []
-            downloaded_links = []
-            # Download PDFs
-            for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
-                try:
-                    # Try to download with multiple methods with retries
-                    pdf_content = await self.download_with_retry_async(doi)
-                    # Save PDF
-                    if pdf_content:
-                        if doi is None:
-                            return None, "Error: DOI not provided", "Error: DOI not provided"
-                        filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
-                        filepath = os.path.join(self.output_dir, filename)
-                        #Write the file asynchronously so it doesn't block the ui.
-                        loop = asyncio.get_running_loop()
-                        await loop.run_in_executor(None, lambda:  open(filepath, 'wb').write(pdf_content))
-                        downloaded_files.append(filepath)
-                        downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                        logger.info(f"Successfully downloaded: {filename}")
-                    else:
-                        failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                except Exception as e:
-                    failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                    logger.error(f"Error processing {doi}: {e}")
-            # Create ZIP of downloaded papers
-            if downloaded_files:
                 zip_filename = 'papers.zip'
-                # Zip asynchronously so the main loop is not blocked.
                 loop = asyncio.get_running_loop()
                 await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
                 logger.info(f"ZIP file created: {zip_filename}")
-            return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
@@ -431,7 +420,7 @@ def create_gradio_interface():
             # Check file type
             if not bib_file.name.lower().endswith('.bib'):
                 return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
             zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file)
             return zip_path, downloaded_dois, failed_dois, None
         elif doi_input:
@@ -498,7 +487,6 @@ def create_gradio_interface():
         """,
         cache_examples=False,
     )
     # Add Javascript to update HTML
     interface.load = """
         function(downloaded_dois, failed_dois){

                 return None, None
     async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
+            """Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
+            current_url = url
+            redirect_count = 0
+            retry_count = 0
+            while redirect_count <= max_redirects:
+                try:
+                    while retry_count <= max_retries:
+                        try:
+                            async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
+                                if response.status in [301, 302, 307, 308]:
+                                    current_url = response.headers['Location']
+                                    redirect_count += 1
+                                    logger.debug(f"Following redirect from {url} to {current_url}")
+                                    break  # Break out of the retry loop for a redirect
+                                response.raise_for_status()
+                                if 'application/pdf' in response.headers.get('Content-Type', ''):
+                                    return await response.read()
+                                else:
+                                    logger.debug(
+                                        f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
+                                    return None
+                        except Exception as e:
+                            logger.debug(
+                                f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
+                            retry_count += 1
+                            await asyncio.sleep(retry_delay)
+                    retry_count = 0  # Reset the retry count, in case there's a next redirect attempt
+                except Exception as e:
+                    logger.debug(f"Error getting PDF from {current_url}: {e}")
+                    return None
+            logger.debug(f"Too many redirects or retries {url}, not following this link further")
+            return None
     async def download_paper_direct_doi_async(self, session, doi):
         """Attempt to download the pdf from the landing page of the doi"""
         if not doi:
         try:
              doi_url = f"https://doi.org/{self.clean_doi(doi)}"
              # First, let's try to download the URL directly in case it is already the pdf.
              pdf_content = await self.fetch_pdf_content(session, doi_url)
              if pdf_content:
                 logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
                 return pdf_content
              # If direct DOI link was not a pdf, fetch landing page and extract links
              text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
              if not text:
                 return None
              pdf_patterns = [
                 r'(https?://[^\s<>"]+?\.pdf)',
                 r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
              pdf_urls = []
              for pattern in pdf_patterns:
                 pdf_urls.extend(re.findall(pattern, text))
              # Attempt each pdf url and break when you find a PDF content.
              for pdf_url in pdf_urls:
                 pdf_content = await self.fetch_pdf_content(session, pdf_url)
                 if pdf_content:
                   logger.debug(f"Found PDF from: {pdf_url}")
                   return pdf_content
         except Exception as e:
             logger.debug(f"Error trying to get the PDF from {doi}: {e}")
             return None
     async def download_paper_scihub_async(self, session, doi):
             """Improved method to download paper from Sci-Hub using async requests"""
             if not doi:
                 logger.warning("DOI not provided")
                 return None
             for base_url in self.download_sources:
                 try:
                     scihub_url = f"{base_url}{self.clean_doi(doi)}"
                     text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
                     if not text:
                         continue
                     # Search for multiple PDF URL patterns
                     pdf_patterns = [
                         r'(https?://[^\s<>"]+?\.pdf)',
                         r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
                         r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
                     ]
                     pdf_urls = []
                     for pattern in pdf_patterns:
                         pdf_urls.extend(re.findall(pattern, text))
                     # Try downloading from found URLs, but iterate over ALL
                     for pdf_url in pdf_urls:
                          pdf_content = await self.fetch_pdf_content(session,pdf_url)
                          if pdf_content:
                             logger.debug(f"Found PDF from: {pdf_url}")
                             return pdf_content
                 except Exception as e:
                     logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
             return None
     async def download_paper_libgen_async(self, session, doi):
         """Download from Libgen, handles the query and the redirection"""
         if not doi:
             return None
         base_url = 'https://libgen.rs/scimag/'
         try:
             search_url = f"{base_url}?q={self.clean_doi(doi)}"
             text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
             if not text or "No results" in text:
                 logger.debug(f"No results for DOI: {doi} on libgen")
                 return None
             soup = BeautifulSoup(text, 'html.parser')
             links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
             if links:
                 link = links[0]
                 pdf_url = link['href']
                 logger.debug(f"Google Scholar error for {doi}: {e}")
             return None
     async def download_paper_crossref_async(self, session, doi):
             """Alternative search method using Crossref"""
             if not doi:
             except Exception as e:
                 logger.debug(f"Crossref error for {doi}: {e}")
             return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
                         await self.download_paper_libgen_async(session, doi) or
                         await self.download_paper_google_scholar_async(session, doi) or
                         await self.download_paper_crossref_async(session, doi)
                     )
                     if pdf_content:
                         return pdf_content
                     logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
                     await asyncio.sleep(delay)
                     delay *= 2  # Exponential backoff
         return None
     async def download_single_doi_async(self, doi):
+        """Downloads a single paper using a DOI"""
+        if not doi:
+            return None, "Error: DOI not provided", "Error: DOI not provided"
+        try:
+            pdf_content = await self.download_with_retry_async(doi)
+            if pdf_content:
+                if doi is None:
+                    return None, "Error: DOI not provided", "Error: DOI not provided"
+                filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
+                filepath = os.path.join(self.output_dir, filename)
+                # Write file asynchronously
+                loop = asyncio.get_running_loop()
+                await loop.run_in_executor(None, lambda: open(filepath, 'wb').write(pdf_content))
+                logger.info(f"Successfully downloaded: {filename}")
+                return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
+            else:
+                logger.warning(f"Could not download: {doi}")
+                return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
+        except Exception as e:
+            logger.error(f"Error processing {doi}: {e}")
+            return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
     async def download_multiple_dois_async(self, dois_text):
+        """Downloads multiple papers from a list of DOIs"""
+        if not dois_text:
+            return None, "Error: No DOIs provided", "Error: No DOIs provided"
+        dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
+        if not dois:
+            return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
+        downloaded_files = []
+        failed_dois = []
+        downloaded_links = []
+        for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
+              filepath, success_message, fail_message = await self.download_single_doi_async(doi)
+              if filepath:
                     # Unique filename for zip
                     filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
                     os.rename(filepath, filepath_unique)
                     downloaded_files.append(filepath_unique)
                     downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+              else:
                     failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+        if downloaded_files:
+              zip_filename = 'papers.zip'
+              loop = asyncio.get_running_loop()
+              await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
+              logger.info(f"ZIP file created: {zip_filename}")
+        return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     def create_zip(self, zip_filename, downloaded_files):
+            with zipfile.ZipFile(zip_filename, 'w') as zipf:
+                for file_path in downloaded_files:
+                    zipf.write(file_path, arcname=os.path.basename(file_path))
     async def process_bibtex_async(self, bib_file):
+        """Process BibTeX file and download papers with multiple strategies"""
+        # Read BibTeX file content from the uploaded object
+        try:
+            with open(bib_file.name, 'r', encoding='utf-8') as f:
+                bib_content = f.read()
+        except Exception as e:
+            logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
+            return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
+        # Parse BibTeX data
+        try:
+            bib_database = bibtexparser.loads(bib_content)
+        except Exception as e:
+            logger.error(f"Error parsing BibTeX data: {e}")
+            return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
+        # Extract DOIs
+        dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
+        logger.info(f"Found {len(dois)} DOIs to download")
+        # Result lists
+        downloaded_files = []
+        failed_dois = []
+        downloaded_links = []
+        # Use asyncio.gather to run all downloads concurrently and show propert progress
+        tasks = [self.download_single_doi_async(doi) for doi in dois]
+        results = await asyncio.gather(*tasks)
+        for i, (filepath, success_message, fail_message) in enumerate(results):
+                if filepath:
+                    # Unique filename for zip
+                    filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
+                    filepath_unique = os.path.join(self.output_dir, filename)
+                    os.rename(filepath, filepath_unique)
+                    downloaded_files.append(filepath_unique)
+                    downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
+                else:
+                    failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
+        if downloaded_files:
                 zip_filename = 'papers.zip'
                 loop = asyncio.get_running_loop()
                 await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
                 logger.info(f"ZIP file created: {zip_filename}")
+        return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
             # Check file type
             if not bib_file.name.lower().endswith('.bib'):
                 return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
             zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file)
             return zip_path, downloaded_dois, failed_dois, None
         elif doi_input:
         """,
         cache_examples=False,
     )
     # Add Javascript to update HTML
     interface.load = """
         function(downloaded_dois, failed_dois){