FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

ba3a95f

verified ·

1 Parent(s): 6f7150a

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -38

app.py CHANGED Viewed

@@ -59,37 +59,46 @@ class PaperDownloader:
                 return None, None
-    async def fetch_pdf_content(self, session, url, max_redirects=5):
-            """Fetch content and validate if response is PDF, following up to max_redirects redirections."""
             current_url = url
             redirect_count = 0
             while redirect_count <= max_redirects:
                 try:
-                    async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
-                         if response.status in [301,302, 307,308]:
-                             current_url = response.headers['Location']
-                             redirect_count+=1
-                             logger.debug(f"Following redirect from {url} to {current_url}")
-                             continue
-                         response.raise_for_status()
-                         if 'application/pdf' in response.headers.get('Content-Type', ''):
-                            return await response.read()
-                         else:
-                           logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
-                           return None
                 except Exception as e:
-                  logger.debug(f"Error getting PDF from {current_url}: {e}")
-                  return None
-            logger.debug(f"Too many redirects {url}, not following this link further")
             return None
     async def download_paper_direct_doi_async(self, session, doi):
             """Attempt to download the pdf from the landing page of the doi"""
             if not doi:
@@ -97,6 +106,14 @@ class PaperDownloader:
             try:
                  doi_url = f"https://doi.org/{self.clean_doi(doi)}"
                  text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
                  if not text:
                     return None
@@ -111,11 +128,12 @@ class PaperDownloader:
                  for pattern in pdf_patterns:
                     pdf_urls.extend(re.findall(pattern, text))
                  for pdf_url in pdf_urls:
-                     pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                     if pdf_content:
-                        logger.debug(f"Found PDF from: {pdf_url}")
-                        return pdf_content
             except Exception as e:
                 logger.debug(f"Error trying to get the PDF from {doi}: {e}")
@@ -145,18 +163,18 @@ class PaperDownloader:
                     for pattern in pdf_patterns:
                         pdf_urls.extend(re.findall(pattern, text))
-                    # Try downloading from found URLs
                     for pdf_url in pdf_urls:
                          pdf_content = await self.fetch_pdf_content(session,pdf_url)
                          if pdf_content:
                             logger.debug(f"Found PDF from: {pdf_url}")
                             return pdf_content
                 except Exception as e:
                     logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
             return None
     async def download_paper_libgen_async(self, session, doi):
         """Download from Libgen, handles the query and the redirection"""
         if not doi:
@@ -185,7 +203,7 @@ class PaperDownloader:
         except Exception as e:
             logger.debug(f"Error trying to download {doi} from libgen: {e}")
         return None
     async def download_paper_google_scholar_async(self, session, doi):
         """Search google scholar to find an article with the given doi, try to get the pdf"""
         if not doi:
@@ -216,7 +234,7 @@ class PaperDownloader:
             logger.debug(f"Google Scholar error for {doi}: {e}")
         return None
     async def download_paper_crossref_async(self, session, doi):
             """Alternative search method using Crossref"""
             if not doi:
@@ -244,7 +262,7 @@ class PaperDownloader:
             except Exception as e:
                 logger.debug(f"Crossref error for {doi}: {e}")
             return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
@@ -299,7 +317,7 @@ class PaperDownloader:
             except Exception as e:
                 logger.error(f"Error processing {doi}: {e}")
                 return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
     async def download_multiple_dois_async(self, dois_text):
             """Downloads multiple papers from a list of DOIs"""
             if not dois_text:
@@ -395,6 +413,7 @@ class PaperDownloader:
             return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
@@ -415,8 +434,6 @@ def create_gradio_interface():
             return zip_path, downloaded_dois, failed_dois, None
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,
@@ -492,11 +509,9 @@ def create_gradio_interface():
     """
     return interface
 def main():
     interface = create_gradio_interface()
     interface.launch(share=True)
 if __name__ == "__main__":
     main()

                 return None, None
+    async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
+            """Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
             current_url = url
             redirect_count = 0
+            retry_count = 0
             while redirect_count <= max_redirects:
                 try:
+                  while retry_count <= max_retries:
+                    try:
+                        async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
+                            if response.status in [301,302, 307,308]:
+                                current_url = response.headers['Location']
+                                redirect_count+=1
+                                logger.debug(f"Following redirect from {url} to {current_url}")
+                                break  # Break out of the retry loop for a redirect
+                            response.raise_for_status()
+                            if 'application/pdf' in response.headers.get('Content-Type', ''):
+                                return await response.read()
+                            else:
+                              logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
+                              return None
+                    except Exception as e:
+                         logger.debug(f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
+                         retry_count+=1
+                         await asyncio.sleep(retry_delay)
+                  retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
                 except Exception as e:
+                   logger.debug(f"Error getting PDF from {current_url}: {e}")
+                   return None
+            logger.debug(f"Too many redirects or retries {url}, not following this link further")
             return None
     async def download_paper_direct_doi_async(self, session, doi):
             """Attempt to download the pdf from the landing page of the doi"""
             if not doi:
             try:
                  doi_url = f"https://doi.org/{self.clean_doi(doi)}"
+                 # First, let's try to download the URL directly in case it is already the pdf.
+                 pdf_content = await self.fetch_pdf_content(session, doi_url)
+                 if pdf_content:
+                   logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
+                   return pdf_content
+                 # If direct DOI link was not a pdf, fetch landing page and extract links
                  text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
                  if not text:
                     return None
                  for pattern in pdf_patterns:
                     pdf_urls.extend(re.findall(pattern, text))
+                 # Attempt each pdf url and break when you find a PDF content.
                  for pdf_url in pdf_urls:
+                    pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                    if pdf_content:
+                      logger.debug(f"Found PDF from: {pdf_url}")
+                      return pdf_content
             except Exception as e:
                 logger.debug(f"Error trying to get the PDF from {doi}: {e}")
                     for pattern in pdf_patterns:
                         pdf_urls.extend(re.findall(pattern, text))
+                    # Try downloading from found URLs, but iterate over ALL
                     for pdf_url in pdf_urls:
                          pdf_content = await self.fetch_pdf_content(session,pdf_url)
                          if pdf_content:
                             logger.debug(f"Found PDF from: {pdf_url}")
                             return pdf_content
                 except Exception as e:
                     logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
             return None
     async def download_paper_libgen_async(self, session, doi):
         """Download from Libgen, handles the query and the redirection"""
         if not doi:
         except Exception as e:
             logger.debug(f"Error trying to download {doi} from libgen: {e}")
         return None
     async def download_paper_google_scholar_async(self, session, doi):
         """Search google scholar to find an article with the given doi, try to get the pdf"""
         if not doi:
             logger.debug(f"Google Scholar error for {doi}: {e}")
         return None
     async def download_paper_crossref_async(self, session, doi):
             """Alternative search method using Crossref"""
             if not doi:
             except Exception as e:
                 logger.debug(f"Crossref error for {doi}: {e}")
             return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
             except Exception as e:
                 logger.error(f"Error processing {doi}: {e}")
                 return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
     async def download_multiple_dois_async(self, dois_text):
             """Downloads multiple papers from a list of DOIs"""
             if not dois_text:
             return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
             return zip_path, downloaded_dois, failed_dois, None
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,
     """
     return interface
 def main():
     interface = create_gradio_interface()
     interface.launch(share=True)
 if __name__ == "__main__":
     main()