FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

4b923db

verified ·

1 Parent(s): 0e74018

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -31

app.py CHANGED Viewed

@@ -57,6 +57,19 @@ class PaperDownloader:
         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
     async def download_paper_direct_doi_async(self, session, doi):
@@ -81,13 +94,10 @@ class PaperDownloader:
             pdf_urls.extend(re.findall(pattern, text))
          for pdf_url in pdf_urls:
-           try:
-               pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
-               if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                    logger.debug(f"Found PDF from: {pdf_url}")
-                    return await pdf_response.read()
-           except Exception as e:
-               logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
       except Exception as e:
@@ -121,14 +131,11 @@ class PaperDownloader:
                 # Try downloading from found URLs
                 for pdf_url in pdf_urls:
-                    try:
-                        pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
-                        # Verify if it's a PDF
-                        if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                            logger.debug(f"Found PDF from: {pdf_url}")
-                            return await pdf_response.read()
-                    except Exception as e:
-                        logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
             except Exception as e:
                 logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
@@ -156,10 +163,10 @@ class PaperDownloader:
             if links:
                 link = links[0]
                 pdf_url = link['href']
-                pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
-                if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
                     logger.debug(f"Found PDF from: {pdf_url}")
-                    return await pdf_response.read()
         except Exception as e:
             logger.debug(f"Error trying to download {doi} from libgen: {e}")
         return None
@@ -185,10 +192,10 @@ class PaperDownloader:
             if links:
                 pdf_url = links[0]['href']
-                pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
-                if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                    logger.debug(f"Found PDF from: {pdf_url}")
-                    return await pdf_response.read()
         except Exception as e:
             logger.debug(f"Google Scholar error for {doi}: {e}")
@@ -214,13 +221,11 @@ class PaperDownloader:
                     if link.get('content-type') == 'application/pdf':
                         pdf_url = link.get('URL')
                         if pdf_url:
-                            try:
-                              pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
-                              if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                                  logger.debug(f"Found PDF from: {pdf_url}")
-                                  return await pdf_response.read()
-                            except Exception as e:
-                              logger.debug(f"Error fetching from {pdf_url}")
         except Exception as e:
             logger.debug(f"Crossref error for {doi}: {e}")
@@ -315,7 +320,7 @@ class PaperDownloader:
             logger.info(f"ZIP file created: {zip_filename}")
         return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     async def process_bibtex_async(self, bib_file):
         """Process BibTeX file and download papers with multiple strategies"""
         # Read BibTeX file content from the uploaded object
@@ -378,6 +383,7 @@ class PaperDownloader:
         return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
@@ -399,7 +405,6 @@ def create_gradio_interface():
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,

         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
+    async def fetch_pdf_content(self, session, url):
+        """Fetch and validate if the content of a request is actually PDF."""
+        try:
+            async with session.get(url, headers=self.headers, timeout=10) as pdf_response:
+                if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                     return await pdf_response.read()
+                else:
+                   logger.debug(f"Content type not PDF for {url}: {pdf_response.headers.get('Content-Type', '')}")
+                   return None
+        except Exception as e:
+          logger.debug(f"Error getting PDF {url}: {e}")
+          return None
     async def download_paper_direct_doi_async(self, session, doi):
             pdf_urls.extend(re.findall(pattern, text))
          for pdf_url in pdf_urls:
+             pdf_content = await self.fetch_pdf_content(session,pdf_url)
+             if pdf_content:
+                logger.debug(f"Found PDF from: {pdf_url}")
+                return pdf_content
       except Exception as e:
                 # Try downloading from found URLs
                 for pdf_url in pdf_urls:
+                    pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                    if pdf_content:
+                       logger.debug(f"Found PDF from: {pdf_url}")
+                       return pdf_content
             except Exception as e:
                 logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
             if links:
                 link = links[0]
                 pdf_url = link['href']
+                pdf_content = await self.fetch_pdf_content(session,pdf_url)
+                if pdf_content:
                     logger.debug(f"Found PDF from: {pdf_url}")
+                    return pdf_content
         except Exception as e:
             logger.debug(f"Error trying to download {doi} from libgen: {e}")
         return None
             if links:
                 pdf_url = links[0]['href']
+                pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                if pdf_content:
+                   logger.debug(f"Found PDF from: {pdf_url}")
+                   return pdf_content
         except Exception as e:
             logger.debug(f"Google Scholar error for {doi}: {e}")
                     if link.get('content-type') == 'application/pdf':
                         pdf_url = link.get('URL')
                         if pdf_url:
+                            pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                            if pdf_content:
+                              logger.debug(f"Found PDF from: {pdf_url}")
+                              return pdf_content
         except Exception as e:
             logger.debug(f"Crossref error for {doi}: {e}")
             logger.info(f"ZIP file created: {zip_filename}")
         return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     async def process_bibtex_async(self, bib_file):
         """Process BibTeX file and download papers with multiple strategies"""
         # Read BibTeX file content from the uploaded object
         return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,