FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

64cb1ff

verified ·

1 Parent(s): 7b9d802

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -52

app.py CHANGED Viewed

@@ -14,7 +14,6 @@ import asyncio
 import aiohttp
 from playwright.async_api import async_playwright
 # Configure logging
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s - %(levelname)s: %(message)s')
@@ -43,7 +42,51 @@ class PaperDownloader:
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
         }
     def clean_doi(self, doi):
         """Clean and encode DOI for URL"""
         if not isinstance(doi, str):
@@ -59,55 +102,43 @@ class PaperDownloader:
         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
     async def download_paper_direct_doi_async(self, session, doi):
-        """Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
-        if not doi:
-           return None
-        try:
-             doi_url = f"https://doi.org/{self.clean_doi(doi)}"
-             # Use Playwright to render JavaScript content
-             async with async_playwright() as p:
-                browser = await p.chromium.launch() # You can use different browsers
-                page = await browser.new_page()
-                try:
-                     await page.goto(doi_url, timeout=30000)
-                     html_content = await page.content()
-                except Exception as e:
-                      logger.debug(f"Error trying to navigate {doi}: {e}")
-                      await browser.close()
-                      return None
-                soup = BeautifulSoup(html_content, 'html.parser')
-                await browser.close()
-             pdf_patterns = [
               r'(https?://[^\s<>"]+?\.pdf)',
               r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
               r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-             ]
-             pdf_urls = []
-             for pattern in pdf_patterns:
-               pdf_urls.extend(re.findall(pattern, html_content))
-             for pdf_url in pdf_urls:
-                 try:
-                    pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
-                    if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                       logger.debug(f"Found PDF from: {pdf_url}")
-                       return await pdf_response.read()
-                 except Exception as e:
-                    logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
-        except Exception as e:
-           logger.debug(f"Error trying to get the PDF from {doi}: {e}")
-        return None
     async def download_paper_scihub_async(self, session, doi):
         """Improved method to download paper from Sci-Hub using async requests"""
@@ -248,13 +279,12 @@ class PaperDownloader:
             while retries < max_retries and not pdf_content:
                 try:
                     pdf_content = (
-                        await self.download_paper_direct_doi_async(session, doi) or
                         await self.download_paper_scihub_async(session, doi) or
                         await self.download_paper_libgen_async(session, doi) or
                         await self.download_paper_google_scholar_async(session, doi) or
                         await self.download_paper_crossref_async(session, doi)
-                    )
                     if pdf_content:
                         return pdf_content
                 except Exception as e:
@@ -418,8 +448,7 @@ class PaperDownloader:
                      self.download_paper_libgen(doi) or
                      self.download_paper_google_scholar(doi) or
                      self.download_paper_crossref(doi)
-                  )
                  if pdf_content:
                      return pdf_content
@@ -614,6 +643,8 @@ class PaperDownloader:
                 for file_path in downloaded_files:
                     zipf.write(file_path, arcname=os.path.basename(file_path))
             logger.info(f"ZIP file created: {zip_filename}")
         return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None

 import aiohttp
 from playwright.async_api import async_playwright
 # Configure logging
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s - %(levelname)s: %(message)s')
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
         }
+        self.playwright_browser = None
+        self.playwright_lock = asyncio.Lock() # Added lock
+    async def initialize_playwright(self):
+      """Initialize the playwright browser instance to be used by the tool."""
+      async with self.playwright_lock:
+          if not self.playwright_browser:
+              try:
+                playwright = await async_playwright().start()
+                self.playwright_browser = await playwright.chromium.launch()
+              except Exception as e:
+                logger.error(f"Error initializing Playwright browser: {e}")
+    async def close_playwright(self):
+      """Closes the playwright browser, must be called at the end of the execution."""
+      async with self.playwright_lock:
+        if self.playwright_browser:
+            try:
+                  await self.playwright_browser.close()
+                  self.playwright_browser = None
+            except Exception as e:
+                  logger.error(f"Error closing Playwright browser: {e}")
+    async def get_html_with_playwright(self, doi_url):
+      """Utility function to fetch content with playwright with try-catch."""
+      if not self.playwright_browser:
+           await self.initialize_playwright()
+      if not self.playwright_browser:
+            logger.error(f"Playwright browser is not initialized for url: {doi_url}")
+            return None
+      page = None
+      try:
+        page = await self.playwright_browser.new_page()
+        await page.goto(doi_url, timeout=30000)
+        return await page.content()
+      except Exception as e:
+           logger.debug(f"Error navigating or getting content for url: {doi_url}: {e}")
+           return None
+      finally:
+        if page:
+          await page.close()
     def clean_doi(self, doi):
         """Clean and encode DOI for URL"""
         if not isinstance(doi, str):
         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
     async def download_paper_direct_doi_async(self, session, doi):
+       """Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
+       if not doi:
+         return None
+       try:
+          doi_url = f"https://doi.org/{self.clean_doi(doi)}"
+          html_content = await self.get_html_with_playwright(doi_url)
+          if not html_content:
+             return None
+          pdf_patterns = [
               r'(https?://[^\s<>"]+?\.pdf)',
               r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
               r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+          ]
+          pdf_urls = []
+          for pattern in pdf_patterns:
+            pdf_urls.extend(re.findall(pattern, html_content))
+          for pdf_url in pdf_urls:
+              try:
+                  pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
+                  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                     logger.debug(f"Found PDF from: {pdf_url}")
+                     return await pdf_response.read()
+              except Exception as e:
+                   logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
+       except Exception as e:
+          logger.debug(f"Error trying to get the PDF from {doi}: {e}")
+       return None
     async def download_paper_scihub_async(self, session, doi):
         """Improved method to download paper from Sci-Hub using async requests"""
             while retries < max_retries and not pdf_content:
                 try:
                     pdf_content = (
+                       await self.download_paper_direct_doi_async(session, doi) or
                         await self.download_paper_scihub_async(session, doi) or
                         await self.download_paper_libgen_async(session, doi) or
                         await self.download_paper_google_scholar_async(session, doi) or
                         await self.download_paper_crossref_async(session, doi)
+                     )
                     if pdf_content:
                         return pdf_content
                 except Exception as e:
                      self.download_paper_libgen(doi) or
                      self.download_paper_google_scholar(doi) or
                      self.download_paper_crossref(doi)
+                   )
                  if pdf_content:
                      return pdf_content
                 for file_path in downloaded_files:
                     zipf.write(file_path, arcname=os.path.basename(file_path))
             logger.info(f"ZIP file created: {zip_filename}")
+        await self.close_playwright()
         return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None