FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

707bdc5

verified ·

1 Parent(s): 64cb1ff

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -324

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ from bs4 import BeautifulSoup
 import io
 import asyncio
 import aiohttp
-from playwright.async_api import async_playwright
 # Configure logging
 logging.basicConfig(level=logging.INFO,
@@ -42,51 +41,7 @@ class PaperDownloader:
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
         }
-        self.playwright_browser = None
-        self.playwright_lock = asyncio.Lock() # Added lock
-    async def initialize_playwright(self):
-      """Initialize the playwright browser instance to be used by the tool."""
-      async with self.playwright_lock:
-          if not self.playwright_browser:
-              try:
-                playwright = await async_playwright().start()
-                self.playwright_browser = await playwright.chromium.launch()
-              except Exception as e:
-                logger.error(f"Error initializing Playwright browser: {e}")
-    async def close_playwright(self):
-      """Closes the playwright browser, must be called at the end of the execution."""
-      async with self.playwright_lock:
-        if self.playwright_browser:
-            try:
-                  await self.playwright_browser.close()
-                  self.playwright_browser = None
-            except Exception as e:
-                  logger.error(f"Error closing Playwright browser: {e}")
-    async def get_html_with_playwright(self, doi_url):
-      """Utility function to fetch content with playwright with try-catch."""
-      if not self.playwright_browser:
-           await self.initialize_playwright()
-      if not self.playwright_browser:
-            logger.error(f"Playwright browser is not initialized for url: {doi_url}")
-            return None
-      page = None
-      try:
-        page = await self.playwright_browser.new_page()
-        await page.goto(doi_url, timeout=30000)
-        return await page.content()
-      except Exception as e:
-           logger.debug(f"Error navigating or getting content for url: {doi_url}: {e}")
-           return None
-      finally:
-        if page:
-          await page.close()
     def clean_doi(self, doi):
         """Clean and encode DOI for URL"""
         if not isinstance(doi, str):
@@ -102,43 +57,43 @@ class PaperDownloader:
         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
     async def download_paper_direct_doi_async(self, session, doi):
-       """Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
-       if not doi:
          return None
-       try:
-          doi_url = f"https://doi.org/{self.clean_doi(doi)}"
-          html_content = await self.get_html_with_playwright(doi_url)
-          if not html_content:
-             return None
-          pdf_patterns = [
-              r'(https?://[^\s<>"]+?\.pdf)',
-              r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-              r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-          ]
-          pdf_urls = []
-          for pattern in pdf_patterns:
-            pdf_urls.extend(re.findall(pattern, html_content))
-          for pdf_url in pdf_urls:
-              try:
-                  pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
-                  if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                     logger.debug(f"Found PDF from: {pdf_url}")
-                     return await pdf_response.read()
-              except Exception as e:
-                   logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
-       except Exception as e:
-          logger.debug(f"Error trying to get the PDF from {doi}: {e}")
-       return None
     async def download_paper_scihub_async(self, session, doi):
         """Improved method to download paper from Sci-Hub using async requests"""
@@ -259,10 +214,13 @@ class PaperDownloader:
                     if link.get('content-type') == 'application/pdf':
                         pdf_url = link.get('URL')
                         if pdf_url:
-                            pdf_response = await session.get(pdf_url, headers=self.headers)
-                            if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                                logger.debug(f"Found PDF from: {pdf_url}")
-                                return await pdf_response.read()
         except Exception as e:
             logger.debug(f"Crossref error for {doi}: {e}")
@@ -279,12 +237,13 @@ class PaperDownloader:
             while retries < max_retries and not pdf_content:
                 try:
                     pdf_content = (
-                       await self.download_paper_direct_doi_async(session, doi) or
                         await self.download_paper_scihub_async(session, doi) or
                         await self.download_paper_libgen_async(session, doi) or
                         await self.download_paper_google_scholar_async(session, doi) or
                         await self.download_paper_crossref_async(session, doi)
-                     )
                     if pdf_content:
                         return pdf_content
                 except Exception as e:
@@ -298,178 +257,14 @@ class PaperDownloader:
         return None
-    def download_paper_scihub(self, doi):
-        """Improved method to download paper from Sci-Hub"""
-        if not doi:
-            logger.warning("DOI not provided")
-            return None
-        for base_url in self.download_sources:
-            try:
-                scihub_url = f"{base_url}{self.clean_doi(doi)}"
-                # Request with more tolerance
-                response = requests.get(scihub_url,
-                                        headers=self.headers,
-                                        allow_redirects=True,
-                                        timeout=15)
-                # Search for multiple PDF URL patterns
-                pdf_patterns = [
-                    r'(https?://[^\s<>"]+?\.pdf)',
-                    r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-                    r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-                ]
-                pdf_urls = []
-                for pattern in pdf_patterns:
-                    pdf_urls.extend(re.findall(pattern, response.text))
-                # Try downloading from found URLs
-                for pdf_url in pdf_urls:
-                    try:
-                        pdf_response = requests.get(pdf_url,
-                                                    headers=self.headers,
-                                                    timeout=10)
-                        # Verify if it's a PDF
-                        if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                            logger.debug(f"Found PDF from: {pdf_url}")
-                            return pdf_response.content
-                    except Exception as e:
-                        logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
-            except Exception as e:
-                logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
-        return None
-    def download_paper_libgen(self, doi):
-        """Download from Libgen, handles the query and the redirection"""
-        if not doi:
-            return None
-        base_url = 'https://libgen.rs/scimag/'
-        try:
-            search_url = f"{base_url}?q={self.clean_doi(doi)}"
-            response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
-            response.raise_for_status()
-            if "No results" in response.text:
-                logger.debug(f"No results for DOI: {doi} on libgen")
-                return None
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Find the link using a specific selector
-            links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
-            if links:
-                link = links[0]
-                pdf_url = link['href']
-                pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
-                if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                    logger.debug(f"Found PDF from: {pdf_url}")
-                    return pdf_response.content
-        except Exception as e:
-            logger.debug(f"Error trying to download {doi} from libgen: {e}")
-        return None
-    def download_paper_google_scholar(self, doi):
-        """Search google scholar to find an article with the given doi, try to get the pdf"""
-        if not doi:
-            return None
-        try:
-            query = f'doi:"{doi}"'
-            params = {'q': query}
-            url = f'https://scholar.google.com/scholar?{urlencode(params)}'
-            response = requests.get(url, headers=self.headers, timeout=10)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Find any links with [PDF]
-            links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
-            if links:
-                pdf_url = links[0]['href']
-                pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
-                if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                    logger.debug(f"Found PDF from: {pdf_url}")
-                    return pdf_response.content
-        except Exception as e:
-            logger.debug(f"Google Scholar error for {doi}: {e}")
-        return None
-    def download_paper_crossref(self, doi):
-        """Alternative search method using Crossref"""
-        if not doi:
-            return None
-        try:
-            # Search for open access link
-            url = f"https://api.crossref.org/works/{doi}"
-            response = requests.get(url, headers=self.headers, timeout=10)
-            if response.status_code == 200:
-                data = response.json()
-                work = data.get('message', {})
-                # Search for open access links
-                links = work.get('link', [])
-                for link in links:
-                    if link.get('content-type') == 'application/pdf':
-                        pdf_url = link.get('URL')
-                        if pdf_url:
-                            pdf_response = requests.get(pdf_url, headers=self.headers)
-                            if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                                logger.debug(f"Found PDF from: {pdf_url}")
-                                return pdf_response.content
-        except Exception as e:
-            logger.debug(f"Crossref error for {doi}: {e}")
-        return None
-    def download_with_retry(self, doi, max_retries=3, initial_delay=2):
-        """Downloads a paper using multiple strategies with exponential backoff"""
-        pdf_content = None
-        retries = 0
-        delay = initial_delay
-        while retries < max_retries and not pdf_content:
-            try:
-                 pdf_content = (
-                     self.download_paper_scihub(doi) or
-                     self.download_paper_libgen(doi) or
-                     self.download_paper_google_scholar(doi) or
-                     self.download_paper_crossref(doi)
-                   )
-                 if pdf_content:
-                     return pdf_content
-            except Exception as e:
-                logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
-            if not pdf_content:
-                retries += 1
-                logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
-                time.sleep(delay)
-                delay *= 2  # Exponential backoff
-        return None
-    def download_single_doi(self, doi):
         """Downloads a single paper using a DOI"""
         if not doi:
             return None, "Error: DOI not provided", "Error: DOI not provided"
         try:
-            pdf_content = self.download_with_retry(doi)
             if pdf_content:
                 if doi is None:
@@ -488,7 +283,8 @@ class PaperDownloader:
             logger.error(f"Error processing {doi}: {e}")
             return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
-    def download_multiple_dois(self, dois_text):
         """Downloads multiple papers from a list of DOIs"""
         if not dois_text:
             return None, "Error: No DOIs provided", "Error: No DOIs provided"
@@ -501,9 +297,9 @@ class PaperDownloader:
         failed_dois = []
         downloaded_links = []
         for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
-            filepath, success_message, fail_message = self.download_single_doi(doi)
             if filepath:
-                # Unique filename for zip
                 filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                 filepath_unique = os.path.join(self.output_dir, filename)
                 os.rename(filepath, filepath_unique)
@@ -522,67 +318,6 @@ class PaperDownloader:
         return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
-    def process_bibtex(self, bib_file):
-        """Process BibTeX file and download papers with multiple strategies"""
-        # Read BibTeX file content from the uploaded object
-        try:
-            with open(bib_file.name, 'r', encoding='utf-8') as f:
-                bib_content = f.read()
-        except Exception as e:
-            logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
-            return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
-        # Parse BibTeX data
-        try:
-            bib_database = bibtexparser.loads(bib_content)
-        except Exception as e:
-            logger.error(f"Error parsing BibTeX data: {e}")
-            return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
-        # Extract DOIs
-        dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
-        logger.info(f"Found {len(dois)} DOIs to download")
-        # Result lists
-        downloaded_files = []
-        failed_dois = []
-        downloaded_links = []
-        # Download PDFs
-        for doi in tqdm(dois, desc="Downloading papers"):
-            try:
-                # Try to download with multiple methods with retries
-                pdf_content = self.download_with_retry(doi)
-                # Save PDF
-                if pdf_content:
-                    if doi is None:
-                        return None, "Error: DOI not provided", "Error: DOI not provided", None
-                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
-                    filepath = os.path.join(self.output_dir, filename)
-                    with open(filepath, 'wb') as f:
-                        f.write(pdf_content)
-                    downloaded_files.append(filepath)
-                    downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                    logger.info(f"Successfully downloaded: {filename}")
-                else:
-                    failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-            except Exception as e:
-                failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                logger.error(f"Error processing {doi}: {e}")
-        # Create ZIP of downloaded papers
-        if downloaded_files:
-            zip_filename = 'papers.zip'
-            with zipfile.ZipFile(zip_filename, 'w') as zipf:
-                for file_path in downloaded_files:
-                    zipf.write(file_path, arcname=os.path.basename(file_path))
-            logger.info(f"ZIP file created: {zip_filename}")
-        return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
     async def process_bibtex_async(self, bib_file):
         """Process BibTeX file and download papers with multiple strategies"""
@@ -611,7 +346,7 @@ class PaperDownloader:
         downloaded_links = []
         # Download PDFs
-        for doi in tqdm(dois, desc="Downloading papers"):
             try:
                 # Try to download with multiple methods with retries
                 pdf_content = await self.download_with_retry_async(doi)
@@ -620,7 +355,7 @@ class PaperDownloader:
                 if pdf_content:
                     if doi is None:
                         return None, "Error: DOI not provided", "Error: DOI not provided", None
-                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
                     filepath = os.path.join(self.output_dir, filename)
                     with open(filepath, 'wb') as f:
@@ -643,11 +378,10 @@ class PaperDownloader:
                 for file_path in downloaded_files:
                     zipf.write(file_path, arcname=os.path.basename(file_path))
             logger.info(f"ZIP file created: {zip_filename}")
-        await self.close_playwright()
         return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
@@ -658,13 +392,13 @@ def create_gradio_interface():
             if not bib_file.name.lower().endswith('.bib'):
                 return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
-            zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file)
             return zip_path, downloaded_dois, failed_dois, None
         elif doi_input:
-            filepath, message, failed_doi = downloader.download_single_doi(doi_input)
             return None, message, failed_doi, filepath
         elif dois_input:
-            zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
             return zip_path, downloaded_dois, failed_dois, None
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None

 import io
 import asyncio
 import aiohttp
 # Configure logging
 logging.basicConfig(level=logging.INFO,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
         }
     def clean_doi(self, doi):
         """Clean and encode DOI for URL"""
         if not isinstance(doi, str):
         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
     async def download_paper_direct_doi_async(self, session, doi):
+      """Attempt to download the pdf from the landing page of the doi"""
+      if not doi:
          return None
+      try:
+         doi_url = f"https://doi.org/{self.clean_doi(doi)}"
+         text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
+         if not text:
+            return None
+         pdf_patterns = [
+           r'(https?://[^\s<>"]+?\.pdf)',
+           r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+           r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+         ]
+         pdf_urls = []
+         for pattern in pdf_patterns:
+            pdf_urls.extend(re.findall(pattern, text))
+         for pdf_url in pdf_urls:
+           try:
+               pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
+               if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                    logger.debug(f"Found PDF from: {pdf_url}")
+                    return await pdf_response.read()
+           except Exception as e:
+               logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
+      except Exception as e:
+           logger.debug(f"Error trying to get the PDF from {doi}: {e}")
+      return None
     async def download_paper_scihub_async(self, session, doi):
         """Improved method to download paper from Sci-Hub using async requests"""
                     if link.get('content-type') == 'application/pdf':
                         pdf_url = link.get('URL')
                         if pdf_url:
+                            try:
+                              pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
+                              if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                                  logger.debug(f"Found PDF from: {pdf_url}")
+                                  return await pdf_response.read()
+                            except Exception as e:
+                              logger.debug(f"Error fetching from {pdf_url}")
         except Exception as e:
             logger.debug(f"Crossref error for {doi}: {e}")
             while retries < max_retries and not pdf_content:
                 try:
                     pdf_content = (
+                        await self.download_paper_direct_doi_async(session, doi) or
                         await self.download_paper_scihub_async(session, doi) or
                         await self.download_paper_libgen_async(session, doi) or
                         await self.download_paper_google_scholar_async(session, doi) or
                         await self.download_paper_crossref_async(session, doi)
+                    )
                     if pdf_content:
                         return pdf_content
                 except Exception as e:
         return None
+    async def download_single_doi_async(self, doi):
         """Downloads a single paper using a DOI"""
         if not doi:
             return None, "Error: DOI not provided", "Error: DOI not provided"
         try:
+            pdf_content = await self.download_with_retry_async(doi)
             if pdf_content:
                 if doi is None:
             logger.error(f"Error processing {doi}: {e}")
             return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
+    async def download_multiple_dois_async(self, dois_text):
         """Downloads multiple papers from a list of DOIs"""
         if not dois_text:
             return None, "Error: No DOIs provided", "Error: No DOIs provided"
         failed_dois = []
         downloaded_links = []
         for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
+            filepath, success_message, fail_message = await self.download_single_doi_async(doi)
             if filepath:
+                 # Unique filename for zip
                 filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                 filepath_unique = os.path.join(self.output_dir, filename)
                 os.rename(filepath, filepath_unique)
         return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     async def process_bibtex_async(self, bib_file):
         """Process BibTeX file and download papers with multiple strategies"""
         downloaded_links = []
         # Download PDFs
+        for i,doi in enumerate(tqdm(dois, desc="Downloading papers")):
             try:
                 # Try to download with multiple methods with retries
                 pdf_content = await self.download_with_retry_async(doi)
                 if pdf_content:
                     if doi is None:
                         return None, "Error: DOI not provided", "Error: DOI not provided", None
+                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath = os.path.join(self.output_dir, filename)
                     with open(filepath, 'wb') as f:
                 for file_path in downloaded_files:
                     zipf.write(file_path, arcname=os.path.basename(file_path))
             logger.info(f"ZIP file created: {zip_filename}")
         return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
             if not bib_file.name.lower().endswith('.bib'):
                 return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
+            zip_path, downloaded_dois, failed_dois,  = await downloader.process_bibtex_async(bib_file)
             return zip_path, downloaded_dois, failed_dois, None
         elif doi_input:
+            filepath, message, failed_doi = await downloader.download_single_doi_async(doi_input)
             return None, message, failed_doi, filepath
         elif dois_input:
+            zip_path, downloaded_dois, failed_dois = await downloader.download_multiple_dois_async(dois_input)
             return zip_path, downloaded_dois, failed_dois, None
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None