FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

b86b9af

verified ·

1 Parent(s): 5d5b6d2

Update app.py

Browse files

Files changed (1) hide show

app.py +205 -206

app.py CHANGED Viewed

@@ -99,171 +99,170 @@ class PaperDownloader:
             logger.debug(f"Too many redirects or retries {url}, not following this link further")
             return None
     async def download_paper_direct_doi_async(self, session, doi):
-        """Attempt to download the pdf from the landing page of the doi"""
-        if not doi:
-           return None
-        try:
-             doi_url = f"https://doi.org/{self.clean_doi(doi)}"
-             # First, let's try to download the URL directly in case it is already the pdf.
-             pdf_content = await self.fetch_pdf_content(session, doi_url)
-             if pdf_content:
-                logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
-                return pdf_content
-             # If direct DOI link was not a pdf, fetch landing page and extract links
-             text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
-             if not text:
-                return None
-             pdf_patterns = [
-                r'(https?://[^\s<>"]+?\.pdf)',
-                r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-                r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-                ]
-             pdf_urls = []
-             for pattern in pdf_patterns:
-                pdf_urls.extend(re.findall(pattern, text))
-             # Attempt each pdf url and break when you find a PDF content.
-             for pdf_url in pdf_urls:
-                pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                if pdf_content:
-                  logger.debug(f"Found PDF from: {pdf_url}")
-                  return pdf_content
-        except Exception as e:
-            logger.debug(f"Error trying to get the PDF from {doi}: {e}")
-            return None
-    async def download_paper_scihub_async(self, session, doi):
-            """Improved method to download paper from Sci-Hub using async requests"""
             if not doi:
-                logger.warning("DOI not provided")
                 return None
-            for base_url in self.download_sources:
-                try:
-                    scihub_url = f"{base_url}{self.clean_doi(doi)}"
-                    text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
-                    if not text:
-                        continue
-                    # Search for multiple PDF URL patterns
-                    pdf_patterns = [
-                        r'(https?://[^\s<>"]+?\.pdf)',
-                        r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-                        r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
                     ]
-                    pdf_urls = []
-                    for pattern in pdf_patterns:
-                        pdf_urls.extend(re.findall(pattern, text))
-                    # Try downloading from found URLs, but iterate over ALL
-                    for pdf_url in pdf_urls:
-                         pdf_content = await self.fetch_pdf_content(session,pdf_url)
-                         if pdf_content:
-                            logger.debug(f"Found PDF from: {pdf_url}")
-                            return pdf_content
-                except Exception as e:
-                    logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
-            return None
-    async def download_paper_libgen_async(self, session, doi):
-        """Download from Libgen, handles the query and the redirection"""
         if not doi:
             return None
-        base_url = 'https://libgen.rs/scimag/'
-        try:
-            search_url = f"{base_url}?q={self.clean_doi(doi)}"
-            text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
-            if not text or "No results" in text:
-                logger.debug(f"No results for DOI: {doi} on libgen")
-                return None
-            soup = BeautifulSoup(text, 'html.parser')
-            links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
-            if links:
-                link = links[0]
-                pdf_url = link['href']
-                pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                if pdf_content:
-                    logger.debug(f"Found PDF from: {pdf_url}")
-                    return pdf_content
-        except Exception as e:
-            logger.debug(f"Error trying to download {doi} from libgen: {e}")
         return None
-    async def download_paper_google_scholar_async(self, session, doi):
-            """Search google scholar to find an article with the given doi, try to get the pdf"""
             if not doi:
                 return None
             try:
-                query = f'doi:"{doi}"'
-                params = {'q': query}
-                url = f'https://scholar.google.com/scholar?{urlencode(params)}'
-                text, headers = await self.fetch_with_headers(session, url, timeout=10)
-                if not text:
                     return None
                 soup = BeautifulSoup(text, 'html.parser')
-                # Find any links with [PDF]
-                links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
                 if links:
-                    pdf_url = links[0]['href']
-                    pdf_content = await self.fetch_pdf_content(session,pdf_url)
                     if pdf_content:
-                      logger.debug(f"Found PDF from: {pdf_url}")
-                      return pdf_content
             except Exception as e:
-                logger.debug(f"Google Scholar error for {doi}: {e}")
             return None
-    async def download_paper_crossref_async(self, session, doi):
-            """Alternative search method using Crossref"""
-            if not doi:
                 return None
-            try:
-                # Search for open access link
-                url = f"https://api.crossref.org/works/{doi}"
-                response = await session.get(url, headers=self.headers, timeout=10)
-                if response.status == 200:
-                    data = await response.json()
-                    work = data.get('message', {})
-                    # Search for open access links
-                    links = work.get('link', [])
-                    for link in links:
-                        if link.get('content-type') == 'application/pdf':
-                            pdf_url = link.get('URL')
-                            if pdf_url:
-                                pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                                if pdf_content:
-                                  logger.debug(f"Found PDF from: {pdf_url}")
-                                  return pdf_content
-            except Exception as e:
-                logger.debug(f"Crossref error for {doi}: {e}")
             return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
@@ -284,14 +283,15 @@ class PaperDownloader:
                         return pdf_content
                 except Exception as e:
                     logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
                 if not pdf_content:
                     retries += 1
                     logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
                     await asyncio.sleep(delay)
                     delay *= 2  # Exponential backoff
         return None
     async def download_single_doi_async(self, doi):
         """Downloads a single paper using a DOI"""
         if not doi:
@@ -302,7 +302,7 @@ class PaperDownloader:
             if pdf_content:
                 if doi is None:
-                    return None, "Error: DOI not provided", "Error: DOI not provided"
                 filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
                 filepath = os.path.join(self.output_dir, filename)
@@ -310,106 +310,102 @@ class PaperDownloader:
                 loop = asyncio.get_running_loop()
                 await loop.run_in_executor(None, lambda: open(filepath, 'wb').write(pdf_content))
                 logger.info(f"Successfully downloaded: {filename}")
                 return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
             else:
-                logger.warning(f"Could not download: {doi}")
-                return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
         except Exception as e:
-            logger.error(f"Error processing {doi}: {e}")
-            return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
     async def download_multiple_dois_async(self, dois_text):
-        """Downloads multiple papers from a list of DOIs"""
-        if not dois_text:
-            return None, "Error: No DOIs provided", "Error: No DOIs provided"
-        dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
-        if not dois:
-            return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
-        downloaded_files = []
-        failed_dois = []
-        downloaded_links = []
-        for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
-              filepath, success_message, fail_message = await self.download_single_doi_async(doi)
-              if filepath:
                     # Unique filename for zip
                     filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
                     os.rename(filepath, filepath_unique)
                     downloaded_files.append(filepath_unique)
                     downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-              else:
                     failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-        if downloaded_files:
-              zip_filename = 'papers.zip'
-              loop = asyncio.get_running_loop()
-              await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
-              logger.info(f"ZIP file created: {zip_filename}")
-        return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     def create_zip(self, zip_filename, downloaded_files):
             with zipfile.ZipFile(zip_filename, 'w') as zipf:
                 for file_path in downloaded_files:
                     zipf.write(file_path, arcname=os.path.basename(file_path))
     async def process_bibtex_async(self, bib_file):
-        """Process BibTeX file and download papers with multiple strategies"""
-        # Read BibTeX file content from the uploaded object
-        try:
-            with open(bib_file.name, 'r', encoding='utf-8') as f:
-                bib_content = f.read()
-        except Exception as e:
-            logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
-            return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
-        # Parse BibTeX data
-        try:
-            bib_database = bibtexparser.loads(bib_content)
-        except Exception as e:
-            logger.error(f"Error parsing BibTeX data: {e}")
-            return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
-        # Extract DOIs
-        dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
-        logger.info(f"Found {len(dois)} DOIs to download")
-        # Result lists
-        downloaded_files = []
-        failed_dois = []
-        downloaded_links = []
-        # Use asyncio.gather to run all downloads concurrently and show propert progress
-        tasks = [self.download_single_doi_async(doi) for doi in dois]
-        results = await asyncio.gather(*tasks)
-        for i, (filepath, success_message, fail_message) in enumerate(results):
-                if filepath:
                     # Unique filename for zip
                     filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
                     os.rename(filepath, filepath_unique)
                     downloaded_files.append(filepath_unique)
                     downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
-                else:
                     failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
-        if downloaded_files:
-                zip_filename = 'papers.zip'
-                loop = asyncio.get_running_loop()
-                await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
-                logger.info(f"ZIP file created: {zip_filename}")
-        return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
@@ -420,7 +416,7 @@ def create_gradio_interface():
             # Check file type
             if not bib_file.name.lower().endswith('.bib'):
                 return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
             zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file)
             return zip_path, downloaded_dois, failed_dois, None
         elif doi_input:
@@ -432,6 +428,7 @@ def create_gradio_interface():
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,
@@ -487,6 +484,7 @@ def create_gradio_interface():
         """,
         cache_examples=False,
     )
     # Add Javascript to update HTML
     interface.load = """
         function(downloaded_dois, failed_dois){
@@ -506,6 +504,7 @@ def create_gradio_interface():
     """
     return interface
 def main():
     interface = create_gradio_interface()
     interface.launch(share=True)

             logger.debug(f"Too many redirects or retries {url}, not following this link further")
             return None
     async def download_paper_direct_doi_async(self, session, doi):
+            """Attempt to download the pdf from the landing page of the doi"""
             if not doi:
                 return None
+            try:
+                doi_url = f"https://doi.org/{self.clean_doi(doi)}"
+                # First, let's try to download the URL directly in case it is already the pdf.
+                pdf_content = await self.fetch_pdf_content(session, doi_url)
+                if pdf_content:
+                    logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
+                    return pdf_content
+                # If direct DOI link was not a pdf, fetch landing page and extract links
+                text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
+                if not text:
+                    return None
+                pdf_patterns = [
+                    r'(https?://[^\s<>"]+?\.pdf)',
+                    r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+                    r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
                     ]
+                pdf_urls = []
+                for pattern in pdf_patterns:
+                    pdf_urls.extend(re.findall(pattern, text))
+                # Attempt each pdf url and break when you find a PDF content.
+                for pdf_url in pdf_urls:
+                    pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                    if pdf_content:
+                        logger.debug(f"Found PDF from: {pdf_url}")
+                        return pdf_content
+            except Exception as e:
+                logger.debug(f"Error trying to get the PDF from {doi}: {e}")
+                return None
+    async def download_paper_scihub_async(self, session, doi):
+        """Improved method to download paper from Sci-Hub using async requests"""
         if not doi:
+            logger.warning("DOI not provided")
             return None
+        for base_url in self.download_sources:
+            try:
+                scihub_url = f"{base_url}{self.clean_doi(doi)}"
+                text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
+                if not text:
+                    continue
+                # Search for multiple PDF URL patterns
+                pdf_patterns = [
+                    r'(https?://[^\s<>"]+?\.pdf)',
+                    r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+                    r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+                ]
+                pdf_urls = []
+                for pattern in pdf_patterns:
+                    pdf_urls.extend(re.findall(pattern, text))
+                # Try downloading from found URLs, but iterate over ALL
+                for pdf_url in pdf_urls:
+                    pdf_content = await self.fetch_pdf_content(session,pdf_url)
+                    if pdf_content:
+                        logger.debug(f"Found PDF from: {pdf_url}")
+                        return pdf_content
+            except Exception as e:
+                logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
         return None
+    async def download_paper_libgen_async(self, session, doi):
+            """Download from Libgen, handles the query and the redirection"""
             if not doi:
                 return None
+            base_url = 'https://libgen.rs/scimag/'
             try:
+                search_url = f"{base_url}?q={self.clean_doi(doi)}"
+                text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
+                if not text or "No results" in text:
+                    logger.debug(f"No results for DOI: {doi} on libgen")
                     return None
                 soup = BeautifulSoup(text, 'html.parser')
+                links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
                 if links:
+                    link = links[0]
+                    pdf_url = link['href']
+                    pdf_content = await self.fetch_pdf_content(session, pdf_url)
                     if pdf_content:
+                        logger.debug(f"Found PDF from: {pdf_url}")
+                        return pdf_content
             except Exception as e:
+                logger.debug(f"Error trying to download {doi} from libgen: {e}")
             return None
+    async def download_paper_google_scholar_async(self, session, doi):
+        """Search google scholar to find an article with the given doi, try to get the pdf"""
+        if not doi:
+            return None
+        try:
+            query = f'doi:"{doi}"'
+            params = {'q': query}
+            url = f'https://scholar.google.com/scholar?{urlencode(params)}'
+            text, headers = await self.fetch_with_headers(session, url, timeout=10)
+            if not text:
                 return None
+            soup = BeautifulSoup(text, 'html.parser')
+            # Find any links with [PDF]
+            links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
+            if links:
+                pdf_url = links[0]['href']
+                pdf_content = await self.fetch_pdf_content(session,pdf_url)
+                if pdf_content:
+                  logger.debug(f"Found PDF from: {pdf_url}")
+                  return pdf_content
+        except Exception as e:
+            logger.debug(f"Google Scholar error for {doi}: {e}")
+        return None
+    async def download_paper_crossref_async(self, session, doi):
+        """Alternative search method using Crossref"""
+        if not doi:
             return None
+        try:
+            # Search for open access link
+            url = f"https://api.crossref.org/works/{doi}"
+            response = await session.get(url, headers=self.headers, timeout=10)
+            if response.status == 200:
+                data = await response.json()
+                work = data.get('message', {})
+                # Search for open access links
+                links = work.get('link', [])
+                for link in links:
+                    if link.get('content-type') == 'application/pdf':
+                        pdf_url = link.get('URL')
+                        if pdf_url:
+                            pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                            if pdf_content:
+                                logger.debug(f"Found PDF from: {pdf_url}")
+                                return pdf_content
+        except Exception as e:
+            logger.debug(f"Crossref error for {doi}: {e}")
+        return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
                         return pdf_content
                 except Exception as e:
                     logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
                 if not pdf_content:
                     retries += 1
                     logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
                     await asyncio.sleep(delay)
                     delay *= 2  # Exponential backoff
         return None
     async def download_single_doi_async(self, doi):
         """Downloads a single paper using a DOI"""
         if not doi:
             if pdf_content:
                 if doi is None:
+                     return None, "Error: DOI not provided", "Error: DOI not provided"
                 filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
                 filepath = os.path.join(self.output_dir, filename)
                 loop = asyncio.get_running_loop()
                 await loop.run_in_executor(None, lambda: open(filepath, 'wb').write(pdf_content))
                 logger.info(f"Successfully downloaded: {filename}")
                 return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
             else:
+                 logger.warning(f"Could not download: {doi}")
+                 return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
         except Exception as e:
+             logger.error(f"Error processing {doi}: {e}")
+             return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
     async def download_multiple_dois_async(self, dois_text):
+            """Downloads multiple papers from a list of DOIs"""
+            if not dois_text:
+                return None, "Error: No DOIs provided", "Error: No DOIs provided"
+            dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
+            if not dois:
+                return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
+            downloaded_files = []
+            failed_dois = []
+            downloaded_links = []
+            for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
+               filepath, success_message, fail_message = await self.download_single_doi_async(doi)
+               if filepath:
                     # Unique filename for zip
                     filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
                     os.rename(filepath, filepath_unique)
                     downloaded_files.append(filepath_unique)
                     downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+               else:
                     failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+            if downloaded_files:
+                    zip_filename = 'papers.zip'
+                    loop = asyncio.get_running_loop()
+                    await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
+                    logger.info(f"ZIP file created: {zip_filename}")
+            return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     def create_zip(self, zip_filename, downloaded_files):
             with zipfile.ZipFile(zip_filename, 'w') as zipf:
                 for file_path in downloaded_files:
                     zipf.write(file_path, arcname=os.path.basename(file_path))
     async def process_bibtex_async(self, bib_file):
+            """Process BibTeX file and download papers with multiple strategies"""
+            # Read BibTeX file content from the uploaded object
+            try:
+                with open(bib_file.name, 'r', encoding='utf-8') as f:
+                    bib_content = f.read()
+            except Exception as e:
+                logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
+                return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
+            # Parse BibTeX data
+            try:
+                bib_database = bibtexparser.loads(bib_content)
+            except Exception as e:
+                logger.error(f"Error parsing BibTeX data: {e}")
+                return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
+            # Extract DOIs
+            dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
+            logger.info(f"Found {len(dois)} DOIs to download")
+            # Result lists
+            downloaded_files = []
+            failed_dois = []
+            downloaded_links = []
+            tasks = [self.download_single_doi_async(doi) for doi in dois]
+            results = await asyncio.gather(*tasks)
+            for i, (filepath, success_message, fail_message) in enumerate(results):
+               if filepath:
                     # Unique filename for zip
                     filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
                     os.rename(filepath, filepath_unique)
                     downloaded_files.append(filepath_unique)
                     downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
+               else:
                     failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
+            if downloaded_files:
+                    zip_filename = 'papers.zip'
+                    loop = asyncio.get_running_loop()
+                    await loop.run_in_executor(None, lambda:  self.create_zip(zip_filename,downloaded_files) )
+                    logger.info(f"ZIP file created: {zip_filename}")
+            return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
             # Check file type
             if not bib_file.name.lower().endswith('.bib'):
                 return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
             zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file)
             return zip_path, downloaded_dois, failed_dois, None
         elif doi_input:
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,
         """,
         cache_examples=False,
     )
     # Add Javascript to update HTML
     interface.load = """
         function(downloaded_dois, failed_dois){
     """
     return interface
 def main():
     interface = create_gradio_interface()
     interface.launch(share=True)