FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

4e2e145

verified ·

1 Parent(s): ff022ac

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -147

app.py CHANGED Viewed

@@ -141,100 +141,100 @@ class PaperDownloader:
                 return None
     async def download_paper_scihub_async(self, session, doi):
-        """Improved method to download paper from Sci-Hub using async requests"""
-        if not doi:
-            logger.warning("DOI not provided")
             return None
-        for base_url in self.download_sources:
             try:
-                scihub_url = f"{base_url}{self.clean_doi(doi)}"
-                text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
                 if not text:
-                    continue
-                # Search for multiple PDF URL patterns
-                pdf_patterns = [
-                    r'(https?://[^\s<>"]+?\.pdf)',
-                    r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-                    r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-                ]
-                pdf_urls = []
-                for pattern in pdf_patterns:
-                    pdf_urls.extend(re.findall(pattern, text))
-                # Try downloading from found URLs, but iterate over ALL
-                for pdf_url in pdf_urls:
                     pdf_content = await self.fetch_pdf_content(session,pdf_url)
                     if pdf_content:
                         logger.debug(f"Found PDF from: {pdf_url}")
                         return pdf_content
             except Exception as e:
-                logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
-        return None
-    async def download_paper_libgen_async(self, session, doi):
-        """Download from Libgen, handles the query and the redirection"""
-        if not doi:
-            return None
-        base_url = 'https://libgen.rs/scimag/'
-        try:
-            search_url = f"{base_url}?q={self.clean_doi(doi)}"
-            text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
-            if not text or "No results" in text:
-                logger.debug(f"No results for DOI: {doi} on libgen")
-                return None
-            soup = BeautifulSoup(text, 'html.parser')
-            links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
-            if links:
-                link = links[0]
-                pdf_url = link['href']
-                pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                if pdf_content:
-                    logger.debug(f"Found PDF from: {pdf_url}")
-                    return pdf_content
-        except Exception as e:
-            logger.debug(f"Error trying to download {doi} from libgen: {e}")
-        return None
-    async def download_paper_google_scholar_async(self, session, doi):
-        """Search google scholar to find an article with the given doi, try to get the pdf"""
-        if not doi:
             return None
-        try:
-            query = f'doi:"{doi}"'
-            params = {'q': query}
-            url = f'https://scholar.google.com/scholar?{urlencode(params)}'
-            text, headers = await self.fetch_with_headers(session, url, timeout=10)
-            if not text:
-                return None
-            soup = BeautifulSoup(text, 'html.parser')
-            # Find any links with [PDF]
-            links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
-            if links:
-                pdf_url = links[0]['href']
-                pdf_content = await self.fetch_pdf_content(session,pdf_url)
-                if pdf_content:
-                  logger.debug(f"Found PDF from: {pdf_url}")
-                  return pdf_content
-        except Exception as e:
-            logger.debug(f"Google Scholar error for {doi}: {e}")
-        return None
     async def download_paper_crossref_async(self, session, doi):
             """Alternative search method using Crossref"""
@@ -258,8 +258,8 @@ class PaperDownloader:
                             if pdf_url:
                                 pdf_content = await self.fetch_pdf_content(session, pdf_url)
                                 if pdf_content:
-                                    logger.debug(f"Found PDF from: {pdf_url}")
-                                    return pdf_content
             except Exception as e:
                 logger.debug(f"Crossref error for {doi}: {e}")
             return None
@@ -294,35 +294,33 @@ class PaperDownloader:
         return None
     async def download_single_doi_async(self, doi, progress_callback):
-            """Downloads a single paper using a DOI, and updates the given progress_callback"""
-            if not doi:
-                return None, "Error: DOI not provided", "Error: DOI not provided"
-            try:
-                pdf_content = await self.download_with_retry_async(doi)
-                if pdf_content:
-                    if doi is None:
-                        return None, "Error: DOI not provided", "Error: DOI not provided"
-                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
-                    filepath = os.path.join(self.output_dir, filename)
-                    loop = asyncio.get_running_loop()
-                    await loop.run_in_executor(self.executor, lambda: open(filepath, 'wb').write(pdf_content))
-                    logger.info(f"Successfully downloaded: {filename}")
-                    progress_callback(f"Successfully downloaded: <a href='https://doi.org/{doi}'>{doi}</a>")
-                    return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
-                else:
-                    logger.warning(f"Could not download: {doi}")
-                    progress_callback(f"Could not download:  <a href='https://doi.org/{doi}'>{doi}</a>")
-                    return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
-            except Exception as e:
-                logger.error(f"Error processing {doi}: {e}")
-                progress_callback(f"Error processing {doi}:  <a href='https://doi.org/{doi}'>{doi}</a> {e}")
-                return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
     async def download_multiple_dois_async(self, dois_text, progress_callback):
             """Downloads multiple papers from a list of DOIs and uses a callback for UI"""
             if not dois_text:
@@ -335,34 +333,35 @@ class PaperDownloader:
             downloaded_files = []
             failed_dois = []
             downloaded_links = []
             for i, doi in enumerate(dois):
-                 filepath, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
-                 if filepath:
-                    # Unique filename for zip
                     filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
-                    os.rename(filepath, filepath_unique)
                     downloaded_files.append(filepath_unique)
-                    downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-                 else:
-                     failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
             if downloaded_files:
                 zip_filename = 'papers.zip'
                 loop = asyncio.get_running_loop()
                 await loop.run_in_executor(self.executor, lambda:  self.create_zip(zip_filename,downloaded_files))
                 logger.info(f"ZIP file created: {zip_filename}")
-            return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     def create_zip(self, zip_filename, downloaded_files):
-        with zipfile.ZipFile(zip_filename, 'w') as zipf:
-                for file_path in downloaded_files:
-                    zipf.write(file_path, arcname=os.path.basename(file_path))
     async def process_bibtex_async(self, bib_file, progress_callback):
-            """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
             # Read BibTeX file content from the uploaded object
             try:
                 with open(bib_file.name, 'r', encoding='utf-8') as f:
@@ -387,24 +386,22 @@ class PaperDownloader:
             failed_dois = []
             downloaded_links = []
-            tasks = [self.download_single_doi_async(doi, progress_callback) for doi in dois]
-            results = await asyncio.gather(*tasks)
-            for i, (filepath, success_message, fail_message) in enumerate(results):
-                    if filepath:
-                        # Unique filename for zip
-                        filename = f"{str(dois[i]).replace('/', '_').replace('.', '_')}_{i}.pdf"
-                        filepath_unique = os.path.join(self.output_dir, filename)
-                        os.rename(filepath, filepath_unique)
-                        downloaded_files.append(filepath_unique)
-                        downloaded_links.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
-                    else:
-                        failed_dois.append(f'<a href="https://doi.org/{dois[i]}">{dois[i]}</a>')
             if downloaded_files:
                 zip_filename = 'papers.zip'
                 loop = asyncio.get_running_loop()
-                await loop.run_in_executor(self.executor, lambda:  self.create_zip(zip_filename,downloaded_files))
                 logger.info(f"ZIP file created: {zip_filename}")
             return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
@@ -418,8 +415,7 @@ def create_gradio_interface():
                 # Check file type
                 if not bib_file.name.lower().endswith('.bib'):
                     return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
                 zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file, progress.update)
                 return zip_path, downloaded_dois, failed_dois, None
@@ -432,7 +428,6 @@ def create_gradio_interface():
             else:
                  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,
@@ -489,7 +484,6 @@ def create_gradio_interface():
         """,
         cache_examples=False,
     )
     # Add Javascript to update HTML
     interface.load = """
         function(downloaded_dois, failed_dois){
@@ -509,7 +503,6 @@ def create_gradio_interface():
     """
     return interface
 def main():
     interface = create_gradio_interface()
     interface.launch(share=True)

                 return None
     async def download_paper_scihub_async(self, session, doi):
+            """Improved method to download paper from Sci-Hub using async requests"""
+            if not doi:
+                logger.warning("DOI not provided")
+                return None
+            for base_url in self.download_sources:
+                try:
+                    scihub_url = f"{base_url}{self.clean_doi(doi)}"
+                    text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
+                    if not text:
+                        continue
+                    # Search for multiple PDF URL patterns
+                    pdf_patterns = [
+                        r'(https?://[^\s<>"]+?\.pdf)',
+                        r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+                        r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+                    ]
+                    pdf_urls = []
+                    for pattern in pdf_patterns:
+                        pdf_urls.extend(re.findall(pattern, text))
+                    # Try downloading from found URLs, but iterate over ALL
+                    for pdf_url in pdf_urls:
+                         pdf_content = await self.fetch_pdf_content(session,pdf_url)
+                         if pdf_content:
+                            logger.debug(f"Found PDF from: {pdf_url}")
+                            return pdf_content
+                except Exception as e:
+                    logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
             return None
+    async def download_paper_libgen_async(self, session, doi):
+            """Download from Libgen, handles the query and the redirection"""
+            if not doi:
+                return None
+            base_url = 'https://libgen.rs/scimag/'
             try:
+                search_url = f"{base_url}?q={self.clean_doi(doi)}"
+                text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
+                if not text or "No results" in text:
+                    logger.debug(f"No results for DOI: {doi} on libgen")
+                    return None
+                soup = BeautifulSoup(text, 'html.parser')
+                links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
+                if links:
+                    link = links[0]
+                    pdf_url = link['href']
+                    pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                    if pdf_content:
+                        logger.debug(f"Found PDF from: {pdf_url}")
+                        return pdf_content
+            except Exception as e:
+                logger.debug(f"Error trying to download {doi} from libgen: {e}")
+            return None
+    async def download_paper_google_scholar_async(self, session, doi):
+            """Search google scholar to find an article with the given doi, try to get the pdf"""
+            if not doi:
+                return None
+            try:
+                query = f'doi:"{doi}"'
+                params = {'q': query}
+                url = f'https://scholar.google.com/scholar?{urlencode(params)}'
+                text, headers = await self.fetch_with_headers(session, url, timeout=10)
                 if not text:
+                    return None
+                soup = BeautifulSoup(text, 'html.parser')
+                # Find any links with [PDF]
+                links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
+                if links:
+                    pdf_url = links[0]['href']
                     pdf_content = await self.fetch_pdf_content(session,pdf_url)
                     if pdf_content:
                         logger.debug(f"Found PDF from: {pdf_url}")
                         return pdf_content
             except Exception as e:
+                logger.debug(f"Google Scholar error for {doi}: {e}")
             return None
     async def download_paper_crossref_async(self, session, doi):
             """Alternative search method using Crossref"""
                             if pdf_url:
                                 pdf_content = await self.fetch_pdf_content(session, pdf_url)
                                 if pdf_content:
+                                  logger.debug(f"Found PDF from: {pdf_url}")
+                                  return pdf_content
             except Exception as e:
                 logger.debug(f"Crossref error for {doi}: {e}")
             return None
         return None
     async def download_single_doi_async(self, doi, progress_callback):
+        """Downloads a single paper using a DOI and updates the UI callback with messages, can be a file or the string "could not download" and that's the message reported on the UI"""
+        if not doi:
+           return None, "Error: DOI not provided", "Error: DOI not provided"
+        try:
+             pdf_content = await self.download_with_retry_async(doi)
+             if pdf_content:
+                if doi is None:
+                    return None, "Error: DOI not provided", "Error: DOI not provided"
+                filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
+                filepath = os.path.join(self.output_dir, filename)
+                loop = asyncio.get_running_loop()
+                await loop.run_in_executor(self.executor, lambda: open(filepath, 'wb').write(pdf_content))
+                logger.info(f"Successfully downloaded: {filename}")
+                progress_callback(f"Successfully downloaded: <a href='https://doi.org/{doi}'>{doi}</a>")
+                return filepath,  f'<a href="https://doi.org/{doi}">{doi}</a>', "" #return with success status
+             else:
+                logger.warning(f"Could not download: {doi}")
+                progress_callback(f"Could not download:  <a href='https://doi.org/{doi}'>{doi}</a>")
+                return "Could not download", f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'  #return a "could not download" state
+        except Exception as e:
+            logger.error(f"Error processing {doi}: {e}")
+            progress_callback(f"Error processing {doi}:  <a href='https://doi.org/{doi}'>{doi}</a> {e}")
+            return "Could not download",  f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"#return a "could not download" state
     async def download_multiple_dois_async(self, dois_text, progress_callback):
             """Downloads multiple papers from a list of DOIs and uses a callback for UI"""
             if not dois_text:
             downloaded_files = []
             failed_dois = []
             downloaded_links = []
             for i, doi in enumerate(dois):
+                result, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
+                if result == "Could not download":
+                     failed_dois.append(fail_message) #reports the error message
+                elif result: # if there was a downloaded pdf, a valid filepath
+                     # Unique filename for zip
                     filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath_unique = os.path.join(self.output_dir, filename)
+                    os.rename(result, filepath_unique)
                     downloaded_files.append(filepath_unique)
+                    downloaded_links.append(success_message)
             if downloaded_files:
                 zip_filename = 'papers.zip'
                 loop = asyncio.get_running_loop()
                 await loop.run_in_executor(self.executor, lambda:  self.create_zip(zip_filename,downloaded_files))
                 logger.info(f"ZIP file created: {zip_filename}")
+            return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
     def create_zip(self, zip_filename, downloaded_files):
+         with zipfile.ZipFile(zip_filename, 'w') as zipf:
+            for file_path in downloaded_files:
+                 zipf.write(file_path, arcname=os.path.basename(file_path))
     async def process_bibtex_async(self, bib_file, progress_callback):
+            """Process BibTeX file and download papers with multiple strategies"""
             # Read BibTeX file content from the uploaded object
             try:
                 with open(bib_file.name, 'r', encoding='utf-8') as f:
             failed_dois = []
             downloaded_links = []
+            for i, doi in enumerate(dois):
+                result, success_message, fail_message = await self.download_single_doi_async(doi, progress_callback)
+                if result == "Could not download":
+                   failed_dois.append(fail_message)  #report failure to download in UI
+                elif result:  #if there is a filepath as result, means success
+                    # Unique filename for zip
+                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
+                    filepath_unique = os.path.join(self.output_dir, filename)
+                    os.rename(result, filepath_unique)
+                    downloaded_files.append(filepath_unique)
+                    downloaded_links.append(success_message) # report to the list of sucessfully donwloaded links
             if downloaded_files:
                 zip_filename = 'papers.zip'
                 loop = asyncio.get_running_loop()
+                await loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename, downloaded_files))
                 logger.info(f"ZIP file created: {zip_filename}")
             return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
                 # Check file type
                 if not bib_file.name.lower().endswith('.bib'):
                     return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
                 zip_path, downloaded_dois, failed_dois = await downloader.process_bibtex_async(bib_file, progress.update)
                 return zip_path, downloaded_dois, failed_dois, None
             else:
                  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,
         """,
         cache_examples=False,
     )
     # Add Javascript to update HTML
     interface.load = """
         function(downloaded_dois, failed_dois){
     """
     return interface
 def main():
     interface = create_gradio_interface()
     interface.launch(share=True)