FreeBibTec2

Sleeping

App Files Files

xet

Community

C2MV commited on Dec 14, 2024

Commit

6f7150a

verified ·

1 Parent(s): 4b923db

Update app.py

Browse files

Files changed (1) hide show

app.py +229 -217

app.py CHANGED Viewed

@@ -47,100 +47,115 @@ class PaperDownloader:
         if not isinstance(doi, str):
             return None
         return quote(doi.strip()) if doi else None
     async def fetch_with_headers(self, session, url, timeout=10):
-        """Utility method to fetch an URL with headers and timeout"""
-        try:
-            async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
-                response.raise_for_status()
-                return await response.text(), response.headers
-        except Exception as e:
-            logger.debug(f"Error fetching {url}: {e}")
-            return None, None
-    async def fetch_pdf_content(self, session, url):
-        """Fetch and validate if the content of a request is actually PDF."""
-        try:
-            async with session.get(url, headers=self.headers, timeout=10) as pdf_response:
-                if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                     return await pdf_response.read()
-                else:
-                   logger.debug(f"Content type not PDF for {url}: {pdf_response.headers.get('Content-Type', '')}")
-                   return None
-        except Exception as e:
-          logger.debug(f"Error getting PDF {url}: {e}")
-          return None
-    async def download_paper_direct_doi_async(self, session, doi):
-      """Attempt to download the pdf from the landing page of the doi"""
-      if not doi:
-         return None
-      try:
-         doi_url = f"https://doi.org/{self.clean_doi(doi)}"
-         text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
-         if not text:
-            return None
-         pdf_patterns = [
-           r'(https?://[^\s<>"]+?\.pdf)',
-           r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-           r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-         ]
-         pdf_urls = []
-         for pattern in pdf_patterns:
-            pdf_urls.extend(re.findall(pattern, text))
-         for pdf_url in pdf_urls:
-             pdf_content = await self.fetch_pdf_content(session,pdf_url)
-             if pdf_content:
-                logger.debug(f"Found PDF from: {pdf_url}")
-                return pdf_content
-      except Exception as e:
-           logger.debug(f"Error trying to get the PDF from {doi}: {e}")
-      return None
-    async def download_paper_scihub_async(self, session, doi):
-        """Improved method to download paper from Sci-Hub using async requests"""
-        if not doi:
-            logger.warning("DOI not provided")
-            return None
-        for base_url in self.download_sources:
             try:
-                scihub_url = f"{base_url}{self.clean_doi(doi)}"
-                text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
-                if not text:
-                    continue
-                # Search for multiple PDF URL patterns
-                pdf_patterns = [
                     r'(https?://[^\s<>"]+?\.pdf)',
                     r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
                     r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-                ]
-                pdf_urls = []
-                for pattern in pdf_patterns:
                     pdf_urls.extend(re.findall(pattern, text))
-                # Try downloading from found URLs
-                for pdf_url in pdf_urls:
-                    pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                    if pdf_content:
-                       logger.debug(f"Found PDF from: {pdf_url}")
-                       return pdf_content
             except Exception as e:
-                logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
-        return None
     async def download_paper_libgen_async(self, session, doi):
         """Download from Libgen, handles the query and the redirection"""
@@ -163,14 +178,14 @@ class PaperDownloader:
             if links:
                 link = links[0]
                 pdf_url = link['href']
-                pdf_content = await self.fetch_pdf_content(session,pdf_url)
                 if pdf_content:
                     logger.debug(f"Found PDF from: {pdf_url}")
                     return pdf_content
         except Exception as e:
             logger.debug(f"Error trying to download {doi} from libgen: {e}")
         return None
     async def download_paper_google_scholar_async(self, session, doi):
         """Search google scholar to find an article with the given doi, try to get the pdf"""
         if not doi:
@@ -192,46 +207,44 @@ class PaperDownloader:
             if links:
                 pdf_url = links[0]['href']
-                pdf_content = await self.fetch_pdf_content(session, pdf_url)
                 if pdf_content:
-                   logger.debug(f"Found PDF from: {pdf_url}")
-                   return pdf_content
         except Exception as e:
             logger.debug(f"Google Scholar error for {doi}: {e}")
         return None
     async def download_paper_crossref_async(self, session, doi):
-        """Alternative search method using Crossref"""
-        if not doi:
             return None
-        try:
-            # Search for open access link
-            url = f"https://api.crossref.org/works/{doi}"
-            response = await session.get(url, headers=self.headers, timeout=10)
-            if response.status == 200:
-                data = await response.json()
-                work = data.get('message', {})
-                # Search for open access links
-                links = work.get('link', [])
-                for link in links:
-                    if link.get('content-type') == 'application/pdf':
-                        pdf_url = link.get('URL')
-                        if pdf_url:
-                            pdf_content = await self.fetch_pdf_content(session, pdf_url)
-                            if pdf_content:
-                              logger.debug(f"Found PDF from: {pdf_url}")
-                              return pdf_content
-        except Exception as e:
-            logger.debug(f"Crossref error for {doi}: {e}")
-        return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
@@ -261,129 +274,127 @@ class PaperDownloader:
                     delay *= 2  # Exponential backoff
         return None
     async def download_single_doi_async(self, doi):
-        """Downloads a single paper using a DOI"""
-        if not doi:
-            return None, "Error: DOI not provided", "Error: DOI not provided"
-        try:
-            pdf_content = await self.download_with_retry_async(doi)
-            if pdf_content:
-                if doi is None:
-                    return None, "Error: DOI not provided", "Error: DOI not provided"
-                filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
-                filepath = os.path.join(self.output_dir, filename)
-                with open(filepath, 'wb') as f:
-                    f.write(pdf_content)
-                logger.info(f"Successfully downloaded: {filename}")
-                return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
-            else:
-                logger.warning(f"Could not download: {doi}")
-                return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
-        except Exception as e:
-            logger.error(f"Error processing {doi}: {e}")
-            return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
-    async def download_multiple_dois_async(self, dois_text):
-        """Downloads multiple papers from a list of DOIs"""
-        if not dois_text:
-            return None, "Error: No DOIs provided", "Error: No DOIs provided"
-        dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
-        if not dois:
-            return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
-        downloaded_files = []
-        failed_dois = []
-        downloaded_links = []
-        for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
-            filepath, success_message, fail_message = await self.download_single_doi_async(doi)
-            if filepath:
-                # Unique filename for zip
-                filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
-                filepath_unique = os.path.join(self.output_dir, filename)
-                os.rename(filepath, filepath_unique)
-                downloaded_files.append(filepath_unique)
-                downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-            else:
-                failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
-        if downloaded_files:
-            zip_filename = 'papers.zip'
-            with zipfile.ZipFile(zip_filename, 'w') as zipf:
-                for file_path in downloaded_files:
-                    zipf.write(file_path, arcname=os.path.basename(file_path))
-            logger.info(f"ZIP file created: {zip_filename}")
-        return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
-    async def process_bibtex_async(self, bib_file):
-        """Process BibTeX file and download papers with multiple strategies"""
-        # Read BibTeX file content from the uploaded object
-        try:
-            with open(bib_file.name, 'r', encoding='utf-8') as f:
-                bib_content = f.read()
-        except Exception as e:
-            logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
-            return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
-        # Parse BibTeX data
-        try:
-            bib_database = bibtexparser.loads(bib_content)
-        except Exception as e:
-            logger.error(f"Error parsing BibTeX data: {e}")
-            return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
-        # Extract DOIs
-        dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
-        logger.info(f"Found {len(dois)} DOIs to download")
-        # Result lists
-        downloaded_files = []
-        failed_dois = []
-        downloaded_links = []
-        # Download PDFs
-        for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
             try:
-                # Try to download with multiple methods with retries
                 pdf_content = await self.download_with_retry_async(doi)
-                # Save PDF
                 if pdf_content:
                     if doi is None:
                         return None, "Error: DOI not provided", "Error: DOI not provided"
-                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
                     filepath = os.path.join(self.output_dir, filename)
                     with open(filepath, 'wb') as f:
                         f.write(pdf_content)
-                    downloaded_files.append(filepath)
-                    downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
                     logger.info(f"Successfully downloaded: {filename}")
                 else:
-                    failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
             except Exception as e:
-                failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
                 logger.error(f"Error processing {doi}: {e}")
-        # Create ZIP of downloaded papers
-        if downloaded_files:
-            zip_filename = 'papers.zip'
-            with zipfile.ZipFile(zip_filename, 'w') as zipf:
-                for file_path in downloaded_files:
-                    zipf.write(file_path, arcname=os.path.basename(file_path))
-            logger.info(f"ZIP file created: {zip_filename}")
-        return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
@@ -405,6 +416,7 @@ def create_gradio_interface():
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,

         if not isinstance(doi, str):
             return None
         return quote(doi.strip()) if doi else None
     async def fetch_with_headers(self, session, url, timeout=10):
+            """Utility method to fetch an URL with headers and timeout"""
+            try:
+                async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
+                    response.raise_for_status()
+                    return await response.text(), response.headers
+            except Exception as e:
+                logger.debug(f"Error fetching {url}: {e}")
+                return None, None
+    async def fetch_pdf_content(self, session, url, max_redirects=5):
+            """Fetch content and validate if response is PDF, following up to max_redirects redirections."""
+            current_url = url
+            redirect_count = 0
+            while redirect_count <= max_redirects:
+                try:
+                    async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
+                         if response.status in [301,302, 307,308]:
+                             current_url = response.headers['Location']
+                             redirect_count+=1
+                             logger.debug(f"Following redirect from {url} to {current_url}")
+                             continue
+                         response.raise_for_status()
+                         if 'application/pdf' in response.headers.get('Content-Type', ''):
+                            return await response.read()
+                         else:
+                           logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
+                           return None
+                except Exception as e:
+                  logger.debug(f"Error getting PDF from {current_url}: {e}")
+                  return None
+            logger.debug(f"Too many redirects {url}, not following this link further")
+            return None
+    async def download_paper_direct_doi_async(self, session, doi):
+            """Attempt to download the pdf from the landing page of the doi"""
+            if not doi:
+               return None
             try:
+                 doi_url = f"https://doi.org/{self.clean_doi(doi)}"
+                 text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
+                 if not text:
+                    return None
+                 pdf_patterns = [
                     r'(https?://[^\s<>"]+?\.pdf)',
                     r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
                     r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+                    ]
+                 pdf_urls = []
+                 for pattern in pdf_patterns:
                     pdf_urls.extend(re.findall(pattern, text))
+                 for pdf_url in pdf_urls:
+                     pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                     if pdf_content:
+                        logger.debug(f"Found PDF from: {pdf_url}")
+                        return pdf_content
             except Exception as e:
+                logger.debug(f"Error trying to get the PDF from {doi}: {e}")
+                return None
+    async def download_paper_scihub_async(self, session, doi):
+            """Improved method to download paper from Sci-Hub using async requests"""
+            if not doi:
+                logger.warning("DOI not provided")
+                return None
+            for base_url in self.download_sources:
+                try:
+                    scihub_url = f"{base_url}{self.clean_doi(doi)}"
+                    text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
+                    if not text:
+                        continue
+                    # Search for multiple PDF URL patterns
+                    pdf_patterns = [
+                        r'(https?://[^\s<>"]+?\.pdf)',
+                        r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+                        r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+                    ]
+                    pdf_urls = []
+                    for pattern in pdf_patterns:
+                        pdf_urls.extend(re.findall(pattern, text))
+                    # Try downloading from found URLs
+                    for pdf_url in pdf_urls:
+                         pdf_content = await self.fetch_pdf_content(session,pdf_url)
+                         if pdf_content:
+                            logger.debug(f"Found PDF from: {pdf_url}")
+                            return pdf_content
+                except Exception as e:
+                    logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
+            return None
     async def download_paper_libgen_async(self, session, doi):
         """Download from Libgen, handles the query and the redirection"""
             if links:
                 link = links[0]
                 pdf_url = link['href']
+                pdf_content = await self.fetch_pdf_content(session, pdf_url)
                 if pdf_content:
                     logger.debug(f"Found PDF from: {pdf_url}")
                     return pdf_content
         except Exception as e:
             logger.debug(f"Error trying to download {doi} from libgen: {e}")
         return None
     async def download_paper_google_scholar_async(self, session, doi):
         """Search google scholar to find an article with the given doi, try to get the pdf"""
         if not doi:
             if links:
                 pdf_url = links[0]['href']
+                pdf_content = await self.fetch_pdf_content(session,pdf_url)
                 if pdf_content:
+                  logger.debug(f"Found PDF from: {pdf_url}")
+                  return pdf_content
         except Exception as e:
             logger.debug(f"Google Scholar error for {doi}: {e}")
         return None
     async def download_paper_crossref_async(self, session, doi):
+            """Alternative search method using Crossref"""
+            if not doi:
+                return None
+            try:
+                # Search for open access link
+                url = f"https://api.crossref.org/works/{doi}"
+                response = await session.get(url, headers=self.headers, timeout=10)
+                if response.status == 200:
+                    data = await response.json()
+                    work = data.get('message', {})
+                    # Search for open access links
+                    links = work.get('link', [])
+                    for link in links:
+                        if link.get('content-type') == 'application/pdf':
+                            pdf_url = link.get('URL')
+                            if pdf_url:
+                                pdf_content = await self.fetch_pdf_content(session, pdf_url)
+                                if pdf_content:
+                                  logger.debug(f"Found PDF from: {pdf_url}")
+                                  return pdf_content
+            except Exception as e:
+                logger.debug(f"Crossref error for {doi}: {e}")
             return None
     async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
         """Downloads a paper using multiple strategies with exponential backoff and async requests"""
         pdf_content = None
                     delay *= 2  # Exponential backoff
         return None
     async def download_single_doi_async(self, doi):
+            """Downloads a single paper using a DOI"""
+            if not doi:
+                return None, "Error: DOI not provided", "Error: DOI not provided"
             try:
                 pdf_content = await self.download_with_retry_async(doi)
                 if pdf_content:
                     if doi is None:
                         return None, "Error: DOI not provided", "Error: DOI not provided"
+                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
                     filepath = os.path.join(self.output_dir, filename)
                     with open(filepath, 'wb') as f:
                         f.write(pdf_content)
                     logger.info(f"Successfully downloaded: {filename}")
+                    return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
                 else:
+                    logger.warning(f"Could not download: {doi}")
+                    return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
             except Exception as e:
                 logger.error(f"Error processing {doi}: {e}")
+                return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
+    async def download_multiple_dois_async(self, dois_text):
+            """Downloads multiple papers from a list of DOIs"""
+            if not dois_text:
+                return None, "Error: No DOIs provided", "Error: No DOIs provided"
+            dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
+            if not dois:
+                return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
+            downloaded_files = []
+            failed_dois = []
+            downloaded_links = []
+            for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
+                filepath, success_message, fail_message = await self.download_single_doi_async(doi)
+                if filepath:
+                    # Unique filename for zip
+                    filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
+                    filepath_unique = os.path.join(self.output_dir, filename)
+                    os.rename(filepath, filepath_unique)
+                    downloaded_files.append(filepath_unique)
+                    downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+                else:
+                    failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+            if downloaded_files:
+                zip_filename = 'papers.zip'
+                with zipfile.ZipFile(zip_filename, 'w') as zipf:
+                    for file_path in downloaded_files:
+                        zipf.write(file_path, arcname=os.path.basename(file_path))
+                logger.info(f"ZIP file created: {zip_filename}")
+            return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
+    async def process_bibtex_async(self, bib_file):
+            """Process BibTeX file and download papers with multiple strategies"""
+            # Read BibTeX file content from the uploaded object
+            try:
+                with open(bib_file.name, 'r', encoding='utf-8') as f:
+                    bib_content = f.read()
+            except Exception as e:
+                logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
+                return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
+            # Parse BibTeX data
+            try:
+                bib_database = bibtexparser.loads(bib_content)
+            except Exception as e:
+                logger.error(f"Error parsing BibTeX data: {e}")
+                return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
+            # Extract DOIs
+            dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
+            logger.info(f"Found {len(dois)} DOIs to download")
+            # Result lists
+            downloaded_files = []
+            failed_dois = []
+            downloaded_links = []
+            # Download PDFs
+            for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
+                try:
+                    # Try to download with multiple methods with retries
+                    pdf_content = await self.download_with_retry_async(doi)
+                    # Save PDF
+                    if pdf_content:
+                        if doi is None:
+                            return None, "Error: DOI not provided", "Error: DOI not provided"
+                        filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
+                        filepath = os.path.join(self.output_dir, filename)
+                        with open(filepath, 'wb') as f:
+                            f.write(pdf_content)
+                        downloaded_files.append(filepath)
+                        downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+                        logger.info(f"Successfully downloaded: {filename}")
+                    else:
+                        failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+                except Exception as e:
+                    failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
+                    logger.error(f"Error processing {doi}: {e}")
+            # Create ZIP of downloaded papers
+            if downloaded_files:
+                zip_filename = 'papers.zip'
+                with zipfile.ZipFile(zip_filename, 'w') as zipf:
+                    for file_path in downloaded_files:
+                        zipf.write(file_path, arcname=os.path.basename(file_path))
+                logger.info(f"ZIP file created: {zip_filename}")
+            return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
 def create_gradio_interface():
     """Create Gradio interface for Paper Downloader"""
     downloader = PaperDownloader()
         else:
             return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
     # Gradio Interface
     interface = gr.Interface(
         fn=download_papers,