Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,7 +14,6 @@ import asyncio
|
|
| 14 |
import aiohttp
|
| 15 |
from playwright.async_api import async_playwright
|
| 16 |
|
| 17 |
-
|
| 18 |
# Configure logging
|
| 19 |
logging.basicConfig(level=logging.INFO,
|
| 20 |
format='%(asctime)s - %(levelname)s: %(message)s')
|
|
@@ -43,7 +42,51 @@ class PaperDownloader:
|
|
| 43 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
| 44 |
'Accept-Language': 'en-US,en;q=0.9',
|
| 45 |
}
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def clean_doi(self, doi):
|
| 48 |
"""Clean and encode DOI for URL"""
|
| 49 |
if not isinstance(doi, str):
|
|
@@ -59,55 +102,43 @@ class PaperDownloader:
|
|
| 59 |
except Exception as e:
|
| 60 |
logger.debug(f"Error fetching {url}: {e}")
|
| 61 |
return None, None
|
| 62 |
-
|
|
|
|
| 63 |
async def download_paper_direct_doi_async(self, session, doi):
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
await page.goto(doi_url, timeout=30000)
|
| 78 |
-
html_content = await page.content()
|
| 79 |
-
except Exception as e:
|
| 80 |
-
logger.debug(f"Error trying to navigate {doi}: {e}")
|
| 81 |
-
await browser.close()
|
| 82 |
-
return None
|
| 83 |
-
|
| 84 |
-
soup = BeautifulSoup(html_content, 'html.parser')
|
| 85 |
-
await browser.close()
|
| 86 |
-
|
| 87 |
-
pdf_patterns = [
|
| 88 |
r'(https?://[^\s<>"]+?\.pdf)',
|
| 89 |
r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
|
| 90 |
r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
logger.debug(f"Error trying to get the PDF from {doi}: {e}")
|
| 110 |
-
return None
|
| 111 |
|
| 112 |
async def download_paper_scihub_async(self, session, doi):
|
| 113 |
"""Improved method to download paper from Sci-Hub using async requests"""
|
|
@@ -248,13 +279,12 @@ class PaperDownloader:
|
|
| 248 |
while retries < max_retries and not pdf_content:
|
| 249 |
try:
|
| 250 |
pdf_content = (
|
| 251 |
-
|
| 252 |
await self.download_paper_scihub_async(session, doi) or
|
| 253 |
await self.download_paper_libgen_async(session, doi) or
|
| 254 |
await self.download_paper_google_scholar_async(session, doi) or
|
| 255 |
await self.download_paper_crossref_async(session, doi)
|
| 256 |
-
|
| 257 |
-
)
|
| 258 |
if pdf_content:
|
| 259 |
return pdf_content
|
| 260 |
except Exception as e:
|
|
@@ -418,8 +448,7 @@ class PaperDownloader:
|
|
| 418 |
self.download_paper_libgen(doi) or
|
| 419 |
self.download_paper_google_scholar(doi) or
|
| 420 |
self.download_paper_crossref(doi)
|
| 421 |
-
|
| 422 |
-
)
|
| 423 |
|
| 424 |
if pdf_content:
|
| 425 |
return pdf_content
|
|
@@ -614,6 +643,8 @@ class PaperDownloader:
|
|
| 614 |
for file_path in downloaded_files:
|
| 615 |
zipf.write(file_path, arcname=os.path.basename(file_path))
|
| 616 |
logger.info(f"ZIP file created: {zip_filename}")
|
|
|
|
|
|
|
| 617 |
|
| 618 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
|
| 619 |
|
|
|
|
| 14 |
import aiohttp
|
| 15 |
from playwright.async_api import async_playwright
|
| 16 |
|
|
|
|
| 17 |
# Configure logging
|
| 18 |
logging.basicConfig(level=logging.INFO,
|
| 19 |
format='%(asctime)s - %(levelname)s: %(message)s')
|
|
|
|
| 42 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
| 43 |
'Accept-Language': 'en-US,en;q=0.9',
|
| 44 |
}
|
| 45 |
+
|
| 46 |
+
self.playwright_browser = None
|
| 47 |
+
self.playwright_lock = asyncio.Lock() # Added lock
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
async def initialize_playwright(self):
|
| 51 |
+
"""Initialize the playwright browser instance to be used by the tool."""
|
| 52 |
+
async with self.playwright_lock:
|
| 53 |
+
if not self.playwright_browser:
|
| 54 |
+
try:
|
| 55 |
+
playwright = await async_playwright().start()
|
| 56 |
+
self.playwright_browser = await playwright.chromium.launch()
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.error(f"Error initializing Playwright browser: {e}")
|
| 59 |
+
|
| 60 |
+
async def close_playwright(self):
|
| 61 |
+
"""Closes the playwright browser, must be called at the end of the execution."""
|
| 62 |
+
async with self.playwright_lock:
|
| 63 |
+
if self.playwright_browser:
|
| 64 |
+
try:
|
| 65 |
+
await self.playwright_browser.close()
|
| 66 |
+
self.playwright_browser = None
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.error(f"Error closing Playwright browser: {e}")
|
| 69 |
+
|
| 70 |
+
async def get_html_with_playwright(self, doi_url):
|
| 71 |
+
"""Utility function to fetch content with playwright with try-catch."""
|
| 72 |
+
if not self.playwright_browser:
|
| 73 |
+
await self.initialize_playwright()
|
| 74 |
+
|
| 75 |
+
if not self.playwright_browser:
|
| 76 |
+
logger.error(f"Playwright browser is not initialized for url: {doi_url}")
|
| 77 |
+
return None
|
| 78 |
+
page = None
|
| 79 |
+
try:
|
| 80 |
+
page = await self.playwright_browser.new_page()
|
| 81 |
+
await page.goto(doi_url, timeout=30000)
|
| 82 |
+
return await page.content()
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.debug(f"Error navigating or getting content for url: {doi_url}: {e}")
|
| 85 |
+
return None
|
| 86 |
+
finally:
|
| 87 |
+
if page:
|
| 88 |
+
await page.close()
|
| 89 |
+
|
| 90 |
def clean_doi(self, doi):
|
| 91 |
"""Clean and encode DOI for URL"""
|
| 92 |
if not isinstance(doi, str):
|
|
|
|
| 102 |
except Exception as e:
|
| 103 |
logger.debug(f"Error fetching {url}: {e}")
|
| 104 |
return None, None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
async def download_paper_direct_doi_async(self, session, doi):
|
| 108 |
+
"""Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
|
| 109 |
+
if not doi:
|
| 110 |
+
return None
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
doi_url = f"https://doi.org/{self.clean_doi(doi)}"
|
| 114 |
+
|
| 115 |
+
html_content = await self.get_html_with_playwright(doi_url)
|
| 116 |
+
|
| 117 |
+
if not html_content:
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
pdf_patterns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
r'(https?://[^\s<>"]+?\.pdf)',
|
| 122 |
r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
|
| 123 |
r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
pdf_urls = []
|
| 127 |
+
for pattern in pdf_patterns:
|
| 128 |
+
pdf_urls.extend(re.findall(pattern, html_content))
|
| 129 |
+
|
| 130 |
+
for pdf_url in pdf_urls:
|
| 131 |
+
try:
|
| 132 |
+
pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
|
| 133 |
+
if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
|
| 134 |
+
logger.debug(f"Found PDF from: {pdf_url}")
|
| 135 |
+
return await pdf_response.read()
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.debug(f"Error trying to get the PDF from {doi}: {e}")
|
| 141 |
+
return None
|
|
|
|
|
|
|
| 142 |
|
| 143 |
async def download_paper_scihub_async(self, session, doi):
|
| 144 |
"""Improved method to download paper from Sci-Hub using async requests"""
|
|
|
|
| 279 |
while retries < max_retries and not pdf_content:
|
| 280 |
try:
|
| 281 |
pdf_content = (
|
| 282 |
+
await self.download_paper_direct_doi_async(session, doi) or
|
| 283 |
await self.download_paper_scihub_async(session, doi) or
|
| 284 |
await self.download_paper_libgen_async(session, doi) or
|
| 285 |
await self.download_paper_google_scholar_async(session, doi) or
|
| 286 |
await self.download_paper_crossref_async(session, doi)
|
| 287 |
+
)
|
|
|
|
| 288 |
if pdf_content:
|
| 289 |
return pdf_content
|
| 290 |
except Exception as e:
|
|
|
|
| 448 |
self.download_paper_libgen(doi) or
|
| 449 |
self.download_paper_google_scholar(doi) or
|
| 450 |
self.download_paper_crossref(doi)
|
| 451 |
+
)
|
|
|
|
| 452 |
|
| 453 |
if pdf_content:
|
| 454 |
return pdf_content
|
|
|
|
| 643 |
for file_path in downloaded_files:
|
| 644 |
zipf.write(file_path, arcname=os.path.basename(file_path))
|
| 645 |
logger.info(f"ZIP file created: {zip_filename}")
|
| 646 |
+
|
| 647 |
+
await self.close_playwright()
|
| 648 |
|
| 649 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
|
| 650 |
|