Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,6 @@ import asyncio
|
|
14 |
import aiohttp
|
15 |
from playwright.async_api import async_playwright
|
16 |
|
17 |
-
|
18 |
# Configure logging
|
19 |
logging.basicConfig(level=logging.INFO,
|
20 |
format='%(asctime)s - %(levelname)s: %(message)s')
|
@@ -43,7 +42,51 @@ class PaperDownloader:
|
|
43 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
44 |
'Accept-Language': 'en-US,en;q=0.9',
|
45 |
}
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def clean_doi(self, doi):
|
48 |
"""Clean and encode DOI for URL"""
|
49 |
if not isinstance(doi, str):
|
@@ -59,55 +102,43 @@ class PaperDownloader:
|
|
59 |
except Exception as e:
|
60 |
logger.debug(f"Error fetching {url}: {e}")
|
61 |
return None, None
|
62 |
-
|
|
|
63 |
async def download_paper_direct_doi_async(self, session, doi):
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
await page.goto(doi_url, timeout=30000)
|
78 |
-
html_content = await page.content()
|
79 |
-
except Exception as e:
|
80 |
-
logger.debug(f"Error trying to navigate {doi}: {e}")
|
81 |
-
await browser.close()
|
82 |
-
return None
|
83 |
-
|
84 |
-
soup = BeautifulSoup(html_content, 'html.parser')
|
85 |
-
await browser.close()
|
86 |
-
|
87 |
-
pdf_patterns = [
|
88 |
r'(https?://[^\s<>"]+?\.pdf)',
|
89 |
r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
|
90 |
r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
logger.debug(f"Error trying to get the PDF from {doi}: {e}")
|
110 |
-
return None
|
111 |
|
112 |
async def download_paper_scihub_async(self, session, doi):
|
113 |
"""Improved method to download paper from Sci-Hub using async requests"""
|
@@ -248,13 +279,12 @@ class PaperDownloader:
|
|
248 |
while retries < max_retries and not pdf_content:
|
249 |
try:
|
250 |
pdf_content = (
|
251 |
-
|
252 |
await self.download_paper_scihub_async(session, doi) or
|
253 |
await self.download_paper_libgen_async(session, doi) or
|
254 |
await self.download_paper_google_scholar_async(session, doi) or
|
255 |
await self.download_paper_crossref_async(session, doi)
|
256 |
-
|
257 |
-
)
|
258 |
if pdf_content:
|
259 |
return pdf_content
|
260 |
except Exception as e:
|
@@ -418,8 +448,7 @@ class PaperDownloader:
|
|
418 |
self.download_paper_libgen(doi) or
|
419 |
self.download_paper_google_scholar(doi) or
|
420 |
self.download_paper_crossref(doi)
|
421 |
-
|
422 |
-
)
|
423 |
|
424 |
if pdf_content:
|
425 |
return pdf_content
|
@@ -614,6 +643,8 @@ class PaperDownloader:
|
|
614 |
for file_path in downloaded_files:
|
615 |
zipf.write(file_path, arcname=os.path.basename(file_path))
|
616 |
logger.info(f"ZIP file created: {zip_filename}")
|
|
|
|
|
617 |
|
618 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
|
619 |
|
|
|
14 |
import aiohttp
|
15 |
from playwright.async_api import async_playwright
|
16 |
|
|
|
17 |
# Configure logging
|
18 |
logging.basicConfig(level=logging.INFO,
|
19 |
format='%(asctime)s - %(levelname)s: %(message)s')
|
|
|
42 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
43 |
'Accept-Language': 'en-US,en;q=0.9',
|
44 |
}
|
45 |
+
|
46 |
+
self.playwright_browser = None
|
47 |
+
self.playwright_lock = asyncio.Lock() # Added lock
|
48 |
+
|
49 |
+
|
50 |
+
async def initialize_playwright(self):
|
51 |
+
"""Initialize the playwright browser instance to be used by the tool."""
|
52 |
+
async with self.playwright_lock:
|
53 |
+
if not self.playwright_browser:
|
54 |
+
try:
|
55 |
+
playwright = await async_playwright().start()
|
56 |
+
self.playwright_browser = await playwright.chromium.launch()
|
57 |
+
except Exception as e:
|
58 |
+
logger.error(f"Error initializing Playwright browser: {e}")
|
59 |
+
|
60 |
+
async def close_playwright(self):
|
61 |
+
"""Closes the playwright browser, must be called at the end of the execution."""
|
62 |
+
async with self.playwright_lock:
|
63 |
+
if self.playwright_browser:
|
64 |
+
try:
|
65 |
+
await self.playwright_browser.close()
|
66 |
+
self.playwright_browser = None
|
67 |
+
except Exception as e:
|
68 |
+
logger.error(f"Error closing Playwright browser: {e}")
|
69 |
+
|
70 |
+
async def get_html_with_playwright(self, doi_url):
|
71 |
+
"""Utility function to fetch content with playwright with try-catch."""
|
72 |
+
if not self.playwright_browser:
|
73 |
+
await self.initialize_playwright()
|
74 |
+
|
75 |
+
if not self.playwright_browser:
|
76 |
+
logger.error(f"Playwright browser is not initialized for url: {doi_url}")
|
77 |
+
return None
|
78 |
+
page = None
|
79 |
+
try:
|
80 |
+
page = await self.playwright_browser.new_page()
|
81 |
+
await page.goto(doi_url, timeout=30000)
|
82 |
+
return await page.content()
|
83 |
+
except Exception as e:
|
84 |
+
logger.debug(f"Error navigating or getting content for url: {doi_url}: {e}")
|
85 |
+
return None
|
86 |
+
finally:
|
87 |
+
if page:
|
88 |
+
await page.close()
|
89 |
+
|
90 |
def clean_doi(self, doi):
|
91 |
"""Clean and encode DOI for URL"""
|
92 |
if not isinstance(doi, str):
|
|
|
102 |
except Exception as e:
|
103 |
logger.debug(f"Error fetching {url}: {e}")
|
104 |
return None, None
|
105 |
+
|
106 |
+
|
107 |
async def download_paper_direct_doi_async(self, session, doi):
|
108 |
+
"""Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
|
109 |
+
if not doi:
|
110 |
+
return None
|
111 |
+
|
112 |
+
try:
|
113 |
+
doi_url = f"https://doi.org/{self.clean_doi(doi)}"
|
114 |
+
|
115 |
+
html_content = await self.get_html_with_playwright(doi_url)
|
116 |
+
|
117 |
+
if not html_content:
|
118 |
+
return None
|
119 |
+
|
120 |
+
pdf_patterns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
r'(https?://[^\s<>"]+?\.pdf)',
|
122 |
r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
|
123 |
r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
|
124 |
+
]
|
125 |
+
|
126 |
+
pdf_urls = []
|
127 |
+
for pattern in pdf_patterns:
|
128 |
+
pdf_urls.extend(re.findall(pattern, html_content))
|
129 |
+
|
130 |
+
for pdf_url in pdf_urls:
|
131 |
+
try:
|
132 |
+
pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
|
133 |
+
if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
|
134 |
+
logger.debug(f"Found PDF from: {pdf_url}")
|
135 |
+
return await pdf_response.read()
|
136 |
+
except Exception as e:
|
137 |
+
logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
|
138 |
+
|
139 |
+
except Exception as e:
|
140 |
+
logger.debug(f"Error trying to get the PDF from {doi}: {e}")
|
141 |
+
return None
|
|
|
|
|
142 |
|
143 |
async def download_paper_scihub_async(self, session, doi):
|
144 |
"""Improved method to download paper from Sci-Hub using async requests"""
|
|
|
279 |
while retries < max_retries and not pdf_content:
|
280 |
try:
|
281 |
pdf_content = (
|
282 |
+
await self.download_paper_direct_doi_async(session, doi) or
|
283 |
await self.download_paper_scihub_async(session, doi) or
|
284 |
await self.download_paper_libgen_async(session, doi) or
|
285 |
await self.download_paper_google_scholar_async(session, doi) or
|
286 |
await self.download_paper_crossref_async(session, doi)
|
287 |
+
)
|
|
|
288 |
if pdf_content:
|
289 |
return pdf_content
|
290 |
except Exception as e:
|
|
|
448 |
self.download_paper_libgen(doi) or
|
449 |
self.download_paper_google_scholar(doi) or
|
450 |
self.download_paper_crossref(doi)
|
451 |
+
)
|
|
|
452 |
|
453 |
if pdf_content:
|
454 |
return pdf_content
|
|
|
643 |
for file_path in downloaded_files:
|
644 |
zipf.write(file_path, arcname=os.path.basename(file_path))
|
645 |
logger.info(f"ZIP file created: {zip_filename}")
|
646 |
+
|
647 |
+
await self.close_playwright()
|
648 |
|
649 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
|
650 |
|