C2MV commited on
Commit
64cb1ff
·
verified ·
1 Parent(s): 7b9d802

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -52
app.py CHANGED
@@ -14,7 +14,6 @@ import asyncio
14
  import aiohttp
15
  from playwright.async_api import async_playwright
16
 
17
-
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO,
20
  format='%(asctime)s - %(levelname)s: %(message)s')
@@ -43,7 +42,51 @@ class PaperDownloader:
43
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
44
  'Accept-Language': 'en-US,en;q=0.9',
45
  }
46
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def clean_doi(self, doi):
48
  """Clean and encode DOI for URL"""
49
  if not isinstance(doi, str):
@@ -59,55 +102,43 @@ class PaperDownloader:
59
  except Exception as e:
60
  logger.debug(f"Error fetching {url}: {e}")
61
  return None, None
62
-
 
63
  async def download_paper_direct_doi_async(self, session, doi):
64
- """Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
65
- if not doi:
66
- return None
67
-
68
- try:
69
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
70
-
71
- # Use Playwright to render JavaScript content
72
- async with async_playwright() as p:
73
- browser = await p.chromium.launch() # You can use different browsers
74
- page = await browser.new_page()
75
-
76
- try:
77
- await page.goto(doi_url, timeout=30000)
78
- html_content = await page.content()
79
- except Exception as e:
80
- logger.debug(f"Error trying to navigate {doi}: {e}")
81
- await browser.close()
82
- return None
83
-
84
- soup = BeautifulSoup(html_content, 'html.parser')
85
- await browser.close()
86
-
87
- pdf_patterns = [
88
  r'(https?://[^\s<>"]+?\.pdf)',
89
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
90
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
91
- ]
92
-
93
- pdf_urls = []
94
- for pattern in pdf_patterns:
95
- pdf_urls.extend(re.findall(pattern, html_content))
96
-
97
-
98
- for pdf_url in pdf_urls:
99
- try:
100
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
101
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
102
- logger.debug(f"Found PDF from: {pdf_url}")
103
- return await pdf_response.read()
104
- except Exception as e:
105
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
106
-
107
-
108
- except Exception as e:
109
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
110
- return None
111
 
112
  async def download_paper_scihub_async(self, session, doi):
113
  """Improved method to download paper from Sci-Hub using async requests"""
@@ -248,13 +279,12 @@ class PaperDownloader:
248
  while retries < max_retries and not pdf_content:
249
  try:
250
  pdf_content = (
251
- await self.download_paper_direct_doi_async(session, doi) or
252
  await self.download_paper_scihub_async(session, doi) or
253
  await self.download_paper_libgen_async(session, doi) or
254
  await self.download_paper_google_scholar_async(session, doi) or
255
  await self.download_paper_crossref_async(session, doi)
256
-
257
- )
258
  if pdf_content:
259
  return pdf_content
260
  except Exception as e:
@@ -418,8 +448,7 @@ class PaperDownloader:
418
  self.download_paper_libgen(doi) or
419
  self.download_paper_google_scholar(doi) or
420
  self.download_paper_crossref(doi)
421
-
422
- )
423
 
424
  if pdf_content:
425
  return pdf_content
@@ -614,6 +643,8 @@ class PaperDownloader:
614
  for file_path in downloaded_files:
615
  zipf.write(file_path, arcname=os.path.basename(file_path))
616
  logger.info(f"ZIP file created: {zip_filename}")
 
 
617
 
618
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
619
 
 
14
  import aiohttp
15
  from playwright.async_api import async_playwright
16
 
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO,
19
  format='%(asctime)s - %(levelname)s: %(message)s')
 
42
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
43
  'Accept-Language': 'en-US,en;q=0.9',
44
  }
45
+
46
+ self.playwright_browser = None
47
+ self.playwright_lock = asyncio.Lock() # Added lock
48
+
49
+
50
+ async def initialize_playwright(self):
51
+ """Initialize the playwright browser instance to be used by the tool."""
52
+ async with self.playwright_lock:
53
+ if not self.playwright_browser:
54
+ try:
55
+ playwright = await async_playwright().start()
56
+ self.playwright_browser = await playwright.chromium.launch()
57
+ except Exception as e:
58
+ logger.error(f"Error initializing Playwright browser: {e}")
59
+
60
+ async def close_playwright(self):
61
+ """Closes the playwright browser, must be called at the end of the execution."""
62
+ async with self.playwright_lock:
63
+ if self.playwright_browser:
64
+ try:
65
+ await self.playwright_browser.close()
66
+ self.playwright_browser = None
67
+ except Exception as e:
68
+ logger.error(f"Error closing Playwright browser: {e}")
69
+
70
+ async def get_html_with_playwright(self, doi_url):
71
+ """Utility function to fetch content with playwright with try-catch."""
72
+ if not self.playwright_browser:
73
+ await self.initialize_playwright()
74
+
75
+ if not self.playwright_browser:
76
+ logger.error(f"Playwright browser is not initialized for url: {doi_url}")
77
+ return None
78
+ page = None
79
+ try:
80
+ page = await self.playwright_browser.new_page()
81
+ await page.goto(doi_url, timeout=30000)
82
+ return await page.content()
83
+ except Exception as e:
84
+ logger.debug(f"Error navigating or getting content for url: {doi_url}: {e}")
85
+ return None
86
+ finally:
87
+ if page:
88
+ await page.close()
89
+
90
  def clean_doi(self, doi):
91
  """Clean and encode DOI for URL"""
92
  if not isinstance(doi, str):
 
102
  except Exception as e:
103
  logger.debug(f"Error fetching {url}: {e}")
104
  return None, None
105
+
106
+
107
  async def download_paper_direct_doi_async(self, session, doi):
108
+ """Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
109
+ if not doi:
110
+ return None
111
+
112
+ try:
113
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
114
+
115
+ html_content = await self.get_html_with_playwright(doi_url)
116
+
117
+ if not html_content:
118
+ return None
119
+
120
+ pdf_patterns = [
 
 
 
 
 
 
 
 
 
 
 
121
  r'(https?://[^\s<>"]+?\.pdf)',
122
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
123
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
124
+ ]
125
+
126
+ pdf_urls = []
127
+ for pattern in pdf_patterns:
128
+ pdf_urls.extend(re.findall(pattern, html_content))
129
+
130
+ for pdf_url in pdf_urls:
131
+ try:
132
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
133
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
134
+ logger.debug(f"Found PDF from: {pdf_url}")
135
+ return await pdf_response.read()
136
+ except Exception as e:
137
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
138
+
139
+ except Exception as e:
140
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
141
+ return None
 
 
142
 
143
  async def download_paper_scihub_async(self, session, doi):
144
  """Improved method to download paper from Sci-Hub using async requests"""
 
279
  while retries < max_retries and not pdf_content:
280
  try:
281
  pdf_content = (
282
+ await self.download_paper_direct_doi_async(session, doi) or
283
  await self.download_paper_scihub_async(session, doi) or
284
  await self.download_paper_libgen_async(session, doi) or
285
  await self.download_paper_google_scholar_async(session, doi) or
286
  await self.download_paper_crossref_async(session, doi)
287
+ )
 
288
  if pdf_content:
289
  return pdf_content
290
  except Exception as e:
 
448
  self.download_paper_libgen(doi) or
449
  self.download_paper_google_scholar(doi) or
450
  self.download_paper_crossref(doi)
451
+ )
 
452
 
453
  if pdf_content:
454
  return pdf_content
 
643
  for file_path in downloaded_files:
644
  zipf.write(file_path, arcname=os.path.basename(file_path))
645
  logger.info(f"ZIP file created: {zip_filename}")
646
+
647
+ await self.close_playwright()
648
 
649
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
650