C2MV commited on
Commit
9898cdf
·
verified ·
1 Parent(s): ba3a95f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -114
app.py CHANGED
@@ -58,23 +58,22 @@ class PaperDownloader:
58
  logger.debug(f"Error fetching {url}: {e}")
59
  return None, None
60
 
61
-
62
  async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
63
- """Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
64
-
65
- current_url = url
66
- redirect_count = 0
67
- retry_count = 0
68
 
69
- while redirect_count <= max_redirects:
70
- try:
71
- while retry_count <= max_retries:
 
 
 
 
72
  try:
73
  async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
74
 
75
- if response.status in [301,302, 307,308]:
76
  current_url = response.headers['Location']
77
- redirect_count+=1
78
  logger.debug(f"Following redirect from {url} to {current_url}")
79
  break # Break out of the retry loop for a redirect
80
 
@@ -83,61 +82,63 @@ class PaperDownloader:
83
  if 'application/pdf' in response.headers.get('Content-Type', ''):
84
  return await response.read()
85
  else:
86
- logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
87
- return None
 
88
  except Exception as e:
89
- logger.debug(f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
90
- retry_count+=1
91
- await asyncio.sleep(retry_delay)
 
92
 
93
- retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
94
-
95
- except Exception as e:
96
- logger.debug(f"Error getting PDF from {current_url}: {e}")
97
- return None
98
-
99
- logger.debug(f"Too many redirects or retries {url}, not following this link further")
100
- return None
101
 
102
  async def download_paper_direct_doi_async(self, session, doi):
103
- """Attempt to download the pdf from the landing page of the doi"""
104
- if not doi:
105
- return None
106
 
107
- try:
108
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
109
-
110
- # First, let's try to download the URL directly in case it is already the pdf.
111
- pdf_content = await self.fetch_pdf_content(session, doi_url)
112
- if pdf_content:
113
- logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
114
- return pdf_content
115
-
116
- # If direct DOI link was not a pdf, fetch landing page and extract links
117
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
118
- if not text:
119
- return None
120
-
121
- pdf_patterns = [
122
- r'(https?://[^\s<>"]+?\.pdf)',
123
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
124
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
125
- ]
126
-
127
- pdf_urls = []
128
- for pattern in pdf_patterns:
129
- pdf_urls.extend(re.findall(pattern, text))
130
-
131
- # Attempt each pdf url and break when you find a PDF content.
132
- for pdf_url in pdf_urls:
133
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
134
- if pdf_content:
135
- logger.debug(f"Found PDF from: {pdf_url}")
136
- return pdf_content
137
 
138
- except Exception as e:
139
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
 
 
 
 
 
 
 
140
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  async def download_paper_scihub_async(self, session, doi):
143
  """Improved method to download paper from Sci-Hub using async requests"""
@@ -169,7 +170,7 @@ class PaperDownloader:
169
  if pdf_content:
170
  logger.debug(f"Found PDF from: {pdf_url}")
171
  return pdf_content
172
-
173
  except Exception as e:
174
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
175
 
@@ -179,20 +180,20 @@ class PaperDownloader:
179
  """Download from Libgen, handles the query and the redirection"""
180
  if not doi:
181
  return None
182
-
183
  base_url = 'https://libgen.rs/scimag/'
184
  try:
185
  search_url = f"{base_url}?q={self.clean_doi(doi)}"
186
  text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
187
-
188
  if not text or "No results" in text:
189
  logger.debug(f"No results for DOI: {doi} on libgen")
190
  return None
191
-
192
  soup = BeautifulSoup(text, 'html.parser')
193
-
194
  links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
195
-
196
  if links:
197
  link = links[0]
198
  pdf_url = link['href']
@@ -205,36 +206,36 @@ class PaperDownloader:
205
  return None
206
 
207
  async def download_paper_google_scholar_async(self, session, doi):
208
- """Search google scholar to find an article with the given doi, try to get the pdf"""
209
- if not doi:
210
- return None
211
-
212
- try:
213
- query = f'doi:"{doi}"'
214
- params = {'q': query}
215
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
216
-
217
- text, headers = await self.fetch_with_headers(session, url, timeout=10)
218
- if not text:
219
  return None
220
-
221
- soup = BeautifulSoup(text, 'html.parser')
222
-
223
- # Find any links with [PDF]
224
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
225
-
226
- if links:
227
- pdf_url = links[0]['href']
228
- pdf_content = await self.fetch_pdf_content(session,pdf_url)
229
- if pdf_content:
230
- logger.debug(f"Found PDF from: {pdf_url}")
231
- return pdf_content
232
-
233
- except Exception as e:
234
- logger.debug(f"Google Scholar error for {doi}: {e}")
235
-
236
- return None
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  async def download_paper_crossref_async(self, session, doi):
239
  """Alternative search method using Crossref"""
240
  if not doi:
@@ -262,7 +263,7 @@ class PaperDownloader:
262
  except Exception as e:
263
  logger.debug(f"Crossref error for {doi}: {e}")
264
  return None
265
-
266
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
267
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
268
  pdf_content = None
@@ -292,7 +293,7 @@ class PaperDownloader:
292
  delay *= 2 # Exponential backoff
293
 
294
  return None
295
-
296
  async def download_single_doi_async(self, doi):
297
  """Downloads a single paper using a DOI"""
298
  if not doi:
@@ -303,20 +304,21 @@ class PaperDownloader:
303
 
304
  if pdf_content:
305
  if doi is None:
306
- return None, "Error: DOI not provided", "Error: DOI not provided"
307
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
308
  filepath = os.path.join(self.output_dir, filename)
309
- with open(filepath, 'wb') as f:
310
- f.write(pdf_content)
 
311
  logger.info(f"Successfully downloaded: {filename}")
312
  return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
313
  else:
314
- logger.warning(f"Could not download: {doi}")
315
- return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
316
 
317
  except Exception as e:
318
- logger.error(f"Error processing {doi}: {e}")
319
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
320
 
321
  async def download_multiple_dois_async(self, dois_text):
322
  """Downloads multiple papers from a list of DOIs"""
@@ -344,13 +346,18 @@ class PaperDownloader:
344
 
345
  if downloaded_files:
346
  zip_filename = 'papers.zip'
347
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
348
- for file_path in downloaded_files:
349
- zipf.write(file_path, arcname=os.path.basename(file_path))
350
  logger.info(f"ZIP file created: {zip_filename}")
351
 
352
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
353
-
 
 
 
 
 
354
  async def process_bibtex_async(self, bib_file):
355
  """Process BibTeX file and download papers with multiple strategies"""
356
  # Read BibTeX file content from the uploaded object
@@ -389,10 +396,11 @@ class PaperDownloader:
389
  return None, "Error: DOI not provided", "Error: DOI not provided"
390
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
391
  filepath = os.path.join(self.output_dir, filename)
392
-
393
- with open(filepath, 'wb') as f:
394
- f.write(pdf_content)
395
-
 
396
  downloaded_files.append(filepath)
397
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
398
  logger.info(f"Successfully downloaded: {filename}")
@@ -406,13 +414,13 @@ class PaperDownloader:
406
  # Create ZIP of downloaded papers
407
  if downloaded_files:
408
  zip_filename = 'papers.zip'
409
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
410
- for file_path in downloaded_files:
411
- zipf.write(file_path, arcname=os.path.basename(file_path))
412
  logger.info(f"ZIP file created: {zip_filename}")
413
 
414
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
415
-
416
 
417
  def create_gradio_interface():
418
  """Create Gradio interface for Paper Downloader"""
@@ -434,6 +442,7 @@ def create_gradio_interface():
434
  return zip_path, downloaded_dois, failed_dois, None
435
  else:
436
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
 
437
  # Gradio Interface
438
  interface = gr.Interface(
439
  fn=download_papers,
 
58
  logger.debug(f"Error fetching {url}: {e}")
59
  return None, None
60
 
 
61
  async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
62
+ """Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
 
 
 
 
63
 
64
+ current_url = url
65
+ redirect_count = 0
66
+ retry_count = 0
67
+
68
+ while redirect_count <= max_redirects:
69
+ try:
70
+ while retry_count <= max_retries:
71
  try:
72
  async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
73
 
74
+ if response.status in [301, 302, 307, 308]:
75
  current_url = response.headers['Location']
76
+ redirect_count += 1
77
  logger.debug(f"Following redirect from {url} to {current_url}")
78
  break # Break out of the retry loop for a redirect
79
 
 
82
  if 'application/pdf' in response.headers.get('Content-Type', ''):
83
  return await response.read()
84
  else:
85
+ logger.debug(
86
+ f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
87
+ return None
88
  except Exception as e:
89
+ logger.debug(
90
+ f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
91
+ retry_count += 1
92
+ await asyncio.sleep(retry_delay)
93
 
94
+ retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
95
+
96
+ except Exception as e:
97
+ logger.debug(f"Error getting PDF from {current_url}: {e}")
98
+ return None
99
+
100
+ logger.debug(f"Too many redirects or retries {url}, not following this link further")
101
+ return None
102
 
103
  async def download_paper_direct_doi_async(self, session, doi):
104
+ """Attempt to download the pdf from the landing page of the doi"""
105
+ if not doi:
106
+ return None
107
 
108
+ try:
109
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ # First, let's try to download the URL directly in case it is already the pdf.
112
+ pdf_content = await self.fetch_pdf_content(session, doi_url)
113
+ if pdf_content:
114
+ logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
115
+ return pdf_content
116
+
117
+ # If direct DOI link was not a pdf, fetch landing page and extract links
118
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
119
+ if not text:
120
  return None
121
+
122
+ pdf_patterns = [
123
+ r'(https?://[^\s<>"]+?\.pdf)',
124
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
125
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
126
+ ]
127
+
128
+ pdf_urls = []
129
+ for pattern in pdf_patterns:
130
+ pdf_urls.extend(re.findall(pattern, text))
131
+
132
+ # Attempt each pdf url and break when you find a PDF content.
133
+ for pdf_url in pdf_urls:
134
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
135
+ if pdf_content:
136
+ logger.debug(f"Found PDF from: {pdf_url}")
137
+ return pdf_content
138
+
139
+ except Exception as e:
140
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
141
+ return None
142
 
143
  async def download_paper_scihub_async(self, session, doi):
144
  """Improved method to download paper from Sci-Hub using async requests"""
 
170
  if pdf_content:
171
  logger.debug(f"Found PDF from: {pdf_url}")
172
  return pdf_content
173
+
174
  except Exception as e:
175
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
176
 
 
180
  """Download from Libgen, handles the query and the redirection"""
181
  if not doi:
182
  return None
183
+
184
  base_url = 'https://libgen.rs/scimag/'
185
  try:
186
  search_url = f"{base_url}?q={self.clean_doi(doi)}"
187
  text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
188
+
189
  if not text or "No results" in text:
190
  logger.debug(f"No results for DOI: {doi} on libgen")
191
  return None
192
+
193
  soup = BeautifulSoup(text, 'html.parser')
194
+
195
  links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
196
+
197
  if links:
198
  link = links[0]
199
  pdf_url = link['href']
 
206
  return None
207
 
208
  async def download_paper_google_scholar_async(self, session, doi):
209
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
210
+ if not doi:
 
 
 
 
 
 
 
 
 
211
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+ try:
214
+ query = f'doi:"{doi}"'
215
+ params = {'q': query}
216
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
217
+
218
+ text, headers = await self.fetch_with_headers(session, url, timeout=10)
219
+ if not text:
220
+ return None
221
+
222
+ soup = BeautifulSoup(text, 'html.parser')
223
+
224
+ # Find any links with [PDF]
225
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
226
+
227
+ if links:
228
+ pdf_url = links[0]['href']
229
+ pdf_content = await self.fetch_pdf_content(session,pdf_url)
230
+ if pdf_content:
231
+ logger.debug(f"Found PDF from: {pdf_url}")
232
+ return pdf_content
233
+
234
+ except Exception as e:
235
+ logger.debug(f"Google Scholar error for {doi}: {e}")
236
+
237
+ return None
238
+
239
  async def download_paper_crossref_async(self, session, doi):
240
  """Alternative search method using Crossref"""
241
  if not doi:
 
263
  except Exception as e:
264
  logger.debug(f"Crossref error for {doi}: {e}")
265
  return None
266
+
267
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
268
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
269
  pdf_content = None
 
293
  delay *= 2 # Exponential backoff
294
 
295
  return None
296
+
297
  async def download_single_doi_async(self, doi):
298
  """Downloads a single paper using a DOI"""
299
  if not doi:
 
304
 
305
  if pdf_content:
306
  if doi is None:
307
+ return None, "Error: DOI not provided", "Error: DOI not provided"
308
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
309
  filepath = os.path.join(self.output_dir, filename)
310
+ #write the file asynchronously here so it doesn't block
311
+ loop = asyncio.get_running_loop()
312
+ await loop.run_in_executor(None, lambda: open(filepath, 'wb').write(pdf_content))
313
  logger.info(f"Successfully downloaded: {filename}")
314
  return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
315
  else:
316
+ logger.warning(f"Could not download: {doi}")
317
+ return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
318
 
319
  except Exception as e:
320
+ logger.error(f"Error processing {doi}: {e}")
321
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
322
 
323
  async def download_multiple_dois_async(self, dois_text):
324
  """Downloads multiple papers from a list of DOIs"""
 
346
 
347
  if downloaded_files:
348
  zip_filename = 'papers.zip'
349
+ # Zip asynchronously
350
+ loop = asyncio.get_running_loop()
351
+ await loop.run_in_executor(None, lambda: self.create_zip(zip_filename,downloaded_files) )
352
  logger.info(f"ZIP file created: {zip_filename}")
353
 
354
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
355
+
356
+ def create_zip(self, zip_filename, downloaded_files):
357
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
358
+ for file_path in downloaded_files:
359
+ zipf.write(file_path, arcname=os.path.basename(file_path))
360
+
361
  async def process_bibtex_async(self, bib_file):
362
  """Process BibTeX file and download papers with multiple strategies"""
363
  # Read BibTeX file content from the uploaded object
 
396
  return None, "Error: DOI not provided", "Error: DOI not provided"
397
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
398
  filepath = os.path.join(self.output_dir, filename)
399
+
400
+ #Write the file asynchronously so it doesn't block the ui.
401
+ loop = asyncio.get_running_loop()
402
+ await loop.run_in_executor(None, lambda: open(filepath, 'wb').write(pdf_content))
403
+
404
  downloaded_files.append(filepath)
405
  downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
406
  logger.info(f"Successfully downloaded: {filename}")
 
414
  # Create ZIP of downloaded papers
415
  if downloaded_files:
416
  zip_filename = 'papers.zip'
417
+ # Zip asynchronously so the main loop is not blocked.
418
+ loop = asyncio.get_running_loop()
419
+ await loop.run_in_executor(None, lambda: self.create_zip(zip_filename,downloaded_files) )
420
  logger.info(f"ZIP file created: {zip_filename}")
421
 
422
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
423
+
424
 
425
  def create_gradio_interface():
426
  """Create Gradio interface for Paper Downloader"""
 
442
  return zip_path, downloaded_dois, failed_dois, None
443
  else:
444
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
445
+
446
  # Gradio Interface
447
  interface = gr.Interface(
448
  fn=download_papers,