C2MV commited on
Commit
ba3a95f
·
verified ·
1 Parent(s): 6f7150a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -38
app.py CHANGED
@@ -59,37 +59,46 @@ class PaperDownloader:
59
  return None, None
60
 
61
 
62
- async def fetch_pdf_content(self, session, url, max_redirects=5):
63
- """Fetch content and validate if response is PDF, following up to max_redirects redirections."""
64
 
65
  current_url = url
66
  redirect_count = 0
 
67
 
68
  while redirect_count <= max_redirects:
69
  try:
70
- async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
71
-
72
- if response.status in [301,302, 307,308]:
73
- current_url = response.headers['Location']
74
- redirect_count+=1
75
- logger.debug(f"Following redirect from {url} to {current_url}")
76
- continue
77
-
78
- response.raise_for_status()
79
- if 'application/pdf' in response.headers.get('Content-Type', ''):
80
- return await response.read()
81
- else:
82
- logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
83
- return None
84
-
 
 
 
 
 
 
 
 
 
85
  except Exception as e:
86
- logger.debug(f"Error getting PDF from {current_url}: {e}")
87
- return None
88
- logger.debug(f"Too many redirects {url}, not following this link further")
 
89
  return None
90
-
91
-
92
-
93
  async def download_paper_direct_doi_async(self, session, doi):
94
  """Attempt to download the pdf from the landing page of the doi"""
95
  if not doi:
@@ -97,6 +106,14 @@ class PaperDownloader:
97
 
98
  try:
99
  doi_url = f"https://doi.org/{self.clean_doi(doi)}"
 
 
 
 
 
 
 
 
100
  text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
101
  if not text:
102
  return None
@@ -111,11 +128,12 @@ class PaperDownloader:
111
  for pattern in pdf_patterns:
112
  pdf_urls.extend(re.findall(pattern, text))
113
 
 
114
  for pdf_url in pdf_urls:
115
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
116
- if pdf_content:
117
- logger.debug(f"Found PDF from: {pdf_url}")
118
- return pdf_content
119
 
120
  except Exception as e:
121
  logger.debug(f"Error trying to get the PDF from {doi}: {e}")
@@ -145,18 +163,18 @@ class PaperDownloader:
145
  for pattern in pdf_patterns:
146
  pdf_urls.extend(re.findall(pattern, text))
147
 
148
- # Try downloading from found URLs
149
  for pdf_url in pdf_urls:
150
  pdf_content = await self.fetch_pdf_content(session,pdf_url)
151
  if pdf_content:
152
  logger.debug(f"Found PDF from: {pdf_url}")
153
  return pdf_content
154
-
155
  except Exception as e:
156
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
157
 
158
  return None
159
-
160
  async def download_paper_libgen_async(self, session, doi):
161
  """Download from Libgen, handles the query and the redirection"""
162
  if not doi:
@@ -185,7 +203,7 @@ class PaperDownloader:
185
  except Exception as e:
186
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
187
  return None
188
-
189
  async def download_paper_google_scholar_async(self, session, doi):
190
  """Search google scholar to find an article with the given doi, try to get the pdf"""
191
  if not doi:
@@ -216,7 +234,7 @@ class PaperDownloader:
216
  logger.debug(f"Google Scholar error for {doi}: {e}")
217
 
218
  return None
219
-
220
  async def download_paper_crossref_async(self, session, doi):
221
  """Alternative search method using Crossref"""
222
  if not doi:
@@ -244,7 +262,7 @@ class PaperDownloader:
244
  except Exception as e:
245
  logger.debug(f"Crossref error for {doi}: {e}")
246
  return None
247
-
248
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
249
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
250
  pdf_content = None
@@ -299,7 +317,7 @@ class PaperDownloader:
299
  except Exception as e:
300
  logger.error(f"Error processing {doi}: {e}")
301
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
302
-
303
  async def download_multiple_dois_async(self, dois_text):
304
  """Downloads multiple papers from a list of DOIs"""
305
  if not dois_text:
@@ -395,6 +413,7 @@ class PaperDownloader:
395
 
396
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
397
 
 
398
  def create_gradio_interface():
399
  """Create Gradio interface for Paper Downloader"""
400
  downloader = PaperDownloader()
@@ -415,8 +434,6 @@ def create_gradio_interface():
415
  return zip_path, downloaded_dois, failed_dois, None
416
  else:
417
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
418
-
419
-
420
  # Gradio Interface
421
  interface = gr.Interface(
422
  fn=download_papers,
@@ -492,11 +509,9 @@ def create_gradio_interface():
492
  """
493
  return interface
494
 
495
-
496
  def main():
497
  interface = create_gradio_interface()
498
  interface.launch(share=True)
499
 
500
-
501
  if __name__ == "__main__":
502
  main()
 
59
  return None, None
60
 
61
 
62
+ async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
63
+ """Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
64
 
65
  current_url = url
66
  redirect_count = 0
67
+ retry_count = 0
68
 
69
  while redirect_count <= max_redirects:
70
  try:
71
+ while retry_count <= max_retries:
72
+ try:
73
+ async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
74
+
75
+ if response.status in [301,302, 307,308]:
76
+ current_url = response.headers['Location']
77
+ redirect_count+=1
78
+ logger.debug(f"Following redirect from {url} to {current_url}")
79
+ break # Break out of the retry loop for a redirect
80
+
81
+ response.raise_for_status()
82
+
83
+ if 'application/pdf' in response.headers.get('Content-Type', ''):
84
+ return await response.read()
85
+ else:
86
+ logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
87
+ return None
88
+ except Exception as e:
89
+ logger.debug(f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
90
+ retry_count+=1
91
+ await asyncio.sleep(retry_delay)
92
+
93
+ retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
94
+
95
  except Exception as e:
96
+ logger.debug(f"Error getting PDF from {current_url}: {e}")
97
+ return None
98
+
99
+ logger.debug(f"Too many redirects or retries {url}, not following this link further")
100
  return None
101
+
 
 
102
  async def download_paper_direct_doi_async(self, session, doi):
103
  """Attempt to download the pdf from the landing page of the doi"""
104
  if not doi:
 
106
 
107
  try:
108
  doi_url = f"https://doi.org/{self.clean_doi(doi)}"
109
+
110
+ # First, let's try to download the URL directly in case it is already the pdf.
111
+ pdf_content = await self.fetch_pdf_content(session, doi_url)
112
+ if pdf_content:
113
+ logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
114
+ return pdf_content
115
+
116
+ # If direct DOI link was not a pdf, fetch landing page and extract links
117
  text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
118
  if not text:
119
  return None
 
128
  for pattern in pdf_patterns:
129
  pdf_urls.extend(re.findall(pattern, text))
130
 
131
+ # Attempt each pdf url and break when you find a PDF content.
132
  for pdf_url in pdf_urls:
133
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
134
+ if pdf_content:
135
+ logger.debug(f"Found PDF from: {pdf_url}")
136
+ return pdf_content
137
 
138
  except Exception as e:
139
  logger.debug(f"Error trying to get the PDF from {doi}: {e}")
 
163
  for pattern in pdf_patterns:
164
  pdf_urls.extend(re.findall(pattern, text))
165
 
166
+ # Try downloading from found URLs, but iterate over ALL
167
  for pdf_url in pdf_urls:
168
  pdf_content = await self.fetch_pdf_content(session,pdf_url)
169
  if pdf_content:
170
  logger.debug(f"Found PDF from: {pdf_url}")
171
  return pdf_content
172
+
173
  except Exception as e:
174
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
175
 
176
  return None
177
+
178
  async def download_paper_libgen_async(self, session, doi):
179
  """Download from Libgen, handles the query and the redirection"""
180
  if not doi:
 
203
  except Exception as e:
204
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
205
  return None
206
+
207
  async def download_paper_google_scholar_async(self, session, doi):
208
  """Search google scholar to find an article with the given doi, try to get the pdf"""
209
  if not doi:
 
234
  logger.debug(f"Google Scholar error for {doi}: {e}")
235
 
236
  return None
237
+
238
  async def download_paper_crossref_async(self, session, doi):
239
  """Alternative search method using Crossref"""
240
  if not doi:
 
262
  except Exception as e:
263
  logger.debug(f"Crossref error for {doi}: {e}")
264
  return None
265
+
266
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
267
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
268
  pdf_content = None
 
317
  except Exception as e:
318
  logger.error(f"Error processing {doi}: {e}")
319
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
320
+
321
  async def download_multiple_dois_async(self, dois_text):
322
  """Downloads multiple papers from a list of DOIs"""
323
  if not dois_text:
 
413
 
414
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
415
 
416
+
417
  def create_gradio_interface():
418
  """Create Gradio interface for Paper Downloader"""
419
  downloader = PaperDownloader()
 
434
  return zip_path, downloaded_dois, failed_dois, None
435
  else:
436
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
 
 
437
  # Gradio Interface
438
  interface = gr.Interface(
439
  fn=download_papers,
 
509
  """
510
  return interface
511
 
 
512
  def main():
513
  interface = create_gradio_interface()
514
  interface.launch(share=True)
515
 
 
516
  if __name__ == "__main__":
517
  main()