C2MV commited on
Commit
6f7150a
·
verified ·
1 Parent(s): 4b923db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +229 -217
app.py CHANGED
@@ -47,100 +47,115 @@ class PaperDownloader:
47
  if not isinstance(doi, str):
48
  return None
49
  return quote(doi.strip()) if doi else None
50
-
51
  async def fetch_with_headers(self, session, url, timeout=10):
52
- """Utility method to fetch an URL with headers and timeout"""
53
- try:
54
- async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
- response.raise_for_status()
56
- return await response.text(), response.headers
57
- except Exception as e:
58
- logger.debug(f"Error fetching {url}: {e}")
59
- return None, None
60
 
61
- async def fetch_pdf_content(self, session, url):
62
- """Fetch and validate if the content of a request is actually PDF."""
63
- try:
64
- async with session.get(url, headers=self.headers, timeout=10) as pdf_response:
65
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
66
- return await pdf_response.read()
67
- else:
68
- logger.debug(f"Content type not PDF for {url}: {pdf_response.headers.get('Content-Type', '')}")
69
- return None
70
- except Exception as e:
71
- logger.debug(f"Error getting PDF {url}: {e}")
72
- return None
73
-
74
-
75
- async def download_paper_direct_doi_async(self, session, doi):
76
- """Attempt to download the pdf from the landing page of the doi"""
77
- if not doi:
78
- return None
79
-
80
- try:
81
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
82
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
83
- if not text:
84
- return None
85
 
86
- pdf_patterns = [
87
- r'(https?://[^\s<>"]+?\.pdf)',
88
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
89
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
90
- ]
91
 
92
- pdf_urls = []
93
- for pattern in pdf_patterns:
94
- pdf_urls.extend(re.findall(pattern, text))
95
 
96
- for pdf_url in pdf_urls:
97
- pdf_content = await self.fetch_pdf_content(session,pdf_url)
98
- if pdf_content:
99
- logger.debug(f"Found PDF from: {pdf_url}")
100
- return pdf_content
 
 
 
 
 
 
 
 
 
 
 
101
 
 
 
 
 
 
102
 
103
- except Exception as e:
104
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
105
-
106
- return None
107
 
108
- async def download_paper_scihub_async(self, session, doi):
109
- """Improved method to download paper from Sci-Hub using async requests"""
110
- if not doi:
111
- logger.warning("DOI not provided")
112
- return None
113
 
114
- for base_url in self.download_sources:
 
 
 
 
115
  try:
116
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
117
- text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
118
- if not text:
119
- continue
120
 
121
- # Search for multiple PDF URL patterns
122
- pdf_patterns = [
123
  r'(https?://[^\s<>"]+?\.pdf)',
124
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
125
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
126
- ]
127
-
128
- pdf_urls = []
129
- for pattern in pdf_patterns:
130
  pdf_urls.extend(re.findall(pattern, text))
131
 
132
- # Try downloading from found URLs
133
- for pdf_url in pdf_urls:
134
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
135
- if pdf_content:
136
- logger.debug(f"Found PDF from: {pdf_url}")
137
- return pdf_content
138
-
139
-
140
  except Exception as e:
141
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
142
-
143
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  async def download_paper_libgen_async(self, session, doi):
146
  """Download from Libgen, handles the query and the redirection"""
@@ -163,14 +178,14 @@ class PaperDownloader:
163
  if links:
164
  link = links[0]
165
  pdf_url = link['href']
166
- pdf_content = await self.fetch_pdf_content(session,pdf_url)
167
  if pdf_content:
168
  logger.debug(f"Found PDF from: {pdf_url}")
169
  return pdf_content
170
  except Exception as e:
171
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
172
  return None
173
-
174
  async def download_paper_google_scholar_async(self, session, doi):
175
  """Search google scholar to find an article with the given doi, try to get the pdf"""
176
  if not doi:
@@ -192,46 +207,44 @@ class PaperDownloader:
192
 
193
  if links:
194
  pdf_url = links[0]['href']
195
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
196
  if pdf_content:
197
- logger.debug(f"Found PDF from: {pdf_url}")
198
- return pdf_content
 
199
  except Exception as e:
200
  logger.debug(f"Google Scholar error for {doi}: {e}")
201
 
202
  return None
203
 
204
  async def download_paper_crossref_async(self, session, doi):
205
- """Alternative search method using Crossref"""
206
- if not doi:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  return None
208
 
209
- try:
210
- # Search for open access link
211
- url = f"https://api.crossref.org/works/{doi}"
212
- response = await session.get(url, headers=self.headers, timeout=10)
213
-
214
- if response.status == 200:
215
- data = await response.json()
216
- work = data.get('message', {})
217
-
218
- # Search for open access links
219
- links = work.get('link', [])
220
- for link in links:
221
- if link.get('content-type') == 'application/pdf':
222
- pdf_url = link.get('URL')
223
- if pdf_url:
224
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
225
- if pdf_content:
226
- logger.debug(f"Found PDF from: {pdf_url}")
227
- return pdf_content
228
-
229
-
230
- except Exception as e:
231
- logger.debug(f"Crossref error for {doi}: {e}")
232
-
233
- return None
234
-
235
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
236
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
237
  pdf_content = None
@@ -261,129 +274,127 @@ class PaperDownloader:
261
  delay *= 2 # Exponential backoff
262
 
263
  return None
264
-
265
 
266
  async def download_single_doi_async(self, doi):
267
- """Downloads a single paper using a DOI"""
268
- if not doi:
269
- return None, "Error: DOI not provided", "Error: DOI not provided"
270
-
271
- try:
272
- pdf_content = await self.download_with_retry_async(doi)
273
-
274
- if pdf_content:
275
- if doi is None:
276
- return None, "Error: DOI not provided", "Error: DOI not provided"
277
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
278
- filepath = os.path.join(self.output_dir, filename)
279
- with open(filepath, 'wb') as f:
280
- f.write(pdf_content)
281
- logger.info(f"Successfully downloaded: {filename}")
282
- return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
283
- else:
284
- logger.warning(f"Could not download: {doi}")
285
- return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
286
-
287
- except Exception as e:
288
- logger.error(f"Error processing {doi}: {e}")
289
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
290
-
291
- async def download_multiple_dois_async(self, dois_text):
292
- """Downloads multiple papers from a list of DOIs"""
293
- if not dois_text:
294
- return None, "Error: No DOIs provided", "Error: No DOIs provided"
295
-
296
- dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
297
- if not dois:
298
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
299
-
300
- downloaded_files = []
301
- failed_dois = []
302
- downloaded_links = []
303
- for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
304
- filepath, success_message, fail_message = await self.download_single_doi_async(doi)
305
- if filepath:
306
- # Unique filename for zip
307
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
308
- filepath_unique = os.path.join(self.output_dir, filename)
309
- os.rename(filepath, filepath_unique)
310
- downloaded_files.append(filepath_unique)
311
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
312
- else:
313
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
314
-
315
- if downloaded_files:
316
- zip_filename = 'papers.zip'
317
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
318
- for file_path in downloaded_files:
319
- zipf.write(file_path, arcname=os.path.basename(file_path))
320
- logger.info(f"ZIP file created: {zip_filename}")
321
-
322
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
323
 
324
- async def process_bibtex_async(self, bib_file):
325
- """Process BibTeX file and download papers with multiple strategies"""
326
- # Read BibTeX file content from the uploaded object
327
- try:
328
- with open(bib_file.name, 'r', encoding='utf-8') as f:
329
- bib_content = f.read()
330
- except Exception as e:
331
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
332
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
333
-
334
- # Parse BibTeX data
335
- try:
336
- bib_database = bibtexparser.loads(bib_content)
337
- except Exception as e:
338
- logger.error(f"Error parsing BibTeX data: {e}")
339
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
340
-
341
- # Extract DOIs
342
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
343
- logger.info(f"Found {len(dois)} DOIs to download")
344
-
345
- # Result lists
346
- downloaded_files = []
347
- failed_dois = []
348
- downloaded_links = []
349
-
350
- # Download PDFs
351
- for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
352
  try:
353
- # Try to download with multiple methods with retries
354
  pdf_content = await self.download_with_retry_async(doi)
355
-
356
- # Save PDF
357
  if pdf_content:
358
  if doi is None:
359
  return None, "Error: DOI not provided", "Error: DOI not provided"
360
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
361
  filepath = os.path.join(self.output_dir, filename)
362
-
363
  with open(filepath, 'wb') as f:
364
  f.write(pdf_content)
365
-
366
- downloaded_files.append(filepath)
367
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
368
  logger.info(f"Successfully downloaded: {filename}")
 
369
  else:
370
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
371
-
 
372
  except Exception as e:
373
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
374
  logger.error(f"Error processing {doi}: {e}")
 
375
 
376
- # Create ZIP of downloaded papers
377
- if downloaded_files:
378
- zip_filename = 'papers.zip'
379
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
380
- for file_path in downloaded_files:
381
- zipf.write(file_path, arcname=os.path.basename(file_path))
382
- logger.info(f"ZIP file created: {zip_filename}")
383
-
384
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
385
-
386
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  def create_gradio_interface():
388
  """Create Gradio interface for Paper Downloader"""
389
  downloader = PaperDownloader()
@@ -405,6 +416,7 @@ def create_gradio_interface():
405
  else:
406
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
407
 
 
408
  # Gradio Interface
409
  interface = gr.Interface(
410
  fn=download_papers,
 
47
  if not isinstance(doi, str):
48
  return None
49
  return quote(doi.strip()) if doi else None
50
+
51
  async def fetch_with_headers(self, session, url, timeout=10):
52
+ """Utility method to fetch an URL with headers and timeout"""
53
+ try:
54
+ async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
+ response.raise_for_status()
56
+ return await response.text(), response.headers
57
+ except Exception as e:
58
+ logger.debug(f"Error fetching {url}: {e}")
59
+ return None, None
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ async def fetch_pdf_content(self, session, url, max_redirects=5):
63
+ """Fetch content and validate if response is PDF, following up to max_redirects redirections."""
 
 
 
64
 
65
+ current_url = url
66
+ redirect_count = 0
 
67
 
68
+ while redirect_count <= max_redirects:
69
+ try:
70
+ async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
71
+
72
+ if response.status in [301,302, 307,308]:
73
+ current_url = response.headers['Location']
74
+ redirect_count+=1
75
+ logger.debug(f"Following redirect from {url} to {current_url}")
76
+ continue
77
+
78
+ response.raise_for_status()
79
+ if 'application/pdf' in response.headers.get('Content-Type', ''):
80
+ return await response.read()
81
+ else:
82
+ logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
83
+ return None
84
 
85
+ except Exception as e:
86
+ logger.debug(f"Error getting PDF from {current_url}: {e}")
87
+ return None
88
+ logger.debug(f"Too many redirects {url}, not following this link further")
89
+ return None
90
 
 
 
 
 
91
 
 
 
 
 
 
92
 
93
+ async def download_paper_direct_doi_async(self, session, doi):
94
+ """Attempt to download the pdf from the landing page of the doi"""
95
+ if not doi:
96
+ return None
97
+
98
  try:
99
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
100
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
101
+ if not text:
102
+ return None
103
 
104
+ pdf_patterns = [
 
105
  r'(https?://[^\s<>"]+?\.pdf)',
106
  r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
107
  r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
108
+ ]
109
+
110
+ pdf_urls = []
111
+ for pattern in pdf_patterns:
112
  pdf_urls.extend(re.findall(pattern, text))
113
 
114
+ for pdf_url in pdf_urls:
115
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
116
+ if pdf_content:
117
+ logger.debug(f"Found PDF from: {pdf_url}")
118
+ return pdf_content
119
+
 
 
120
  except Exception as e:
121
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
122
+ return None
123
+
124
+ async def download_paper_scihub_async(self, session, doi):
125
+ """Improved method to download paper from Sci-Hub using async requests"""
126
+ if not doi:
127
+ logger.warning("DOI not provided")
128
+ return None
129
+
130
+ for base_url in self.download_sources:
131
+ try:
132
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
133
+ text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
134
+ if not text:
135
+ continue
136
+
137
+ # Search for multiple PDF URL patterns
138
+ pdf_patterns = [
139
+ r'(https?://[^\s<>"]+?\.pdf)',
140
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
141
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
142
+ ]
143
+
144
+ pdf_urls = []
145
+ for pattern in pdf_patterns:
146
+ pdf_urls.extend(re.findall(pattern, text))
147
+
148
+ # Try downloading from found URLs
149
+ for pdf_url in pdf_urls:
150
+ pdf_content = await self.fetch_pdf_content(session,pdf_url)
151
+ if pdf_content:
152
+ logger.debug(f"Found PDF from: {pdf_url}")
153
+ return pdf_content
154
+
155
+ except Exception as e:
156
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
157
+
158
+ return None
159
 
160
  async def download_paper_libgen_async(self, session, doi):
161
  """Download from Libgen, handles the query and the redirection"""
 
178
  if links:
179
  link = links[0]
180
  pdf_url = link['href']
181
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
182
  if pdf_content:
183
  logger.debug(f"Found PDF from: {pdf_url}")
184
  return pdf_content
185
  except Exception as e:
186
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
187
  return None
188
+
189
  async def download_paper_google_scholar_async(self, session, doi):
190
  """Search google scholar to find an article with the given doi, try to get the pdf"""
191
  if not doi:
 
207
 
208
  if links:
209
  pdf_url = links[0]['href']
210
+ pdf_content = await self.fetch_pdf_content(session,pdf_url)
211
  if pdf_content:
212
+ logger.debug(f"Found PDF from: {pdf_url}")
213
+ return pdf_content
214
+
215
  except Exception as e:
216
  logger.debug(f"Google Scholar error for {doi}: {e}")
217
 
218
  return None
219
 
220
  async def download_paper_crossref_async(self, session, doi):
221
+ """Alternative search method using Crossref"""
222
+ if not doi:
223
+ return None
224
+
225
+ try:
226
+ # Search for open access link
227
+ url = f"https://api.crossref.org/works/{doi}"
228
+ response = await session.get(url, headers=self.headers, timeout=10)
229
+
230
+ if response.status == 200:
231
+ data = await response.json()
232
+ work = data.get('message', {})
233
+
234
+ # Search for open access links
235
+ links = work.get('link', [])
236
+ for link in links:
237
+ if link.get('content-type') == 'application/pdf':
238
+ pdf_url = link.get('URL')
239
+ if pdf_url:
240
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
241
+ if pdf_content:
242
+ logger.debug(f"Found PDF from: {pdf_url}")
243
+ return pdf_content
244
+ except Exception as e:
245
+ logger.debug(f"Crossref error for {doi}: {e}")
246
  return None
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
249
  """Downloads a paper using multiple strategies with exponential backoff and async requests"""
250
  pdf_content = None
 
274
  delay *= 2 # Exponential backoff
275
 
276
  return None
 
277
 
278
  async def download_single_doi_async(self, doi):
279
+ """Downloads a single paper using a DOI"""
280
+ if not doi:
281
+ return None, "Error: DOI not provided", "Error: DOI not provided"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  try:
 
284
  pdf_content = await self.download_with_retry_async(doi)
285
+
 
286
  if pdf_content:
287
  if doi is None:
288
  return None, "Error: DOI not provided", "Error: DOI not provided"
289
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
290
  filepath = os.path.join(self.output_dir, filename)
 
291
  with open(filepath, 'wb') as f:
292
  f.write(pdf_content)
 
 
 
293
  logger.info(f"Successfully downloaded: {filename}")
294
+ return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
295
  else:
296
+ logger.warning(f"Could not download: {doi}")
297
+ return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
298
+
299
  except Exception as e:
 
300
  logger.error(f"Error processing {doi}: {e}")
301
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
302
 
303
+ async def download_multiple_dois_async(self, dois_text):
304
+ """Downloads multiple papers from a list of DOIs"""
305
+ if not dois_text:
306
+ return None, "Error: No DOIs provided", "Error: No DOIs provided"
307
+
308
+ dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
309
+ if not dois:
310
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
311
+
312
+ downloaded_files = []
313
+ failed_dois = []
314
+ downloaded_links = []
315
+ for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
316
+ filepath, success_message, fail_message = await self.download_single_doi_async(doi)
317
+ if filepath:
318
+ # Unique filename for zip
319
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
320
+ filepath_unique = os.path.join(self.output_dir, filename)
321
+ os.rename(filepath, filepath_unique)
322
+ downloaded_files.append(filepath_unique)
323
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
324
+ else:
325
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
326
+
327
+ if downloaded_files:
328
+ zip_filename = 'papers.zip'
329
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
330
+ for file_path in downloaded_files:
331
+ zipf.write(file_path, arcname=os.path.basename(file_path))
332
+ logger.info(f"ZIP file created: {zip_filename}")
333
+
334
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
335
+
336
+ async def process_bibtex_async(self, bib_file):
337
+ """Process BibTeX file and download papers with multiple strategies"""
338
+ # Read BibTeX file content from the uploaded object
339
+ try:
340
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
341
+ bib_content = f.read()
342
+ except Exception as e:
343
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
344
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
345
+
346
+ # Parse BibTeX data
347
+ try:
348
+ bib_database = bibtexparser.loads(bib_content)
349
+ except Exception as e:
350
+ logger.error(f"Error parsing BibTeX data: {e}")
351
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
352
+
353
+ # Extract DOIs
354
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
355
+ logger.info(f"Found {len(dois)} DOIs to download")
356
+
357
+ # Result lists
358
+ downloaded_files = []
359
+ failed_dois = []
360
+ downloaded_links = []
361
+
362
+ # Download PDFs
363
+ for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
364
+ try:
365
+ # Try to download with multiple methods with retries
366
+ pdf_content = await self.download_with_retry_async(doi)
367
+
368
+ # Save PDF
369
+ if pdf_content:
370
+ if doi is None:
371
+ return None, "Error: DOI not provided", "Error: DOI not provided"
372
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
373
+ filepath = os.path.join(self.output_dir, filename)
374
+
375
+ with open(filepath, 'wb') as f:
376
+ f.write(pdf_content)
377
+
378
+ downloaded_files.append(filepath)
379
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
380
+ logger.info(f"Successfully downloaded: {filename}")
381
+ else:
382
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
383
+
384
+ except Exception as e:
385
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
386
+ logger.error(f"Error processing {doi}: {e}")
387
+
388
+ # Create ZIP of downloaded papers
389
+ if downloaded_files:
390
+ zip_filename = 'papers.zip'
391
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
392
+ for file_path in downloaded_files:
393
+ zipf.write(file_path, arcname=os.path.basename(file_path))
394
+ logger.info(f"ZIP file created: {zip_filename}")
395
+
396
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
397
+
398
  def create_gradio_interface():
399
  """Create Gradio interface for Paper Downloader"""
400
  downloader = PaperDownloader()
 
416
  else:
417
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
418
 
419
+
420
  # Gradio Interface
421
  interface = gr.Interface(
422
  fn=download_papers,