C2MV commited on
Commit
4b923db
·
verified ·
1 Parent(s): 0e74018

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -31
app.py CHANGED
@@ -57,6 +57,19 @@ class PaperDownloader:
57
  except Exception as e:
58
  logger.debug(f"Error fetching {url}: {e}")
59
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  async def download_paper_direct_doi_async(self, session, doi):
@@ -81,13 +94,10 @@ class PaperDownloader:
81
  pdf_urls.extend(re.findall(pattern, text))
82
 
83
  for pdf_url in pdf_urls:
84
- try:
85
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
- logger.debug(f"Found PDF from: {pdf_url}")
88
- return await pdf_response.read()
89
- except Exception as e:
90
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
 
92
 
93
  except Exception as e:
@@ -121,14 +131,11 @@ class PaperDownloader:
121
 
122
  # Try downloading from found URLs
123
  for pdf_url in pdf_urls:
124
- try:
125
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
- # Verify if it's a PDF
127
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
- logger.debug(f"Found PDF from: {pdf_url}")
129
- return await pdf_response.read()
130
- except Exception as e:
131
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
 
133
  except Exception as e:
134
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
@@ -156,10 +163,10 @@ class PaperDownloader:
156
  if links:
157
  link = links[0]
158
  pdf_url = link['href']
159
- pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
  logger.debug(f"Found PDF from: {pdf_url}")
162
- return await pdf_response.read()
163
  except Exception as e:
164
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
  return None
@@ -185,10 +192,10 @@ class PaperDownloader:
185
 
186
  if links:
187
  pdf_url = links[0]['href']
188
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
- logger.debug(f"Found PDF from: {pdf_url}")
191
- return await pdf_response.read()
192
  except Exception as e:
193
  logger.debug(f"Google Scholar error for {doi}: {e}")
194
 
@@ -214,13 +221,11 @@ class PaperDownloader:
214
  if link.get('content-type') == 'application/pdf':
215
  pdf_url = link.get('URL')
216
  if pdf_url:
217
- try:
218
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
219
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
220
- logger.debug(f"Found PDF from: {pdf_url}")
221
- return await pdf_response.read()
222
- except Exception as e:
223
- logger.debug(f"Error fetching from {pdf_url}")
224
 
225
  except Exception as e:
226
  logger.debug(f"Crossref error for {doi}: {e}")
@@ -315,7 +320,7 @@ class PaperDownloader:
315
  logger.info(f"ZIP file created: {zip_filename}")
316
 
317
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
318
-
319
  async def process_bibtex_async(self, bib_file):
320
  """Process BibTeX file and download papers with multiple strategies"""
321
  # Read BibTeX file content from the uploaded object
@@ -378,6 +383,7 @@ class PaperDownloader:
378
 
379
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
380
 
 
381
  def create_gradio_interface():
382
  """Create Gradio interface for Paper Downloader"""
383
  downloader = PaperDownloader()
@@ -399,7 +405,6 @@ def create_gradio_interface():
399
  else:
400
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
401
 
402
-
403
  # Gradio Interface
404
  interface = gr.Interface(
405
  fn=download_papers,
 
57
  except Exception as e:
58
  logger.debug(f"Error fetching {url}: {e}")
59
  return None, None
60
+
61
+ async def fetch_pdf_content(self, session, url):
62
+ """Fetch and validate if the content of a request is actually PDF."""
63
+ try:
64
+ async with session.get(url, headers=self.headers, timeout=10) as pdf_response:
65
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
66
+ return await pdf_response.read()
67
+ else:
68
+ logger.debug(f"Content type not PDF for {url}: {pdf_response.headers.get('Content-Type', '')}")
69
+ return None
70
+ except Exception as e:
71
+ logger.debug(f"Error getting PDF {url}: {e}")
72
+ return None
73
 
74
 
75
  async def download_paper_direct_doi_async(self, session, doi):
 
94
  pdf_urls.extend(re.findall(pattern, text))
95
 
96
  for pdf_url in pdf_urls:
97
+ pdf_content = await self.fetch_pdf_content(session,pdf_url)
98
+ if pdf_content:
99
+ logger.debug(f"Found PDF from: {pdf_url}")
100
+ return pdf_content
 
 
 
101
 
102
 
103
  except Exception as e:
 
131
 
132
  # Try downloading from found URLs
133
  for pdf_url in pdf_urls:
134
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
135
+ if pdf_content:
136
+ logger.debug(f"Found PDF from: {pdf_url}")
137
+ return pdf_content
138
+
 
 
 
139
 
140
  except Exception as e:
141
  logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
 
163
  if links:
164
  link = links[0]
165
  pdf_url = link['href']
166
+ pdf_content = await self.fetch_pdf_content(session,pdf_url)
167
+ if pdf_content:
168
  logger.debug(f"Found PDF from: {pdf_url}")
169
+ return pdf_content
170
  except Exception as e:
171
  logger.debug(f"Error trying to download {doi} from libgen: {e}")
172
  return None
 
192
 
193
  if links:
194
  pdf_url = links[0]['href']
195
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
196
+ if pdf_content:
197
+ logger.debug(f"Found PDF from: {pdf_url}")
198
+ return pdf_content
199
  except Exception as e:
200
  logger.debug(f"Google Scholar error for {doi}: {e}")
201
 
 
221
  if link.get('content-type') == 'application/pdf':
222
  pdf_url = link.get('URL')
223
  if pdf_url:
224
+ pdf_content = await self.fetch_pdf_content(session, pdf_url)
225
+ if pdf_content:
226
+ logger.debug(f"Found PDF from: {pdf_url}")
227
+ return pdf_content
228
+
 
 
229
 
230
  except Exception as e:
231
  logger.debug(f"Crossref error for {doi}: {e}")
 
320
  logger.info(f"ZIP file created: {zip_filename}")
321
 
322
  return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
323
+
324
  async def process_bibtex_async(self, bib_file):
325
  """Process BibTeX file and download papers with multiple strategies"""
326
  # Read BibTeX file content from the uploaded object
 
383
 
384
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
385
 
386
+
387
  def create_gradio_interface():
388
  """Create Gradio interface for Paper Downloader"""
389
  downloader = PaperDownloader()
 
405
  else:
406
  return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
407
 
 
408
  # Gradio Interface
409
  interface = gr.Interface(
410
  fn=download_papers,