Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -57,6 +57,19 @@ class PaperDownloader:
|
|
57 |
except Exception as e:
|
58 |
logger.debug(f"Error fetching {url}: {e}")
|
59 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
async def download_paper_direct_doi_async(self, session, doi):
|
@@ -81,13 +94,10 @@ class PaperDownloader:
|
|
81 |
pdf_urls.extend(re.findall(pattern, text))
|
82 |
|
83 |
for pdf_url in pdf_urls:
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
return await pdf_response.read()
|
89 |
-
except Exception as e:
|
90 |
-
logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
|
91 |
|
92 |
|
93 |
except Exception as e:
|
@@ -121,14 +131,11 @@ class PaperDownloader:
|
|
121 |
|
122 |
# Try downloading from found URLs
|
123 |
for pdf_url in pdf_urls:
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
return await pdf_response.read()
|
130 |
-
except Exception as e:
|
131 |
-
logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
|
132 |
|
133 |
except Exception as e:
|
134 |
logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
|
@@ -156,10 +163,10 @@ class PaperDownloader:
|
|
156 |
if links:
|
157 |
link = links[0]
|
158 |
pdf_url = link['href']
|
159 |
-
|
160 |
-
if
|
161 |
logger.debug(f"Found PDF from: {pdf_url}")
|
162 |
-
return
|
163 |
except Exception as e:
|
164 |
logger.debug(f"Error trying to download {doi} from libgen: {e}")
|
165 |
return None
|
@@ -185,10 +192,10 @@ class PaperDownloader:
|
|
185 |
|
186 |
if links:
|
187 |
pdf_url = links[0]['href']
|
188 |
-
|
189 |
-
if
|
190 |
-
|
191 |
-
|
192 |
except Exception as e:
|
193 |
logger.debug(f"Google Scholar error for {doi}: {e}")
|
194 |
|
@@ -214,13 +221,11 @@ class PaperDownloader:
|
|
214 |
if link.get('content-type') == 'application/pdf':
|
215 |
pdf_url = link.get('URL')
|
216 |
if pdf_url:
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
except Exception as e:
|
223 |
-
logger.debug(f"Error fetching from {pdf_url}")
|
224 |
|
225 |
except Exception as e:
|
226 |
logger.debug(f"Crossref error for {doi}: {e}")
|
@@ -315,7 +320,7 @@ class PaperDownloader:
|
|
315 |
logger.info(f"ZIP file created: {zip_filename}")
|
316 |
|
317 |
return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
|
318 |
-
|
319 |
async def process_bibtex_async(self, bib_file):
|
320 |
"""Process BibTeX file and download papers with multiple strategies"""
|
321 |
# Read BibTeX file content from the uploaded object
|
@@ -378,6 +383,7 @@ class PaperDownloader:
|
|
378 |
|
379 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
|
380 |
|
|
|
381 |
def create_gradio_interface():
|
382 |
"""Create Gradio interface for Paper Downloader"""
|
383 |
downloader = PaperDownloader()
|
@@ -399,7 +405,6 @@ def create_gradio_interface():
|
|
399 |
else:
|
400 |
return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
|
401 |
|
402 |
-
|
403 |
# Gradio Interface
|
404 |
interface = gr.Interface(
|
405 |
fn=download_papers,
|
|
|
57 |
except Exception as e:
|
58 |
logger.debug(f"Error fetching {url}: {e}")
|
59 |
return None, None
|
60 |
+
|
61 |
+
async def fetch_pdf_content(self, session, url):
|
62 |
+
"""Fetch and validate if the content of a request is actually PDF."""
|
63 |
+
try:
|
64 |
+
async with session.get(url, headers=self.headers, timeout=10) as pdf_response:
|
65 |
+
if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
|
66 |
+
return await pdf_response.read()
|
67 |
+
else:
|
68 |
+
logger.debug(f"Content type not PDF for {url}: {pdf_response.headers.get('Content-Type', '')}")
|
69 |
+
return None
|
70 |
+
except Exception as e:
|
71 |
+
logger.debug(f"Error getting PDF {url}: {e}")
|
72 |
+
return None
|
73 |
|
74 |
|
75 |
async def download_paper_direct_doi_async(self, session, doi):
|
|
|
94 |
pdf_urls.extend(re.findall(pattern, text))
|
95 |
|
96 |
for pdf_url in pdf_urls:
|
97 |
+
pdf_content = await self.fetch_pdf_content(session,pdf_url)
|
98 |
+
if pdf_content:
|
99 |
+
logger.debug(f"Found PDF from: {pdf_url}")
|
100 |
+
return pdf_content
|
|
|
|
|
|
|
101 |
|
102 |
|
103 |
except Exception as e:
|
|
|
131 |
|
132 |
# Try downloading from found URLs
|
133 |
for pdf_url in pdf_urls:
|
134 |
+
pdf_content = await self.fetch_pdf_content(session, pdf_url)
|
135 |
+
if pdf_content:
|
136 |
+
logger.debug(f"Found PDF from: {pdf_url}")
|
137 |
+
return pdf_content
|
138 |
+
|
|
|
|
|
|
|
139 |
|
140 |
except Exception as e:
|
141 |
logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
|
|
|
163 |
if links:
|
164 |
link = links[0]
|
165 |
pdf_url = link['href']
|
166 |
+
pdf_content = await self.fetch_pdf_content(session,pdf_url)
|
167 |
+
if pdf_content:
|
168 |
logger.debug(f"Found PDF from: {pdf_url}")
|
169 |
+
return pdf_content
|
170 |
except Exception as e:
|
171 |
logger.debug(f"Error trying to download {doi} from libgen: {e}")
|
172 |
return None
|
|
|
192 |
|
193 |
if links:
|
194 |
pdf_url = links[0]['href']
|
195 |
+
pdf_content = await self.fetch_pdf_content(session, pdf_url)
|
196 |
+
if pdf_content:
|
197 |
+
logger.debug(f"Found PDF from: {pdf_url}")
|
198 |
+
return pdf_content
|
199 |
except Exception as e:
|
200 |
logger.debug(f"Google Scholar error for {doi}: {e}")
|
201 |
|
|
|
221 |
if link.get('content-type') == 'application/pdf':
|
222 |
pdf_url = link.get('URL')
|
223 |
if pdf_url:
|
224 |
+
pdf_content = await self.fetch_pdf_content(session, pdf_url)
|
225 |
+
if pdf_content:
|
226 |
+
logger.debug(f"Found PDF from: {pdf_url}")
|
227 |
+
return pdf_content
|
228 |
+
|
|
|
|
|
229 |
|
230 |
except Exception as e:
|
231 |
logger.debug(f"Crossref error for {doi}: {e}")
|
|
|
320 |
logger.info(f"ZIP file created: {zip_filename}")
|
321 |
|
322 |
return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
|
323 |
+
|
324 |
async def process_bibtex_async(self, bib_file):
|
325 |
"""Process BibTeX file and download papers with multiple strategies"""
|
326 |
# Read BibTeX file content from the uploaded object
|
|
|
383 |
|
384 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
|
385 |
|
386 |
+
|
387 |
def create_gradio_interface():
|
388 |
"""Create Gradio interface for Paper Downloader"""
|
389 |
downloader = PaperDownloader()
|
|
|
405 |
else:
|
406 |
return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
|
407 |
|
|
|
408 |
# Gradio Interface
|
409 |
interface = gr.Interface(
|
410 |
fn=download_papers,
|