Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -59,37 +59,46 @@ class PaperDownloader:
|
|
59 |
return None, None
|
60 |
|
61 |
|
62 |
-
async def fetch_pdf_content(self, session, url, max_redirects=5):
|
63 |
-
"""Fetch content and validate if response is PDF, following up to max_redirects redirections."""
|
64 |
|
65 |
current_url = url
|
66 |
redirect_count = 0
|
|
|
67 |
|
68 |
while redirect_count <= max_redirects:
|
69 |
try:
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
except Exception as e:
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
89 |
return None
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
async def download_paper_direct_doi_async(self, session, doi):
|
94 |
"""Attempt to download the pdf from the landing page of the doi"""
|
95 |
if not doi:
|
@@ -97,6 +106,14 @@ class PaperDownloader:
|
|
97 |
|
98 |
try:
|
99 |
doi_url = f"https://doi.org/{self.clean_doi(doi)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
|
101 |
if not text:
|
102 |
return None
|
@@ -111,11 +128,12 @@ class PaperDownloader:
|
|
111 |
for pattern in pdf_patterns:
|
112 |
pdf_urls.extend(re.findall(pattern, text))
|
113 |
|
|
|
114 |
for pdf_url in pdf_urls:
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
|
120 |
except Exception as e:
|
121 |
logger.debug(f"Error trying to get the PDF from {doi}: {e}")
|
@@ -145,18 +163,18 @@ class PaperDownloader:
|
|
145 |
for pattern in pdf_patterns:
|
146 |
pdf_urls.extend(re.findall(pattern, text))
|
147 |
|
148 |
-
# Try downloading from found URLs
|
149 |
for pdf_url in pdf_urls:
|
150 |
pdf_content = await self.fetch_pdf_content(session,pdf_url)
|
151 |
if pdf_content:
|
152 |
logger.debug(f"Found PDF from: {pdf_url}")
|
153 |
return pdf_content
|
154 |
-
|
155 |
except Exception as e:
|
156 |
logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
|
157 |
|
158 |
return None
|
159 |
-
|
160 |
async def download_paper_libgen_async(self, session, doi):
|
161 |
"""Download from Libgen, handles the query and the redirection"""
|
162 |
if not doi:
|
@@ -185,7 +203,7 @@ class PaperDownloader:
|
|
185 |
except Exception as e:
|
186 |
logger.debug(f"Error trying to download {doi} from libgen: {e}")
|
187 |
return None
|
188 |
-
|
189 |
async def download_paper_google_scholar_async(self, session, doi):
|
190 |
"""Search google scholar to find an article with the given doi, try to get the pdf"""
|
191 |
if not doi:
|
@@ -216,7 +234,7 @@ class PaperDownloader:
|
|
216 |
logger.debug(f"Google Scholar error for {doi}: {e}")
|
217 |
|
218 |
return None
|
219 |
-
|
220 |
async def download_paper_crossref_async(self, session, doi):
|
221 |
"""Alternative search method using Crossref"""
|
222 |
if not doi:
|
@@ -244,7 +262,7 @@ class PaperDownloader:
|
|
244 |
except Exception as e:
|
245 |
logger.debug(f"Crossref error for {doi}: {e}")
|
246 |
return None
|
247 |
-
|
248 |
async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
|
249 |
"""Downloads a paper using multiple strategies with exponential backoff and async requests"""
|
250 |
pdf_content = None
|
@@ -299,7 +317,7 @@ class PaperDownloader:
|
|
299 |
except Exception as e:
|
300 |
logger.error(f"Error processing {doi}: {e}")
|
301 |
return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
|
302 |
-
|
303 |
async def download_multiple_dois_async(self, dois_text):
|
304 |
"""Downloads multiple papers from a list of DOIs"""
|
305 |
if not dois_text:
|
@@ -395,6 +413,7 @@ class PaperDownloader:
|
|
395 |
|
396 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
|
397 |
|
|
|
398 |
def create_gradio_interface():
|
399 |
"""Create Gradio interface for Paper Downloader"""
|
400 |
downloader = PaperDownloader()
|
@@ -415,8 +434,6 @@ def create_gradio_interface():
|
|
415 |
return zip_path, downloaded_dois, failed_dois, None
|
416 |
else:
|
417 |
return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
|
418 |
-
|
419 |
-
|
420 |
# Gradio Interface
|
421 |
interface = gr.Interface(
|
422 |
fn=download_papers,
|
@@ -492,11 +509,9 @@ def create_gradio_interface():
|
|
492 |
"""
|
493 |
return interface
|
494 |
|
495 |
-
|
496 |
def main():
|
497 |
interface = create_gradio_interface()
|
498 |
interface.launch(share=True)
|
499 |
|
500 |
-
|
501 |
if __name__ == "__main__":
|
502 |
main()
|
|
|
59 |
return None, None
|
60 |
|
61 |
|
62 |
+
async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
|
63 |
+
"""Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
|
64 |
|
65 |
current_url = url
|
66 |
redirect_count = 0
|
67 |
+
retry_count = 0
|
68 |
|
69 |
while redirect_count <= max_redirects:
|
70 |
try:
|
71 |
+
while retry_count <= max_retries:
|
72 |
+
try:
|
73 |
+
async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
|
74 |
+
|
75 |
+
if response.status in [301,302, 307,308]:
|
76 |
+
current_url = response.headers['Location']
|
77 |
+
redirect_count+=1
|
78 |
+
logger.debug(f"Following redirect from {url} to {current_url}")
|
79 |
+
break # Break out of the retry loop for a redirect
|
80 |
+
|
81 |
+
response.raise_for_status()
|
82 |
+
|
83 |
+
if 'application/pdf' in response.headers.get('Content-Type', ''):
|
84 |
+
return await response.read()
|
85 |
+
else:
|
86 |
+
logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
|
87 |
+
return None
|
88 |
+
except Exception as e:
|
89 |
+
logger.debug(f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
|
90 |
+
retry_count+=1
|
91 |
+
await asyncio.sleep(retry_delay)
|
92 |
+
|
93 |
+
retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
|
94 |
+
|
95 |
except Exception as e:
|
96 |
+
logger.debug(f"Error getting PDF from {current_url}: {e}")
|
97 |
+
return None
|
98 |
+
|
99 |
+
logger.debug(f"Too many redirects or retries {url}, not following this link further")
|
100 |
return None
|
101 |
+
|
|
|
|
|
102 |
async def download_paper_direct_doi_async(self, session, doi):
|
103 |
"""Attempt to download the pdf from the landing page of the doi"""
|
104 |
if not doi:
|
|
|
106 |
|
107 |
try:
|
108 |
doi_url = f"https://doi.org/{self.clean_doi(doi)}"
|
109 |
+
|
110 |
+
# First, let's try to download the URL directly in case it is already the pdf.
|
111 |
+
pdf_content = await self.fetch_pdf_content(session, doi_url)
|
112 |
+
if pdf_content:
|
113 |
+
logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
|
114 |
+
return pdf_content
|
115 |
+
|
116 |
+
# If direct DOI link was not a pdf, fetch landing page and extract links
|
117 |
text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
|
118 |
if not text:
|
119 |
return None
|
|
|
128 |
for pattern in pdf_patterns:
|
129 |
pdf_urls.extend(re.findall(pattern, text))
|
130 |
|
131 |
+
# Attempt each pdf url and break when you find a PDF content.
|
132 |
for pdf_url in pdf_urls:
|
133 |
+
pdf_content = await self.fetch_pdf_content(session, pdf_url)
|
134 |
+
if pdf_content:
|
135 |
+
logger.debug(f"Found PDF from: {pdf_url}")
|
136 |
+
return pdf_content
|
137 |
|
138 |
except Exception as e:
|
139 |
logger.debug(f"Error trying to get the PDF from {doi}: {e}")
|
|
|
163 |
for pattern in pdf_patterns:
|
164 |
pdf_urls.extend(re.findall(pattern, text))
|
165 |
|
166 |
+
# Try downloading from found URLs, but iterate over ALL
|
167 |
for pdf_url in pdf_urls:
|
168 |
pdf_content = await self.fetch_pdf_content(session,pdf_url)
|
169 |
if pdf_content:
|
170 |
logger.debug(f"Found PDF from: {pdf_url}")
|
171 |
return pdf_content
|
172 |
+
|
173 |
except Exception as e:
|
174 |
logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
|
175 |
|
176 |
return None
|
177 |
+
|
178 |
async def download_paper_libgen_async(self, session, doi):
|
179 |
"""Download from Libgen, handles the query and the redirection"""
|
180 |
if not doi:
|
|
|
203 |
except Exception as e:
|
204 |
logger.debug(f"Error trying to download {doi} from libgen: {e}")
|
205 |
return None
|
206 |
+
|
207 |
async def download_paper_google_scholar_async(self, session, doi):
|
208 |
"""Search google scholar to find an article with the given doi, try to get the pdf"""
|
209 |
if not doi:
|
|
|
234 |
logger.debug(f"Google Scholar error for {doi}: {e}")
|
235 |
|
236 |
return None
|
237 |
+
|
238 |
async def download_paper_crossref_async(self, session, doi):
|
239 |
"""Alternative search method using Crossref"""
|
240 |
if not doi:
|
|
|
262 |
except Exception as e:
|
263 |
logger.debug(f"Crossref error for {doi}: {e}")
|
264 |
return None
|
265 |
+
|
266 |
async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
|
267 |
"""Downloads a paper using multiple strategies with exponential backoff and async requests"""
|
268 |
pdf_content = None
|
|
|
317 |
except Exception as e:
|
318 |
logger.error(f"Error processing {doi}: {e}")
|
319 |
return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
|
320 |
+
|
321 |
async def download_multiple_dois_async(self, dois_text):
|
322 |
"""Downloads multiple papers from a list of DOIs"""
|
323 |
if not dois_text:
|
|
|
413 |
|
414 |
return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois)
|
415 |
|
416 |
+
|
417 |
def create_gradio_interface():
|
418 |
"""Create Gradio interface for Paper Downloader"""
|
419 |
downloader = PaperDownloader()
|
|
|
434 |
return zip_path, downloaded_dois, failed_dois, None
|
435 |
else:
|
436 |
return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
|
|
|
|
|
437 |
# Gradio Interface
|
438 |
interface = gr.Interface(
|
439 |
fn=download_papers,
|
|
|
509 |
"""
|
510 |
return interface
|
511 |
|
|
|
512 |
def main():
|
513 |
interface = create_gradio_interface()
|
514 |
interface.launch(share=True)
|
515 |
|
|
|
516 |
if __name__ == "__main__":
|
517 |
main()
|