C2MV commited on
Commit
ec889f0
·
verified ·
1 Parent(s): 0ee5088

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +782 -686
app.py CHANGED
@@ -1,730 +1,826 @@
1
- import os
2
- import re
3
- import time
4
- import logging
5
- import zipfile
6
- import requests
7
- import bibtexparser
8
- from tqdm import tqdm
9
- from urllib.parse import quote, urlencode
10
  import gradio as gr
11
- from bs4 import BeautifulSoup
12
  import io
13
- import asyncio
14
- import aiohttp
15
-
16
- # Configure logging
17
- logging.basicConfig(level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s: %(message)s')
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class PaperDownloader:
23
- def __init__(self, output_dir='papers'):
24
- self.output_dir = output_dir
25
- os.makedirs(output_dir, exist_ok=True)
26
-
27
- # Updated download sources
28
- self.download_sources = [
29
- 'https://sci-hub.ee/',
30
- 'https://sci-hub.st/',
31
- 'https://sci-hub.ru/',
32
- 'https://sci-hub.ren/',
33
- 'https://sci-hub.mksa.top/',
34
- 'https://sci-hub.se/',
35
- 'https://libgen.rs/scimag/'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  ]
37
 
38
- # Request headers
39
- self.headers = {
40
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
41
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
- 'Accept-Language': 'en-US,en;q=0.9',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
 
45
- def clean_doi(self, doi):
46
- """Clean and encode DOI for URL"""
47
- if not isinstance(doi, str):
48
- return None
49
- return quote(doi.strip()) if doi else None
50
-
51
- async def fetch_with_headers(self, session, url, timeout=10):
52
- """Utility method to fetch an URL with headers and timeout"""
53
- try:
54
- async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
- response.raise_for_status()
56
- return await response.text(), response.headers
57
- except Exception as e:
58
- logger.debug(f"Error fetching {url}: {e}")
59
- return None, None
60
-
61
-
62
- async def download_paper_direct_doi_async(self, session, doi):
63
- """Attempt to download the pdf from the landing page of the doi"""
64
- if not doi:
65
- return None
66
-
67
- try:
68
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
- if not text:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  return None
72
 
73
- pdf_patterns = [
74
- r'(https?://[^\s<>"]+?\.pdf)',
75
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
- ]
78
-
79
- pdf_urls = []
80
- for pattern in pdf_patterns:
81
- pdf_urls.extend(re.findall(pattern, text))
82
-
83
- for pdf_url in pdf_urls:
84
- try:
85
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
- logger.debug(f"Found PDF from: {pdf_url}")
88
- return await pdf_response.read()
89
- except Exception as e:
90
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
-
92
-
93
- except Exception as e:
94
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
-
96
- return None
97
-
98
- async def download_paper_scihub_async(self, session, doi):
99
- """Improved method to download paper from Sci-Hub using async requests"""
100
- if not doi:
101
- logger.warning("DOI not provided")
102
  return None
103
 
104
- for base_url in self.download_sources:
105
- try:
106
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
- text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
108
- if not text:
109
- continue
110
-
111
- # Search for multiple PDF URL patterns
112
- pdf_patterns = [
113
- r'(https?://[^\s<>"]+?\.pdf)',
114
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
115
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
116
- ]
117
-
118
- pdf_urls = []
119
- for pattern in pdf_patterns:
120
- pdf_urls.extend(re.findall(pattern, text))
121
-
122
- # Try downloading from found URLs
123
- for pdf_url in pdf_urls:
124
- try:
125
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
- # Verify if it's a PDF
127
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
- logger.debug(f"Found PDF from: {pdf_url}")
129
- return await pdf_response.read()
130
- except Exception as e:
131
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
-
133
- except Exception as e:
134
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
135
 
136
- return None
137
 
138
- async def download_paper_libgen_async(self, session, doi):
139
- """Download from Libgen, handles the query and the redirection"""
140
- if not doi:
 
 
 
141
  return None
142
 
143
- base_url = 'https://libgen.rs/scimag/'
144
- try:
145
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
146
- text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
147
-
148
- if not text or "No results" in text:
149
- logger.debug(f"No results for DOI: {doi} on libgen")
150
- return None
151
-
152
- soup = BeautifulSoup(text, 'html.parser')
153
-
154
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
155
-
156
- if links:
157
- link = links[0]
158
- pdf_url = link['href']
159
- pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
- logger.debug(f"Found PDF from: {pdf_url}")
162
- return await pdf_response.read()
163
- except Exception as e:
164
- logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
- return None
166
-
167
- async def download_paper_google_scholar_async(self, session, doi):
168
- """Search google scholar to find an article with the given doi, try to get the pdf"""
169
- if not doi:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  return None
171
 
172
- try:
173
- query = f'doi:"{doi}"'
174
- params = {'q': query}
175
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
 
176
 
177
- text, headers = await self.fetch_with_headers(session, url, timeout=10)
178
- if not text:
179
- return None
180
 
181
- soup = BeautifulSoup(text, 'html.parser')
 
182
 
183
- # Find any links with [PDF]
184
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
185
 
186
- if links:
187
- pdf_url = links[0]['href']
188
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
- logger.debug(f"Found PDF from: {pdf_url}")
191
- return await pdf_response.read()
192
- except Exception as e:
193
- logger.debug(f"Google Scholar error for {doi}: {e}")
194
 
195
- return None
196
-
197
- async def download_paper_crossref_async(self, session, doi):
198
- """Alternative search method using Crossref"""
199
- if not doi:
200
- return None
201
-
202
- try:
203
- # Search for open access link
204
- url = f"https://api.crossref.org/works/{doi}"
205
- response = await session.get(url, headers=self.headers, timeout=10)
206
-
207
- if response.status == 200:
208
- data = await response.json()
209
- work = data.get('message', {})
210
-
211
- # Search for open access links
212
- links = work.get('link', [])
213
- for link in links:
214
- if link.get('content-type') == 'application/pdf':
215
- pdf_url = link.get('URL')
216
- if pdf_url:
217
- pdf_response = await session.get(pdf_url, headers=self.headers)
218
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
219
- logger.debug(f"Found PDF from: {pdf_url}")
220
- return await pdf_response.read()
221
-
222
- except Exception as e:
223
- logger.debug(f"Crossref error for {doi}: {e}")
224
-
225
- return None
226
 
227
- async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
228
- """Downloads a paper using multiple strategies with exponential backoff and async requests"""
229
- pdf_content = None
230
- retries = 0
231
- delay = initial_delay
232
-
233
- async with aiohttp.ClientSession() as session:
234
- while retries < max_retries and not pdf_content:
235
- try:
236
- pdf_content = (
237
- await self.download_paper_direct_doi_async(session, doi) or
238
- await self.download_paper_scihub_async(session, doi) or
239
- await self.download_paper_libgen_async(session, doi) or
240
- await self.download_paper_google_scholar_async(session, doi) or
241
- await self.download_paper_crossref_async(session, doi)
242
-
243
- )
244
- if pdf_content:
245
- return pdf_content
246
- except Exception as e:
247
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
248
-
249
- if not pdf_content:
250
- retries += 1
251
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
252
- await asyncio.sleep(delay)
253
- delay *= 2 # Exponential backoff
254
 
255
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- def download_paper_scihub(self, doi):
258
- """Improved method to download paper from Sci-Hub"""
259
- if not doi:
260
- logger.warning("DOI not provided")
 
261
  return None
262
 
263
- for base_url in self.download_sources:
264
- try:
265
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
-
267
- # Request with more tolerance
268
- response = requests.get(scihub_url,
269
- headers=self.headers,
270
- allow_redirects=True,
271
- timeout=15)
272
-
273
- # Search for multiple PDF URL patterns
274
- pdf_patterns = [
275
- r'(https?://[^\s<>"]+?\.pdf)',
276
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
- ]
279
-
280
- pdf_urls = []
281
- for pattern in pdf_patterns:
282
- pdf_urls.extend(re.findall(pattern, response.text))
283
-
284
- # Try downloading from found URLs
285
- for pdf_url in pdf_urls:
286
- try:
287
- pdf_response = requests.get(pdf_url,
288
- headers=self.headers,
289
- timeout=10)
290
-
291
- # Verify if it's a PDF
292
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
- logger.debug(f"Found PDF from: {pdf_url}")
294
- return pdf_response.content
295
- except Exception as e:
296
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
-
298
- except Exception as e:
299
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
-
301
- return None
302
-
303
- def download_paper_libgen(self, doi):
304
- """Download from Libgen, handles the query and the redirection"""
305
- if not doi:
306
  return None
307
 
308
- base_url = 'https://libgen.rs/scimag/'
309
- try:
310
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
- response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
- response.raise_for_status()
 
 
313
 
314
- if "No results" in response.text:
315
- logger.debug(f"No results for DOI: {doi} on libgen")
316
- return None
 
317
 
318
- soup = BeautifulSoup(response.text, 'html.parser')
319
-
320
- # Find the link using a specific selector
321
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
322
-
323
- if links:
324
- link = links[0]
325
- pdf_url = link['href']
326
- pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
- logger.debug(f"Found PDF from: {pdf_url}")
329
- return pdf_response.content
330
-
331
- except Exception as e:
332
- logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
- return None
334
-
335
- def download_paper_google_scholar(self, doi):
336
- """Search google scholar to find an article with the given doi, try to get the pdf"""
337
- if not doi:
338
- return None
339
 
340
- try:
341
- query = f'doi:"{doi}"'
342
- params = {'q': query}
343
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
 
345
- response = requests.get(url, headers=self.headers, timeout=10)
346
- response.raise_for_status()
347
-
348
- soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- # Find any links with [PDF]
351
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
 
 
 
 
352
 
353
- if links:
354
- pdf_url = links[0]['href']
355
- pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
- logger.debug(f"Found PDF from: {pdf_url}")
358
- return pdf_response.content
359
- except Exception as e:
360
- logger.debug(f"Google Scholar error for {doi}: {e}")
 
 
 
 
 
 
 
 
 
 
361
 
 
 
 
 
 
362
  return None
 
 
 
 
 
 
 
 
 
 
363
 
364
- def download_paper_crossref(self, doi):
365
- """Alternative search method using Crossref"""
366
- if not doi:
367
- return None
368
-
369
- try:
370
- # Search for open access link
371
- url = f"https://api.crossref.org/works/{doi}"
372
- response = requests.get(url, headers=self.headers, timeout=10)
373
-
374
- if response.status_code == 200:
375
- data = response.json()
376
- work = data.get('message', {})
377
-
378
- # Search for open access links
379
- links = work.get('link', [])
380
- for link in links:
381
- if link.get('content-type') == 'application/pdf':
382
- pdf_url = link.get('URL')
383
- if pdf_url:
384
- pdf_response = requests.get(pdf_url, headers=self.headers)
385
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
- logger.debug(f"Found PDF from: {pdf_url}")
387
- return pdf_response.content
388
-
389
- except Exception as e:
390
- logger.debug(f"Crossref error for {doi}: {e}")
391
-
392
  return None
 
 
 
393
 
394
- def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
- """Downloads a paper using multiple strategies with exponential backoff"""
396
- pdf_content = None
397
- retries = 0
398
- delay = initial_delay
399
-
400
- while retries < max_retries and not pdf_content:
401
- try:
402
- pdf_content = (
403
- self.download_paper_scihub(doi) or
404
- self.download_paper_libgen(doi) or
405
- self.download_paper_google_scholar(doi) or
406
- self.download_paper_crossref(doi)
407
-
408
- )
409
-
410
- if pdf_content:
411
- return pdf_content
412
- except Exception as e:
413
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
-
415
- if not pdf_content:
416
- retries += 1
417
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
- time.sleep(delay)
419
- delay *= 2 # Exponential backoff
420
-
421
  return None
 
 
 
422
 
423
- def download_single_doi(self, doi):
424
- """Downloads a single paper using a DOI"""
425
- if not doi:
426
- return None, "Error: DOI not provided", "Error: DOI not provided"
427
-
428
- try:
429
- pdf_content = self.download_with_retry(doi)
430
-
431
- if pdf_content:
432
- if doi is None:
433
- return None, "Error: DOI not provided", "Error: DOI not provided"
434
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
- filepath = os.path.join(self.output_dir, filename)
436
- with open(filepath, 'wb') as f:
437
- f.write(pdf_content)
438
- logger.info(f"Successfully downloaded: {filename}")
439
- return filepath, f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>', ""
440
- else:
441
- logger.warning(f"Could not download: {doi}")
442
- return None, f"Could not download {doi}", f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>'
443
-
444
- except Exception as e:
445
- logger.error(f"Error processing {doi}: {e}")
446
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
447
-
448
- def download_multiple_dois(self, dois_text):
449
- """Downloads multiple papers from a list of DOIs"""
450
- if not dois_text:
451
- return None, "Error: No DOIs provided", "Error: No DOIs provided"
452
-
453
- dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
454
- if not dois:
455
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
456
-
457
- downloaded_files = []
458
- failed_dois = []
459
- downloaded_links = []
460
- for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
461
- filepath, success_message, fail_message = self.download_single_doi(doi)
462
- if filepath:
463
- # Unique filename for zip
464
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
465
- filepath_unique = os.path.join(self.output_dir, filename)
466
- os.rename(filepath, filepath_unique)
467
- downloaded_files.append(filepath_unique)
468
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
469
-
470
- else:
471
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
472
-
473
- if downloaded_files:
474
- zip_filename = 'papers.zip'
475
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
476
- for file_path in downloaded_files:
477
- zipf.write(file_path, arcname=os.path.basename(file_path))
478
- logger.info(f"ZIP file created: {zip_filename}")
479
-
480
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
481
-
482
- def process_bibtex(self, bib_file):
483
- """Process BibTeX file and download papers with multiple strategies"""
484
- # Read BibTeX file content from the uploaded object
485
- try:
486
- with open(bib_file.name, 'r', encoding='utf-8') as f:
487
- bib_content = f.read()
488
- except Exception as e:
489
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
490
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
491
-
492
- # Parse BibTeX data
493
- try:
494
- bib_database = bibtexparser.loads(bib_content)
495
- except Exception as e:
496
- logger.error(f"Error parsing BibTeX data: {e}")
497
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
498
-
499
- # Extract DOIs
500
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
501
- logger.info(f"Found {len(dois)} DOIs to download")
502
-
503
- # Result lists
504
- downloaded_files = []
505
- failed_dois = []
506
- downloaded_links = []
507
-
508
- # Download PDFs
509
- for doi in tqdm(dois, desc="Downloading papers"):
510
- try:
511
- # Try to download with multiple methods with retries
512
- pdf_content = self.download_with_retry(doi)
513
-
514
- # Save PDF
515
- if pdf_content:
516
- if doi is None:
517
- return None, "Error: DOI not provided", "Error: DOI not provided", None
518
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
519
- filepath = os.path.join(self.output_dir, filename)
520
-
521
- with open(filepath, 'wb') as f:
522
- f.write(pdf_content)
523
-
524
- downloaded_files.append(filepath)
525
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
526
- logger.info(f"Successfully downloaded: {filename}")
527
- else:
528
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
529
-
530
- except Exception as e:
531
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
532
- logger.error(f"Error processing {doi}: {e}")
533
-
534
- # Create ZIP of downloaded papers
535
- if downloaded_files:
536
- zip_filename = 'papers.zip'
537
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
538
- for file_path in downloaded_files:
539
- zipf.write(file_path, arcname=os.path.basename(file_path))
540
- logger.info(f"ZIP file created: {zip_filename}")
541
-
542
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
543
-
544
- async def process_bibtex_async(self, bib_file):
545
- """Process BibTeX file and download papers with multiple strategies"""
546
- # Read BibTeX file content from the uploaded object
547
- try:
548
- with open(bib_file.name, 'r', encoding='utf-8') as f:
549
- bib_content = f.read()
550
- except Exception as e:
551
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
552
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
553
-
554
- # Parse BibTeX data
555
- try:
556
- bib_database = bibtexparser.loads(bib_content)
557
- except Exception as e:
558
- logger.error(f"Error parsing BibTeX data: {e}")
559
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
560
-
561
- # Extract DOIs
562
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
563
- logger.info(f"Found {len(dois)} DOIs to download")
564
-
565
- # Result lists
566
- downloaded_files = []
567
- failed_dois = []
568
- downloaded_links = []
569
-
570
- # Download PDFs
571
- for doi in tqdm(dois, desc="Downloading papers"):
572
- try:
573
- # Try to download with multiple methods with retries
574
- pdf_content = await self.download_with_retry_async(doi)
575
-
576
- # Save PDF
577
- if pdf_content:
578
- if doi is None:
579
- return None, "Error: DOI not provided", "Error: DOI not provided", None
580
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
581
- filepath = os.path.join(self.output_dir, filename)
582
-
583
- with open(filepath, 'wb') as f:
584
- f.write(pdf_content)
585
-
586
- downloaded_files.append(filepath)
587
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
588
- logger.info(f"Successfully downloaded: {filename}")
589
- else:
590
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
591
-
592
- except Exception as e:
593
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
594
- logger.error(f"Error processing {doi}: {e}")
595
-
596
- # Create ZIP of downloaded papers
597
- if downloaded_files:
598
- zip_filename = 'papers.zip'
599
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
600
- for file_path in downloaded_files:
601
- zipf.write(file_path, arcname=os.path.basename(file_path))
602
- logger.info(f"ZIP file created: {zip_filename}")
603
-
604
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
605
-
606
- def create_gradio_interface():
607
- """Create Gradio interface for Paper Downloader"""
608
- downloader = PaperDownloader()
609
-
610
- async def download_papers(bib_file, doi_input, dois_input):
611
- if bib_file:
612
- # Check file type
613
- if not bib_file.name.lower().endswith('.bib'):
614
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
615
-
616
- zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file)
617
- return zip_path, downloaded_dois, failed_dois, None
618
- elif doi_input:
619
- filepath, message, failed_doi = downloader.download_single_doi(doi_input)
620
- return None, message, failed_doi, filepath
621
- elif dois_input:
622
- zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
623
- return zip_path, downloaded_dois, failed_dois, None
624
- else:
625
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
626
-
627
- # Gradio Interface
628
- interface = gr.Interface(
629
- fn=download_papers,
630
- inputs=[
631
- gr.File(file_types=['.bib'], label="Upload BibTeX File"),
632
- gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
633
- gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
634
- ],
635
  outputs=[
636
- gr.File(label="Download Papers (ZIP) or Single PDF"),
637
- gr.HTML(label="""
638
- <div style='padding-bottom: 5px; font-weight: bold;'>
639
- Found DOIs
640
- </div>
641
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
642
- <div id="downloaded-dois"></div>
643
- </div>
644
- """),
645
- gr.HTML(label="""
646
- <div style='padding-bottom: 5px; font-weight: bold;'>
647
- Missed DOIs
648
- </div>
649
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
650
- <div id="failed-dois"></div>
651
- </div>
652
- """),
653
- gr.File(label="Downloaded Single PDF")
654
- ],
655
- title="🔬 Academic Paper Batch Downloader",
656
- description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
657
- theme="Hev832/Applio",
658
- examples=[
659
- ["example.bib", None, None], # Bibtex File
660
- [None, "10.1038/nature12373", None], # Single DOI
661
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
662
- ],
663
- css="""
664
- .gradio-container {
665
- background-color: black;
666
- }
667
- .gr-interface {
668
- max-width: 800px;
669
- margin: 0 auto;
670
- }
671
- .gr-box {
672
- background-color: black;
673
- border-radius: 10px;
674
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
675
- }
676
- .output-text a {
677
- color: #007bff; /* Blue color for hyperlinks */
678
- }
679
- """,
680
- cache_examples=False,
681
  )
682
-
683
- # Add Javascript to update HTML
684
- interface.load = """
685
- function(downloaded_dois, failed_dois) {
686
- let downloaded_html = '';
687
- downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
688
- downloaded_html += doi + '<br>';
689
- });
690
- document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
691
-
692
- let failed_html = '';
693
- failed_dois.split('\\n').filter(Boolean).forEach(doi => {
694
- failed_html += doi + '<br>';
695
- });
696
- document.querySelector("#failed-dois").innerHTML = failed_html;
697
- return [downloaded_html, failed_html];
698
- }
699
- """
700
 
701
- interface.head = """
702
- <script>
703
- function copyLink(button) {
704
- const linkElement = button.previousElementSibling;
705
- const link = linkElement.href;
706
- navigator.clipboard.writeText(link)
707
- .then(() => {
708
- button.innerText = '✓ Copied';
709
- button.style.color = 'green';
710
- setTimeout(() => {
711
- button.innerText = 'Copy';
712
- button.style.color = '';
713
- }, 2000);
714
- })
715
- .catch(err => {
716
- console.error('Failed to copy link: ', err);
717
- });
718
- }
719
- </script>
720
- """
721
- return interface
722
-
723
-
724
- def main():
725
- interface = create_gradio_interface()
726
- interface.launch(share=True)
727
-
728
-
729
- if __name__ == "__main__":
730
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import statsmodels.formula.api as smf
4
+ import statsmodels.api as sm
5
+ import plotly.graph_objects as go
6
+ from scipy.optimize import minimize
7
+ import plotly.express as px
8
+ from scipy.stats import t, f
 
9
  import gradio as gr
 
10
  import io
11
+ import zipfile
12
+ import tempfile
13
+ from datetime import datetime
14
+
15
+ class RSM_BoxBehnken:
16
+ def __init__(self, data, x1_name, x2_name, x3_name, y_name, x1_levels, x2_levels, x3_levels):
17
+ """
18
+ Inicializa la clase con los datos del diseño Box-Behnken.
19
+ """
20
+ self.data = data.copy()
21
+ self.model = None
22
+ self.model_simplified = None
23
+ self.optimized_results = None
24
+ self.optimal_levels = None
25
+ self.all_figures = [] # Lista para almacenar las figuras
26
+ self.x1_name = x1_name
27
+ self.x2_name = x2_name
28
+ self.x3_name = x3_name
29
+ self.y_name = y_name
30
+
31
+ # Niveles originales de las variables
32
+ self.x1_levels = x1_levels
33
+ self.x2_levels = x2_levels
34
+ self.x3_levels = x3_levels
35
+
36
+ def get_levels(self, variable_name):
37
+ """
38
+ Obtiene los niveles para una variable específica.
39
+ """
40
+ if variable_name == self.x1_name:
41
+ return self.x1_levels
42
+ elif variable_name == self.x2_name:
43
+ return self.x2_levels
44
+ elif variable_name == self.x3_name:
45
+ return self.x3_levels
46
+ else:
47
+ raise ValueError(f"Variable desconocida: {variable_name}")
48
+
49
+ def fit_model(self):
50
+ """
51
+ Ajusta el modelo de segundo orden completo a los datos.
52
+ """
53
+ formula = f'{self.y_name} ~ {self.x1_name} + {self.x2_name} + {self.x3_name} + ' \
54
+ f'I({self.x1_name}**2) + I({self.x2_name}**2) + I({self.x3_name}**2) + ' \
55
+ f'{self.x1_name}:{self.x2_name} + {self.x1_name}:{self.x3_name} + {self.x2_name}:{self.x3_name}'
56
+ self.model = smf.ols(formula, data=self.data).fit()
57
+ print("Modelo Completo:")
58
+ print(self.model.summary())
59
+ return self.model, self.pareto_chart(self.model, "Pareto - Modelo Completo")
60
+
61
+ def fit_simplified_model(self):
62
+ """
63
+ Ajusta el modelo de segundo orden a los datos, eliminando términos no significativos.
64
+ """
65
+ formula = f'{self.y_name} ~ {self.x1_name} + {self.x2_name} + ' \
66
+ f'I({self.x1_name}**2) + I({self.x2_name}**2) + I({self.x3_name}**2)'
67
+ self.model_simplified = smf.ols(formula, data=self.data).fit()
68
+ print("\nModelo Simplificado:")
69
+ print(self.model_simplified.summary())
70
+ return self.model_simplified, self.pareto_chart(self.model_simplified, "Pareto - Modelo Simplificado")
71
+
72
+ def optimize(self, method='Nelder-Mead'):
73
+ """
74
+ Encuentra los niveles óptimos de los factores para maximizar la respuesta usando el modelo simplificado.
75
+ """
76
+ if self.model_simplified is None:
77
+ print("Error: Ajusta el modelo simplificado primero.")
78
+ return
79
+
80
+ def objective_function(x):
81
+ return -self.model_simplified.predict(pd.DataFrame({
82
+ self.x1_name: [x[0]],
83
+ self.x2_name: [x[1]],
84
+ self.x3_name: [x[2]]
85
+ })).values[0]
86
+
87
+ bounds = [(-1, 1), (-1, 1), (-1, 1)]
88
+ x0 = [0, 0, 0]
89
+
90
+ self.optimized_results = minimize(objective_function, x0, method=method, bounds=bounds)
91
+ self.optimal_levels = self.optimized_results.x
92
+
93
+ # Convertir niveles óptimos de codificados a naturales
94
+ optimal_levels_natural = [
95
+ self.coded_to_natural(self.optimal_levels[0], self.x1_name),
96
+ self.coded_to_natural(self.optimal_levels[1], self.x2_name),
97
+ self.coded_to_natural(self.optimal_levels[2], self.x3_name)
98
+ ]
99
+ # Crear la tabla de optimización
100
+ optimization_table = pd.DataFrame({
101
+ 'Variable': [self.x1_name, self.x2_name, self.x3_name],
102
+ 'Nivel Óptimo (Natural)': optimal_levels_natural,
103
+ 'Nivel Óptimo (Codificado)': self.optimal_levels
104
+ })
105
+
106
+ return optimization_table.round(3) # Redondear a 3 decimales
107
+
108
+ def plot_rsm_individual(self, fixed_variable, fixed_level):
109
+ """
110
+ Genera un gráfico de superficie de respuesta (RSM) individual para una configuración específica.
111
+ """
112
+ if self.model_simplified is None:
113
+ print("Error: Ajusta el modelo simplificado primero.")
114
+ return None
115
+
116
+ # Determinar las variables que varían y sus niveles naturales
117
+ varying_variables = [var for var in [self.x1_name, self.x2_name, self.x3_name] if var != fixed_variable]
118
+
119
+ # Establecer los niveles naturales para las variables que varían
120
+ x_natural_levels = self.get_levels(varying_variables[0])
121
+ y_natural_levels = self.get_levels(varying_variables[1])
122
+
123
+ # Crear una malla de puntos para las variables que varían (en unidades naturales)
124
+ x_range_natural = np.linspace(x_natural_levels[0], x_natural_levels[-1], 100)
125
+ y_range_natural = np.linspace(y_natural_levels[0], y_natural_levels[-1], 100)
126
+ x_grid_natural, y_grid_natural = np.meshgrid(x_range_natural, y_range_natural)
127
+
128
+ # Convertir la malla de variables naturales a codificadas
129
+ x_grid_coded = self.natural_to_coded(x_grid_natural, varying_variables[0])
130
+ y_grid_coded = self.natural_to_coded(y_grid_natural, varying_variables[1])
131
+
132
+ # Crear un DataFrame para la predicción con variables codificadas
133
+ prediction_data = pd.DataFrame({
134
+ varying_variables[0]: x_grid_coded.flatten(),
135
+ varying_variables[1]: y_grid_coded.flatten(),
136
+ })
137
+ prediction_data[fixed_variable] = self.natural_to_coded(fixed_level, fixed_variable)
138
+
139
+ # Calcular los valores predichos
140
+ z_pred = self.model_simplified.predict(prediction_data).values.reshape(x_grid_coded.shape)
141
+
142
+ # Filtrar por el nivel de la variable fija (en codificado)
143
+ fixed_level_coded = self.natural_to_coded(fixed_level, fixed_variable)
144
+ subset_data = self.data[np.isclose(self.data[fixed_variable], fixed_level_coded)]
145
+
146
+ # Filtrar por niveles válidos en las variables que varían
147
+ valid_levels = [-1, 0, 1]
148
+ experiments_data = subset_data[
149
+ subset_data[varying_variables[0]].isin(valid_levels) &
150
+ subset_data[varying_variables[1]].isin(valid_levels)
151
  ]
152
 
153
+ # Convertir coordenadas de experimentos a naturales
154
+ experiments_x_natural = experiments_data[varying_variables[0]].apply(lambda x: self.coded_to_natural(x, varying_variables[0]))
155
+ experiments_y_natural = experiments_data[varying_variables[1]].apply(lambda x: self.coded_to_natural(x, varying_variables[1]))
156
+
157
+ # Crear el gráfico de superficie con variables naturales en los ejes y transparencia
158
+ fig = go.Figure(data=[go.Surface(z=z_pred, x=x_grid_natural, y=y_grid_natural, colorscale='Viridis', opacity=0.7, showscale=True)])
159
+
160
+ # --- Añadir cuadrícula a la superficie ---
161
+ # Líneas en la dirección x
162
+ for i in range(x_grid_natural.shape[0]):
163
+ fig.add_trace(go.Scatter3d(
164
+ x=x_grid_natural[i, :],
165
+ y=y_grid_natural[i, :],
166
+ z=z_pred[i, :],
167
+ mode='lines',
168
+ line=dict(color='gray', width=2),
169
+ showlegend=False,
170
+ hoverinfo='skip'
171
+ ))
172
+ # Líneas en la dirección y
173
+ for j in range(x_grid_natural.shape[1]):
174
+ fig.add_trace(go.Scatter3d(
175
+ x=x_grid_natural[:, j],
176
+ y=y_grid_natural[:, j],
177
+ z=z_pred[:, j],
178
+ mode='lines',
179
+ line=dict(color='gray', width=2),
180
+ showlegend=False,
181
+ hoverinfo='skip'
182
+ ))
183
+
184
+ # --- Fin de la adición de la cuadrícula ---
185
+
186
+ # Añadir los puntos de los experimentos en la superficie de respuesta con diferentes colores y etiquetas
187
+ colors = px.colors.qualitative.Safe
188
+ point_labels = [f"{row[self.y_name]:.3f}" for _, row in experiments_data.iterrows()]
189
+
190
+ fig.add_trace(go.Scatter3d(
191
+ x=experiments_x_natural,
192
+ y=experiments_y_natural,
193
+ z=experiments_data[self.y_name].round(3),
194
+ mode='markers+text',
195
+ marker=dict(size=4, color=colors[:len(experiments_x_natural)]),
196
+ text=point_labels,
197
+ textposition='top center',
198
+ name='Experimentos'
199
+ ))
200
+
201
+ # Añadir etiquetas y título con variables naturales
202
+ fig.update_layout(
203
+ scene=dict(
204
+ xaxis_title=f"{varying_variables[0]} ({self.get_units(varying_variables[0])})",
205
+ yaxis_title=f"{varying_variables[1]} ({self.get_units(varying_variables[1])})",
206
+ zaxis_title=self.y_name,
207
+ ),
208
+ title=f"{self.y_name} vs {varying_variables[0]} y {varying_variables[1]}<br><sup>{fixed_variable} fijo en {fixed_level:.3f} ({self.get_units(fixed_variable)}) (Modelo Simplificado)</sup>",
209
+ height=800,
210
+ width=1000,
211
+ showlegend=True
212
+ )
213
+ return fig
214
+
215
+ def get_units(self, variable_name):
216
+ """
217
+ Define las unidades de las variables para etiquetas.
218
+ Puedes personalizar este método según tus necesidades.
219
+ """
220
+ units = {
221
+ 'Glucosa': 'g/L',
222
+ 'Extracto_de_Levadura': 'g/L',
223
+ 'Triptofano': 'g/L',
224
+ 'AIA_ppm': 'ppm'
225
+ }
226
+ return units.get(variable_name, '')
227
+
228
+ def generate_all_plots(self):
229
+ """
230
+ Genera todas las gráficas de RSM, variando la variable fija y sus niveles usando el modelo simplificado.
231
+ Almacena las figuras en self.all_figures.
232
+ """
233
+ if self.model_simplified is None:
234
+ print("Error: Ajusta el modelo simplificado primero.")
235
+ return
236
+
237
+ self.all_figures = [] # Resetear la lista de figuras
238
+
239
+ # Niveles naturales para graficar
240
+ levels_to_plot_natural = {
241
+ self.x1_name: self.x1_levels,
242
+ self.x2_name: self.x2_levels,
243
+ self.x3_name: self.x3_levels
244
  }
245
 
246
+ # Generar y almacenar gráficos individuales
247
+ for fixed_variable in [self.x1_name, self.x2_name, self.x3_name]:
248
+ for level in levels_to_plot_natural[fixed_variable]:
249
+ fig = self.plot_rsm_individual(fixed_variable, level)
250
+ if fig is not None:
251
+ self.all_figures.append(fig)
252
+
253
+ def coded_to_natural(self, coded_value, variable_name):
254
+ """Convierte un valor codificado a su valor natural."""
255
+ levels = self.get_levels(variable_name)
256
+ return levels[0] + (coded_value + 1) * (levels[-1] - levels[0]) / 2
257
+
258
+ def natural_to_coded(self, natural_value, variable_name):
259
+ """Convierte un valor natural a su valor codificado."""
260
+ levels = self.get_levels(variable_name)
261
+ return -1 + 2 * (natural_value - levels[0]) / (levels[-1] - levels[0])
262
+
263
+ def pareto_chart(self, model, title):
264
+ """
265
+ Genera un diagrama de Pareto para los efectos estandarizados de un modelo,
266
+ incluyendo la línea de significancia.
267
+ """
268
+ # Calcular los efectos estandarizados
269
+ tvalues = model.tvalues[1:] # Excluir la Intercept
270
+ abs_tvalues = np.abs(tvalues)
271
+ sorted_idx = np.argsort(abs_tvalues)[::-1]
272
+ sorted_tvalues = abs_tvalues[sorted_idx]
273
+ sorted_names = tvalues.index[sorted_idx]
274
+
275
+ # Calcular el valor crítico de t para la línea de significancia
276
+ alpha = 0.05 # Nivel de significancia
277
+ dof = model.df_resid # Grados de libertad residuales
278
+ t_critical = t.ppf(1 - alpha / 2, dof)
279
+
280
+ # Crear el diagrama de Pareto
281
+ fig = px.bar(
282
+ x=sorted_tvalues.round(3),
283
+ y=sorted_names,
284
+ orientation='h',
285
+ labels={'x': 'Efecto Estandarizado', 'y': 'Término'},
286
+ title=title
287
+ )
288
+ fig.update_yaxes(autorange="reversed")
289
+
290
+ # Agregar la línea de significancia
291
+ fig.add_vline(x=t_critical, line_dash="dot",
292
+ annotation_text=f"t crítico = {t_critical:.3f}",
293
+ annotation_position="bottom right")
294
+
295
+ return fig
296
+
297
+ def get_simplified_equation(self):
298
+ """
299
+ Imprime la ecuación del modelo simplificado.
300
+ """
301
+ if self.model_simplified is None:
302
+ print("Error: Ajusta el modelo simplificado primero.")
303
  return None
304
 
305
+ coefficients = self.model_simplified.params
306
+ equation = f"{self.y_name} = {coefficients['Intercept']:.3f}"
307
+
308
+ for term, coef in coefficients.items():
309
+ if term != 'Intercept':
310
+ if term == f'{self.x1_name}':
311
+ equation += f" + {coef:.3f}*{self.x1_name}"
312
+ elif term == f'{self.x2_name}':
313
+ equation += f" + {coef:.3f}*{self.x2_name}"
314
+ elif term == f'{self.x3_name}':
315
+ equation += f" + {coef:.3f}*{self.x3_name}"
316
+ elif term == f'I({self.x1_name} ** 2)':
317
+ equation += f" + {coef:.3f}*{self.x1_name}^2"
318
+ elif term == f'I({self.x2_name} ** 2)':
319
+ equation += f" + {coef:.3f}*{self.x2_name}^2"
320
+ elif term == f'I({self.x3_name} ** 2)':
321
+ equation += f" + {coef:.3f}*{self.x3_name}^2"
322
+
323
+ return equation
324
+
325
+ def generate_prediction_table(self):
326
+ """
327
+ Genera una tabla con los valores actuales, predichos y residuales.
328
+ """
329
+ if self.model_simplified is None:
330
+ print("Error: Ajusta el modelo simplificado primero.")
 
 
 
331
  return None
332
 
333
+ self.data['Predicho'] = self.model_simplified.predict(self.data)
334
+ self.data['Residual'] = self.data[self.y_name] - self.data['Predicho']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
+ return self.data[[self.y_name, 'Predicho', 'Residual']].round(3)
337
 
338
+ def calculate_contribution_percentage(self):
339
+ """
340
+ Calcula el porcentaje de contribución de cada factor a la variabilidad de la respuesta (AIA).
341
+ """
342
+ if self.model_simplified is None:
343
+ print("Error: Ajusta el modelo simplificado primero.")
344
  return None
345
 
346
+ # ANOVA del modelo simplificado
347
+ anova_table = sm.stats.anova_lm(self.model_simplified, typ=2)
348
+
349
+ # Suma de cuadrados total
350
+ ss_total = anova_table['sum_sq'].sum()
351
+
352
+ # Crear tabla de contribución
353
+ contribution_table = pd.DataFrame({
354
+ 'Factor': [],
355
+ 'Suma de Cuadrados': [],
356
+ '% Contribución': []
357
+ })
358
+
359
+ # Calcular porcentaje de contribución para cada factor
360
+ for index, row in anova_table.iterrows():
361
+ if index != 'Residual':
362
+ factor_name = index
363
+ if factor_name == f'I({self.x1_name} ** 2)':
364
+ factor_name = f'{self.x1_name}^2'
365
+ elif factor_name == f'I({self.x2_name} ** 2)':
366
+ factor_name = f'{self.x2_name}^2'
367
+ elif factor_name == f'I({self.x3_name} ** 2)':
368
+ factor_name = f'{self.x3_name}^2'
369
+
370
+ ss_factor = row['sum_sq']
371
+ contribution_percentage = (ss_factor / ss_total) * 100
372
+
373
+ contribution_table = pd.concat([contribution_table, pd.DataFrame({
374
+ 'Factor': [factor_name],
375
+ 'Suma de Cuadrados': [ss_factor],
376
+ '% Contribución': [contribution_percentage]
377
+ })], ignore_index=True)
378
+
379
+ return contribution_table.round(3)
380
+
381
+ def calculate_detailed_anova(self):
382
+ """
383
+ Calcula la tabla ANOVA detallada con la descomposición del error residual.
384
+ """
385
+ if self.model_simplified is None:
386
+ print("Error: Ajusta el modelo simplificado primero.")
387
  return None
388
 
389
+ # --- ANOVA detallada ---
390
+ # 1. Ajustar un modelo solo con los términos de primer orden y cuadráticos
391
+ formula_reduced = f'{self.y_name} ~ {self.x1_name} + {self.x2_name} + {self.x3_name} + ' \
392
+ f'I({self.x1_name}**2) + I({self.x2_name}**2) + I({self.x3_name}**2)'
393
+ model_reduced = smf.ols(formula_reduced, data=self.data).fit()
394
 
395
+ # 2. ANOVA del modelo reducido (para obtener la suma de cuadrados de la regresión)
396
+ anova_reduced = sm.stats.anova_lm(model_reduced, typ=2)
 
397
 
398
+ # 3. Suma de cuadrados total
399
+ ss_total = np.sum((self.data[self.y_name] - self.data[self.y_name].mean())**2)
400
 
401
+ # 4. Grados de libertad totales
402
+ df_total = len(self.data) - 1
403
 
404
+ # 5. Suma de cuadrados de la regresión
405
+ ss_regression = anova_reduced['sum_sq'][:-1].sum() # Sumar todo excepto 'Residual'
 
 
 
 
 
 
406
 
407
+ # 6. Grados de libertad de la regresión
408
+ df_regression = len(anova_reduced) - 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
+ # 7. Suma de cuadrados del error residual
411
+ ss_residual = self.model_simplified.ssr
412
+ df_residual = self.model_simplified.df_resid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
414
+ # 8. Suma de cuadrados del error puro (se calcula a partir de las réplicas)
415
+ replicas = self.data[self.data.duplicated(subset=[self.x1_name, self.x2_name, self.x3_name], keep=False)]
416
+ if not replicas.empty:
417
+ ss_pure_error = replicas.groupby([self.x1_name, self.x2_name, self.x3_name])[self.y_name].var().sum() * replicas.groupby([self.x1_name, self.x2_name, self.x3_name]).ngroups
418
+ df_pure_error = len(replicas) - replicas.groupby([self.x1_name, self.x2_name, self.x3_name]).ngroups
419
+ else:
420
+ ss_pure_error = np.nan
421
+ df_pure_error = np.nan
422
+
423
+ # 9. Suma de cuadrados de la falta de ajuste
424
+ ss_lack_of_fit = ss_residual - ss_pure_error if not np.isnan(ss_pure_error) else np.nan
425
+ df_lack_of_fit = df_residual - df_pure_error if not np.isnan(df_pure_error) else np.nan
426
+
427
+ # 10. Cuadrados medios
428
+ ms_regression = ss_regression / df_regression
429
+ ms_residual = ss_residual / df_residual
430
+ ms_lack_of_fit = ss_lack_of_fit / df_lack_of_fit if not np.isnan(ss_lack_of_fit) else np.nan
431
+ ms_pure_error = ss_pure_error / df_pure_error if not np.isnan(ss_pure_error) else np.nan
432
+
433
+ # 11. Estadístico F y valor p para la falta de ajuste
434
+ f_lack_of_fit = ms_lack_of_fit / ms_pure_error if not np.isnan(ms_lack_of_fit) else np.nan
435
+ p_lack_of_fit = 1 - f.cdf(f_lack_of_fit, df_lack_of_fit, df_pure_error) if not np.isnan(f_lack_of_fit) else np.nan
436
+
437
+ # 12. Crear la tabla ANOVA detallada
438
+ detailed_anova_table = pd.DataFrame({
439
+ 'Fuente de Variación': ['Regresión', 'Residual', 'Falta de Ajuste', 'Error Puro', 'Total'],
440
+ 'Suma de Cuadrados': [ss_regression, ss_residual, ss_lack_of_fit, ss_pure_error, ss_total],
441
+ 'Grados de Libertad': [df_regression, df_residual, df_lack_of_fit, df_pure_error, df_total],
442
+ 'Cuadrado Medio': [ms_regression, ms_residual, ms_lack_of_fit, ms_pure_error, np.nan],
443
+ 'F': [np.nan, np.nan, f_lack_of_fit, np.nan, np.nan],
444
+ 'Valor p': [np.nan, np.nan, p_lack_of_fit, np.nan, np.nan]
445
+ })
446
+
447
+ # Calcular la suma de cuadrados y grados de libertad para la curvatura
448
+ ss_curvature = anova_reduced['sum_sq'][f'I({self.x1_name} ** 2)'] + anova_reduced['sum_sq'][f'I({self.x2_name} ** 2)'] + anova_reduced['sum_sq'][f'I({self.x3_name} ** 2)']
449
+ df_curvature = 3
450
+
451
+ # Añadir la fila de curvatura a la tabla ANOVA
452
+ detailed_anova_table.loc[len(detailed_anova_table)] = ['Curvatura', ss_curvature, df_curvature, ss_curvature / df_curvature, np.nan, np.nan]
453
+
454
+ # Reorganizar las filas para que la curvatura aparezca después de la regresión
455
+ detailed_anova_table = detailed_anova_table.reindex([0, 5, 1, 2, 3, 4])
456
+
457
+ # Resetear el índice para que sea consecutivo
458
+ detailed_anova_table = detailed_anova_table.reset_index(drop=True)
459
+
460
+ return detailed_anova_table.round(3)
461
+
462
+ def get_all_tables(self):
463
+ """
464
+ Obtiene todas las tablas generadas para ser exportadas a Excel.
465
+ """
466
+ prediction_table = self.generate_prediction_table()
467
+ contribution_table = self.calculate_contribution_percentage()
468
+ detailed_anova_table = self.calculate_detailed_anova()
469
+
470
+ return {
471
+ 'Predicciones': prediction_table,
472
+ '% Contribución': contribution_table,
473
+ 'ANOVA Detallada': detailed_anova_table
474
+ }
475
 
476
+ def save_figures_to_zip(self):
477
+ """
478
+ Guarda todas las figuras almacenadas en self.all_figures a un archivo ZIP en memoria.
479
+ """
480
+ if not self.all_figures:
481
  return None
482
 
483
+ zip_buffer = io.BytesIO()
484
+ with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
485
+ for idx, fig in enumerate(self.all_figures, start=1):
486
+ img_bytes = fig.to_image(format="png")
487
+ zip_file.writestr(f'Grafico_{idx}.png', img_bytes)
488
+ zip_buffer.seek(0)
489
+
490
+ # Guardar en un archivo temporal
491
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
492
+ temp_file.write(zip_buffer.read())
493
+ temp_path = temp_file.name
494
+
495
+ return temp_path
496
+
497
+ def save_fig_to_bytes(self, fig):
498
+ """
499
+ Convierte una figura Plotly a bytes en formato PNG.
500
+ """
501
+ return fig.to_image(format="png")
502
+
503
+ def save_all_figures_png(self):
504
+ """
505
+ Guarda todas las figuras en archivos PNG temporales y retorna las rutas.
506
+ """
507
+ png_paths = []
508
+ for idx, fig in enumerate(self.all_figures, start=1):
509
+ img_bytes = fig.to_image(format="png")
510
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
511
+ temp_file.write(img_bytes)
512
+ temp_path = temp_file.name
513
+ png_paths.append(temp_path)
514
+ return png_paths
515
+
516
+ def save_tables_to_excel(self):
517
+ """
518
+ Guarda todas las tablas en un archivo Excel con múltiples hojas y retorna la ruta del archivo.
519
+ """
520
+ if 'rsm' not in globals():
 
 
 
 
 
521
  return None
522
 
523
+ tables = self.get_all_tables()
524
+ excel_buffer = io.BytesIO()
525
+ with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
526
+ for sheet_name, table in tables.items():
527
+ table.to_excel(writer, sheet_name=sheet_name, index=False)
528
+ excel_buffer.seek(0)
529
+ excel_bytes = excel_buffer.read()
530
 
531
+ # Guardar en un archivo temporal
532
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
533
+ temp_file.write(excel_bytes)
534
+ temp_path = temp_file.name
535
 
536
+ return temp_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
 
538
+ # --- Funciones para la interfaz de Gradio ---
 
 
 
539
 
540
+ def load_data(x1_name, x2_name, x3_name, y_name, x1_levels_str, x2_levels_str, x3_levels_str, data_str):
541
+ """
542
+ Carga los datos del diseño Box-Behnken desde cajas de texto y crea la instancia de RSM_BoxBehnken.
543
+ """
544
+ try:
545
+ # Convertir los niveles a listas de números
546
+ x1_levels = [float(x.strip()) for x in x1_levels_str.split(',')]
547
+ x2_levels = [float(x.strip()) for x in x2_levels_str.split(',')]
548
+ x3_levels = [float(x.strip()) for x in x3_levels_str.split(',')]
549
+
550
+ # Crear DataFrame a partir de la cadena de datos
551
+ data_list = [row.split(',') for row in data_str.strip().split('\n')]
552
+ column_names = ['Exp.', x1_name, x2_name, x3_name, y_name]
553
+ data = pd.DataFrame(data_list, columns=column_names)
554
+ data = data.apply(pd.to_numeric, errors='coerce') # Convertir a numérico
555
+
556
+ # Validar que el DataFrame tenga las columnas correctas
557
+ if not all(col in data.columns for col in column_names):
558
+ raise ValueError("El formato de los datos no es correcto.")
559
+
560
+ # Crear la instancia de RSM_BoxBehnken
561
+ global rsm
562
+ rsm = RSM_BoxBehnken(data, x1_name, x2_name, x3_name, y_name, x1_levels, x2_levels, x3_levels)
563
+
564
+ return data.round(3), x1_name, x2_name, x3_name, y_name, x1_levels, x2_levels, x3_levels, gr.update(visible=True)
565
+
566
+ except Exception as e:
567
+ # Mostrar mensaje de error
568
+ error_message = f"Error al cargar los datos: {str(e)}"
569
+ print(error_message)
570
+ return None, "", "", "", "", [], [], [], gr.update(visible=False)
571
+
572
+ def fit_and_optimize_model():
573
+ if 'rsm' not in globals():
574
+ return [None]*10
575
+
576
+ # Ajustar modelos y optimizar
577
+ model_completo, pareto_completo = rsm.fit_model()
578
+ model_simplificado, pareto_simplificado = rsm.fit_simplified_model()
579
+ optimization_table = rsm.optimize()
580
+ equation = rsm.get_simplified_equation()
581
+ prediction_table = rsm.generate_prediction_table()
582
+ contribution_table = rsm.calculate_contribution_percentage()
583
+ anova_table = rsm.calculate_detailed_anova()
584
+
585
+ # Generar todas las figuras y almacenarlas
586
+ rsm.generate_all_plots()
587
+
588
+ # Formatear la ecuación para que se vea mejor en Markdown
589
+ equation_formatted = equation.replace(" + ", "<br>+ ").replace(" ** ", "^").replace("*", " × ")
590
+ equation_formatted = f"### Ecuación del Modelo Simplificado:<br>{equation_formatted}"
591
+
592
+ # Guardar las tablas en Excel temporal
593
+ excel_path = rsm.save_tables_to_excel()
594
+
595
+ # Guardar todas las figuras en un ZIP temporal
596
+ zip_path = rsm.save_figures_to_zip()
597
+
598
+ return (
599
+ model_completo.summary().as_html(),
600
+ pareto_completo,
601
+ model_simplificado.summary().as_html(),
602
+ pareto_simplificado,
603
+ equation_formatted,
604
+ optimization_table,
605
+ prediction_table,
606
+ contribution_table,
607
+ anova_table,
608
+ zip_path, # Ruta del ZIP de gráficos
609
+ excel_path # Ruta del Excel de tablas
610
+ )
611
 
612
+ def show_plot(current_index, all_figures):
613
+ if not all_figures:
614
+ return None, "No hay gráficos disponibles.", current_index
615
+ selected_fig = all_figures[current_index]
616
+ plot_info_text = f"Gráfico {current_index + 1} de {len(all_figures)}"
617
+ return selected_fig, plot_info_text, current_index
618
 
619
+ def navigate_plot(direction, current_index, all_figures):
620
+ """
621
+ Navega entre los gráficos.
622
+ """
623
+ if not all_figures:
624
+ return None, "No hay gráficos disponibles.", current_index
625
+
626
+ if direction == 'left':
627
+ new_index = (current_index - 1) % len(all_figures)
628
+ elif direction == 'right':
629
+ new_index = (current_index + 1) % len(all_figures)
630
+ else:
631
+ new_index = current_index
632
+
633
+ selected_fig = all_figures[new_index]
634
+ plot_info_text = f"Gráfico {new_index + 1} de {len(all_figures)}"
635
+
636
+ return selected_fig, plot_info_text, new_index
637
 
638
+ def download_current_plot(all_figures, current_index):
639
+ """
640
+ Descarga la figura actual como PNG.
641
+ """
642
+ if not all_figures:
643
  return None
644
+ fig = all_figures[current_index]
645
+ img_bytes = rsm.save_fig_to_bytes(fig)
646
+ filename = f"Grafico_RSM_{current_index + 1}.png"
647
+
648
+ # Crear un archivo temporal
649
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
650
+ temp_file.write(img_bytes)
651
+ temp_path = temp_file.name
652
+
653
+ return temp_path # Retornar solo la ruta
654
 
655
+ def download_all_plots_zip(all_figures):
656
+ """
657
+ Descarga todas las figuras en un archivo ZIP.
658
+ """
659
+ if not all_figures:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
  return None
661
+ zip_path = rsm.save_figures_to_zip()
662
+ filename = f"Graficos_RSM_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
663
+ return zip_path # Retornar solo la ruta
664
 
665
+ def download_all_tables_excel():
666
+ """
667
+ Descarga todas las tablas en un archivo Excel con múltiples hojas.
668
+ """
669
+ if 'rsm' not in globals():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  return None
671
+ excel_path = rsm.save_tables_to_excel()
672
+ filename = f"Tablas_RSM_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
673
+ return excel_path # Retornar solo la ruta
674
 
675
+ # --- Crear la interfaz de Gradio ---
676
+
677
+ with gr.Blocks() as demo:
678
+ gr.Markdown("# Optimización de la producción de AIA usando RSM Box-Behnken")
679
+
680
+ with gr.Row():
681
+ with gr.Column():
682
+ gr.Markdown("## Configuración del Diseño")
683
+ x1_name_input = gr.Textbox(label="Nombre de la Variable X1 (ej. Glucosa)", value="Glucosa")
684
+ x2_name_input = gr.Textbox(label="Nombre de la Variable X2 (ej. Extracto de Levadura)", value="Extracto_de_Levadura")
685
+ x3_name_input = gr.Textbox(label="Nombre de la Variable X3 (ej. Triptófano)", value="Triptofano")
686
+ y_name_input = gr.Textbox(label="Nombre de la Variable Dependiente (ej. AIA (ppm))", value="AIA_ppm")
687
+ x1_levels_input = gr.Textbox(label="Niveles de X1 (separados por comas)", value="1, 3.5, 5.5")
688
+ x2_levels_input = gr.Textbox(label="Niveles de X2 (separados por comas)", value="0.03, 0.2, 0.3")
689
+ x3_levels_input = gr.Textbox(label="Niveles de X3 (separados por comas)", value="0.4, 0.65, 0.9")
690
+ data_input = gr.Textbox(label="Datos del Experimento (formato CSV)", lines=10, value="""1,-1,-1,0,166.594
691
+ 2,1,-1,0,177.557
692
+ 3,-1,1,0,127.261
693
+ 4,1,1,0,147.573
694
+ 5,-1,0,-1,188.883
695
+ 6,1,0,-1,224.527
696
+ 7,-1,0,1,190.238
697
+ 8,1,0,1,226.483
698
+ 9,0,-1,-1,195.550
699
+ 10,0,1,-1,149.493
700
+ 11,0,-1,1,187.683
701
+ 12,0,1,1,148.621
702
+ 13,0,0,0,278.951
703
+ 14,0,0,0,297.238
704
+ 15,0,0,0,280.896""")
705
+ load_button = gr.Button("Cargar Datos")
706
+
707
+ with gr.Column():
708
+ gr.Markdown("## Datos Cargados")
709
+ data_output = gr.Dataframe(label="Tabla de Datos", interactive=False)
710
+
711
+ # Sección de análisis visible solo después de cargar los datos
712
+ with gr.Row(visible=False) as analysis_row:
713
+ with gr.Column():
714
+ fit_button = gr.Button("Ajustar Modelo y Optimizar")
715
+ gr.Markdown("**Modelo Completo**")
716
+ model_completo_output = gr.HTML()
717
+ pareto_completo_output = gr.Plot()
718
+ gr.Markdown("**Modelo Simplificado**")
719
+ model_simplificado_output = gr.HTML()
720
+ pareto_simplificado_output = gr.Plot()
721
+ gr.Markdown("**Ecuación del Modelo Simplificado**")
722
+ equation_output = gr.HTML()
723
+ optimization_table_output = gr.Dataframe(label="Tabla de Optimización", interactive=False)
724
+ prediction_table_output = gr.Dataframe(label="Tabla de Predicciones", interactive=False)
725
+ contribution_table_output = gr.Dataframe(label="Tabla de % de Contribución", interactive=False)
726
+ anova_table_output = gr.Dataframe(label="Tabla ANOVA Detallada", interactive=False)
727
+ gr.Markdown("## Descargar Todas las Tablas")
728
+ download_excel_button = gr.DownloadButton("Descargar Tablas en Excel")
729
+
730
+ with gr.Column():
731
+ gr.Markdown("## Generar Gráficos de Superficie de Respuesta")
732
+ fixed_variable_input = gr.Dropdown(label="Variable Fija", choices=["Glucosa", "Extracto_de_Levadura", "Triptofano"], value="Glucosa")
733
+ fixed_level_input = gr.Slider(label="Nivel de Variable Fija", minimum=-1, maximum=1, step=0.01, value=0.0)
734
+ plot_button = gr.Button("Generar Gráficos")
735
+ with gr.Row():
736
+ left_button = gr.Button("<")
737
+ right_button = gr.Button(">")
738
+ rsm_plot_output = gr.Plot()
739
+ plot_info = gr.Textbox(label="Información del Gráfico", value="Gráfico 1 de 9", interactive=False)
740
+ with gr.Row():
741
+ download_plot_button = gr.DownloadButton("Descargar Gráfico Actual (PNG)")
742
+ download_all_plots_button = gr.DownloadButton("Descargar Todos los Gráficos (ZIP)")
743
+ current_index_state = gr.State(0) # Estado para el índice actual
744
+ all_figures_state = gr.State([]) # Estado para todas las figuras
745
+
746
+ # Cargar datos
747
+ load_button.click(
748
+ load_data,
749
+ inputs=[x1_name_input, x2_name_input, x3_name_input, y_name_input, x1_levels_input, x2_levels_input, x3_levels_input, data_input],
750
+ outputs=[data_output, x1_name_input, x2_name_input, x3_name_input, y_name_input, x1_levels_input, x2_levels_input, x3_levels_input, analysis_row]
751
+ )
752
+
753
+ # Ajustar modelo y optimizar
754
+ fit_button.click(
755
+ fit_and_optimize_model,
756
+ inputs=[],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  outputs=[
758
+ model_completo_output,
759
+ pareto_completo_output,
760
+ model_simplificado_output,
761
+ pareto_simplificado_output,
762
+ equation_output,
763
+ optimization_table_output,
764
+ prediction_table_output,
765
+ contribution_table_output,
766
+ anova_table_output,
767
+ download_all_plots_button,
768
+ download_excel_button
769
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
 
772
+ # Generar y mostrar los gráficos
773
+ plot_button.click(
774
+ lambda fixed_var, fixed_lvl: (rsm.plot_rsm_individual(fixed_var, fixed_lvl), "Gráfico 1 de " + str(len(rsm.all_figures)), 0),
775
+ inputs=[fixed_variable_input, fixed_level_input],
776
+ outputs=[rsm_plot_output, plot_info, current_index_state]
777
+ )
778
+
779
+ # Navegación de gráficos
780
+ left_button.click(
781
+ navigate_plot,
782
+ inputs=[gr.Button.get_value(left_button), current_index_state, all_figures_state],
783
+ outputs=[rsm_plot_output, plot_info, current_index_state]
784
+ )
785
+ right_button.click(
786
+ navigate_plot,
787
+ inputs=[gr.Button.get_value(right_button), current_index_state, all_figures_state],
788
+ outputs=[rsm_plot_output, plot_info, current_index_state]
789
+ )
790
+
791
+ # Descargar gráfico actual
792
+ download_plot_button.click(
793
+ download_current_plot,
794
+ inputs=[all_figures_state, current_index_state],
795
+ outputs=download_plot_button
796
+ )
797
+
798
+ # Descargar todos los gráficos en ZIP
799
+ download_all_plots_button.click(
800
+ download_all_plots_zip,
801
+ inputs=[all_figures_state],
802
+ outputs=download_all_plots_button
803
+ )
804
+
805
+ # Descargar todas las tablas en Excel
806
+ download_excel_button.click(
807
+ download_all_tables_excel,
808
+ inputs=[],
809
+ outputs=download_excel_button
810
+ )
811
+
812
+ # Ejemplo de uso
813
+ gr.Markdown("## Ejemplo de uso")
814
+ gr.Markdown("""
815
+ 1. Introduce los nombres de las variables y sus niveles en las cajas de texto correspondientes.
816
+ 2. Copia y pega los datos del experimento en la caja de texto 'Datos del Experimento'.
817
+ 3. Haz clic en 'Cargar Datos' para cargar los datos en la tabla.
818
+ 4. Haz clic en 'Ajustar Modelo y Optimizar' para ajustar el modelo y encontrar los niveles óptimos de los factores.
819
+ 5. Selecciona una variable fija y su nivel en los controles deslizantes.
820
+ 6. Haz clic en 'Generar Gráficos' para generar los gráficos de superficie de respuesta.
821
+ 7. Navega entre los gráficos usando los botones '<' y '>'.
822
+ 8. Descarga el gráfico actual en PNG o descarga todos los gráficos en un ZIP.
823
+ 9. Descarga todas las tablas en un archivo Excel con el botón correspondiente.
824
+ """)
825
+
826
+ demo.launch()