Phoenix21 commited on
Commit
7aa6a7e
Β·
verified Β·
1 Parent(s): 02c4a63

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +985 -0
app.py ADDED
@@ -0,0 +1,985 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Complete Medical Literature Health Dataset Generator with Gradio Interface
2
+ #
3
+ # This creates a web-based interface for generating synthetic health optimization datasets
4
+
5
+ # =====================================================================
6
+ # STEP 1: INSTALLATIONS AND IMPORTS
7
+ # =====================================================================
8
+
9
+ # Install required packages
10
+ import subprocess
11
+ import sys
12
+
13
+ def install_packages():
14
+ """Install required packages"""
15
+ packages = ['openai', 'gradio', 'python-dotenv', 'requests', 'pandas']
16
+ for package in packages:
17
+ try:
18
+ __import__(package)
19
+ except ImportError:
20
+ print(f"Installing {package}...")
21
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package])
22
+
23
+ # Run installation
24
+ install_packages()
25
+
26
+ # Import libraries
27
+ import gradio as gr
28
+ import json
29
+ import random
30
+ import re
31
+ import time
32
+ import os
33
+ import io
34
+ import zipfile
35
+ from datetime import datetime
36
+ from typing import Dict, List, Any, Optional, Tuple
37
+ from openai import OpenAI
38
+ import pandas as pd
39
+
40
+ # =====================================================================
41
+ # STEP 2: CORE CLASSES (Same as before but with progress callbacks)
42
+ # =====================================================================
43
+
44
+ class MedicalLiteratureSimulator:
45
+ """Simulates medical literature research for health dataset generation"""
46
+
47
+ def __init__(self):
48
+ self.research_domains = {
49
+ "longevity": {
50
+ "interventions": ["NAD+ supplementation", "resveratrol", "metformin", "caloric restriction"],
51
+ "biomarkers": ["telomere length", "cellular senescence", "inflammatory markers", "mitochondrial function"],
52
+ "outcomes": ["biological age reduction", "improved healthspan", "enhanced cellular repair"]
53
+ },
54
+ "metabolic_health": {
55
+ "interventions": ["berberine", "intermittent fasting", "alpha-lipoic acid", "chromium"],
56
+ "biomarkers": ["glucose levels", "insulin sensitivity", "HbA1c", "HOMA-IR"],
57
+ "outcomes": ["improved glucose control", "enhanced insulin sensitivity", "reduced inflammation"]
58
+ },
59
+ "cardiovascular": {
60
+ "interventions": ["omega-3 fatty acids", "coenzyme Q10", "magnesium", "nattokinase"],
61
+ "biomarkers": ["blood pressure", "cholesterol levels", "CRP", "endothelial function"],
62
+ "outcomes": ["reduced blood pressure", "improved lipid profile", "decreased inflammation"]
63
+ },
64
+ "cognitive": {
65
+ "interventions": ["lion's mane mushroom", "phosphatidylserine", "bacopa monnieri", "acetyl-L-carnitine"],
66
+ "biomarkers": ["cognitive performance", "BDNF levels", "neuroinflammation", "memory function"],
67
+ "outcomes": ["enhanced memory", "improved cognitive function", "neuroprotection"]
68
+ },
69
+ "hormonal": {
70
+ "interventions": ["ashwagandha", "vitamin D", "DHEA", "maca root"],
71
+ "biomarkers": ["cortisol levels", "thyroid hormones", "sex hormones", "stress markers"],
72
+ "outcomes": ["hormone balance", "improved energy", "better sleep quality"]
73
+ },
74
+ "inflammation": {
75
+ "interventions": ["curcumin", "omega-3", "quercetin", "boswellia"],
76
+ "biomarkers": ["CRP", "IL-6", "TNF-alpha", "oxidative stress"],
77
+ "outcomes": ["reduced inflammation", "improved immune function", "enhanced recovery"]
78
+ }
79
+ }
80
+
81
+ def generate_study_data(self, domain: str) -> Dict[str, Any]:
82
+ """Generate realistic medical study data"""
83
+ if domain not in self.research_domains:
84
+ domain = "longevity"
85
+
86
+ domain_data = self.research_domains[domain]
87
+
88
+ study = {
89
+ "pmid": f"PMID{random.randint(35000000, 40000000)}",
90
+ "title": self._generate_study_title(domain, domain_data),
91
+ "abstract": self._generate_study_abstract(domain, domain_data),
92
+ "journal": random.choice([
93
+ "Nature Medicine", "Cell Metabolism", "Journal of Clinical Medicine",
94
+ "Circulation", "Aging Cell", "Nutrients", "Clinical Nutrition"
95
+ ]),
96
+ "year": random.choice([2023, 2024]),
97
+ "domain": domain,
98
+ "interventions": random.sample(domain_data["interventions"], min(2, len(domain_data["interventions"]))),
99
+ "biomarkers": random.sample(domain_data["biomarkers"], min(3, len(domain_data["biomarkers"]))),
100
+ "outcomes": random.sample(domain_data["outcomes"], min(2, len(domain_data["outcomes"]))),
101
+ "participant_count": random.randint(50, 300),
102
+ "duration_weeks": random.choice([8, 12, 16, 24]),
103
+ "dosages": self._generate_dosages(domain_data["interventions"][0])
104
+ }
105
+
106
+ return study
107
+
108
+ def _generate_study_title(self, domain: str, domain_data: Dict) -> str:
109
+ intervention = random.choice(domain_data["interventions"])
110
+ outcome = random.choice(domain_data["outcomes"])
111
+
112
+ titles = [
113
+ f"Effects of {intervention} on {outcome}: A randomized controlled trial",
114
+ f"{intervention} supplementation improves {outcome} in healthy adults",
115
+ f"Clinical evaluation of {intervention} for {outcome} optimization",
116
+ f"Randomized trial of {intervention} in {outcome} enhancement"
117
+ ]
118
+
119
+ return random.choice(titles)
120
+
121
+ def _generate_study_abstract(self, domain: str, domain_data: Dict) -> str:
122
+ intervention = domain_data["interventions"][0]
123
+ biomarker = random.choice(domain_data["biomarkers"])
124
+ outcome = random.choice(domain_data["outcomes"])
125
+
126
+ abstract = f"""
127
+ Background: {intervention} has shown promise in preliminary studies for health optimization.
128
+
129
+ Objective: To evaluate the effects of {intervention} supplementation on {biomarker} and related health outcomes.
130
+
131
+ Methods: Randomized, double-blind, placebo-controlled trial with {random.randint(120, 250)} participants aged 40-65 years.
132
+ Subjects received {intervention} or placebo for {random.randint(12, 24)} weeks.
133
+
134
+ Results: {intervention} supplementation significantly improved {outcome} compared to placebo (p<0.05).
135
+ {biomarker.capitalize()} showed {random.randint(15, 35)}% improvement from baseline.
136
+ Secondary outcomes included improved quality of life and no serious adverse events.
137
+
138
+ Conclusions: {intervention} supplementation provides significant benefits for {outcome} with excellent safety profile.
139
+ """.strip()
140
+
141
+ return abstract
142
+
143
+ def _generate_dosages(self, intervention: str) -> List[str]:
144
+ dosage_ranges = {
145
+ "NAD+": ["250mg", "500mg", "1000mg"],
146
+ "resveratrol": ["100mg", "250mg", "500mg"],
147
+ "berberine": ["500mg", "1000mg", "1500mg"],
148
+ "omega-3": ["1000mg", "2000mg", "3000mg"],
149
+ "magnesium": ["200mg", "400mg", "600mg"],
150
+ "curcumin": ["500mg", "1000mg", "1500mg"]
151
+ }
152
+
153
+ for key in dosage_ranges:
154
+ if key.lower() in intervention.lower():
155
+ return random.sample(dosage_ranges[key], min(2, len(dosage_ranges[key])))
156
+
157
+ return ["500mg", "1000mg"]
158
+
159
+ class HealthProfileGenerator:
160
+ """Generates realistic health profiles based on medical studies"""
161
+
162
+ def __init__(self):
163
+ self.severity_levels = {
164
+ "optimal": {"multiplier": 1.0, "description": "excellent baseline health with optimization focus"},
165
+ "mild": {"multiplier": 1.2, "description": "minor health concerns with good overall function"},
166
+ "moderate": {"multiplier": 1.5, "description": "noticeable health issues requiring intervention"},
167
+ "severe": {"multiplier": 2.0, "description": "significant health challenges needing intensive protocols"}
168
+ }
169
+
170
+ def generate_profile_from_study(self, study: Dict[str, Any], severity: str = "moderate") -> Dict[str, Any]:
171
+ """Generate complete health profile based on study data and severity level"""
172
+ domain = study.get("domain", "longevity")
173
+ severity_data = self.severity_levels.get(severity, self.severity_levels["moderate"])
174
+ multiplier = severity_data["multiplier"]
175
+
176
+ age = random.randint(35, 65)
177
+ gender = random.choice(["male", "female"])
178
+
179
+ labs = self._generate_lab_values(domain, multiplier)
180
+
181
+ health_profile = {
182
+ "user_tests_result_data": {
183
+ "Labs": labs,
184
+ "gut_microbiome": self._generate_gut_microbiome(severity),
185
+ "epigenetics": self._generate_epigenetics(severity),
186
+ "wearables": self._generate_wearables(severity),
187
+ "cgm": self._generate_cgm(severity)
188
+ },
189
+ "user_query": self._generate_user_query(study, age, gender, severity),
190
+ "source_study": {
191
+ "pmid": study.get("pmid"),
192
+ "domain": domain,
193
+ "severity": severity,
194
+ "title": study.get("title")
195
+ }
196
+ }
197
+
198
+ return health_profile
199
+
200
+ def _generate_lab_values(self, domain: str, multiplier: float) -> Dict[str, Any]:
201
+ """Generate realistic lab values based on domain and severity"""
202
+ base_labs = {
203
+ "blood_tests": {
204
+ "systolic_bp": int(random.randint(120, 140) * multiplier),
205
+ "diastolic_bp": int(random.randint(70, 90) * multiplier),
206
+ "total_cholesterol": int(random.randint(180, 220) * multiplier),
207
+ "ldl": int(random.randint(100, 140) * multiplier),
208
+ "hdl": int(random.randint(40, 60) / multiplier),
209
+ "triglycerides": int(random.randint(80, 150) * multiplier),
210
+ "apoB": int(random.randint(70, 110) * multiplier),
211
+ "lp_a": random.randint(10, 50)
212
+ },
213
+ "inflammatory": {
214
+ "hscrp": round(random.uniform(1.0, 4.0) * multiplier, 1),
215
+ "esr": int(random.randint(5, 25) * multiplier),
216
+ "il6": round(random.uniform(1.0, 5.0) * multiplier, 1),
217
+ "tnf_alpha": round(random.uniform(1.0, 3.0) * multiplier, 1),
218
+ "oxidative_stress_markers": "elevated" if multiplier > 1.3 else "normal",
219
+ "homocysteine": round(random.uniform(8, 15) * multiplier, 1)
220
+ },
221
+ "nutritional": {
222
+ "vitamin_d": int(random.randint(25, 50) / multiplier),
223
+ "b12": random.randint(250, 400),
224
+ "folate": round(random.uniform(6, 14), 1),
225
+ "iron": random.randint(60, 120),
226
+ "ferritin": random.randint(30, 100),
227
+ "selenium": random.randint(80, 120),
228
+ "zinc": random.randint(70, 110),
229
+ "magnesium": round(random.uniform(1.5, 2.2), 1),
230
+ "omega3_index": round(random.uniform(4, 8) / multiplier, 1)
231
+ }
232
+ }
233
+
234
+ if domain == "metabolic_health":
235
+ base_labs["metabolic"] = {
236
+ "fasting_glucose": int(random.randint(85, 110) * multiplier),
237
+ "hba1c": round(random.uniform(5.2, 6.0) * min(multiplier, 1.4), 1),
238
+ "insulin_fasting": round(random.uniform(5, 15) * multiplier, 1),
239
+ "homa_ir": round(random.uniform(1.5, 4.0) * multiplier, 1)
240
+ }
241
+
242
+ return base_labs
243
+
244
+ def _generate_gut_microbiome(self, severity: str) -> str:
245
+ scores = {
246
+ "optimal": random.uniform(8.5, 9.5),
247
+ "mild": random.uniform(7.0, 8.5),
248
+ "moderate": random.uniform(5.5, 7.0),
249
+ "severe": random.uniform(3.5, 5.5)
250
+ }
251
+
252
+ score = scores.get(severity, 6.5)
253
+
254
+ descriptions = {
255
+ "optimal": "excellent diversity with optimal bacterial balance",
256
+ "mild": "good diversity with minor imbalances",
257
+ "moderate": "moderate dysbiosis with reduced beneficial bacteria",
258
+ "severe": "significant dysbiosis with pathogenic overgrowth"
259
+ }
260
+
261
+ desc = descriptions.get(severity, "moderate dysbiosis")
262
+ return f"Diversity score {score:.1f}/10, {desc}, beneficial bacteria {random.randint(60, 90)}%"
263
+
264
+ def _generate_epigenetics(self, severity: str) -> str:
265
+ age_acceleration = {
266
+ "optimal": random.randint(-2, 1),
267
+ "mild": random.randint(1, 3),
268
+ "moderate": random.randint(3, 6),
269
+ "severe": random.randint(6, 12)
270
+ }
271
+
272
+ acceleration = age_acceleration.get(severity, 4)
273
+ telomere_percentile = max(10, random.randint(30, 80) - acceleration * 5)
274
+
275
+ return f"Biological age acceleration: {acceleration} years, telomere length: {telomere_percentile}th percentile, DunedinPACE: {round(random.uniform(0.9, 1.4), 2)}"
276
+
277
+ def _generate_wearables(self, severity: str) -> Dict[str, int]:
278
+ base_ranges = {
279
+ "optimal": {"hrv": (55, 75), "rhr": (45, 60), "sleep": (85, 95)},
280
+ "mild": {"hrv": (45, 65), "rhr": (55, 70), "sleep": (75, 85)},
281
+ "moderate": {"hrv": (30, 50), "rhr": (65, 80), "sleep": (60, 75)},
282
+ "severe": {"hrv": (20, 35), "rhr": (75, 95), "sleep": (45, 65)}
283
+ }
284
+
285
+ ranges = base_ranges.get(severity, base_ranges["moderate"])
286
+
287
+ return {
288
+ "hrv_avg": random.randint(*ranges["hrv"]),
289
+ "rhr": random.randint(*ranges["rhr"]),
290
+ "sleep_score": random.randint(*ranges["sleep"]),
291
+ "recovery_score": random.randint(ranges["sleep"][0]-10, ranges["sleep"][1]-5),
292
+ "stress_score": random.randint(100-ranges["sleep"][1], 100-ranges["sleep"][0]+20),
293
+ "vo2_max": random.randint(25, 50),
294
+ "fitness_age": random.randint(30, 65)
295
+ }
296
+
297
+ def _generate_cgm(self, severity: str) -> str:
298
+ glucose_ranges = {
299
+ "optimal": (80, 95, 92, 98),
300
+ "mild": (85, 105, 85, 95),
301
+ "moderate": (95, 120, 70, 85),
302
+ "severe": (110, 140, 55, 75)
303
+ }
304
+
305
+ avg_min, avg_max, tir_min, tir_max = glucose_ranges.get(severity, glucose_ranges["moderate"])
306
+ return f"Average glucose {random.randint(avg_min, avg_max)} mg/dL, time in range {random.randint(tir_min, tir_max)}%"
307
+
308
+ def _generate_user_query(self, study: Dict[str, Any], age: int, gender: str, severity: str) -> str:
309
+ domain = study.get("domain", "longevity")
310
+
311
+ base_queries = {
312
+ "longevity": f"I'm a {age}-year-old {gender} interested in longevity optimization and anti-aging protocols",
313
+ "metabolic_health": f"I'm a {age}-year-old {gender} with metabolic dysfunction seeking evidence-based glucose control",
314
+ "cardiovascular": f"I'm a {age}-year-old {gender} with cardiovascular risk factors wanting heart health optimization",
315
+ "cognitive": f"I'm a {age}-year-old {gender} seeking cognitive enhancement and brain health optimization",
316
+ "hormonal": f"I'm a {age}-year-old {gender} with hormonal imbalances needing optimization protocols",
317
+ "inflammation": f"I'm a {age}-year-old {gender} with chronic inflammation seeking anti-inflammatory interventions"
318
+ }
319
+
320
+ base_query = base_queries.get(domain, base_queries["longevity"])
321
+
322
+ severity_context = {
323
+ "optimal": "I have excellent baseline health but want to push the boundaries of optimization",
324
+ "mild": "I have minor health concerns and want targeted interventions",
325
+ "moderate": "I have noticeable health issues and need comprehensive protocols",
326
+ "severe": "I have significant health challenges and require intensive interventions"
327
+ }
328
+
329
+ context = severity_context.get(severity, "")
330
+ return f"{base_query}. {context}."
331
+
332
+ class AIProtocolGenerator:
333
+ """Uses OpenAI to generate health optimization protocols"""
334
+
335
+ def __init__(self, api_key: str, model: str = "gpt-4"):
336
+ self.client = OpenAI(api_key=api_key)
337
+ self.model = model
338
+ self.total_cost = 0.0
339
+
340
+ def generate_protocol(self, health_profile: Dict[str, Any], study_context: Dict[str, Any], progress_callback=None) -> Optional[str]:
341
+ """Generate comprehensive health optimization protocol"""
342
+
343
+ system_prompt = self._create_system_prompt(study_context)
344
+ user_prompt = self._create_user_prompt(health_profile, study_context)
345
+
346
+ try:
347
+ if progress_callback:
348
+ progress_callback(f"πŸ”„ Generating protocol using {self.model}...")
349
+
350
+ response = self.client.chat.completions.create(
351
+ model=self.model,
352
+ messages=[
353
+ {"role": "system", "content": system_prompt},
354
+ {"role": "user", "content": user_prompt}
355
+ ],
356
+ max_tokens=4000,
357
+ temperature=0.7,
358
+ top_p=0.9
359
+ )
360
+
361
+ self._update_cost(response.usage)
362
+
363
+ if progress_callback:
364
+ progress_callback(f"βœ… Protocol generated ({response.usage.total_tokens} tokens)")
365
+
366
+ return response.choices[0].message.content
367
+
368
+ except Exception as e:
369
+ if progress_callback:
370
+ progress_callback(f"❌ Error generating protocol: {e}")
371
+ return None
372
+
373
+ def _create_system_prompt(self, study_context: Dict[str, Any]) -> str:
374
+ domain = study_context.get("domain", "health")
375
+ interventions = ", ".join(study_context.get("interventions", []))
376
+
377
+ return f"""You are an advanced AI health optimization system specializing in evidence-based medicine and personalized protocols.
378
+
379
+ RESEARCH CONTEXT:
380
+ - Domain: {domain} optimization
381
+ - Key Interventions: {interventions}
382
+ - Evidence Level: Peer-reviewed clinical research
383
+
384
+ PROTOCOL REQUIREMENTS:
385
+ 1. Executive Summary with current health assessment
386
+ 2. Multi-Phase Protocol:
387
+ - Phase 1: Foundation (0-3 months)
388
+ - Phase 2: Optimization (3-6 months)
389
+ - Phase 3: Advanced Enhancement (6-12 months)
390
+ 3. Specific supplement protocols with dosages and timing
391
+ 4. Lifestyle interventions (exercise, nutrition, sleep)
392
+ 5. Monitoring and assessment plans
393
+ 6. Expected outcomes with realistic timelines
394
+
395
+ STYLE: Professional, authoritative, using Medicine 3.0 terminology. Reference biological age, biomarkers, and cellular health.
396
+
397
+ SAFETY: Keep dosages within evidence-based safe ranges. Include monitoring recommendations.
398
+
399
+ Generate comprehensive protocols (3000+ words) with actionable precision medicine recommendations."""
400
+
401
+ def _create_user_prompt(self, health_profile: Dict[str, Any], study_context: Dict[str, Any]) -> str:
402
+ return f"""
403
+ COMPREHENSIVE HEALTH OPTIMIZATION REQUEST:
404
+
405
+ Health Profile Analysis:
406
+ {json.dumps(health_profile, indent=2)}
407
+
408
+ Research Context:
409
+ - Study: {study_context.get('title', 'Health Optimization Study')}
410
+ - Domain: {study_context.get('domain', 'general health')}
411
+ - Key Findings: Based on clinical research showing significant improvements in health biomarkers
412
+
413
+ Please analyze this health profile and generate a detailed, personalized optimization protocol. Address the specific biomarker patterns, deficiencies, and health challenges identified in the data. Provide targeted interventions with precise dosing, timing, and monitoring protocols.
414
+ """
415
+
416
+ def _update_cost(self, usage):
417
+ pricing = {
418
+ "gpt-3.5-turbo": {"input": 0.0015, "output": 0.002},
419
+ "gpt-4": {"input": 0.03, "output": 0.06},
420
+ "gpt-4-turbo": {"input": 0.01, "output": 0.03}
421
+ }
422
+
423
+ model_pricing = pricing.get(self.model, pricing["gpt-4"])
424
+ input_cost = usage.prompt_tokens * model_pricing["input"] / 1000
425
+ output_cost = usage.completion_tokens * model_pricing["output"] / 1000
426
+
427
+ self.total_cost += input_cost + output_cost
428
+
429
+ class HealthDatasetGenerator:
430
+ """Complete system that orchestrates the entire dataset generation process"""
431
+
432
+ def __init__(self, api_key: str, model: str = "gpt-4"):
433
+ self.literature_sim = MedicalLiteratureSimulator()
434
+ self.profile_gen = HealthProfileGenerator()
435
+ self.protocol_gen = AIProtocolGenerator(api_key, model)
436
+ self.generated_examples = []
437
+
438
+ def generate_dataset(self,
439
+ domains: List[str] = None,
440
+ examples_per_domain: int = 2,
441
+ rate_limit_delay: float = 2.0,
442
+ progress_callback=None) -> Tuple[List[Dict[str, Any]], str]:
443
+ """Generate complete health optimization dataset with progress updates"""
444
+
445
+ if domains is None:
446
+ domains = ["longevity", "metabolic_health", "cardiovascular", "cognitive"]
447
+
448
+ if progress_callback:
449
+ progress_callback(f"πŸš€ Starting Health Dataset Generation")
450
+ progress_callback(f"Domains: {domains}")
451
+ progress_callback(f"Examples per domain: {examples_per_domain}")
452
+ progress_callback(f"Total examples to generate: {len(domains) * examples_per_domain}")
453
+
454
+ examples = []
455
+ total_examples = len(domains) * examples_per_domain
456
+ current_example = 0
457
+
458
+ for domain in domains:
459
+ if progress_callback:
460
+ progress_callback(f"\nπŸ“‚ Processing domain: {domain}")
461
+
462
+ for i in range(examples_per_domain):
463
+ current_example += 1
464
+ try:
465
+ if progress_callback:
466
+ progress_callback(f" Creating example {i+1}/{examples_per_domain} (Overall: {current_example}/{total_examples})")
467
+
468
+ # Generate study data
469
+ study = self.literature_sim.generate_study_data(domain)
470
+ if progress_callback:
471
+ progress_callback(f" πŸ“š Generated study: {study['title'][:50]}...")
472
+
473
+ # Create health profile
474
+ severity = random.choice(["mild", "moderate", "severe"])
475
+ health_profile = self.profile_gen.generate_profile_from_study(study, severity)
476
+ if progress_callback:
477
+ progress_callback(f" πŸ‘€ Created {severity} health profile")
478
+
479
+ # Generate protocol
480
+ protocol = self.protocol_gen.generate_protocol(health_profile, study, progress_callback)
481
+
482
+ if protocol:
483
+ training_example = {
484
+ "user_context": health_profile,
485
+ "response": protocol,
486
+ "citations": self._generate_citations(study),
487
+ "metadata": {
488
+ "domain": domain,
489
+ "severity": severity,
490
+ "study_pmid": study["pmid"],
491
+ "generated_at": datetime.now().isoformat()
492
+ }
493
+ }
494
+
495
+ examples.append(training_example)
496
+ if progress_callback:
497
+ progress_callback(f" βœ… Complete example generated")
498
+
499
+ # Rate limiting
500
+ if i < examples_per_domain - 1:
501
+ if progress_callback:
502
+ progress_callback(f" ⏳ Rate limit delay: {rate_limit_delay}s")
503
+ time.sleep(rate_limit_delay)
504
+
505
+ except Exception as e:
506
+ if progress_callback:
507
+ progress_callback(f" ❌ Error generating example: {e}")
508
+ continue
509
+
510
+ if progress_callback:
511
+ progress_callback(f"\nπŸŽ‰ Dataset generation complete!")
512
+ progress_callback(f"Generated: {len(examples)} examples")
513
+ progress_callback(f"Total cost: ${self.protocol_gen.total_cost:.4f}")
514
+
515
+ self.generated_examples = examples
516
+ return examples, f"Generated {len(examples)} examples. Total cost: ${self.protocol_gen.total_cost:.4f}"
517
+
518
+ def _generate_citations(self, study: Dict[str, Any]) -> Dict[str, List[str]]:
519
+ return {
520
+ "tier_1_peer_reviewed": [study["pmid"], f"PMC{random.randint(1000000, 9999999)}"],
521
+ "tier_2_rct": [f"{study['domain'].upper()}.2024.{random.randint(100000, 999999)}"],
522
+ "tier_3_cohort": [f"HEALTH.2023.{random.randint(100000, 999999)}"],
523
+ "real_world_cases": ["Evidence-based health optimization protocols"]
524
+ }
525
+
526
+ def export_dataset(self, filename: str = None) -> Tuple[str, List[str]]:
527
+ """Export dataset and return zip file path and file list"""
528
+
529
+ if not filename:
530
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
531
+ filename = f"health_dataset_{timestamp}"
532
+
533
+ # Create all files in memory
534
+ files_created = []
535
+
536
+ # Raw dataset
537
+ raw_data = json.dumps(self.generated_examples, indent=2, ensure_ascii=False)
538
+ files_created.append((f"{filename}.json", raw_data))
539
+
540
+ # Fine-tuning format
541
+ fine_tune_lines = []
542
+ for example in self.generated_examples:
543
+ fine_tune_example = {
544
+ "messages": [
545
+ {
546
+ "role": "system",
547
+ "content": "You are an advanced AI health optimization system that creates evidence-based protocols."
548
+ },
549
+ {
550
+ "role": "user",
551
+ "content": f"Create a health optimization protocol for this profile:\n\n{json.dumps(example['user_context'], indent=2)}"
552
+ },
553
+ {
554
+ "role": "assistant",
555
+ "content": example["response"]
556
+ }
557
+ ]
558
+ }
559
+ fine_tune_lines.append(json.dumps(fine_tune_example, ensure_ascii=False))
560
+
561
+ fine_tune_data = '\n'.join(fine_tune_lines)
562
+ files_created.append((f"{filename}_fine_tuning.jsonl", fine_tune_data))
563
+
564
+ # Sample examples
565
+ sample_size = min(3, len(self.generated_examples))
566
+ sample_data = json.dumps(self.generated_examples[:sample_size], indent=2, ensure_ascii=False)
567
+ files_created.append((f"{filename}_samples.json", sample_data))
568
+
569
+ # Metadata
570
+ metadata = {
571
+ "generation_info": {
572
+ "generated_at": datetime.now().isoformat(),
573
+ "total_examples": len(self.generated_examples),
574
+ "total_cost": self.protocol_gen.total_cost,
575
+ "model_used": self.protocol_gen.model
576
+ },
577
+ "domains_covered": list(set(ex["metadata"]["domain"] for ex in self.generated_examples)),
578
+ "severity_distribution": {
579
+ severity: sum(1 for ex in self.generated_examples if ex["metadata"]["severity"] == severity)
580
+ for severity in ["mild", "moderate", "severe"]
581
+ }
582
+ }
583
+
584
+ metadata_data = json.dumps(metadata, indent=2, ensure_ascii=False)
585
+ files_created.append((f"{filename}_metadata.json", metadata_data))
586
+
587
+ # Create zip file
588
+ zip_buffer = io.BytesIO()
589
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
590
+ for file_name, file_content in files_created:
591
+ zip_file.writestr(file_name, file_content)
592
+
593
+ # Save zip file
594
+ zip_filename = f"{filename}.zip"
595
+ with open(zip_filename, 'wb') as f:
596
+ f.write(zip_buffer.getvalue())
597
+
598
+ file_list = [f[0] for f in files_created]
599
+ return zip_filename, file_list
600
+
601
+ # =====================================================================
602
+ # STEP 3: GRADIO INTERFACE
603
+ # =====================================================================
604
+
605
+ class HealthDatasetGradioInterface:
606
+ """Gradio web interface for the health dataset generator"""
607
+
608
+ def __init__(self):
609
+ self.generator = None
610
+ self.available_domains = list(MedicalLiteratureSimulator().research_domains.keys())
611
+
612
+ def estimate_cost(self, domains, examples_per_domain, model):
613
+ """Estimate generation cost"""
614
+ if not domains:
615
+ return "Please select at least one domain"
616
+
617
+ total_examples = len(domains) * examples_per_domain
618
+
619
+ cost_per_example = {
620
+ "gpt-3.5-turbo": 0.05,
621
+ "gpt-4": 0.25,
622
+ "gpt-4-turbo": 0.15
623
+ }
624
+
625
+ estimated_cost = total_examples * cost_per_example.get(model, 0.25)
626
+
627
+ return f"πŸ’° Estimated cost: ${estimated_cost:.2f} for {total_examples} examples"
628
+
629
+ def validate_inputs(self, api_key, domains, examples_per_domain):
630
+ """Validate user inputs"""
631
+ if not api_key or not api_key.strip():
632
+ return False, "❌ Please provide your OpenAI API key"
633
+
634
+ if not domains:
635
+ return False, "❌ Please select at least one domain"
636
+
637
+ if examples_per_domain < 1 or examples_per_domain > 10:
638
+ return False, "❌ Examples per domain must be between 1 and 10"
639
+
640
+ return True, "βœ… Inputs are valid"
641
+
642
+ def generate_dataset_interface(self, api_key, domains, examples_per_domain, model, rate_limit):
643
+ """Main dataset generation function for Gradio interface"""
644
+
645
+ # Validate inputs
646
+ is_valid, message = self.validate_inputs(api_key, domains, examples_per_domain)
647
+ if not is_valid:
648
+ yield message, "", "", None, None
649
+ return
650
+
651
+ # Initialize generator
652
+ try:
653
+ self.generator = HealthDatasetGenerator(api_key.strip(), model)
654
+ except Exception as e:
655
+ yield f"❌ Error initializing generator: {e}", "", "", None, None
656
+ return
657
+
658
+ # Progress tracking
659
+ progress_messages = []
660
+
661
+ def progress_callback(message):
662
+ progress_messages.append(message)
663
+ progress_text = "\n".join(progress_messages[-20:]) # Keep last 20 messages
664
+ return progress_text
665
+
666
+ try:
667
+ # Generate dataset
668
+ yield "πŸš€ Starting dataset generation...", "", "", None, None
669
+
670
+ dataset, summary = self.generator.generate_dataset(
671
+ domains=domains,
672
+ examples_per_domain=examples_per_domain,
673
+ rate_limit_delay=rate_limit,
674
+ progress_callback=progress_callback
675
+ )
676
+
677
+ if not dataset:
678
+ yield "❌ No examples generated", "", "", None, None
679
+ return
680
+
681
+ # Export dataset
682
+ progress_callback("πŸ’Ύ Exporting dataset...")
683
+ zip_filename, file_list = self.generator.export_dataset()
684
+
685
+ # Create preview
686
+ preview = self.create_dataset_preview(dataset)
687
+
688
+ # Final progress
689
+ final_progress = progress_callback(f"πŸŽ‰ Generation complete! Files: {', '.join(file_list)}")
690
+
691
+ yield final_progress, summary, preview, zip_filename, file_list
692
+
693
+ except Exception as e:
694
+ yield f"❌ Error during generation: {e}", "", "", None, None
695
+
696
+ def create_dataset_preview(self, dataset):
697
+ """Create a preview of the generated dataset"""
698
+ if not dataset:
699
+ return "No data to preview"
700
+
701
+ preview = "πŸ“„ **Dataset Preview**\n\n"
702
+
703
+ # Summary statistics
704
+ preview += f"**Total Examples:** {len(dataset)}\n"
705
+
706
+ # Domain distribution
707
+ domains = [ex['metadata']['domain'] for ex in dataset]
708
+ domain_counts = {d: domains.count(d) for d in set(domains)}
709
+ preview += f"**Domain Distribution:** {domain_counts}\n"
710
+
711
+ # Severity distribution
712
+ severities = [ex['metadata']['severity'] for ex in dataset]
713
+ severity_counts = {s: severities.count(s) for s in set(severities)}
714
+ preview += f"**Severity Distribution:** {severity_counts}\n\n"
715
+
716
+ # Sample example
717
+ if dataset:
718
+ example = dataset[0]
719
+ preview += "**Sample Example:**\n"
720
+ preview += f"- **Domain:** {example['metadata']['domain']}\n"
721
+ preview += f"- **Severity:** {example['metadata']['severity']}\n"
722
+ preview += f"- **User Query:** {example['user_context']['user_query'][:150]}...\n"
723
+ preview += f"- **Response Length:** {len(example['response'])} characters\n"
724
+ preview += f"- **PMID:** {example['metadata']['study_pmid']}\n"
725
+
726
+ return preview
727
+
728
+ def analyze_dataset_file(self, zip_file):
729
+ """Analyze uploaded dataset file"""
730
+ if zip_file is None:
731
+ return "No file uploaded"
732
+
733
+ try:
734
+ # Read the zip file
735
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
736
+ # Look for the main dataset file
737
+ json_files = [f for f in zip_ref.namelist() if f.endswith('.json') and not f.endswith('_samples.json') and not f.endswith('_metadata.json')]
738
+
739
+ if json_files:
740
+ dataset_file = json_files[0]
741
+ with zip_ref.open(dataset_file) as f:
742
+ dataset = json.load(f)
743
+
744
+ analysis = "πŸ“Š **Dataset Analysis**\n\n"
745
+ analysis += f"**Total Examples:** {len(dataset)}\n"
746
+ analysis += f"**Average Response Length:** {sum(len(ex['response']) for ex in dataset) / len(dataset):.0f} characters\n"
747
+
748
+ # Quality checks
749
+ long_responses = sum(1 for ex in dataset if len(ex['response']) > 2000)
750
+ has_phases = sum(1 for ex in dataset if "Phase" in ex['response'])
751
+ has_dosages = sum(1 for ex in dataset if re.search(r'\d+\s*mg', ex['response']))
752
+
753
+ analysis += f"**Quality Metrics:**\n"
754
+ analysis += f"- Responses >2000 chars: {long_responses}/{len(dataset)} ({long_responses/len(dataset)*100:.1f}%)\n"
755
+ analysis += f"- Responses with phases: {has_phases}/{len(dataset)} ({has_phases/len(dataset)*100:.1f}%)\n"
756
+ analysis += f"- Responses with dosages: {has_dosages}/{len(dataset)} ({has_dosages/len(dataset)*100:.1f}%)\n"
757
+
758
+ return analysis
759
+ else:
760
+ return "No dataset JSON file found in zip"
761
+
762
+ except Exception as e:
763
+ return f"Error analyzing file: {e}"
764
+
765
+ def create_interface(self):
766
+ """Create the Gradio interface"""
767
+
768
+ with gr.Blocks(title="Medical Literature Health Dataset Generator", theme=gr.themes.Soft()) as interface:
769
+
770
+ gr.Markdown("""
771
+ # πŸ₯ Medical Literature Health Dataset Generator
772
+
773
+ This tool generates synthetic health optimization datasets based on medical literature patterns.
774
+ Perfect for training AI models on evidence-based health protocols.
775
+
776
+ ⚠️ **Important:** Generated content is for research/educational purposes only. Not medical advice.
777
+ """)
778
+
779
+ with gr.Tab("πŸ“Š Generate Dataset"):
780
+
781
+ with gr.Row():
782
+ with gr.Column(scale=1):
783
+ gr.Markdown("### βš™οΈ Configuration")
784
+
785
+ api_key = gr.Textbox(
786
+ label="OpenAI API Key",
787
+ placeholder="sk-...",
788
+ type="password",
789
+ info="Your OpenAI API key for generating protocols"
790
+ )
791
+
792
+ domains = gr.CheckboxGroup(
793
+ label="Research Domains",
794
+ choices=self.available_domains,
795
+ value=["longevity", "metabolic_health"],
796
+ info="Select medical research domains to include"
797
+ )
798
+
799
+ examples_per_domain = gr.Slider(
800
+ label="Examples per Domain",
801
+ minimum=1,
802
+ maximum=10,
803
+ value=2,
804
+ step=1,
805
+ info="Number of examples to generate for each domain"
806
+ )
807
+
808
+ model = gr.Dropdown(
809
+ label="OpenAI Model",
810
+ choices=["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"],
811
+ value="gpt-4",
812
+ info="Model for generating protocols (GPT-4 recommended for quality)"
813
+ )
814
+
815
+ rate_limit = gr.Slider(
816
+ label="Rate Limit Delay (seconds)",
817
+ minimum=0.5,
818
+ maximum=5.0,
819
+ value=2.0,
820
+ step=0.5,
821
+ info="Delay between API calls to avoid rate limits"
822
+ )
823
+
824
+ cost_estimate = gr.Textbox(
825
+ label="Cost Estimate",
826
+ value="Select domains and examples to see estimate",
827
+ interactive=False
828
+ )
829
+
830
+ generate_btn = gr.Button(
831
+ "πŸš€ Generate Dataset",
832
+ variant="primary",
833
+ size="lg"
834
+ )
835
+
836
+ with gr.Column(scale=2):
837
+ gr.Markdown("### πŸ“ˆ Progress & Results")
838
+
839
+ progress_output = gr.Textbox(
840
+ label="Generation Progress",
841
+ lines=15,
842
+ max_lines=20,
843
+ value="Ready to generate dataset...",
844
+ interactive=False
845
+ )
846
+
847
+ summary_output = gr.Textbox(
848
+ label="Generation Summary",
849
+ lines=3,
850
+ interactive=False
851
+ )
852
+
853
+ preview_output = gr.Markdown(
854
+ label="Dataset Preview",
855
+ value="Dataset preview will appear here..."
856
+ )
857
+
858
+ with gr.Row():
859
+ download_file = gr.File(
860
+ label="πŸ“₯ Download Generated Dataset",
861
+ interactive=False
862
+ )
863
+
864
+ file_list = gr.Textbox(
865
+ label="Generated Files",
866
+ placeholder="Files included in download will be listed here",
867
+ interactive=False
868
+ )
869
+
870
+ with gr.Tab("πŸ“Š Analyze Dataset"):
871
+ gr.Markdown("### πŸ“‹ Dataset Analysis")
872
+ gr.Markdown("Upload a generated dataset zip file to analyze its quality and structure.")
873
+
874
+ with gr.Row():
875
+ with gr.Column():
876
+ upload_file = gr.File(
877
+ label="Upload Dataset Zip File",
878
+ file_types=[".zip"]
879
+ )
880
+
881
+ analyze_btn = gr.Button(
882
+ "πŸ” Analyze Dataset",
883
+ variant="secondary"
884
+ )
885
+
886
+ with gr.Column():
887
+ analysis_output = gr.Markdown(
888
+ label="Analysis Results",
889
+ value="Upload a dataset file to see analysis..."
890
+ )
891
+
892
+ with gr.Tab("ℹ️ Information"):
893
+ gr.Markdown("""
894
+ ### πŸ“š How It Works
895
+
896
+ 1. **Literature Simulation**: Creates realistic medical studies with proper abstracts, interventions, and outcomes
897
+ 2. **Health Profile Generation**: Generates comprehensive health profiles based on study domains and severity levels
898
+ 3. **AI Protocol Generation**: Uses OpenAI to create detailed health optimization protocols
899
+ 4. **Dataset Export**: Outputs data in multiple formats including OpenAI fine-tuning format
900
+
901
+ ### 🎯 Output Files
902
+
903
+ - **`dataset.json`**: Complete raw dataset
904
+ - **`dataset_fine_tuning.jsonl`**: OpenAI fine-tuning format
905
+ - **`dataset_samples.json`**: Sample examples for review
906
+ - **`dataset_metadata.json`**: Generation statistics and info
907
+
908
+ ### πŸ’° Cost Information
909
+
910
+ - **GPT-3.5-turbo**: ~$0.05 per example
911
+ - **GPT-4**: ~$0.25 per example
912
+ - **GPT-4-turbo**: ~$0.15 per example
913
+
914
+ ### ⚠️ Important Notes
915
+
916
+ - Generated content is for **research/educational purposes only**
917
+ - **Not medical advice** - always consult healthcare professionals
918
+ - Include appropriate medical disclaimers when using generated content
919
+ - Review sample outputs before using in production
920
+
921
+ ### πŸ”§ Recommended Settings
922
+
923
+ - **Start small**: Generate 2-4 examples first to test quality
924
+ - **Use GPT-4**: Better quality than GPT-3.5-turbo
925
+ - **Rate limiting**: Use 2+ second delays to avoid API limits
926
+ - **Multiple domains**: Include diverse domains for comprehensive dataset
927
+ """)
928
+
929
+ # Event handlers
930
+
931
+ # Update cost estimate when inputs change
932
+ def update_cost_estimate(domains, examples_per_domain, model):
933
+ return self.estimate_cost(domains, examples_per_domain, model)
934
+
935
+ for input_component in [domains, examples_per_domain, model]:
936
+ input_component.change(
937
+ fn=update_cost_estimate,
938
+ inputs=[domains, examples_per_domain, model],
939
+ outputs=[cost_estimate]
940
+ )
941
+
942
+ # Generate dataset
943
+ generate_btn.click(
944
+ fn=self.generate_dataset_interface,
945
+ inputs=[api_key, domains, examples_per_domain, model, rate_limit],
946
+ outputs=[progress_output, summary_output, preview_output, download_file, file_list]
947
+ )
948
+
949
+ # Analyze dataset
950
+ analyze_btn.click(
951
+ fn=self.analyze_dataset_file,
952
+ inputs=[upload_file],
953
+ outputs=[analysis_output]
954
+ )
955
+
956
+ return interface
957
+
958
+ # =====================================================================
959
+ # STEP 4: LAUNCH THE INTERFACE
960
+ # =====================================================================
961
+
962
+ def main():
963
+ """Launch the Gradio interface"""
964
+
965
+ print("πŸš€ Launching Medical Literature Health Dataset Generator")
966
+ print("This will start a web interface accessible through your browser")
967
+
968
+ # Create interface
969
+ interface_creator = HealthDatasetGradioInterface()
970
+ interface = interface_creator.create_interface()
971
+
972
+ # Launch with configuration
973
+ interface.launch(
974
+ share=True, # Creates public link for sharing
975
+ server_name="0.0.0.0", # Makes it accessible from other devices
976
+ server_port=7860, # Default Gradio port
977
+ show_error=True, # Show detailed errors
978
+ quiet=False # Show startup info
979
+ )
980
+
981
+ if __name__ == "__main__":
982
+ main()
983
+
984
+ # For Google Colab, uncomment the following:
985
+ # main()