Ffftdtd5dtft commited on
Commit
37a01aa
verified
1 Parent(s): ad03500

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -0
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install torch==2.0.1 transformers==4.27.1 datasets==2.4.0 wget==3.2 huggingface-hub==0.14.1 beautifulsoup4==4.11.1 requests==2.28.1 matplotlib tqdm python-dotenv diffusers
2
+
3
+ import os
4
+ import torch
5
+ import torch.nn as nn
6
+ from torch.utils.data import DataLoader, Dataset
7
+ from torch.optim import AdamW
8
+ import matplotlib.pyplot as plt
9
+ import matplotlib.animation as animation
10
+ import time
11
+ import threading
12
+ from tqdm import tqdm
13
+ from transformers import AutoTokenizer, AutoModel, TrainingArguments, pipeline
14
+ from diffusers import DiffusionPipeline
15
+ from huggingface_hub import login, HfApi, Repository
16
+ from dotenv import load_dotenv
17
+
18
+ # Cargar variables de entorno
19
+ load_dotenv()
20
+
21
+ class UnifiedModel(nn.Module):
22
+ def __init__(self, models):
23
+ super(UnifiedModel, self).__init__()
24
+ self.models = nn.ModuleList(models)
25
+ self.classifier = nn.Linear(sum([model.config.hidden_size for model in models if hasattr(model, 'config')]), 2)
26
+
27
+ def forward(self, inputs):
28
+ hidden_states = []
29
+ for model in self.models:
30
+ if isinstance(model, nn.Module):
31
+ outputs = model(inputs)
32
+ hidden_states.append(outputs.last_hidden_state[:, 0, :])
33
+ elif isinstance(model, DiffusionPipeline) or isinstance(model, pipeline):
34
+ outputs = model(inputs)
35
+ hidden_states.append(torch.tensor(outputs))
36
+ concatenated_hidden_states = torch.cat(hidden_states, dim=-1)
37
+ logits = self.classifier(concatenated_hidden_states)
38
+ return logits
39
+
40
+
41
+ class SyntheticDataset(Dataset):
42
+ def __init__(self, tokenizers, size=100):
43
+ self.tokenizers = tokenizers
44
+ self.size = size
45
+ self.data = self._generate_data()
46
+
47
+ def _generate_data(self):
48
+ data = []
49
+ for _ in range(self.size):
50
+ text = "This is a sample sentence for testing purposes."
51
+ label = torch.tensor(0) # Sample label
52
+ item = {"text": text, "label": label}
53
+ for name, tokenizer in self.tokenizers.items():
54
+ tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=128)
55
+ item[f"input_ids_{name}"] = torch.tensor(tokenized["input_ids"])
56
+ item[f"attention_mask_{name}"] = torch.tensor(tokenized["attention_mask"])
57
+ data.append(item)
58
+ return data
59
+
60
+ def __len__(self):
61
+ return len(self.data)
62
+
63
+ def __getitem__(self, idx):
64
+ return self.data[idx]
65
+
66
+
67
+ def push_to_hub(local_dir, repo_name):
68
+ try:
69
+ repo_url = HfApi().create_repo(repo_name, exist_ok=True)
70
+ repo = Repository(local_dir, clone_from=repo_url)
71
+
72
+ if not os.path.exists(os.path.join(local_dir, ".git")):
73
+ os.system(f"cd {local_dir} && git init && git remote add origin {repo_url} && git pull origin main")
74
+
75
+ repo.git_add(auto_lfs_track=True)
76
+ repo.git_commit("Add model and tokenizer files")
77
+
78
+ json_files = ["config.json", "generation_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer.model", "tokenizer_config.json"]
79
+ for json_file in json_files:
80
+ json_file_path = os.path.join(local_dir, json_file)
81
+ if os.path.exists(json_file_path):
82
+ repo.git_add(json_file_path)
83
+
84
+ repo.git_push()
85
+ print(f"Pushed model and tokenizer to {repo_url}")
86
+ except Exception as e:
87
+ print(f"Error pushing to Hugging Face Hub: {e}")
88
+
89
+
90
+ def main():
91
+ while True:
92
+ try:
93
+ os.system("git config --global credential.helper store")
94
+ login(token=os.getenv("HUGGINGFACE_TOKEN"), add_to_git_credential=True)
95
+
96
+ # Definir los modelos que se van a utilizar
97
+ models_to_train = [
98
+ "openai-community/gpt2-xl",
99
+ "google/gemma-2-9b-it",
100
+ "google/gemma-2-9b",
101
+ "meta-llama/Meta-Llama-3.1-8B-Instruct",
102
+ "meta-llama/Meta-Llama-3.1-8B",
103
+ "openbmb/MiniCPM-V-2_6",
104
+ "bigcode/starcoder",
105
+ "WizardLMTeam/WizardCoder-Python-34B-V1.0",
106
+ "Qwen/Qwen2-72B-Instruct",
107
+ "google/gemma-2-2b-it",
108
+ "facebook/bart-large-cnn",
109
+ "Falconsai/text_summarization",
110
+ "microsoft/speecht5_tts",
111
+ "Groq/Llama-3-Groq-70B-Tool-Use",
112
+ "Groq/Llama-3-Groq-8B-Tool-Use"
113
+ ]
114
+
115
+ # Inicializar los pipelines
116
+ pipelines_to_unify = [
117
+ pipeline("text-to-audio", model="facebook/musicgen-melody"),
118
+ pipeline("text-to-audio", model="facebook/musicgen-large"),
119
+ pipeline("text-to-audio", model="facebook/musicgen-small"),
120
+ DiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt-1-1"),
121
+ pipeline("automatic-speech-recognition", model="openai/whisper-small"),
122
+ DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"),
123
+ DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1"),
124
+ DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell"),
125
+ pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B"),
126
+ pipeline("text-generation", model="openbmb/MiniCPM-V-2_6"),
127
+ pipeline("text-generation", model="bigcode/starcoder"),
128
+ pipeline("text-to-speech", model="microsoft/speecht5_tts"),
129
+ pipeline("text-generation", model="WizardLMTeam/WizardCoder-Python-34B-V1.0"),
130
+ pipeline("text-generation", model="Qwen/Qwen2-72B-Instruct"),
131
+ pipeline("text-generation", model="google/gemma-2-2b-it"),
132
+ pipeline("summarization", model="facebook/bart-large-cnn"),
133
+ pipeline("summarization", model="Falconsai/text_summarization"),
134
+ DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"),
135
+ pipeline("text-to-audio", model="facebook/musicgen-small"),
136
+ pipeline("text-generation", model="Groq/Llama-3-Groq-70B-Tool-Use"),
137
+ pipeline("text-generation", model="Groq/Llama-3-Groq-8B-Tool-Use")
138
+ ]
139
+
140
+ tokenizers = {}
141
+ models = []
142
+ for model_name in models_to_train:
143
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
144
+
145
+ if tokenizer.pad_token is None:
146
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
147
+
148
+ model = AutoModel.from_pretrained(model_name)
149
+ tokenizers[model_name] = tokenizer
150
+ models.append(model)
151
+
152
+ # Agregar pipelines como modelos
153
+ models.extend(pipelines_to_unify)
154
+
155
+ # Crear un dataset sint茅tico para entrenamiento y evaluaci贸n
156
+ synthetic_dataset = SyntheticDataset(tokenizers, size=100)
157
+
158
+ # Dividir el dataset en entrenamiento y evaluaci贸n
159
+ train_size = int(0.8 * len(synthetic_dataset))
160
+ val_size = len(synthetic_dataset) - train_size
161
+ train_dataset, val_dataset = torch.utils.data.random_split(synthetic_dataset, [train_size, val_size])
162
+
163
+ # Crear DataLoaders para entrenamiento y evaluaci贸n
164
+ train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
165
+ eval_loader = DataLoader(val_dataset, batch_size=16)
166
+
167
+ # Unificar los modelos y pipelines en uno solo
168
+ unified_model = UnifiedModel(models)
169
+ unified_model.to(torch.device("cpu"))
170
+
171
+ # Mostrar la cantidad de par谩metros totales a entrenar
172
+ total_params = sum(p.numel() for p in unified_model.parameters())
173
+ print(f"Total parameters to train: {total_params}")
174
+
175
+ # Definir los argumentos de entrenamiento
176
+ training_args = TrainingArguments(
177
+ output_dir="outputs/unified_model",
178
+ evaluation_strategy="epoch",
179
+ learning_rate=9e-4,
180
+ per_device_train_batch_size=2,
181
+ per_device_eval_batch_size=16,
182
+ num_train_epochs=1, # Reduced epochs for quick training
183
+ weight_decay=0.01,
184
+ logging_steps=10, # More frequent logging for quicker feedback
185
+ optim="adamw_hf"
186
+ )
187
+
188
+ # Definir el optimizador
189
+ optimizer = AdamW(unified_model.parameters(), lr=training_args.learning_rate)
190
+
191
+ train_losses = []
192
+ eval_losses = []
193
+
194
+ def train(model, train_loader, eval_loader, args):
195
+ model.train()
196
+ epoch = 0
197
+ total_steps = args.num_train_epochs * len(train_loader)
198
+ progress_bar = tqdm(total=total_steps, desc="Training")
199
+
200
+ while epoch < args.num_train_epochs:
201
+ start_time = time.time()
202
+ for step, batch in enumerate(train_loader):
203
+ input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
204
+ attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
205
+ labels = batch["label"].to("cpu")
206
+ optimizer.zero_grad()
207
+ outputs = model(input_ids)
208
+ loss = nn.CrossEntropyLoss()(outputs, labels)
209
+ loss.backward()
210
+ optimizer.step()
211
+ progress_bar.update(1)
212
+
213
+ elapsed_time = time.time() - start_time
214
+ estimated_total_time = total_steps * (elapsed_time / (step + 1))
215
+ estimated_remaining_time = estimated_total_time - elapsed_time
216
+
217
+ if step % args.logging_steps == 0:
218
+ train_losses.append(loss.item())
219
+ print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds")
220
+
221
+ epoch += 1
222
+ model.eval()
223
+ eval_loss = 0
224
+ with torch.no_grad():
225
+ for batch in eval_loader:
226
+ input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
227
+ attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
228
+ labels = batch["label"].to("cpu")
229
+ outputs = model(input_ids)
230
+ loss = nn.CrossEntropyLoss()(outputs, labels)
231
+ eval_loss += loss.item()
232
+
233
+ eval_loss /= len(eval_loader)
234
+ eval_losses.append(eval_loss)
235
+ print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}")
236
+
237
+ train(unified_model, train_loader, eval_loader, training_args)
238
+
239
+ # Visualizar p茅rdidas durante el entrenamiento
240
+ fig, ax = plt.subplots()
241
+ ax.set_xlabel("Epochs")
242
+ ax.set_ylabel("Loss")
243
+ ax.legend()
244
+
245
+ def animate(i):
246
+ ax.clear()
247
+ ax.plot(train_losses[:i], label="Train Loss")
248
+ ax.plot(eval_losses[:i], label="Eval Loss")
249
+ ax.legend()
250
+
251
+ ani = animation.FuncAnimation(fig, animate, frames=len(train_losses), blit=False)
252
+ plt.show()
253
+
254
+ # Subir el modelo unificado a Hugging Face Hub
255
+ local_dir = "./outputs/unified_model"
256
+ push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model")
257
+
258
+ break
259
+ except Exception as e:
260
+ print(f"Error: {e}")
261
+ time.sleep(2)
262
+
263
+ if __name__ == "__main__":
264
+ main()