{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "BPkyxq4uSYee" }, "source": [ "### Instalación de Dependencias" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SCmXa_xmd3c4" }, "outputs": [], "source": [ "!pip install -qU transformers datasets accelerate huggingface_hub gradio pandas bitsandbytes peft gradio" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MuY2p53aDzeq", "outputId": "5265031a-6b23-4c2e-d14e-b19010a2b096" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cuda\n" ] } ], "source": [ "import torch\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "print(device)" ] }, { "cell_type": "markdown", "metadata": { "id": "oj-7CwC1pqyy" }, "source": [ "### Autenticación Hugging Face" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 17, "referenced_widgets": [ "71e0e8b50884432cae7372a60d28d748", "06bdcf1e4b064557ac5dad826639db30", "9f7af2fbe8df47e281298c38829fedab", "96445071e8584eee87423256d63a1ab2", "150b57ea5c0d42fdb88bd5e21e0f98a3", "af3c4f61dce24115b3d6fc42ae1c13fc", "9543cb5dea6e452fbb8c2f5b0e7868ab", "a6eff3772c5d4ddbbf52094ba7ce3505", "d5ce277ef97c40319e0b4848c744119a", "45a9701cabfe47e38fe9602caaba606a", "0b5993733594406483435639f4c251b3", "45bae474a32d480092c568b6c601d126", "2244d8e865e64bf8acb69bd1f346d8cc", "6492dd6ac527484aa5a4c32a4823bce8", "d566b0b841944a3c942bf139abc83a74", "0ad42a0ba1f34b06ac934e9306084294", "a519419cab844898b2bddbee0241923e", "3ff529fd2cab48f1a4c865d55fb0fb18", "37a1ff79aeab4c1681bb8e3c2ec09620", "e7b3308063814e32bceaa848a5e6e7fe" ] }, "id": "nKR_zzYNpqQG", "outputId": "b3f8b925-b773-4d4e-a2c7-c93d8576fa16" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "71e0e8b50884432cae7372a60d28d748", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
[INST] Explica: {tag} [/INST]{desc}\"\n", " for tag, desc in zip(examples['tag'], examples['description'])]\n", " }\n", "\n", "dataset = dataset.map(format_instruction, batched=True, remove_columns=dataset[\"train\"].column_names)" ] }, { "cell_type": "markdown", "metadata": { "id": "JSYdZXmem0Bv" }, "source": [ "### Tokenización" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81, "referenced_widgets": [ "08ae5a36a6034bfea641dfe693f42832", "a90a0bc4c87749e1abeae25d710792c8", "ea19b8ccdb0143619a1c09aa89252436", "db59e78371c04c4d9c984fd63db30d24", "3481ce6b918c41e7b7b706c640ed21f3", "b6a71099e44846f8b8f7698312139c0b", "63d7d550ae6b4d0da740154f83ef4d94", "1c36246304114ddf9f5099f1607fc4b7", "7f72634e669542eca99d04d5a5abe4f1", "aea1369ea7924bea88422a4f9da098c8", "86df44b70df045deb6dfc15204295e50", "db4c72a9daf0461fafc74cb7fd251343", "a3fa0c5a93754c03836ec52fbc5fba6f", "13492cdaf3c642069c12abb109deee0e", "b2c11e48dd7e47b195b78516b229c8c7", "1da3355e42f144e9ab2ff4bdabc80dfa", "5395ba26d89641a897bf464ebb4741da", "b6aa07e20e204fb9a0ef56cc6055c3a0", "04934075bac1482c9da487ee5a090e65", "492dd3401ef84a0397ea3d2ab82c46dd", "c342d2330f0a4eb48826b6fc1c127167", "ce7fc2e186814c3eb9cb034849506431" ] }, "id": "uwasoOTHgoJK", "outputId": "60b5a180-7471-45f9-b322-113ae7a84bf1" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "08ae5a36a6034bfea641dfe693f42832", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/52 [00:00:9: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " \"input_ids\": torch.stack([torch.tensor(f[\"input_ids\"]) for f in features]),\n", ":10: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " \"attention_mask\": torch.stack([torch.tensor(f[\"attention_mask\"]) for f in features]),\n", ":11: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " \"labels\": torch.stack([torch.tensor(f[\"input_ids\"]) for f in features])\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Verificación de batch:\n", "Input ids type: \n", "Labels shape: torch.Size([2, 512])\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [39/39 02:15, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
512.130200
103.432800
150.502100
200.297100
250.232200
300.199000
350.174700

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=39, training_loss=2.1929761950786295, metrics={'train_runtime': 140.2841, 'train_samples_per_second': 1.112, 'train_steps_per_second': 0.278, 'total_flos': 3409289020440576.0, 'train_loss': 2.1929761950786295, 'epoch': 3.0})" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import TrainingArguments, Trainer\n", "\n", "# 1. Configurar formato del dataset como tensores\n", "tokenized_dataset.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\"])\n", "\n", "# 2. Data collator mejorado\n", "def custom_collator(features):\n", " return {\n", " \"input_ids\": torch.stack([torch.tensor(f[\"input_ids\"]) for f in features]),\n", " \"attention_mask\": torch.stack([torch.tensor(f[\"attention_mask\"]) for f in features]),\n", " \"labels\": torch.stack([torch.tensor(f[\"input_ids\"]) for f in features])\n", " }\n", "\n", "# 3. Configurar argumentos con parámetros faltantes\n", "training_args = TrainingArguments(\n", " output_dir=\"./html5-lora\",\n", " per_device_train_batch_size=2,\n", " gradient_accumulation_steps=2, # Reducir para ahorrar memoria\n", " num_train_epochs=3,\n", " learning_rate=3e-4,\n", " fp16=torch.cuda.is_available(),\n", " logging_steps=5,\n", " report_to=\"none\",\n", " remove_unused_columns=False, # Necesario para LoRA\n", " label_names=[\"labels\"] # Añadir parámetro faltante\n", ")\n", "\n", "# 4. Crear Trainer con parámetros actualizados\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_dataset[\"train\"],\n", " eval_dataset=tokenized_dataset[\"test\"],\n", " data_collator=custom_collator\n", ")\n", "\n", "# 5. Verificación adicional\n", "sample_batch = next(iter(trainer.get_train_dataloader()))\n", "print(\"\\nVerificación de batch:\")\n", "print(f\"Input ids type: {type(sample_batch['input_ids'][0])}\")\n", "print(f\"Labels shape: {sample_batch['labels'].shape}\")\n", "\n", "# 6. Iniciar entrenamiento\n", "trainer.train()" ] }, { "cell_type": "markdown", "metadata": { "id": "hm89m0JCtYnY" }, "source": [ "### Generación de Respuestas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rukjNbmftfCv", "outputId": "e7c3781f-1a33-4a43-9c8c-4eb1e68589e2" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Device set to use cuda:0\n", "The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'GraniteMoeSharedForCausalLM', 'HeliumForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MllamaForCausalLM', 'MoshiForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyForCausalLM', 'MvpForCausalLM', 'NemotronForCausalLM', 'OlmoForCausalLM', 'Olmo2ForCausalLM', 'OlmoeForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PhiForCausalLM', 'Phi3ForCausalLM', 'PhimoeForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2MoeForCausalLM', 'RecurrentGemmaForCausalLM', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'StableLmForCausalLM', 'Starcoder2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'WhisperForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM', 'ZambaForCausalLM', 'Zamba2ForCausalLM'].\n" ] } ], "source": [ "from transformers import pipeline\n", "chatbot = pipeline(\n", " \"text-generation\",\n", " model = model,\n", " tokenizer = tokenizer,\n", " torch_dtype = torch.float16\n", ")\n", "\n", "def generate_response(query):\n", " prompt = f\"[INST] Pregunta HTML5: {query} [/INST]\"\n", " response = chatbot(\n", " prompt,\n", " max_new_tokens = 200,\n", " temperature = 0.3,\n", " do_sample = True,\n", " pad_token_id = tokenizer.eos_token_id\n", " )\n", " return response[0]['generated_text'].split(\"[/INST]\")[-1].strip()\n", "\n", "\n", "\n", "def generate_response_gradio(query):\n", " try:\n", " # Manejar casos no técnicos primero\n", " if query.lower().strip() in [\"hola\", \"hi\", \"ayuda\"]:\n", " return \"¡Hola! Soy un asistente de HTML5. Ejemplo: '¿Cómo usar ?'\"\n", "\n", " # Formato especial para Mistral\n", " prompt = f\"[INST] {query} [/INST]\"\n", "\n", " # Generación con parámetros optimizados\n", " outputs = chatbot(\n", " prompt,\n", " max_new_tokens=150,\n", " num_return_sequences=1,\n", " pad_token_id=tokenizer.eos_token_id\n", " )\n", "\n", " return outputs[0]['generated_text'].split(\"[/INST]\")[-1].strip()\n", "\n", " except Exception as e:\n", " return f\"Error: {str(e)}\" # Debuggear fallos" ] }, { "cell_type": "markdown", "metadata": { "id": "B-h5SZvDw07t" }, "source": [ " ### Pruebas de Validación Mejoradas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pAomdkQww684", "outputId": "2608454c-a4c1-4fde-bc6a-cbd2d6853e91" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Caso 3/3\n", "Pregunta: ¿Cómo usar

y ?\n", "Respuesta:
y son elementos HTML5 que permiten mostrar contenido oculto, como respuestas a preguntas, descripciones de listas o detalles adicionales. El es el elemento visible, mientras que el contenido oculto se muestra al expandirse.\n", "Tiempo: 6.26s\n", "Esperado:
crea un widget desplegable, define el título visible\n", "--------------------------------------------------------------------------------\n", "\n", " Resumen Final:\n", "Tiempo promedio por respuesta: 6.52s\n", "Precisión promedio: 44.4%\n" ] } ], "source": [ "import time\n", "from IPython.display import clear_output\n", "\n", "test_cases = [\n", " {\n", " \"pregunta\": \"¿Qué es la etiqueta