{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8NIm_0b_sjJc", "outputId": "615bda96-593f-4851-b6e6-6acb84376b93" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: sacrebleu in /usr/local/lib/python3.10/dist-packages (2.4.3)\n", "Requirement already satisfied: portalocker in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (2.10.1)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (2024.9.11)\n", "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (0.9.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (1.26.4)\n", "Requirement already satisfied: colorama in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (0.4.6)\n", "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu) (5.3.0)\n" ] } ], "source": [ "!pip install sacrebleu\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader, Dataset\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import sacrebleu\n", "import json\n", "import csv\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XAapN08evSpV", "outputId": "61281ff6-7979-48a5-9da9-a92d17f17210" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EN to IC\n", "\n", " give three tips for staying healthy \n", " gefðu þrjú ráð til að halda heilsu \n", "\n", "IC to EN\n", "\n", " gefðu þrjú ráð til að halda heilsu \n", " give three tips for staying healthy \n", "\n", "\n", "EN to IC\n", "\n", " 1 eat a balanced and nutritious diet make sure your meals are inclusive of a variety of fruits and vegetables lean protein whole grains and healthy fats this helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases 2 engage in regular physical activity exercise is crucial for maintaining strong bones muscles and cardiovascular health aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week 3 get enough sleep getting enough quality sleep is crucial for physical and mental wellbeing it helps to regulate mood improve cognitive function and supports healthy growth and immune function aim for 79 hours of sleep each night \n", " 1 borðaðu hollt og næringarríkt mataræði gakktu úr skugga um að máltíðirnar þínar innihaldi margs konar ávexti og grænmeti magurt prótein heilkorn og holla fitu þetta hjálpar til við að veita líkamanum nauðsynleg næringarefni til að virka sem best og getur hjálpað til við að koma í veg fyrir langvinna sjúkdóma 2 taktu þátt í reglulegri hreyfingu hreyfing er mikilvæg til að viðhalda sterkum beinum vöðvum og hjarta og æðaheilbrigði miðaðu við að minnsta kosti 150 mínútur af hóflegri þolþjálfun eða 75 mínútur af öflugri hreyfingu í hverri viku 3 fáðu nægan svefn að fá nægan gæðasvefn skiptir sköpum fyrir líkamlega og andlega vellíðan það hjálpar til við að stjórna skapi bæta vitræna virkni og styðja við heilbrigðan vöxt og ónæmisvirkni miðaðu við 79 tíma svefn á hverri nóttu \n", "\n", "IC to EN\n", "\n", " 1 borðaðu hollt og næringarríkt mataræði gakktu úr skugga um að máltíðirnar þínar innihaldi margs konar ávexti og grænmeti magurt prótein heilkorn og holla fitu þetta hjálpar til við að veita líkamanum nauðsynleg næringarefni til að virka sem best og getur hjálpað til við að koma í veg fyrir langvinna sjúkdóma 2 taktu þátt í reglulegri hreyfingu hreyfing er mikilvæg til að viðhalda sterkum beinum vöðvum og hjarta og æðaheilbrigði miðaðu við að minnsta kosti 150 mínútur af hóflegri þolþjálfun eða 75 mínútur af öflugri hreyfingu í hverri viku 3 fáðu nægan svefn að fá nægan gæðasvefn skiptir sköpum fyrir líkamlega og andlega vellíðan það hjálpar til við að stjórna skapi bæta vitræna virkni og styðja við heilbrigðan vöxt og ónæmisvirkni miðaðu við 79 tíma svefn á hverri nóttu \n", " 1 eat a balanced and nutritious diet make sure your meals are inclusive of a variety of fruits and vegetables lean protein whole grains and healthy fats this helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases 2 engage in regular physical activity exercise is crucial for maintaining strong bones muscles and cardiovascular health aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week 3 get enough sleep getting enough quality sleep is crucial for physical and mental wellbeing it helps to regulate mood improve cognitive function and supports healthy growth and immune function aim for 79 hours of sleep each night \n", "\n", "\n", "EN to IC\n", "\n", " what are the three primary colors \n", " hverjir eru þrír aðallitirnir \n", "\n", "IC to EN\n", "\n", " hverjir eru þrír aðallitirnir \n", " what are the three primary colors \n", "\n", "\n", "EN to IC\n", "\n", " the three primary colors are red blue and yellow these colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions in the additive color system used for light the primary colors are red green and blue rgb \n", " aðallitirnir þrír eru rauður blár og gulur þessir litir eru kallaðir frumefni vegna þess að ekki er hægt að búa þá til með því að blanda öðrum litum og alla aðra liti er hægt að búa til með því að sameina þá í ýmsum hlutföllum í auklitakerfinu notað fyrir ljós eru aðallitirnir rauður grænn og blár rgb \n", "\n", "IC to EN\n", "\n", " aðallitirnir þrír eru rauður blár og gulur þessir litir eru kallaðir frumefni vegna þess að ekki er hægt að búa þá til með því að blanda öðrum litum og alla aðra liti er hægt að búa til með því að sameina þá í ýmsum hlutföllum í auklitakerfinu notað fyrir ljós eru aðallitirnir rauður grænn og blár rgb \n", " the three primary colors are red blue and yellow these colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions in the additive color system used for light the primary colors are red green and blue rgb \n", "\n", "\n", "EN to IC\n", "\n", " describe the structure of an atom \n", " lýstu byggingu atóms \n", "\n", "IC to EN\n", "\n", " lýstu byggingu atóms \n", " describe the structure of an atom \n", "\n", "\n" ] } ], "source": [ "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "\n", "# Load the Icelandic_cleaned.json dataset\n", "with open('/content/Icelandic_cleaned.json', 'r', encoding='utf-8') as f:\n", " data = json.load(f)\n", "\n", "# Extract the first 1,000 English and Icelandic sentences for BLEU and CHRF calculation\n", "en_sentences = [entry['input'] for entry in data[:1000]]\n", "ic_sentences = [entry['output'] for entry in data[:1000]]\n", "\n", "# Tokenizer and Vectorization\n", "tokenizer = lambda x: x.split()\n", "\n", "def build_vocab(sentences):\n", " counter = Counter()\n", " for sentence in sentences:\n", " counter.update(tokenizer(sentence))\n", " return counter\n", "\n", "en_vocab = build_vocab(en_sentences)\n", "ic_vocab = build_vocab(ic_sentences)\n", "\n", "# Adding special tokens\n", "en_vocab = {'': 1, '': 2, '': 0, '': 3, **en_vocab}\n", "ic_vocab = {'': 1, '': 2, '': 0, '': 3, **ic_vocab}\n", "\n", "def sentence_to_tensor(sentence, vocab, max_len=MAX_LEN):\n", " tokens = tokenizer(sentence)\n", " indices = [vocab.get(token, vocab['']) for token in tokens]\n", " indices = [vocab['']] + indices + [vocab['']]\n", " if len(indices) < max_len:\n", " indices += [vocab['']] * (max_len - len(indices))\n", " else:\n", " indices = indices[:max_len]\n", " return torch.tensor(indices)\n", "\n", "\n", "VOCAB_SIZE = 10000\n", "EMBEDDING_DIM = 256\n", "HIDDEN_DIM = 512\n", "BATCH_SIZE = 32\n", "NUM_EPOCHS = 10\n", "MAX_LEN = 50\n", "\n", "# Prepare dataset and dataloaders\n", "en_tensor_sentences = [sentence_to_tensor(s, en_vocab) for s in en_sentences]\n", "ic_tensor_sentences = [sentence_to_tensor(s, ic_vocab) for s in ic_sentences]\n", "\n", "# Split data into training and validation sets (80% train, 20% validation)\n", "train_size = int(0.8 * len(en_tensor_sentences))\n", "train_en, val_en = en_tensor_sentences[:train_size], en_tensor_sentences[train_size:]\n", "train_ic, val_ic = ic_tensor_sentences[:train_size], ic_tensor_sentences[train_size:]\n", "\n", "# Sample Data Printing in Specified Format\n", "def print_sample_data(en_sentences, ic_sentences):\n", " for i in range(5):\n", " en_sample = en_sentences[i]\n", " ic_sample = ic_sentences[i]\n", "\n", " print(\"EN to IC\")\n", " print(\"\")\n", " print(f\" {en_sample} \")\n", " print(f\" {ic_sample} \")\n", " print(\"\")\n", "\n", " print(\"IC to EN\")\n", " print(\"\")\n", " print(f\" {ic_sample} \")\n", " print(f\" {en_sample} \")\n", " print(\"\")\n", " print()\n", "\n", "print_sample_data(en_sentences, ic_sentences)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "nzbNHJI1vSlg" }, "outputs": [], "source": [ "# Custom Dataset\n", "class TranslationDataset(Dataset):\n", " def __init__(self, en_sentences, ic_sentences):\n", " self.en_sentences = en_sentences\n", " self.ic_sentences = ic_sentences\n", "\n", " def __len__(self):\n", " return len(self.en_sentences)\n", "\n", " def __getitem__(self, idx):\n", " return self.en_sentences[idx], self.ic_sentences[idx]\n", "\n", "# Prepare dataloader\n", "train_dataset = TranslationDataset(train_en, train_ic)\n", "val_dataset = TranslationDataset(val_en, val_ic)\n", "\n", "# Define a collate_fn to pad sequences dynamically\n", "def collate_fn(batch):\n", " en_batch, ic_batch = zip(*batch)\n", " en_batch = torch.stack(en_batch, dim=0)\n", " ic_batch = torch.stack(ic_batch, dim=0)\n", " return en_batch, ic_batch\n", "\n", "train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n", "val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "3tVw3ao4vSiX" }, "outputs": [], "source": [ "\n", "# Seq2Seq Model\n", "class Seq2SeqModel(nn.Module):\n", " def __init__(self, input_dim, output_dim, embedding_dim, hidden_dim, num_layers=1):\n", " super(Seq2SeqModel, self).__init__()\n", "\n", " self.encoder_embedding = nn.Embedding(input_dim, embedding_dim)\n", " self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)\n", "\n", " self.decoder_embedding = nn.Embedding(output_dim, embedding_dim)\n", " self.decoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)\n", " self.fc_out = nn.Linear(hidden_dim, output_dim)\n", "\n", " def forward(self, src, trg):\n", " embedded_src = self.encoder_embedding(src)\n", " encoder_output, (hidden, cell) = self.encoder_lstm(embedded_src)\n", "\n", " embedded_trg = self.decoder_embedding(trg)\n", " decoder_output, _ = self.decoder_lstm(embedded_trg, (hidden, cell))\n", "\n", " output = self.fc_out(decoder_output)\n", " return output\n", "\n", "# Initialize the Seq2Seq model\n", "model = Seq2SeqModel(len(en_vocab), len(ic_vocab), EMBEDDING_DIM, HIDDEN_DIM, num_layers=1).to(DEVICE)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "caqExHCzvSgZ", "outputId": "e03cd84e-4e07-4cae-f804-406b98cfa0dd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10, Training Loss: 4.162079634666443, Validation Loss: 1.783459918839591\n", "Epoch 2/10, Training Loss: 1.0344110178947448, Validation Loss: 0.4918278966631208\n", "Epoch 3/10, Training Loss: 0.2632215261459351, Validation Loss: 0.12271245462553841\n", "Epoch 4/10, Training Loss: 0.07123038023710251, Validation Loss: 0.04314851335116795\n", "Epoch 5/10, Training Loss: 0.030836958587169647, Validation Loss: 0.02404651258672987\n", "Epoch 6/10, Training Loss: 0.01885295122861862, Validation Loss: 0.01633326443178313\n", "Epoch 7/10, Training Loss: 0.013474887125194073, Validation Loss: 0.012323189526796341\n", "Epoch 8/10, Training Loss: 0.010358258336782455, Validation Loss: 0.009776035456785135\n", "Epoch 9/10, Training Loss: 0.00829771364107728, Validation Loss: 0.008006346212433917\n", "Epoch 10/10, Training Loss: 0.006895635910332203, Validation Loss: 0.006732985709926912\n" ] } ], "source": [ "# Define the loss function and optimizer\n", "criterion = nn.CrossEntropyLoss(ignore_index=ic_vocab[''])\n", "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", "\n", "# Function to train the model\n", "def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs=10):\n", " model.to(DEVICE)\n", " train_losses, val_losses = [], []\n", "\n", " for epoch in range(num_epochs):\n", " model.train()\n", " epoch_train_loss = 0\n", " for en_batch, ic_batch in train_dataloader:\n", " en_batch, ic_batch = en_batch.to(DEVICE), ic_batch.to(DEVICE)\n", " ic_batch = ic_batch.long()\n", "\n", " optimizer.zero_grad()\n", " output = model(en_batch, ic_batch)\n", "\n", " output_dim = output.shape[-1]\n", " output = output.reshape(-1, output_dim)\n", " ic_batch = ic_batch.reshape(-1)\n", " loss = criterion(output, ic_batch)\n", "\n", " loss.backward()\n", " optimizer.step()\n", "\n", " epoch_train_loss += loss.item()\n", "\n", " train_losses.append(epoch_train_loss / len(train_dataloader))\n", "\n", " model.eval()\n", " epoch_val_loss = 0\n", " with torch.no_grad():\n", " for en_batch, ic_batch in val_dataloader:\n", " en_batch, ic_batch = en_batch.to(DEVICE), ic_batch.to(DEVICE)\n", " ic_batch = ic_batch.long()\n", "\n", " output = model(en_batch, ic_batch)\n", " output_dim = output.shape[-1]\n", " output = output.reshape(-1, output_dim)\n", " ic_batch = ic_batch.reshape(-1)\n", "\n", " loss = criterion(output, ic_batch)\n", " epoch_val_loss += loss.item()\n", "\n", " val_losses.append(epoch_val_loss / len(val_dataloader))\n", "\n", " print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_losses[-1]}, Validation Loss: {val_losses[-1]}')\n", "\n", " return train_losses, val_losses\n", "\n", "train_losses, val_losses = train_model(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs=NUM_EPOCHS)\n", "\n", "# Save the trained model\n", "torch.save(model.state_dict(), 'seq2seq_translation_model.pth')\n", "\n", "\n", "def translate_new_sentence(model, sentence, src_vocab, tgt_vocab, max_len=MAX_LEN):\n", " # Prepare input tensor for the source sentence (English)\n", " model.eval() # Set model to evaluation mode\n", " en_indices = [src_vocab.get(word, src_vocab['']) for word in sentence.split()] # Handle unknown words\n", " en_tensor = torch.tensor(en_indices).unsqueeze(0).to(DEVICE)\n", "\n", " # Initialize target sentence for translation (Croatian) with start token\n", " tgt_tensor = torch.tensor([tgt_vocab['']]).unsqueeze(0).to(DEVICE)\n", "\n", " translated_sentence = []\n", "\n", " for _ in range(max_len):\n", " with torch.no_grad():\n", " output = model(en_tensor, tgt_tensor)\n", "\n", " # Get the last output token's prediction (the next word)\n", " output_token = output.argmax(dim=-1)[:, -1]\n", "\n", " # Get the predicted word from the target vocabulary\n", " output_token_item = output_token.item()\n", "\n", " # Check if the token is within the target vocabulary range\n", " if output_token_item not in tgt_vocab.values():\n", " translated_word = '' # Map to if not found\n", " else:\n", " translated_word = list(tgt_vocab.keys())[list(tgt_vocab.values()).index(output_token_item)]\n", "\n", " # Append predicted word to the translated sentence\n", " translated_sentence.append(translated_word)\n", "\n", " # If we encounter an end token, stop the translation\n", " if translated_word == '':\n", " break\n", "\n", " # Update tgt_tensor for next prediction\n", " tgt_tensor = torch.cat((tgt_tensor, output_token.unsqueeze(0)), dim=-1)\n", "\n", " return ' '.join(translated_sentence)\n", "\n", "# BLEU and CHRF Scores Calculation\n", "def calculate_bleu_chrf(en_sentences, ic_sentences, model, en_vocab, ic_vocab, max_len=MAX_LEN):\n", " bleu_scores = []\n", " chrf_scores = []\n", "\n", " for en_sentence, ic_sentence in zip(en_sentences, ic_sentences):\n", " translated_sentence = translate_new_sentence(model, en_sentence, en_vocab, ic_vocab, max_len)\n", "\n", " bleu_score = sacrebleu.corpus_bleu([translated_sentence], [[ic_sentence]]).score\n", " bleu_scores.append(bleu_score)\n", "\n", " chrf_score = sacrebleu.corpus_chrf([translated_sentence], [[ic_sentence]]).score\n", " chrf_scores.append(chrf_score)\n", "\n", " return bleu_scores, chrf_scores\n", "\n", "lstm_bleu_scores, lstm_chrf_scores = calculate_bleu_chrf(en_sentences, ic_sentences, model, en_vocab, ic_vocab)\n", "\n", "# Save BLEU and CHRF scores to CSV\n", "with open('Seq2Seq_BLEU_CHRF_scores.csv', mode='w', newline='', encoding='utf-8') as file:\n", " writer = csv.writer(file)\n", " writer.writerow([\"BLEU Score\", \"CHRF Score\"])\n", " for bleu, chrf in zip(lstm_bleu_scores, lstm_chrf_scores):\n", " writer.writerow([bleu, chrf])\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 487 }, "id": "pllSNt5vK-AX", "outputId": "ba288b64-8026-4194-94fb-83cf9a54f7c2" }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "\n", "# Plot the training and validation loss\n", "def plot_losses(train_losses, val_losses):\n", " plt.figure(figsize=(10, 5))\n", " plt.plot(train_losses, label='Training Loss')\n", " plt.plot(val_losses, label='Validation Loss')\n", " plt.xlabel('Epochs')\n", " plt.ylabel('Loss')\n", " plt.legend()\n", " plt.title('Training and Validation Loss for seq2seq')\n", " plt.show()\n", "\n", "plot_losses(train_losses, val_losses)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "w4FDUt1Vshjm", "outputId": "540d70b3-62f1-44f5-fab0-0c5cb9285578" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original: This is a test sentence.\n", "Translated: gefðu þrjú ráð halda heilsu borðaðu hollt næringarríkt mataræði gakktu máltíðirnar þínar innihaldi margs konar ávexti grænmeti magurt prótein heilkorn holla fitu þetta veita líkamanum nauðsynleg næringarefni virka best hjálpað koma veg langvinna sjúkdóma taktu þátt reglulegri hreyfing mikilvæg viðhalda sterkum beinum vöðvum hjarta æðaheilbrigði minnsta kosti 150 hóflegri þolþjálfun 75 öflugri viku fáðu fá gæðasvefn skiptir sköpum andlega vellíðan stjórna skapi bæta vitræna virkni heilbrigðan vöxt ónæmisvirkni 79 nóttu hverjir gulur þessir litir kallaðir blanda öðrum litum alla aðra liti sameina ýmsum hlutföllum auklitakerfinu notað ljós grænn rgb byggingu atóm grunnbygging alls efnis samsett þremur tegundum agna rafeindum lýsa má uppbyggingu miðjunni umkringdur rafeindaskýi kjarni gerður róteindir jákvætt nifteindir hlutlausar án hleðslu báðar þessar staðsettar miðju inniheldur mestan hluta umhverfis rafeindaský rafeindir neikvætt stöðugri kringum kjarnann skipt skeljar svigrúm hver skel geymt ákveðinn ystu skelinni kallast gildisskel efnafræðilega eiginleika hlutlausu atómi kjarnanum jafn þannig jákvæða neikvæða hleðslan jafnast atómið hefur enga heildarhleðslu einnig kallaður atómnúmer hvaða frumeind hvernig getum dregið meðal orkunotkun spara orku minnkum magn losnar virkjunum þessu orkusparandi tæki lýsingu einangra slökkva ljósum rafeindabúnaði þau notaðu bílar vörubílar helstu uppsprettur hjólreiðar fækka bílum veginum aftur dregur útblæstri ábyrgur úrgangi úrgangur brenndur niðurbrotinn hann skaðlegar lofttegundir fargaðu spilliefnum réttan endurvinna plast pappír gler mögulegt 4 orkugjafa endurnýjanlegir orkugjafar sólar vindorka valda lítilli engri styðjið orkufyrirtæki verkefni íhugið setja upp endurnýjanleg orkukerfi þínu 5 akið ef þarft skaltu ganga bílnum sé vel haldið forðast hraða hröðun hemlun lausagangi 6 meðvitaður vöruval sumar venjur gefa sér meiri en aðrar veldu umhverfisvænum umbúðum reyndu lágmarka þína úðabrúsa skaðleg þurftir erfiða ai aðstoðarmaður hef eigin persónulegu reynslu tilvist hins vegar forritaður ákvarðanir byggðar reikniritum rökfræði gögnum tímar lendi aðstæðum erfitt geta verið misvísandi upplýsingar margar mögulegar niðurstöður slíkum tilfellum hæfileika greina vega áhættu ávinning hvers valkosts komast niðurstöðu líklegast tilætluðum árangri\n" ] } ], "source": [ "# Translate a new sentence\n", "def translate_new_sentence(model, en_sentence, en_vocab, ic_vocab, max_len=MAX_LEN):\n", " model.eval()\n", " en_tensor = sentence_to_tensor(en_sentence, en_vocab, max_len).unsqueeze(0).to(DEVICE)\n", "\n", " # Start with the token as input to the decoder\n", " ic_input = torch.tensor([ic_vocab['']]).unsqueeze(0).to(DEVICE)\n", "\n", " translated_sentence = []\n", " with torch.no_grad():\n", " for _ in range(max_len):\n", " output = model(en_tensor, ic_input)\n", " output_token = output.argmax(dim=-1)[:, -1].item() # Get the most likely next token\n", "\n", " translated_sentence.append(output_token)\n", " if output_token == ic_vocab['']: # End token\n", " break\n", "\n", " # Update the input for the next time step (use the predicted token)\n", " ic_input = torch.tensor([output_token]).unsqueeze(0).to(DEVICE) # Convert output_token to tensor\n", "\n", " # Convert token IDs back to words\n", " translated_sentence = [k for k, v in ic_vocab.items() if v in translated_sentence]\n", " return \" \".join(translated_sentence)\n", "\n", "# Example sentence translation\n", "en_example_sentence = \"This is a test sentence.\"\n", "translated_ic_sentence = translate_new_sentence(model, en_example_sentence, en_vocab, ic_vocab)\n", "print(f\"Original: {en_example_sentence}\")\n", "print(f\"Translated: {translated_ic_sentence}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YbfHsa5PpW8G" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N93ucaxdpW_d" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NWPzmQ3zpXCi" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HCxdDaispXHV" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pHS_C_VTpXKu" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lNvPqrwPpXNc" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 1 }