File size: 6,636 Bytes

75e6bf2
 
 
 
e983d05
 
75e6bf2
 
 
 
 
e10d376
75e6bf2
 
 
 
 
e983d05
 
75e6bf2
e256a57
75e6bf2
e983d05
 
75e6bf2
 
 
 
e983d05
 
2a395ac
 
75e6bf2
e983d05
 
75e6bf2
 
 
 
e983d05
 
75e6bf2
 
 
e10d376
 
 
 
 
 
75e6bf2
 
 
e10d376
75e6bf2
 
 
 
e983d05
 
75e6bf2
 
 
e10d376
 
 
75e6bf2
 
 
 
e983d05
 
75e6bf2
e10d376
75e6bf2
e10d376
75e6bf2
 
 
 
e983d05
 
75e6bf2
 
 
e10d376
 
 
 
 
 
 
 
 
e983d05
e10d376
 
75e6bf2
 
 
e10d376
75e6bf2
 
 
 
e983d05
 
75e6bf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e983d05
 
75e6bf2
 
 
e983d05
 
 
 
 
75e6bf2
 
 
 
 
 
 
 
e983d05
 
75e6bf2
 
 
e256a57
75e6bf2
 
 
 
e983d05
 
75e6bf2
 
 
e10d376
 
 
 
75e6bf2
 
 
 
 
 
 
 
 
 
 
e983d05
 
75e6bf2
 
 
 
 
 
 
e983d05
75e6bf2
 
e256a57
75e6bf2
 
 
 
 
e10d376
75e6bf2
 
 
 
 
 
 
 
 
 
e983d05
75e6bf2
 
 
 
 
 
 
e983d05
75e6bf2

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "2bdeda95",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
    "from datasets import load_dataset, load_metric, Audio\n",
    "import numpy as np\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "8f840be9",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
    "processor = Wav2Vec2Processor.from_pretrained(\".\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "46339a6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\").to('cuda')\n",
    "# processor = Wav2Vec2Processor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "2c28d4f3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default-fbad308ab5a03eb2\n",
      "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
     ]
    }
   ],
   "source": [
    "common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "f14c1cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "common_voice_test  = (common_voice_test\n",
    "                      .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
    "                      .rename_column('text', 'sentence'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "b60360b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "common_voice_test  = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "64758ba8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_3799144408.wav',\n",
       "  'array': array([-1.0600963e-06,  1.2359066e-06, -1.4001107e-06, ...,\n",
       "         -3.1423504e-05,  4.4914182e-06,  0.0000000e+00], dtype=float32),\n",
       "  'sampling_rate': 16000},\n",
       " 'sentence': 'ស៊ី ដាច់ ម៉ូតូ នៅ ពេល ដែល ប្រើ ឱ្យ ឌុប សម្ភារៈ គ្រឿង សង្ហារឹម យក ទៅ ឱ្យ ម៉ូយ នៅ ម្ដុំ វត្ដ សំរោងអណ្ដែត'}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "common_voice_test[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "93cd7415",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_dataset(batch):\n",
    "    audio = batch[\"audio\"]\n",
    "    \n",
    "    # batched output is \"un-batched\"\n",
    "    batch[\"input_values\"] = processor(np.array(audio[\"array\"]), sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n",
    "    batch[\"input_length\"] = len(batch[\"input_values\"])\n",
    "    \n",
    "    with processor.as_target_processor():\n",
    "        batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "04751885",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-abf3b661c395248b.arrow\n"
     ]
    }
   ],
   "source": [
    "common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "e55d9cc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "i = 25"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "4f637d1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
     ]
    }
   ],
   "source": [
    "input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)\n",
    "logits = model(input_dict.input_values.to(\"cuda\")).logits\n",
    "pred_ids = torch.argmax(logits, dim=-1)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "85334ad6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prediction:\n",
      "ក្រុង ប៉ោយប៉ែត នឹង ក្វាះ ទឹក ស្អាត ប្រើ ចាប់ ពី សប្តាហ ក្រោយ ទៅ\n",
      "\n",
      "Reference:\n",
      "ក្រុង ប៉ោយប៉ែត នឹង ខ្វះ ទឹក ស្អាត ប្រើ ចាប់ ពី សប្តាហ៍ ក្រោយ ទៅ\n"
     ]
    }
   ],
   "source": [
    "print(\"Prediction:\")\n",
    "pred_ids = pred_ids[pred_ids != processor.tokenizer.pad_token_id]\n",
    "print(processor.decode(pred_ids))\n",
    "\n",
    "print(\"\\nReference:\")\n",
    "print(processor.decode(common_voice_test['labels'][i]))\n",
    "# print(common_voice_test_transcription[0][\"sentence\"].lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be1c8d79",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f7eaba0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}