Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- Tamil_number_conversion.ipynb +223 -0
- Text2List.ipynb +96 -0
- convert2list.ipynb +84 -0
- isNumber.ipynb +51 -0
- numberMapping.ipynb +162 -0
- processDoubles.ipynb +85 -0
- replaceWords.ipynb +182 -0
- text2int.ipynb +233 -0
Tamil_number_conversion.ipynb
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "dc09394e-2130-4bd4-af30-01346d8ee355",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stderr",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"name": "stdout",
|
18 |
+
"output_type": "stream",
|
19 |
+
"text": [
|
20 |
+
"Running on local URL: http://127.0.0.1:7860\n",
|
21 |
+
"\n",
|
22 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"data": {
|
27 |
+
"text/html": [
|
28 |
+
"<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
29 |
+
],
|
30 |
+
"text/plain": [
|
31 |
+
"<IPython.core.display.HTML object>"
|
32 |
+
]
|
33 |
+
},
|
34 |
+
"metadata": {},
|
35 |
+
"output_type": "display_data"
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"data": {
|
39 |
+
"text/plain": []
|
40 |
+
},
|
41 |
+
"execution_count": 1,
|
42 |
+
"metadata": {},
|
43 |
+
"output_type": "execute_result"
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"name": "stderr",
|
47 |
+
"output_type": "stream",
|
48 |
+
"text": [
|
49 |
+
"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\gradio\\analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.37.2, however version 4.44.1 is available, please upgrade. \n",
|
50 |
+
"--------\n",
|
51 |
+
" warnings.warn(\n",
|
52 |
+
"ERROR: Exception in ASGI application\n",
|
53 |
+
"Traceback (most recent call last):\n",
|
54 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\uvicorn\\protocols\\http\\h11_impl.py\", line 404, in run_asgi\n",
|
55 |
+
" result = await app( # type: ignore[func-returns-value]\n",
|
56 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
57 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\uvicorn\\middleware\\proxy_headers.py\", line 84, in __call__\n",
|
58 |
+
" return await self.app(scope, receive, send)\n",
|
59 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
60 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\fastapi\\applications.py\", line 1054, in __call__\n",
|
61 |
+
" await super().__call__(scope, receive, send)\n",
|
62 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\applications.py\", line 123, in __call__\n",
|
63 |
+
" await self.middleware_stack(scope, receive, send)\n",
|
64 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\middleware\\errors.py\", line 186, in __call__\n",
|
65 |
+
" raise exc\n",
|
66 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\middleware\\errors.py\", line 164, in __call__\n",
|
67 |
+
" await self.app(scope, receive, _send)\n",
|
68 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\gradio\\route_utils.py\", line 714, in __call__\n",
|
69 |
+
" await self.app(scope, receive, send)\n",
|
70 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\middleware\\exceptions.py\", line 62, in __call__\n",
|
71 |
+
" await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)\n",
|
72 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\_exception_handler.py\", line 64, in wrapped_app\n",
|
73 |
+
" raise exc\n",
|
74 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\_exception_handler.py\", line 53, in wrapped_app\n",
|
75 |
+
" await app(scope, receive, sender)\n",
|
76 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\routing.py\", line 762, in __call__\n",
|
77 |
+
" await self.middleware_stack(scope, receive, send)\n",
|
78 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\routing.py\", line 782, in app\n",
|
79 |
+
" await route.handle(scope, receive, send)\n",
|
80 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\routing.py\", line 297, in handle\n",
|
81 |
+
" await self.app(scope, receive, send)\n",
|
82 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\routing.py\", line 77, in app\n",
|
83 |
+
" await wrap_app_handling_exceptions(app, request)(scope, receive, send)\n",
|
84 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\_exception_handler.py\", line 64, in wrapped_app\n",
|
85 |
+
" raise exc\n",
|
86 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\_exception_handler.py\", line 53, in wrapped_app\n",
|
87 |
+
" await app(scope, receive, sender)\n",
|
88 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\routing.py\", line 75, in app\n",
|
89 |
+
" await response(scope, receive, send)\n",
|
90 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\responses.py\", line 346, in __call__\n",
|
91 |
+
" await send(\n",
|
92 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\_exception_handler.py\", line 50, in sender\n",
|
93 |
+
" await send(message)\n",
|
94 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\_exception_handler.py\", line 50, in sender\n",
|
95 |
+
" await send(message)\n",
|
96 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\starlette\\middleware\\errors.py\", line 161, in _send\n",
|
97 |
+
" await send(message)\n",
|
98 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\uvicorn\\protocols\\http\\h11_impl.py\", line 508, in send\n",
|
99 |
+
" output = self.conn.send(event=h11.EndOfMessage())\n",
|
100 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
101 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\h11\\_connection.py\", line 512, in send\n",
|
102 |
+
" data_list = self.send_with_data_passthrough(event)\n",
|
103 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
104 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\h11\\_connection.py\", line 545, in send_with_data_passthrough\n",
|
105 |
+
" writer(event, data_list.append)\n",
|
106 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\h11\\_writers.py\", line 67, in __call__\n",
|
107 |
+
" self.send_eom(event.headers, write)\n",
|
108 |
+
" File \"C:\\Users\\WCHL\\anaconda3\\envs\\RunInference2\\Lib\\site-packages\\h11\\_writers.py\", line 96, in send_eom\n",
|
109 |
+
" raise LocalProtocolError(\"Too little data for declared Content-Length\")\n",
|
110 |
+
"h11._util.LocalProtocolError: Too little data for declared Content-Length\n"
|
111 |
+
]
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"name": "stdout",
|
115 |
+
"output_type": "stream",
|
116 |
+
"text": [
|
117 |
+
"எண்பது\n",
|
118 |
+
"எண்பது\n",
|
119 |
+
"எண்பது\n",
|
120 |
+
"eighty\n",
|
121 |
+
"80\n"
|
122 |
+
]
|
123 |
+
}
|
124 |
+
],
|
125 |
+
"source": [
|
126 |
+
"import gradio as gr\n",
|
127 |
+
"import librosa\n",
|
128 |
+
"import numpy as np\n",
|
129 |
+
"import pywt\n",
|
130 |
+
"import nbimporter\n",
|
131 |
+
"from scipy.signal import butter, lfilter, wiener\n",
|
132 |
+
"from scipy.io.wavfile import write\n",
|
133 |
+
"from transformers import pipeline\n",
|
134 |
+
"from text2int import text_to_int\n",
|
135 |
+
"from isNumber import is_number\n",
|
136 |
+
"from Text2List import text_to_list\n",
|
137 |
+
"from convert2list import convert_to_list\n",
|
138 |
+
"from processDoubles import process_doubles\n",
|
139 |
+
"from replaceWords import replace_words\n",
|
140 |
+
"\n",
|
141 |
+
"asr_model = pipeline(\"automatic-speech-recognition\", model=\"cdactvm/w2v-bert-tamil_new\")\n",
|
142 |
+
"\n",
|
143 |
+
"# Function to apply a high-pass filter\n",
|
144 |
+
"def high_pass_filter(audio, sr, cutoff=300):\n",
|
145 |
+
" nyquist = 0.5 * sr\n",
|
146 |
+
" normal_cutoff = cutoff / nyquist\n",
|
147 |
+
" b, a = butter(1, normal_cutoff, btype='high', analog=False)\n",
|
148 |
+
" filtered_audio = lfilter(b, a, audio)\n",
|
149 |
+
" return filtered_audio\n",
|
150 |
+
"\n",
|
151 |
+
"# Function to apply wavelet denoising\n",
|
152 |
+
"def wavelet_denoise(audio, wavelet='db1', level=1):\n",
|
153 |
+
" coeffs = pywt.wavedec(audio, wavelet, mode='per')\n",
|
154 |
+
" sigma = np.median(np.abs(coeffs[-level])) / 0.5\n",
|
155 |
+
" uthresh = sigma * np.sqrt(2 * np.log(len(audio)))\n",
|
156 |
+
" coeffs[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeffs[1:]]\n",
|
157 |
+
" return pywt.waverec(coeffs, wavelet, mode='per')\n",
|
158 |
+
"\n",
|
159 |
+
"# Function to apply a Wiener filter for noise reduction\n",
|
160 |
+
"def apply_wiener_filter(audio):\n",
|
161 |
+
" return wiener(audio)\n",
|
162 |
+
"\n",
|
163 |
+
"# Function to handle speech recognition\n",
|
164 |
+
"def recognize_speech(audio_file):\n",
|
165 |
+
" audio, sr = librosa.load(audio_file, sr=16000)\n",
|
166 |
+
" audio = high_pass_filter(audio, sr)\n",
|
167 |
+
" audio = apply_wiener_filter(audio)\n",
|
168 |
+
" denoised_audio = wavelet_denoise(audio)\n",
|
169 |
+
" result = asr_model(denoised_audio)\n",
|
170 |
+
" text_value = result['text']\n",
|
171 |
+
" cleaned_text = text_value.replace(\"<s>\", \"\")\n",
|
172 |
+
" print(cleaned_text)\n",
|
173 |
+
" converted_to_list = convert_to_list(cleaned_text, text_to_list())\n",
|
174 |
+
" print(converted_to_list)\n",
|
175 |
+
" processed_doubles = process_doubles(converted_to_list)\n",
|
176 |
+
" print(processed_doubles)\n",
|
177 |
+
" replaced_words = replace_words(processed_doubles)\n",
|
178 |
+
" print(replaced_words)\n",
|
179 |
+
" converted_text = text_to_int(replaced_words)\n",
|
180 |
+
" print(converted_text)\n",
|
181 |
+
" return converted_text\n",
|
182 |
+
"\n",
|
183 |
+
"# Gradio Interface\n",
|
184 |
+
"gr.Interface(\n",
|
185 |
+
" fn=recognize_speech,\n",
|
186 |
+
" inputs=gr.Audio(sources=[\"microphone\",\"upload\"], type=\"filepath\"),\n",
|
187 |
+
" outputs=\"text\",\n",
|
188 |
+
" title=\"Speech Recognition with Advanced Noise Reduction & Hindi ASR\",\n",
|
189 |
+
" description=\"Upload an audio file, and the system will use high-pass filtering, Wiener filtering, and wavelet-based denoising, then a Hindi ASR model will transcribe the clean audio.\"\n",
|
190 |
+
").launch()\n"
|
191 |
+
]
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"cell_type": "code",
|
195 |
+
"execution_count": null,
|
196 |
+
"id": "d4565cfb-a8e0-49a1-8878-6e5b1cd105e6",
|
197 |
+
"metadata": {},
|
198 |
+
"outputs": [],
|
199 |
+
"source": []
|
200 |
+
}
|
201 |
+
],
|
202 |
+
"metadata": {
|
203 |
+
"kernelspec": {
|
204 |
+
"display_name": "Python 3 (ipykernel)",
|
205 |
+
"language": "python",
|
206 |
+
"name": "python3"
|
207 |
+
},
|
208 |
+
"language_info": {
|
209 |
+
"codemirror_mode": {
|
210 |
+
"name": "ipython",
|
211 |
+
"version": 3
|
212 |
+
},
|
213 |
+
"file_extension": ".py",
|
214 |
+
"mimetype": "text/x-python",
|
215 |
+
"name": "python",
|
216 |
+
"nbconvert_exporter": "python",
|
217 |
+
"pygments_lexer": "ipython3",
|
218 |
+
"version": "3.11.7"
|
219 |
+
}
|
220 |
+
},
|
221 |
+
"nbformat": 4,
|
222 |
+
"nbformat_minor": 5
|
223 |
+
}
|
Text2List.ipynb
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
+
"id": "94c5b577-d632-422a-a82f-f357f36d491b",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"def text_to_list():\n",
|
11 |
+
" text_list = [\n",
|
12 |
+
" # Tamil script for English numbers (11-19)\n",
|
13 |
+
" 'எலெவன்', 'ட்வெல்வ்', 'திர்டீன்', 'போர்டீன்', 'ஃபிஃப்டீன்', 'சிக்ஸ்டீன்', 'சிவன்டீன்', 'எட்டீன்', 'நைன்டீன்', \n",
|
14 |
+
" # Tamil numbers (11-19)\n",
|
15 |
+
" 'பதினொன்று', 'பனிரண்டு', 'பதிமூன்று', 'பதிநான்கு', 'பதினைந்து', 'பதினாறு', 'பதினேழு', 'பதினெட்டு', 'பத்தொன்பது',\n",
|
16 |
+
" # Tamil script for English multiples of ten (20, 30, ..., 90)\n",
|
17 |
+
" 'ட்வெண்டி', 'திர்டி', 'போர்டி', 'ஃபிப்டி', 'சிக்ஸ்டி', 'சிவெண்டி', 'எய்ட்டி', 'நைன்டி',\n",
|
18 |
+
" # Tamil multiples of ten (20, 30, ..., 90)\n",
|
19 |
+
" 'இருபது', 'முப்பது', 'நாற்பது', 'ஐம்பது', 'அறுபது', 'எழுபது', 'எண்பது', 'தொண்ணூறு',\n",
|
20 |
+
" # Tamil script for English combinations of 21-29\n",
|
21 |
+
" 'ட்வெண்டி ஒன்', 'ட்வெண்டி டூ', 'ட்வெண்டி த்ரீ', 'ட்வெண்டி ஃபோர்', 'ட்வெண்டி ஃபைவு', 'ட்வெண்டி சிக்ஸ்', 'ட்வெண்டி செவன்', 'ட்வெண்டி எட்டு', 'ட்வெண்டி நைன்', \n",
|
22 |
+
" # Tamil combinations of 21-29\n",
|
23 |
+
" 'இருபத்து ஒன்று', 'இருபத்து இரண்டு', 'இருபத்து மூன்று', 'இருபத்து நான்கு', 'இருபத்து ஐந்து', 'இருபத்து ஆறு', 'இருபத்து ஏழு', 'இருபத்து எட்டு', 'இருபத்து ஒன்பது',\n",
|
24 |
+
" # Tamil script for English combinations of 31-39\n",
|
25 |
+
" 'திர்டி ஒன்', 'திர்டி டூ', 'திர்டி த்ரீ', 'திர்டி ஃபோர்', 'திர்டி ஃபைவு', 'திர்டி சிக்ஸ்', 'திர்டி செவன்', 'திர்டி எட்டு', 'திர்டி நைன்', \n",
|
26 |
+
" # Tamil combinations of 31-39\n",
|
27 |
+
" 'முப்பத்து ஒன்று', 'முப்பத்து இரண்டு', 'முப்பத்து மூன்று', 'முப்பத்து நான்கு', 'முப்பத்து ஐந்து', 'முப்பத்து ஆறு', 'முப்பத்து ஏழு', 'முப்பத்து எட்டு', 'முப்பத்து ஒன்பது',\n",
|
28 |
+
" # Tamil script for English combinations of 41-49\n",
|
29 |
+
" 'போர்டி ஒன்', 'போர்டி டூ', 'போர்டி த்ரீ', 'போர்டி ஃபோர்', 'போர்டி ஃபைவு', 'போர்டி சிக்ஸ்', 'போர்டி செவன்', 'போர்டி எட்டு', 'போர்டி நைன்', \n",
|
30 |
+
" # Tamil combinations of 41-49\n",
|
31 |
+
" 'நாற்பத்து ஒன்று', 'நாற்பத்து இரண்டு', 'நாற்பத்து மூன்று', 'நாற்பத்து நான்கு', 'நாற்பத்து ஐந்து', 'நாற்பத்து ஆறு', 'நாற்பத்து ஏழு', 'நாற்பத்து எட்டு', 'நாற்பத்து ஒன்பது',\n",
|
32 |
+
" # Tamil script for English combinations of 51-59\n",
|
33 |
+
" 'ஃபிப்டி ஒன்', 'ஃபிப்டி டூ', 'ஃபிப்டி த்ரீ', 'ஃபிப்டி ஃபோர்', 'ஃபிப்டி ஃபைவு', 'ஃபிப்டி சிக்ஸ்', 'ஃபிப்டி செவன்', 'ஃபிப்டி எட்டு', 'ஃபிப்டி நைன்', \n",
|
34 |
+
" # Tamil combinations of 51-59\n",
|
35 |
+
" 'ஐம்பத்து ஒன்று', 'ஐம்பத்து இரண்டு', 'ஐம்பத்து மூன்று', 'ஐம்பத்து நான்கு', 'ஐம்பத்து ஐந்து', 'ஐம்பத்து ஆறு', 'ஐம்பத்து ஏழு', 'ஐம்பத்து எட்டு', 'ஐம்பத்து ஒன்பது',\n",
|
36 |
+
" # Tamil script for English combinations of 61-69\n",
|
37 |
+
" 'சிக்ஸ்டி ஒன்', 'சிக்ஸ்டி டூ', 'சிக்ஸ்டி த்ரீ', 'சிக்ஸ்டி ஃபோர்', 'சிக்ஸ்டி ஃபைவு', 'சிக்ஸ்டி சிக்ஸ்', 'சிக்ஸ்டி செவன்', 'சிக்ஸ்டி எட்டு', 'சிக்ஸ்டி நைன்', \n",
|
38 |
+
" # Tamil combinations of 61-69\n",
|
39 |
+
" 'அறுபத்து ஒன்று', 'அறுபத்து இரண்டு', 'அறுபத்து மூன்று', 'அறுபத்து நான்கு', 'அறுபத்து ஐந்து', 'அறுபத்து ஆறு', 'அறுபத்து ஏழு', 'அறுபத்து எட்டு', 'அறுபத்து ஒன்பது',\n",
|
40 |
+
" # Tamil script for English combinations of 71-79\n",
|
41 |
+
" 'சிவெண்டி ஒன்', 'சிவெண்டி டூ', 'சிவெண்டி த்ரீ', 'சிவெண்டி ஃபோர்', 'சிவெண்டி ஃபைவு', 'சிவெண்டி சிக்ஸ்', 'சிவெண்டி செவன்', 'சிவெண்டி எட்டு', 'சிவெண்டி நைன்', \n",
|
42 |
+
" # Tamil combinations of 71-79\n",
|
43 |
+
" 'எழுபத்து ஒன்று', 'எழுபத்து இரண்டு', 'எழுபத்து மூன்று', 'எழுபத்து நான்கு', 'எழுபத்து ஐந்து', 'எழுபத்து ஆறு', 'எழுபத்து ஏழு', 'எழுபத்து எட்டு', 'எழுபத்து ஒன்பது',\n",
|
44 |
+
" # Tamil script for English combinations of 81-89\n",
|
45 |
+
" 'எய்ட்டி ஒன்', 'எய்ட்டி டூ', 'எய்ட்டி த்ரீ', 'எய்ட்டி ஃபோர்', 'எய்ட்டி ஃபைவு', 'எய்ட்டி சிக்ஸ்', 'எய்ட்டி செவன்', 'எய்ட்டி எட்டு', 'எய்ட்டி நைன்', \n",
|
46 |
+
" # Tamil combinations of 81-89\n",
|
47 |
+
" 'எண்பத்து ஒன்று', 'எண்பத்து இரண்டு', 'எண்பத்து மூன்று', 'எண்பத்து நான்கு', 'எண்பத்து ஐந்து', 'எண்பத்து ஆறு', 'எண்பத்து ஏழு', 'எண்பத்து எட்டு', 'எண்பத்து ஒன்பது',\n",
|
48 |
+
" # Tamil script for English combinations of 91-99\n",
|
49 |
+
" 'நைன்டி ஒன்', 'நைன்டி டூ', 'நைன்டி த்ரீ', 'நைன்டி ஃபோர்', 'நைன்டி ஃபைவு', 'நைன்டி சிக்ஸ்', 'நைன்டி செவன்', 'நைன்டி எட்டு', 'நைன்டி நைன்', \n",
|
50 |
+
" # Tamil combinations of 91-99\n",
|
51 |
+
" 'தொண்ணூற்று ஒன்று', 'தொண்ணூற்று இரண்டு', 'தொண்ணூற்று மூன்று', 'தொண்ணூற்று நான்கு', 'தொண்ணூற்று ஐந்து', 'தொண்ணூற்று ஆறு', 'தொண்ணூற்று ஏழு', 'தொண்ணூற்று எட்டு', 'தொண்ணூற்று ஒன்பது',\n",
|
52 |
+
" # Tamil script for English numbers (0-10)\n",
|
53 |
+
" 'ஜீரோ', 'ஒன்', 'டூ', 'த்ரீ', 'போர்', 'ஃபைவ்', 'சிக்ஸ்', 'சிவன்', 'ஏட்', 'நைன்', 'டென்',\n",
|
54 |
+
" # Tamil numbers (0-10)\n",
|
55 |
+
" 'பூஜ்ஜியம்', 'ஒன்று', 'இரண்டு', 'மூன்று', 'நான்கு', 'ஐந்து', 'ஆறு', 'ஏழு', 'எட்டு', 'ஒன்பது', 'பத்து',\n",
|
56 |
+
" # Tamil script for 100\n",
|
57 |
+
" 'ஹண்ட்ரெட்',\n",
|
58 |
+
" # Tamil for 100\n",
|
59 |
+
" 'நூறு',\n",
|
60 |
+
" # Tamil for 1000\n",
|
61 |
+
" 'ஆயிரம்'\n",
|
62 |
+
" ]\n",
|
63 |
+
" return text_list\n"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"cell_type": "code",
|
68 |
+
"execution_count": null,
|
69 |
+
"id": "787fa243-0dba-4a35-b479-2e9198dc6af1",
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [],
|
72 |
+
"source": []
|
73 |
+
}
|
74 |
+
],
|
75 |
+
"metadata": {
|
76 |
+
"kernelspec": {
|
77 |
+
"display_name": "Python 3 (ipykernel)",
|
78 |
+
"language": "python",
|
79 |
+
"name": "python3"
|
80 |
+
},
|
81 |
+
"language_info": {
|
82 |
+
"codemirror_mode": {
|
83 |
+
"name": "ipython",
|
84 |
+
"version": 3
|
85 |
+
},
|
86 |
+
"file_extension": ".py",
|
87 |
+
"mimetype": "text/x-python",
|
88 |
+
"name": "python",
|
89 |
+
"nbconvert_exporter": "python",
|
90 |
+
"pygments_lexer": "ipython3",
|
91 |
+
"version": "3.11.7"
|
92 |
+
}
|
93 |
+
},
|
94 |
+
"nbformat": 4,
|
95 |
+
"nbformat_minor": 5
|
96 |
+
}
|
convert2list.ipynb
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"id": "b52e9a66-a8e9-4f56-91fd-8564b5b636fc",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# import nbimporter\n",
|
11 |
+
"import nbimporter\n",
|
12 |
+
"from Text2List import text_to_list\n",
|
13 |
+
"def convert_to_list(text, text_list):\n",
|
14 |
+
" matched_words = []\n",
|
15 |
+
" unmatched_text = '' # To accumulate unmatched characters\n",
|
16 |
+
"\n",
|
17 |
+
" # Sort text_list by length in descending order to prioritize longest matches first\n",
|
18 |
+
" text_list_sorted = sorted(text_list, key=len, reverse=True)\n",
|
19 |
+
"\n",
|
20 |
+
" while text:\n",
|
21 |
+
" matched = False\n",
|
22 |
+
" for word in text_list_sorted:\n",
|
23 |
+
" if text.startswith(word):\n",
|
24 |
+
" # Add any accumulated unmatched text before appending the matched word\n",
|
25 |
+
" if unmatched_text:\n",
|
26 |
+
" matched_words.append(unmatched_text)\n",
|
27 |
+
" unmatched_text = '' # Reset unmatched text accumulator\n",
|
28 |
+
"\n",
|
29 |
+
" matched_words.append(word)\n",
|
30 |
+
" text = text[len(word):] # Remove the matched part from text\n",
|
31 |
+
" matched = True\n",
|
32 |
+
" break\n",
|
33 |
+
"\n",
|
34 |
+
" if not matched:\n",
|
35 |
+
" # Accumulate unmatched characters\n",
|
36 |
+
" unmatched_text += text[0]\n",
|
37 |
+
" text = text[1:]\n",
|
38 |
+
"\n",
|
39 |
+
" # If there's any remaining unmatched text, add it to the result\n",
|
40 |
+
" if unmatched_text:\n",
|
41 |
+
" matched_words.append(unmatched_text)\n",
|
42 |
+
"\n",
|
43 |
+
" # Join matched words and unmatched text with a space\n",
|
44 |
+
" result = ' '.join(matched_words)\n",
|
45 |
+
" return result\n",
|
46 |
+
" \n",
|
47 |
+
"# text = \"जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच\"\n",
|
48 |
+
"\n",
|
49 |
+
"# if __name__==\"__main__\":\n",
|
50 |
+
"# converted=convert_to_list(text, text_to_list())\n",
|
51 |
+
"# print(converted)"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": null,
|
57 |
+
"id": "98835c96-2949-4e78-8d1e-c8623d5dcb00",
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [],
|
60 |
+
"source": []
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"metadata": {
|
64 |
+
"kernelspec": {
|
65 |
+
"display_name": "Python 3 (ipykernel)",
|
66 |
+
"language": "python",
|
67 |
+
"name": "python3"
|
68 |
+
},
|
69 |
+
"language_info": {
|
70 |
+
"codemirror_mode": {
|
71 |
+
"name": "ipython",
|
72 |
+
"version": 3
|
73 |
+
},
|
74 |
+
"file_extension": ".py",
|
75 |
+
"mimetype": "text/x-python",
|
76 |
+
"name": "python",
|
77 |
+
"nbconvert_exporter": "python",
|
78 |
+
"pygments_lexer": "ipython3",
|
79 |
+
"version": "3.11.7"
|
80 |
+
}
|
81 |
+
},
|
82 |
+
"nbformat": 4,
|
83 |
+
"nbformat_minor": 5
|
84 |
+
}
|
isNumber.ipynb
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "ac442c1a-b404-4936-afec-7e4eb43bb68b",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# Function to check if the string is a number\n",
|
11 |
+
"def is_number(x):\n",
|
12 |
+
" if type(x) == str:\n",
|
13 |
+
" x = x.replace(',', '')\n",
|
14 |
+
" try:\n",
|
15 |
+
" float(x)\n",
|
16 |
+
" except:\n",
|
17 |
+
" return False\n",
|
18 |
+
" return True"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"cell_type": "code",
|
23 |
+
"execution_count": null,
|
24 |
+
"id": "639ab4a5-6c8e-4ced-8736-a9a283e3bdb7",
|
25 |
+
"metadata": {},
|
26 |
+
"outputs": [],
|
27 |
+
"source": []
|
28 |
+
}
|
29 |
+
],
|
30 |
+
"metadata": {
|
31 |
+
"kernelspec": {
|
32 |
+
"display_name": "Python 3 (ipykernel)",
|
33 |
+
"language": "python",
|
34 |
+
"name": "python3"
|
35 |
+
},
|
36 |
+
"language_info": {
|
37 |
+
"codemirror_mode": {
|
38 |
+
"name": "ipython",
|
39 |
+
"version": 3
|
40 |
+
},
|
41 |
+
"file_extension": ".py",
|
42 |
+
"mimetype": "text/x-python",
|
43 |
+
"name": "python",
|
44 |
+
"nbconvert_exporter": "python",
|
45 |
+
"pygments_lexer": "ipython3",
|
46 |
+
"version": "3.11.7"
|
47 |
+
}
|
48 |
+
},
|
49 |
+
"nbformat": 4,
|
50 |
+
"nbformat_minor": 5
|
51 |
+
}
|
numberMapping.ipynb
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "4f170db4-bc35-4f00-9c10-25ae297beda5",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"replacement_map = {\n",
|
11 |
+
" 'zero': ['शून्य', 'जेरो', 'शुन्ना', 'जीरो'],\n",
|
12 |
+
" 'one': ['वन', 'एंक', 'इक', 'एक'],\n",
|
13 |
+
" 'two': ['टू', 'दौ', 'दो'],\n",
|
14 |
+
" 'three': ['थ्री', 'तीना', 'तीन', 'त्री'],\n",
|
15 |
+
" 'four': ['फोर', 'फॉर', 'च्यार', 'चार'],\n",
|
16 |
+
" 'five': ['फाइव', 'पाँच', 'पांच'],\n",
|
17 |
+
" 'six': ['सिक्स', 'चह', 'छौ', 'छै', 'छह'],\n",
|
18 |
+
" 'seven': ['सेवन', 'सात'],\n",
|
19 |
+
" 'eight': ['एट', 'अट', 'आठ'],\n",
|
20 |
+
" 'nine': ['नाइन', 'नौ'],\n",
|
21 |
+
" 'ten': ['टेन', 'दस'],\n",
|
22 |
+
" \n",
|
23 |
+
" # Numbers from 11 to 19\n",
|
24 |
+
" 'eleven': ['इलेवन', 'ग्यारह'],\n",
|
25 |
+
" 'twelve': ['ट्वेल्व', 'बारह'],\n",
|
26 |
+
" 'thirteen': ['थर्टीन', 'तेरह'],\n",
|
27 |
+
" 'fourteen': ['फोर्टीन', 'चौदह'],\n",
|
28 |
+
" 'fifteen': ['फिफ्टीन', 'पंद्रह'],\n",
|
29 |
+
" 'sixteen': ['सिक्स्टीन', 'सोलह'],\n",
|
30 |
+
" 'seventeen': ['सेवंटीन', 'सत्रह'],\n",
|
31 |
+
" 'eighteen': ['एटीन', 'अठारह'],\n",
|
32 |
+
" 'nineteen': ['नाइनटीन', 'उन्नीस'],\n",
|
33 |
+
"\n",
|
34 |
+
" # Multiples of ten\n",
|
35 |
+
" 'twenty': ['ट्वेंटी', 'बीस'],\n",
|
36 |
+
" 'thirty': ['थर्टी', 'तीस'],\n",
|
37 |
+
" 'forty': ['फोर्टी', 'चालीस'],\n",
|
38 |
+
" 'fifty': ['फिफ्टी', 'पचास'],\n",
|
39 |
+
" 'sixty': ['सिक्स्टी', 'साठ'],\n",
|
40 |
+
" 'seventy': ['सेवंटी', 'सत्तर'],\n",
|
41 |
+
" 'eighty': ['एटी', 'अस्सी'],\n",
|
42 |
+
" 'ninety': ['नाइंटी', 'नब्बे'],\n",
|
43 |
+
"\n",
|
44 |
+
" # Numbers from 21 to 29\n",
|
45 |
+
" 'twenty one': ['ट्वेंटी वन', 'इक्कीस'],\n",
|
46 |
+
" 'twenty two': ['ट्वेंटी टू', 'बाईस'],\n",
|
47 |
+
" 'twenty three': ['ट्वेंटी थ्री', 'तेईस'],\n",
|
48 |
+
" 'twenty four': ['ट्वेंटी फोर', 'चौबीस'],\n",
|
49 |
+
" 'twenty five': ['ट्वेंटी फाइव', 'पच्चीस'],\n",
|
50 |
+
" 'twenty six': ['ट्वेंटी सिक्स', 'छब्बीस'],\n",
|
51 |
+
" 'twenty seven': ['ट्वेंटी सेवन', 'सत्ताईस'],\n",
|
52 |
+
" 'twenty eight': ['ट्वेंटी एट', 'अट्ठाईस'],\n",
|
53 |
+
" 'twenty nine': ['ट्वेंटी नाइन', 'उनतीस'],\n",
|
54 |
+
"\n",
|
55 |
+
" # Numbers from 31 to 39\n",
|
56 |
+
" 'thirty one': ['थर्टी वन', 'इकतीस'],\n",
|
57 |
+
" 'thirty two': ['थर्टी टू', 'बत्तीस'],\n",
|
58 |
+
" 'thirty three': ['थर्टी थ्री', 'तेतीस'],\n",
|
59 |
+
" 'thirty four': ['थर्टी फोर', 'चौंतीस'],\n",
|
60 |
+
" 'thirty five': ['थर्टी फाइव', 'पैंतीस'],\n",
|
61 |
+
" 'thirty six': ['थर्टी सिक्स', 'छत्तीस'],\n",
|
62 |
+
" 'thirty seven': ['थर्टी सेवन', 'सैंतीस'],\n",
|
63 |
+
" 'thirty eight': ['थर्टी एट', 'अड़तीस'],\n",
|
64 |
+
" 'thirty nine': ['थर्टी नाइन', 'उनतालीस'],\n",
|
65 |
+
"\n",
|
66 |
+
" # Numbers from 41 to 49\n",
|
67 |
+
" 'forty one': ['फोर्टी वन', 'इकतालीस'],\n",
|
68 |
+
" 'forty two': ['फोर्टी टू', 'बयालीस'],\n",
|
69 |
+
" 'forty three': ['फोर्टी थ्री', 'तैंतालीस'],\n",
|
70 |
+
" 'forty four': ['फोर्टी फोर', 'चौंतालीस'],\n",
|
71 |
+
" 'forty five': ['फोर्टी फाइव', 'पैंतालीस'],\n",
|
72 |
+
" 'forty six': ['फोर्टी सिक्स', 'छयालिस'],\n",
|
73 |
+
" 'forty seven': ['फोर्टी सेवन', 'सैंतालीस'],\n",
|
74 |
+
" 'forty eight': ['फोर्टी एट', 'अड़तालीस'],\n",
|
75 |
+
" 'forty nine': ['फोर्टी नाइन', 'उनचास'],\n",
|
76 |
+
"\n",
|
77 |
+
" # Numbers from 51 to 59\n",
|
78 |
+
" 'fifty one': ['फिफ्टी वन', 'इक्यावन'],\n",
|
79 |
+
" 'fifty two': ['फिफ्टी टू', 'बावन'],\n",
|
80 |
+
" 'fifty three': ['फिफ्टी थ्री', 'तिरेपन'],\n",
|
81 |
+
" 'fifty four': ['फिफ्टी फोर', 'चौवन'],\n",
|
82 |
+
" 'fifty five': ['फिफ्टी फाइव', 'पचपन'],\n",
|
83 |
+
" 'fifty six': ['फिफ्टी सिक्स', 'छप्पन'],\n",
|
84 |
+
" 'fifty seven': ['फिफ्टी सेवन', 'सत्तावन'],\n",
|
85 |
+
" 'fifty eight': ['फिफ्टी एट', 'अट्ठावन'],\n",
|
86 |
+
" 'fifty nine': ['फिफ्टी नाइन', 'उनसठ'],\n",
|
87 |
+
"\n",
|
88 |
+
" # Numbers from 61 to 69\n",
|
89 |
+
" 'sixty one': ['सिक्स्टी वन', 'इकसठ'],\n",
|
90 |
+
" 'sixty two': ['सिक्स्टी टू', 'बासठ'],\n",
|
91 |
+
" 'sixty three': ['सिक्स्टी थ्री', 'तिरसठ'],\n",
|
92 |
+
" 'sixty four': ['सिक्स्टी फोर', 'चौंसठ'],\n",
|
93 |
+
" 'sixty five': ['सिक्स्टी फाइव', 'पैंसठ'],\n",
|
94 |
+
" 'sixty six': ['सिक्स्टी सिक्स', 'छियासठ'],\n",
|
95 |
+
" 'sixty seven': ['सिक्स्टी सेवन', 'सड़सठ'],\n",
|
96 |
+
" 'sixty eight': ['सिक्स्टी एट', 'अड़सठ'],\n",
|
97 |
+
" 'sixty nine': ['सिक्स्टी नाइन', 'उनहत्तर'],\n",
|
98 |
+
"\n",
|
99 |
+
" # Numbers from 71 to 79\n",
|
100 |
+
" 'seventy one': ['सेवंटी वन', 'इकहत्तर'],\n",
|
101 |
+
" 'seventy two': ['सेवंटी टू', 'बहत्तर'],\n",
|
102 |
+
" 'seventy three': ['सेवंटी थ्री', 'तिहत्तर'],\n",
|
103 |
+
" 'seventy four': ['सेवंटी फोर', 'चौहत्तर'],\n",
|
104 |
+
" 'seventy five': ['सेवंटी फाइव', 'पचहत्तर'],\n",
|
105 |
+
" 'seventy six': ['सेवंटी सिक्स', 'छिहत्तर'],\n",
|
106 |
+
" 'seventy seven': ['सेवंटी सेवन', 'सतहत्तर'],\n",
|
107 |
+
" 'seventy eight': ['सेवंटी एट', 'अठहत्तर'],\n",
|
108 |
+
" 'seventy nine': ['सेवंटी नाइन', 'उन्यासी'],\n",
|
109 |
+
"\n",
|
110 |
+
" # Numbers from 81 to 89\n",
|
111 |
+
" 'eighty one': ['एटी वन', 'इक्यासी'],\n",
|
112 |
+
" 'eighty two': ['एटी टू', 'बयासी'],\n",
|
113 |
+
" 'eighty three': ['एटी थ्री', 'तिरासी'],\n",
|
114 |
+
" 'eighty four': ['एटी फोर', 'चौरासी'],\n",
|
115 |
+
" 'eighty five': ['एटी फाइव', 'पचासी'],\n",
|
116 |
+
" 'eighty six': ['एटी सिक्स', 'छियासी'],\n",
|
117 |
+
" 'eighty seven': ['एटी सेवन', 'सतासी'],\n",
|
118 |
+
" 'eighty eight': ['एटी एट', 'अठासी'],\n",
|
119 |
+
" 'eighty nine': ['एटी नाइन', 'नवासी'],\n",
|
120 |
+
"\n",
|
121 |
+
" # Numbers from 91 to 99\n",
|
122 |
+
" 'ninety one': ['नाइंटी वन', 'इक्यानवे'],\n",
|
123 |
+
" 'ninety two': ['नाइंटी टू', 'बानवे'],\n",
|
124 |
+
" 'ninety three': ['नाइंटी थ्री', 'तिरानवे'],\n",
|
125 |
+
" 'ninety four': ['नाइंटी फोर', 'चौरानवे'],\n",
|
126 |
+
" 'ninety five': ['नाइंटी फाइव', 'पचानवे'],\n",
|
127 |
+
" 'ninety six': ['नाइंटी सिक्स', 'छियानवे'],\n",
|
128 |
+
" 'ninety seven': ['नाइंटी सेवन', 'सतानवे'],\n",
|
129 |
+
" 'ninety eight': ['नाइंटी एट', 'अठानवे'],\n",
|
130 |
+
" 'ninety nine': ['नाइंटी नाइन', 'निन्यानवे'],\n",
|
131 |
+
"\n",
|
132 |
+
" # Hundred\n",
|
133 |
+
" 'hundred': ['हंड्रेड', 'सौ'],\n",
|
134 |
+
"\n",
|
135 |
+
" # Special for double digits\n",
|
136 |
+
" 'डबल': ['दबल', 'डबल', 'दुबाल'],\n",
|
137 |
+
"}\n"
|
138 |
+
]
|
139 |
+
}
|
140 |
+
],
|
141 |
+
"metadata": {
|
142 |
+
"kernelspec": {
|
143 |
+
"display_name": "Python 3 (ipykernel)",
|
144 |
+
"language": "python",
|
145 |
+
"name": "python3"
|
146 |
+
},
|
147 |
+
"language_info": {
|
148 |
+
"codemirror_mode": {
|
149 |
+
"name": "ipython",
|
150 |
+
"version": 3
|
151 |
+
},
|
152 |
+
"file_extension": ".py",
|
153 |
+
"mimetype": "text/x-python",
|
154 |
+
"name": "python",
|
155 |
+
"nbconvert_exporter": "python",
|
156 |
+
"pygments_lexer": "ipython3",
|
157 |
+
"version": "3.11.7"
|
158 |
+
}
|
159 |
+
},
|
160 |
+
"nbformat": 4,
|
161 |
+
"nbformat_minor": 5
|
162 |
+
}
|
processDoubles.ipynb
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"id": "5912ba94-833f-4662-8b8c-f201b5dde892",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# # Function to process \"double\" followed by a number\n",
|
11 |
+
"# def process_doubles(sentence):\n",
|
12 |
+
"# tokens = sentence.split()\n",
|
13 |
+
"# result = []\n",
|
14 |
+
"# i = 0\n",
|
15 |
+
"# while i < len(tokens):\n",
|
16 |
+
"# if tokens[i] == \"डबल\":\n",
|
17 |
+
"# if i + 1 < len(tokens):\n",
|
18 |
+
"# result.append(tokens[i + 1])\n",
|
19 |
+
"# result.append(tokens[i + 1])\n",
|
20 |
+
"# i += 2\n",
|
21 |
+
"# else:\n",
|
22 |
+
"# result.append(tokens[i])\n",
|
23 |
+
"# i += 1\n",
|
24 |
+
"# else:\n",
|
25 |
+
"# result.append(tokens[i])\n",
|
26 |
+
"# i += 1\n",
|
27 |
+
"# return ' '.join(result)\n"
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"execution_count": null,
|
33 |
+
"id": "7d7065ce-94c9-4cda-a398-71dac90a67ba",
|
34 |
+
"metadata": {},
|
35 |
+
"outputs": [],
|
36 |
+
"source": [
|
37 |
+
"import re\n",
|
38 |
+
"\n",
|
39 |
+
"def process_doubles(sentence):\n",
|
40 |
+
" # Use regex to split 'डबल' followed by numbers/words without space (e.g., \"डबलवन\" -> \"डबल वन\")\n",
|
41 |
+
" sentence = re.sub(r'(डबल)(\\S+)', r'\\1 \\2', sentence)\n",
|
42 |
+
" \n",
|
43 |
+
" tokens = sentence.split()\n",
|
44 |
+
" result = []\n",
|
45 |
+
" i = 0\n",
|
46 |
+
" \n",
|
47 |
+
" while i < len(tokens):\n",
|
48 |
+
" if tokens[i] == \"डबल\":\n",
|
49 |
+
" if i + 1 < len(tokens):\n",
|
50 |
+
" result.append(tokens[i + 1]) # Append the next word/number\n",
|
51 |
+
" result.append(tokens[i + 1]) # Append the next word/number again to duplicate\n",
|
52 |
+
" i += 2 # Skip over the next word since it's already processed\n",
|
53 |
+
" else:\n",
|
54 |
+
" result.append(tokens[i])\n",
|
55 |
+
" i += 1\n",
|
56 |
+
" else:\n",
|
57 |
+
" result.append(tokens[i])\n",
|
58 |
+
" i += 1\n",
|
59 |
+
"\n",
|
60 |
+
" return ' '.join(result)\n"
|
61 |
+
]
|
62 |
+
}
|
63 |
+
],
|
64 |
+
"metadata": {
|
65 |
+
"kernelspec": {
|
66 |
+
"display_name": "Python 3 (ipykernel)",
|
67 |
+
"language": "python",
|
68 |
+
"name": "python3"
|
69 |
+
},
|
70 |
+
"language_info": {
|
71 |
+
"codemirror_mode": {
|
72 |
+
"name": "ipython",
|
73 |
+
"version": 3
|
74 |
+
},
|
75 |
+
"file_extension": ".py",
|
76 |
+
"mimetype": "text/x-python",
|
77 |
+
"name": "python",
|
78 |
+
"nbconvert_exporter": "python",
|
79 |
+
"pygments_lexer": "ipython3",
|
80 |
+
"version": "3.11.7"
|
81 |
+
}
|
82 |
+
},
|
83 |
+
"nbformat": 4,
|
84 |
+
"nbformat_minor": 5
|
85 |
+
}
|
replaceWords.ipynb
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 9,
|
6 |
+
"id": "19bbb494-3054-48ae-9b64-7f0756c0532d",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import re\n",
|
11 |
+
"def replace_words(sentence):\n",
|
12 |
+
" replacement_map = {\n",
|
13 |
+
" # Single digits\n",
|
14 |
+
" 'one': ['ஒன்று', 'ஒன்னு', 'ஒன்'],\n",
|
15 |
+
" 'two': ['இரண்டு', 'ரெண்டு', 'டூ'],\n",
|
16 |
+
" 'three': ['மூன்று', 'முன்னு', 'த்ரீ'],\n",
|
17 |
+
" 'four': ['நான்கு', 'நாலு', 'ஃபோர்'],\n",
|
18 |
+
" 'five': ['ஐந்து', 'அஞ்சு', 'ஃபைவ்'],\n",
|
19 |
+
" 'six': ['ஆறு', 'ஆறு', 'சிக்ஸ்'],\n",
|
20 |
+
" 'seven': ['ஏழு', 'எழு', 'செவன்'],\n",
|
21 |
+
" 'eight': ['எட்டு', 'எட்டு', 'எய்ட்'],\n",
|
22 |
+
" 'nine': ['ஒன்பது', 'ஒம்பது', 'நைன்'],\n",
|
23 |
+
" 'ten': ['பத்து', 'பத்து', 'டென'],\n",
|
24 |
+
"\n",
|
25 |
+
" # Numbers from 11 to 19\n",
|
26 |
+
" 'eleven': ['பதினொன்று', 'பதினொன்னு', 'எலெவன்'],\n",
|
27 |
+
" 'twelve': ['பன்னிரண்டு', 'பன்னிரண்டு', 'ட்வெல்வ்'],\n",
|
28 |
+
" 'thirteen': ['பதின்மூன்று', 'பதின்முன்னு', 'தர்டீன்'],\n",
|
29 |
+
" 'fourteen': ['பதினான்கு', 'பதின்நாலு', 'ஃபோர்டீன்'],\n",
|
30 |
+
" 'fifteen': ['பதினைந்து', 'பதினஞ்சு', 'ஃபிப்டீன்'],\n",
|
31 |
+
" 'sixteen': ['பதினாறு', 'பதினாறோ', 'சிக்ஸ்டீன்'],\n",
|
32 |
+
" 'seventeen': ['பதினேழு', 'பதினேழு', 'செவன்டீன்'],\n",
|
33 |
+
" 'eighteen': ['பதினெட்டு', 'பதினெட்டு', 'ஏட்டீன்'],\n",
|
34 |
+
" 'nineteen': ['பத்தொன்பது', 'பத்தொம்பது', 'நைன்டீன்'],\n",
|
35 |
+
"\n",
|
36 |
+
" # Multiples of ten\n",
|
37 |
+
" 'twenty': ['இருபது', 'இருபத்து', 'ட்வென்டி'],\n",
|
38 |
+
" 'thirty': ['முப்பது', 'முப்பத்து', 'தர்டி'],\n",
|
39 |
+
" 'forty': ['நாற்பது', 'நாற்பத்து', 'ஃபோர்டி'],\n",
|
40 |
+
" 'fifty': ['ஐம்பது', 'ஐம்பத்து', 'ஃபிப்டி'],\n",
|
41 |
+
" 'sixty': ['அறுபது', 'அறுபத்து', 'சிக்ஸ்டி'],\n",
|
42 |
+
" 'seventy': ['எழுபது', 'எழுபத்து', 'செவன்டி'],\n",
|
43 |
+
" 'eighty': ['எண்பது', 'எண்பத்து', 'ஏட்டி'],\n",
|
44 |
+
" 'ninety': ['தொண்ணூறு', 'தொன்னூறு', 'நைன்டி'],\n",
|
45 |
+
"\n",
|
46 |
+
" # Numbers from 21 to 29\n",
|
47 |
+
" 'twenty one': ['இருபத்து ஒன்று', 'இருபத்தொன்று', 'ட்வென்டி ஒன்'],\n",
|
48 |
+
" 'twenty two': ['இருபத்து இரண்டு', 'இருபத்திரண்டு', 'ட்வென்டி டூ'],\n",
|
49 |
+
" 'twenty three': ['இருபத்து மூன்று', 'இருபத்துமூன்று', 'ட்வென்டி த்ரீ'],\n",
|
50 |
+
" 'twenty four': ['இருபத்து நான்கு', 'இருபத்துநான்கு', 'ட்வென்டி ஃபோர்'],\n",
|
51 |
+
" 'twenty five': ['இருபத்து ஐந்து', 'இருபத்துஐந்து', 'ட்வென்டி ஃபைவ்'],\n",
|
52 |
+
" 'twenty six': ['இருபத்து ஆறு', 'இருபத்துஆறு', 'ட்வென்டி சிக்ஸ்'],\n",
|
53 |
+
" 'twenty seven': ['இருபத்து ஏழு', 'இருபத்துஏழு', 'ட்வென்டி செவன்'],\n",
|
54 |
+
" 'twenty eight': ['இருபத்து எட்டு', 'இருபத்துஎட்டு', 'ட்வென்டி ஏட்'],\n",
|
55 |
+
" 'twenty nine': ['இருபத்து ஒன்பது', 'இருபத்தொன்பது', 'ட்வென்டி நைன்'],\n",
|
56 |
+
"\n",
|
57 |
+
" # Numbers from 31 to 39\n",
|
58 |
+
" 'thirty one': ['முப்பத்து ஒன்று', 'முப்பத்தொன்று', 'தர்டி ஒன்'],\n",
|
59 |
+
" 'thirty two': ['முப்பத்து இரண்டு', 'முப்பத்திரண்டு', 'தர்டி டூ'],\n",
|
60 |
+
" 'thirty three': ['முப்பத்து மூன்று', 'முப்பத்துமூன்று', 'தர்டி த்ரீ'],\n",
|
61 |
+
" 'thirty four': ['முப்பத்து நான்க���', 'முப்பத்துநான்கு', 'தர்டி ஃபோர்'],\n",
|
62 |
+
" 'thirty five': ['முப்பத்து ஐந்து', 'முப்பத்துஐந்து', 'தர்டி ஃபைவ்'],\n",
|
63 |
+
" 'thirty six': ['முப்பத்து ஆறு', 'முப்பத்துஆறு', 'தர்டி சிக்ஸ்'],\n",
|
64 |
+
" 'thirty seven': ['முப்பத்து ஏழு', 'முப்பத்துஏழு', 'தர்டி செவன்'],\n",
|
65 |
+
" 'thirty eight': ['முப்பத்து எட்டு', 'முப்பத்துஎட்டு', 'தர்டி ஏட்'],\n",
|
66 |
+
" 'thirty nine': ['முப்பத்து ஒன்பது', 'முப்பத்தொன்பது', 'தர்டி நைன்'],\n",
|
67 |
+
"\n",
|
68 |
+
" # Numbers from 41 to 49\n",
|
69 |
+
" 'forty one': ['நாற்பத்து ஒன்று', 'நாற்பத்தொன்று', 'ஃபோர்டி ஒன்'],\n",
|
70 |
+
" 'forty two': ['நாற்பத்து இரண்டு', 'நாற்பத்திரண்டு', 'ஃபோர்டி டூ'],\n",
|
71 |
+
" 'forty three': ['நாற்பத்து மூன்று', 'நாற்பத்துமூன்று', 'ஃபோர்டி த்ரீ'],\n",
|
72 |
+
" 'forty four': ['நாற்பத்து நான்கு', 'நாற்பத்துநான்கு', 'ஃபோர்டி ஃபோர்'],\n",
|
73 |
+
" 'forty five': ['நாற்பத்து ஐந்து', 'நாற்பத்துஐந்து', 'ஃபோர்டி ஃபைவ்'],\n",
|
74 |
+
" 'forty six': ['நாற்பத்து ஆறு', 'நாற்பத்துஆறு', 'ஃபோர்டி சிக்ஸ்'],\n",
|
75 |
+
" 'forty seven': ['நாற்பத்து ஏழு', 'நாற்பத்துஏழு', 'ஃபோர்டி செவன்'],\n",
|
76 |
+
" 'forty eight': ['நாற்பத்து எட்டு', 'நாற்பத்துஎட்டு', 'ஃபோர்டி ஏட்'],\n",
|
77 |
+
" 'forty nine': ['நாற்பத்து ஒன்பது', 'நாற்பத்தொன்பது', 'ஃபோர்டி நைன்'],\n",
|
78 |
+
"\n",
|
79 |
+
" # Numbers from 51 to 59\n",
|
80 |
+
" 'fifty one': ['ஐம்பத்து ஒன்று', 'ஐம்பத்தொன்று', 'ஃபிப்டி ஒன்'],\n",
|
81 |
+
" 'fifty two': ['ஐம்பத்து இரண்டு', 'ஐம்பத்திரண்டு', 'ஃபிப்டி டூ'],\n",
|
82 |
+
" 'fifty three': ['ஐம்பத்து மூன்று', 'ஐம்பத்துமூன்று', 'ஃபிப்டி த்ரீ'],\n",
|
83 |
+
" 'fifty four': ['ஐம்பத்து நான்கு', 'ஐம்பத்துநான்கு', 'ஃபிப்டி ஃபோர்'],\n",
|
84 |
+
" 'fifty five': ['ஐம்பத்து ஐந்து', 'ஐம்பத்துஐந்து', 'ஃபிப்டி ஃபைவ்'],\n",
|
85 |
+
" 'fifty six': ['ஐம்பத்து ஆறு', 'ஐம்பத்துஆறு', 'ஃபிப்டி சிக்ஸ்'],\n",
|
86 |
+
" 'fifty seven': ['ஐம்பத்து ஏழு', 'ஐம்பத்துஏழு', 'ஃபிப்டி செவன்'],\n",
|
87 |
+
" 'fifty eight': ['ஐம்பத்து எட்டு', 'ஐம்பத்துஎட்டு', 'ஃபிப்டி ஏட்'],\n",
|
88 |
+
" 'fifty nine': ['ஐம்பத்து ஒன்பது', 'ஐம்பத்தொன்பது', 'ஃபிப்டி நைன்'],\n",
|
89 |
+
" \n",
|
90 |
+
" # Numbers from 61 to 69\n",
|
91 |
+
" 'sixty one': ['அறுபத்து ஒன்று', 'அறுபத்தொன்று', 'சிக்ஸ்டி ஒன்'],\n",
|
92 |
+
" 'sixty two': ['அறுபத்து இரண்டு', 'அறுபத்திரண்டு', 'சிக்ஸ்டி டூ'],\n",
|
93 |
+
" 'sixty three': ['அறுபத்து மூன்று', 'அறுபத்துமூன்று', 'சிக்ஸ்டி த்ரீ'],\n",
|
94 |
+
" 'sixty four': ['அறுபத்து நான்கு', 'அறுபத்துநான்கு', 'சிக்ஸ்டி ஃபோர்'],\n",
|
95 |
+
" 'sixty five': ['அறுபத்து ஐந்து', 'அறுபத்துஐந்து', 'சிக்ஸ்டி ஃபைவ்'],\n",
|
96 |
+
" 'sixty six': ['அறுபத்து ஆறு', 'அறுபத்துஆறு', 'சிக்ஸ்டி சிக்ஸ்'],\n",
|
97 |
+
" 'sixty seven': ['அறுபத்து ஏழு', 'அறுபத்துஏழு', 'சிக்ஸ்டி செவன்'],\n",
|
98 |
+
" 'sixty eight': ['அறுபத்து எட்டு', 'அறுபத்துஎட்டு', 'சிக்ஸ்டி ஏட்'],\n",
|
99 |
+
" 'sixty nine': ['அறுபத்து ஒன்பது', 'அறுபத்தொன்பது', 'சிக்ஸ்டி நைன்'],\n",
|
100 |
+
"\n",
|
101 |
+
" # Numbers from 71 to 79\n",
|
102 |
+
" 'seventy one': ['எழுபத்து ஒன்று', 'எழுபத்தொன்று', 'செவன்டி ஒன்'],\n",
|
103 |
+
" 'seventy two': ['எழுபத்து இரண்டு', 'எழுபத்திரண்டு', 'செவன்டி டூ'],\n",
|
104 |
+
" 'seventy three': ['எழுபத்து மூன்று', 'எழுபத்துமூன்று', 'செவன்டி த்ரீ'],\n",
|
105 |
+
" 'seventy four': ['எழுபத்து நான்கு', 'எழுபத்துநான்கு', 'செவன்டி ஃபோர்'],\n",
|
106 |
+
" 'seventy five': ['எழுபத்து ஐந்து', 'எழுபத்துஐந்து', 'செவன்டி ஃபைவ்'],\n",
|
107 |
+
" 'seventy six': ['எழுபத்து ஆறு', 'எழுபத்துஆறு', 'செவன்டி சிக்ஸ்'],\n",
|
108 |
+
" 'seventy seven': ['எழுபத்து ஏழு', 'எழுபத்துஏழு', 'செவன்டி செவன்'],\n",
|
109 |
+
" 'seventy eight': ['எழுபத்து எட்டு', 'எழுபத்துஎட்டு', 'செவன்டி ஏட்'],\n",
|
110 |
+
" 'seventy nine': ['எழுபத்து ஒன்பது', 'எழுபத்தொன்பது', 'செவன்டி நைன்'],\n",
|
111 |
+
"\n",
|
112 |
+
" # Numbers from 81 to 89\n",
|
113 |
+
" 'eighty one': ['எண்பத்து ஒன்று', 'எண்பத்தொன்று', 'ஏட்டி ஒன்'],\n",
|
114 |
+
" 'eighty two': ['எண்பத்து இரண்டு', 'எண்பத்திரண்டு', 'ஏட்டி டூ'],\n",
|
115 |
+
" 'eighty three': ['எண்பத்து மூன்று', 'எண்பத்துமூன்று', 'ஏட்டி த்ரீ'],\n",
|
116 |
+
" 'eighty four': ['எண்பத்து நான்கு', 'எண்பத்துநான்கு', 'ஏட்டி ஃபோர்'],\n",
|
117 |
+
" 'eighty five': ['எண்பத்து ஐந்து', 'எண்பத்துஐந்து', 'ஏட்டி ஃபைவ்'],\n",
|
118 |
+
" 'eighty six': ['எண்பத்து ஆறு', 'எண்பத்துஆறு', 'ஏட்டி சிக்ஸ்'],\n",
|
119 |
+
" 'eighty seven': ['எண்பத்து ஏழு', 'எண்பத்துஏழு', 'ஏட்டி செவன்'],\n",
|
120 |
+
" 'eighty eight': ['எண்பத்து எட்டு', 'எண்பத்துஎட்டு', 'ஏட்டி ஏட்'],\n",
|
121 |
+
" 'eighty nine': ['எண்பத்து ஒன்பது', 'எண்பத்தொன்பது', 'ஏட்டி நைன்'],\n",
|
122 |
+
"\n",
|
123 |
+
" # Numbers from 91 to 99\n",
|
124 |
+
" 'ninety one': ['தொண்ணூற்று ஒன்று', 'தொண்ணூற்றொன்று', 'நைன்டி ஒன்'],\n",
|
125 |
+
" 'ninety two': ['தொண்ணூற்று இரண்டு', 'தொண்ணூற்றிரண்டு', 'நைன்டி டூ'],\n",
|
126 |
+
" 'ninety three': ['தொண்ணூற்று மூன்று', 'தொண்ணூற்றுமூன்று', 'நைன்டி த்ரீ'],\n",
|
127 |
+
" 'ninety four': ['தொண்ணூற்று நான்கு', 'தொண்ணூற்றுநான்கு', 'நைன்டி ஃபோர்'],\n",
|
128 |
+
" 'ninety five': ['தொண்ணூற்று ஐந்து', 'தொண்ணூற்றுஐந்து', 'நைன்டி ஃபைவ்'],\n",
|
129 |
+
" 'ninety six': ['தொண்ணூற்று ஆறு', 'தொண்ணூற்றுஆறு', 'நைன்டி சிக்ஸ்'],\n",
|
130 |
+
" 'ninety seven': ['தொண்ணூற்று ஏழு', 'தொண்ணூற்றுஏழு', 'நைன்டி செவன்'],\n",
|
131 |
+
" 'ninety eight': ['தொண்ணூற்று எட்டு', 'தொண்ணூற்றுஎட்டு', 'நைன்டி ஏட்'],\n",
|
132 |
+
" 'ninety nine': ['தொண்ணூற்று ஒன்பது', 'தொண்ணூற்றொன்பது', 'நைன்டி நைன்'],\n",
|
133 |
+
"\n",
|
134 |
+
" # Hundred\n",
|
135 |
+
" 'hundred': ['நூறு', 'நூறை', 'ஹண்ட்ரெட்'],\n",
|
136 |
+
" # Thousand\n",
|
137 |
+
" 'thousand': ['ஆயிரம்'],\n",
|
138 |
+
" }\n",
|
139 |
+
" \n",
|
140 |
+
" words = sentence.split() # Split the sentence by spaces\n",
|
141 |
+
" \n",
|
142 |
+
" # Replace words using the mapping\n",
|
143 |
+
" for i, word in enumerate(words):\n",
|
144 |
+
" for replacement, patterns in replacement_map.items():\n",
|
145 |
+
" if word in patterns:\n",
|
146 |
+
" words[i] = replacement # Replace the word if it's fully matched\n",
|
147 |
+
" \n",
|
148 |
+
" # Join the processed words back into a sentence\n",
|
149 |
+
" return ' '.join(words)"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "code",
|
154 |
+
"execution_count": null,
|
155 |
+
"id": "7bdb593a-cb68-4b04-af8d-b61ea396a5eb",
|
156 |
+
"metadata": {},
|
157 |
+
"outputs": [],
|
158 |
+
"source": []
|
159 |
+
}
|
160 |
+
],
|
161 |
+
"metadata": {
|
162 |
+
"kernelspec": {
|
163 |
+
"display_name": "Python 3 (ipykernel)",
|
164 |
+
"language": "python",
|
165 |
+
"name": "python3"
|
166 |
+
},
|
167 |
+
"language_info": {
|
168 |
+
"codemirror_mode": {
|
169 |
+
"name": "ipython",
|
170 |
+
"version": 3
|
171 |
+
},
|
172 |
+
"file_extension": ".py",
|
173 |
+
"mimetype": "text/x-python",
|
174 |
+
"name": "python",
|
175 |
+
"nbconvert_exporter": "python",
|
176 |
+
"pygments_lexer": "ipython3",
|
177 |
+
"version": "3.11.7"
|
178 |
+
}
|
179 |
+
},
|
180 |
+
"nbformat": 4,
|
181 |
+
"nbformat_minor": 5
|
182 |
+
}
|
text2int.ipynb
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "61185b34-45e0-4a78-a84b-2cedd08ad39a",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# # Function to convert Hindi text to numerical representation\n",
|
11 |
+
"# from isNumber import is_number\n",
|
12 |
+
"\n",
|
13 |
+
"# def text_to_int (textnum, numwords={}):\n",
|
14 |
+
"# units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',\n",
|
15 |
+
"# 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',\n",
|
16 |
+
"# 'sixteen', 'seventeen', 'eighteen', 'nineteen']\n",
|
17 |
+
"# tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']\n",
|
18 |
+
"# scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']\n",
|
19 |
+
"# ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}\n",
|
20 |
+
"# ordinal_endings = [('ieth', 'y'), ('th', '')]\n",
|
21 |
+
"\n",
|
22 |
+
"# if not numwords:\n",
|
23 |
+
"# numwords['and'] = (1, 0)\n",
|
24 |
+
"# for idx, word in enumerate(units): numwords[word] = (1, idx)\n",
|
25 |
+
"# for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)\n",
|
26 |
+
"# for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)\n",
|
27 |
+
"\n",
|
28 |
+
"# textnum = textnum.replace('-', ' ')\n",
|
29 |
+
"\n",
|
30 |
+
"# current = result = 0\n",
|
31 |
+
"# curstring = ''\n",
|
32 |
+
"# onnumber = False\n",
|
33 |
+
"# lastunit = False\n",
|
34 |
+
"# lastscale = False\n",
|
35 |
+
"\n",
|
36 |
+
"# def is_numword(x):\n",
|
37 |
+
"# if is_number(x):\n",
|
38 |
+
"# return True\n",
|
39 |
+
"# if word in numwords:\n",
|
40 |
+
"# return True\n",
|
41 |
+
"# return False\n",
|
42 |
+
"\n",
|
43 |
+
"# def from_numword(x):\n",
|
44 |
+
"# if is_number(x):\n",
|
45 |
+
"# scale = 0\n",
|
46 |
+
"# increment = int(x.replace(',', ''))\n",
|
47 |
+
"# return scale, increment\n",
|
48 |
+
"# return numwords[x]\n",
|
49 |
+
"\n",
|
50 |
+
"# for word in textnum.split():\n",
|
51 |
+
"# if word in ordinal_words:\n",
|
52 |
+
"# scale, increment = (1, ordinal_words[word])\n",
|
53 |
+
"# current = current * scale + increment\n",
|
54 |
+
"# if scale > 100:\n",
|
55 |
+
"# result += current\n",
|
56 |
+
"# current = 0\n",
|
57 |
+
"# onnumber = True\n",
|
58 |
+
"# lastunit = False\n",
|
59 |
+
"# lastscale = False\n",
|
60 |
+
"# else:\n",
|
61 |
+
"# for ending, replacement in ordinal_endings:\n",
|
62 |
+
"# if word.endswith(ending):\n",
|
63 |
+
"# word = \"%s%s\" % (word[:-len(ending)], replacement)\n",
|
64 |
+
"\n",
|
65 |
+
"# if (not is_numword(word)) or (word == 'and' and not lastscale):\n",
|
66 |
+
"# if onnumber:\n",
|
67 |
+
"# # Flush the current number we are building\n",
|
68 |
+
"# curstring += repr(result + current) + \" \"\n",
|
69 |
+
"# curstring += word + \" \"\n",
|
70 |
+
"# result = current = 0\n",
|
71 |
+
"# onnumber = False\n",
|
72 |
+
"# lastunit = False\n",
|
73 |
+
"# lastscale = False\n",
|
74 |
+
"# else:\n",
|
75 |
+
"# scale, increment = from_numword(word)\n",
|
76 |
+
"# onnumber = True\n",
|
77 |
+
"\n",
|
78 |
+
"# if lastunit and (word not in scales): \n",
|
79 |
+
"# # Assume this is part of a string of individual numbers to \n",
|
80 |
+
"# # be flushed, such as a zipcode \"one two three four five\" \n",
|
81 |
+
"# curstring += repr(result + current) \n",
|
82 |
+
"# result = current = 0 \n",
|
83 |
+
"\n",
|
84 |
+
"# if scale > 1: \n",
|
85 |
+
"# current = max(1, current) \n",
|
86 |
+
"\n",
|
87 |
+
"# current = current * scale + increment \n",
|
88 |
+
"# if scale > 100: \n",
|
89 |
+
"# result += current \n",
|
90 |
+
"# current = 0 \n",
|
91 |
+
"\n",
|
92 |
+
"# lastscale = False \n",
|
93 |
+
"# lastunit = False \n",
|
94 |
+
"# if word in scales: \n",
|
95 |
+
"# lastscale = True \n",
|
96 |
+
"# elif word in units: \n",
|
97 |
+
"# lastunit = True\n",
|
98 |
+
"\n",
|
99 |
+
"# if onnumber:\n",
|
100 |
+
"# curstring += repr(result + current)\n",
|
101 |
+
"\n",
|
102 |
+
"# return curstring\n"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": 3,
|
108 |
+
"id": "a87b26d7-4a0e-4fdc-b03e-1537600faf65",
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [],
|
111 |
+
"source": [
|
112 |
+
"import nbimporter\n",
|
113 |
+
"from isNumber import is_number # Remove or replace this if unnecessary\n",
|
114 |
+
"\n",
|
115 |
+
"def text_to_int(textnum, numwords={}):\n",
|
116 |
+
" # Define units, tens, and scales including \"lac\"\n",
|
117 |
+
" units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',\n",
|
118 |
+
" 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',\n",
|
119 |
+
" 'sixteen', 'seventeen', 'eighteen', 'nineteen']\n",
|
120 |
+
" tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']\n",
|
121 |
+
" scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # \"lac\" added\n",
|
122 |
+
" ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}\n",
|
123 |
+
" ordinal_endings = [('ieth', 'y'), ('th', '')]\n",
|
124 |
+
"\n",
|
125 |
+
" if not numwords:\n",
|
126 |
+
" numwords['and'] = (1, 0) # Handle \"one hundred and twenty\"\n",
|
127 |
+
" \n",
|
128 |
+
" # Add units, tens, and scales to numwords\n",
|
129 |
+
" for idx, word in enumerate(units):\n",
|
130 |
+
" numwords[word] = (1, idx)\n",
|
131 |
+
" for idx, word in enumerate(tens):\n",
|
132 |
+
" numwords[word] = (1, idx * 10)\n",
|
133 |
+
" \n",
|
134 |
+
" for idx, word in enumerate(scales):\n",
|
135 |
+
" numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle \"lac\" as 10^5\n",
|
136 |
+
"\n",
|
137 |
+
" # Remove hyphens and normalize input\n",
|
138 |
+
" textnum = textnum.replace('-', ' ')\n",
|
139 |
+
"\n",
|
140 |
+
" current = result = 0\n",
|
141 |
+
" curstring = ''\n",
|
142 |
+
" onnumber = False\n",
|
143 |
+
" lastunit = False\n",
|
144 |
+
" lastscale = False\n",
|
145 |
+
"\n",
|
146 |
+
" def is_numword(x):\n",
|
147 |
+
" return is_number(x) or x in numwords\n",
|
148 |
+
"\n",
|
149 |
+
" def from_numword(x):\n",
|
150 |
+
" if is_number(x):\n",
|
151 |
+
" return 0, int(x.replace(',', ''))\n",
|
152 |
+
" return numwords[x]\n",
|
153 |
+
"\n",
|
154 |
+
" for word in textnum.split():\n",
|
155 |
+
" if word in ordinal_words:\n",
|
156 |
+
" scale, increment = (1, ordinal_words[word])\n",
|
157 |
+
" current = current * scale + increment\n",
|
158 |
+
" if scale > 100:\n",
|
159 |
+
" result += current\n",
|
160 |
+
" current = 0\n",
|
161 |
+
" onnumber = True\n",
|
162 |
+
" lastunit = False\n",
|
163 |
+
" lastscale = False\n",
|
164 |
+
" else:\n",
|
165 |
+
" for ending, replacement in ordinal_endings:\n",
|
166 |
+
" if word.endswith(ending):\n",
|
167 |
+
" word = f\"{word[:-len(ending)]}{replacement}\"\n",
|
168 |
+
"\n",
|
169 |
+
" if not is_numword(word) or (word == 'and' and not lastscale):\n",
|
170 |
+
" if onnumber:\n",
|
171 |
+
" curstring += repr(result + current) + \" \"\n",
|
172 |
+
" curstring += word + \" \"\n",
|
173 |
+
" result = current = 0\n",
|
174 |
+
" onnumber = False\n",
|
175 |
+
" lastunit = False\n",
|
176 |
+
" lastscale = False\n",
|
177 |
+
" else:\n",
|
178 |
+
" scale, increment = from_numword(word)\n",
|
179 |
+
" onnumber = True\n",
|
180 |
+
"\n",
|
181 |
+
" if lastunit and word not in scales:\n",
|
182 |
+
" curstring += repr(result + current) + \" \"\n",
|
183 |
+
" result = current = 0\n",
|
184 |
+
"\n",
|
185 |
+
" if scale > 1:\n",
|
186 |
+
" current = max(1, current)\n",
|
187 |
+
"\n",
|
188 |
+
" current = current * scale + increment\n",
|
189 |
+
"\n",
|
190 |
+
" if scale >= 100:\n",
|
191 |
+
" result += current\n",
|
192 |
+
" current = 0\n",
|
193 |
+
"\n",
|
194 |
+
" lastscale = word in scales\n",
|
195 |
+
" lastunit = word in units\n",
|
196 |
+
"\n",
|
197 |
+
" if onnumber:\n",
|
198 |
+
" curstring += repr(result + current)\n",
|
199 |
+
"\n",
|
200 |
+
" return curstring.strip()"
|
201 |
+
]
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"cell_type": "code",
|
205 |
+
"execution_count": null,
|
206 |
+
"id": "83997c73-e1b4-4863-b1df-d6de6153e80d",
|
207 |
+
"metadata": {},
|
208 |
+
"outputs": [],
|
209 |
+
"source": []
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"metadata": {
|
213 |
+
"kernelspec": {
|
214 |
+
"display_name": "Python 3 (ipykernel)",
|
215 |
+
"language": "python",
|
216 |
+
"name": "python3"
|
217 |
+
},
|
218 |
+
"language_info": {
|
219 |
+
"codemirror_mode": {
|
220 |
+
"name": "ipython",
|
221 |
+
"version": 3
|
222 |
+
},
|
223 |
+
"file_extension": ".py",
|
224 |
+
"mimetype": "text/x-python",
|
225 |
+
"name": "python",
|
226 |
+
"nbconvert_exporter": "python",
|
227 |
+
"pygments_lexer": "ipython3",
|
228 |
+
"version": "3.11.7"
|
229 |
+
}
|
230 |
+
},
|
231 |
+
"nbformat": 4,
|
232 |
+
"nbformat_minor": 5
|
233 |
+
}
|