Spaces:

cdactvm
/

Tamil_ASR_Demo

Sleeping

App Files Files Community

cdactvm commited on Dec 11, 2024

Commit

cd1b576

verified ·

1 Parent(s): 63a812a

Upload 13 files

Browse files

Files changed (13) hide show

Tamil_number_conversion.py +78 -0
Text2List.py +67 -0
applyVad.py +212 -0
convert2list.py +55 -0
highPassFilter.py +41 -0
ipynb2py.py +41 -0
isNumber.py +22 -0
numberMapping.py +135 -0
processDoubles.py +54 -0
replaceWords.py +153 -0
text2int.py +200 -0
waveletDenoise.py +21 -0
wienerFilter.py +22 -0

Tamil_number_conversion.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+import gradio as gr
+import librosa
+import numpy as np
+import pywt
+import nbimporter
+from scipy.signal import butter, lfilter, wiener
+from scipy.io.wavfile import write
+from transformers import pipeline
+from text2int import text_to_int
+from isNumber import is_number
+from Text2List import text_to_list
+from convert2list import convert_to_list
+from processDoubles import process_doubles
+from replaceWords import replace_words
+asr_model = pipeline("automatic-speech-recognition", model="cdactvm/w2v-bert-tamil_new")
+# Function to apply a high-pass filter
+def high_pass_filter(audio, sr, cutoff=300):
+    nyquist = 0.5 * sr
+    normal_cutoff = cutoff / nyquist
+    b, a = butter(1, normal_cutoff, btype='high', analog=False)
+    filtered_audio = lfilter(b, a, audio)
+    return filtered_audio
+# Function to apply wavelet denoising
+def wavelet_denoise(audio, wavelet='db1', level=1):
+    coeffs = pywt.wavedec(audio, wavelet, mode='per')
+    sigma = np.median(np.abs(coeffs[-level])) / 0.5
+    uthresh = sigma * np.sqrt(2 * np.log(len(audio)))
+    coeffs[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeffs[1:]]
+    return pywt.waverec(coeffs, wavelet, mode='per')
+# Function to apply a Wiener filter for noise reduction
+def apply_wiener_filter(audio):
+    return wiener(audio)
+# Function to handle speech recognition
+def recognize_speech(audio_file):
+    audio, sr = librosa.load(audio_file, sr=16000)
+    audio = high_pass_filter(audio, sr)
+    audio = apply_wiener_filter(audio)
+    denoised_audio = wavelet_denoise(audio)
+    result = asr_model(denoised_audio)
+    text_value = result['text']
+    cleaned_text = text_value.replace("<s>", "")
+    print(cleaned_text)
+    converted_to_list = convert_to_list(cleaned_text, text_to_list())
+    print(converted_to_list)
+    processed_doubles = process_doubles(converted_to_list)
+    print(processed_doubles)
+    replaced_words = replace_words(processed_doubles)
+    print(replaced_words)
+    converted_text = text_to_int(replaced_words)
+    print(converted_text)
+    return converted_text
+# Gradio Interface
+gr.Interface(
+    fn=recognize_speech,
+    inputs=gr.Audio(sources=["microphone","upload"], type="filepath"),
+    outputs="text",
+    title="Speech Recognition with Advanced Noise Reduction & Hindi ASR",
+    description="Upload an audio file, and the system will use high-pass filtering, Wiener filtering, and wavelet-based denoising, then a Hindi ASR model will transcribe the clean audio."
+).launch()
+# In[ ]:

Text2List.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[4]:
+def text_to_list():
+    text_list = [
+        # Tamil script for English numbers (11-19)
+        'எலெவன்', 'ட்வெல்வ்', 'திர்டீன்', 'போர்டீன்', 'ஃபிஃப்டீன்', 'சிக்ஸ்டீன்', 'சிவன்டீன்', 'எட்டீன்', 'நைன்டீன்',
+        # Tamil numbers (11-19)
+        'பதினொன்று', 'பனிரண்டு', 'பதிமூன்று', 'பதிநான்கு', 'பதினைந்து', 'பதினாறு', 'பதினேழு', 'பதினெட்டு', 'பத்தொன்பது',
+        # Tamil script for English multiples of ten (20, 30, ..., 90)
+        'ட்வெண்டி', 'திர்டி', 'போர்டி', 'ஃபிப்டி', 'சிக்ஸ்டி', 'சிவெண்டி', 'எய்ட்டி', 'நைன்டி',
+        # Tamil multiples of ten (20, 30, ..., 90)
+        'இருபது', 'முப்பது', 'நாற்பது', 'ஐம்பது', 'அறுபது', 'எழுபது', 'எண்பது', 'தொண்ணூறு',
+        # Tamil script for English combinations of 21-29
+        'ட்வெண்டி ஒன்', 'ட்வெண்டி டூ', 'ட்வெண்டி த்ரீ', 'ட்வெண்டி ஃபோர்', 'ட்வெண்டி ஃபைவு', 'ட்வெண்டி சிக்ஸ்', 'ட்வெண்டி செவன்', 'ட்வெண்டி எட்டு', 'ட்வெண்டி நைன்',
+        # Tamil combinations of 21-29
+        'இருபத்து ஒன்று', 'இருபத்து இரண்டு', 'இருபத்து மூன்று', 'இருபத்து நான்கு', 'இருபத்து ஐந்து', 'இருபத்து ஆறு', 'இருபத்து ஏழு', 'இருபத்து எட்டு', 'இருபத்து ஒன்பது',
+        # Tamil script for English combinations of 31-39
+        'திர்டி ஒன்', 'திர்டி டூ', 'திர்டி த்ரீ', 'திர்டி ஃபோர்', 'திர்டி ஃபைவு', 'திர்டி சிக்ஸ்', 'திர்டி செவன்', 'திர்டி எட்டு', 'திர்டி நைன்',
+        # Tamil combinations of 31-39
+        'முப்பத்து ஒன்று', 'முப்பத்து இரண்டு', 'முப்பத்து மூன்று', 'முப்பத்து நான்கு', 'முப்பத்து ஐந்து', 'முப்பத்து ஆறு', 'முப்பத்து ஏழு', 'முப்பத்து எட்டு', 'முப்பத்து ஒன்பது',
+        # Tamil script for English combinations of 41-49
+        'போர்டி ஒன்', 'போர்டி டூ', 'போர்டி த்ரீ', 'போர்டி ஃபோர்', 'போர்டி ஃபைவு', 'போர்டி சிக்ஸ்', 'போர்டி செவன்', 'போர்டி எட்டு', 'போர்டி நைன்',
+        # Tamil combinations of 41-49
+        'நாற்பத்து ஒன்று', 'நாற்பத்து இரண்டு', 'நாற்பத்து மூன்று', 'நாற்பத்து நான்கு', 'நாற்பத்து ஐந்து', 'நாற்பத்து ஆறு', 'நாற்பத்து ஏழு', 'நாற்பத்து எட்டு', 'நாற்பத்து ஒன்பது',
+        # Tamil script for English combinations of 51-59
+        'ஃபிப்டி ஒன்', 'ஃபிப்டி டூ', 'ஃபிப்டி த்ரீ', 'ஃபிப்டி ஃபோர்', 'ஃபிப்டி ஃபைவு', 'ஃபிப்டி சிக்ஸ்', 'ஃபிப்டி செவன்', 'ஃபிப்டி எட்டு', 'ஃபிப்டி நைன்',
+        # Tamil combinations of 51-59
+        'ஐம்பத்து ஒன்று', 'ஐம்பத்து இரண்டு', 'ஐம்பத்து மூன்று', 'ஐம்பத்து நான்கு', 'ஐம்பத்து ஐந்து', 'ஐம்பத்து ஆறு', 'ஐம்பத்து ஏழு', 'ஐம்பத்து எட்டு', 'ஐம்பத்து ஒன்பது',
+        # Tamil script for English combinations of 61-69
+        'சிக்ஸ்டி ஒன்', 'சிக்ஸ்டி டூ', 'சிக்ஸ்டி த்ரீ', 'சிக்ஸ்டி ஃபோர்', 'சிக்ஸ்டி ஃபைவு', 'சிக்ஸ்டி சிக்ஸ்', 'ச��க்ஸ்டி செவன்', 'சிக்ஸ்டி எட்டு', 'சிக்ஸ்டி நைன்',
+        # Tamil combinations of 61-69
+        'அறுபத்து ஒன்று', 'அறுபத்து இரண்டு', 'அறுபத்து மூன்று', 'அறுபத்து நான்கு', 'அறுபத்து ஐந்து', 'அறுபத்து ஆறு', 'அறுபத்து ஏழு', 'அறுபத்து எட்டு', 'அறுபத்து ஒன்பது',
+        # Tamil script for English combinations of 71-79
+        'சிவெண்டி ஒன்', 'சிவெண்டி டூ', 'சிவெண்டி த்ரீ', 'சிவெண்டி ஃபோர்', 'சிவெண்டி ஃபைவு', 'சிவெண்டி சிக்ஸ்', 'சிவெண்டி செவன்', 'சிவெண்டி எட்டு', 'சிவெண்டி நைன்',
+        # Tamil combinations of 71-79
+        'எழுபத்து ஒன்று', 'எழுபத்து இரண்டு', 'எழுபத்து மூன்று', 'எழுபத்து நான்கு', 'எழுபத்து ஐந்து', 'எழுபத்து ஆறு', 'எழுபத்து ஏழு', 'எழுபத்து எட்டு', 'எழுபத்து ஒன்பது',
+        # Tamil script for English combinations of 81-89
+        'எய்ட்டி ஒன்', 'எய்ட்டி டூ', 'எய்ட்டி த்ரீ', 'எய்ட்டி ஃபோர்', 'எய்ட்டி ஃபைவு', 'எய்ட்டி சிக்ஸ்', 'எய்ட்டி செவன்', 'எய்ட்டி எட்டு', 'எய்ட்டி நைன்',
+        # Tamil combinations of 81-89
+        'எண்பத்து ஒன்று', 'எண்பத்து இரண்டு', 'எண்பத்து மூன்று', 'எண்பத்து நான்கு', 'எண்பத்து ஐந்து', 'எண்பத்து ஆறு', 'எண்பத்து ஏழு', 'எண்பத்து எட்டு', 'எண்பத்து ஒன்பது',
+        # Tamil script for English combinations of 91-99
+        'நைன்டி ஒன்', 'நைன்டி டூ', 'நைன்டி த்ரீ', 'நைன்டி ஃபோர்', 'நைன்டி ஃபைவு', 'நைன்டி சிக்ஸ்', 'நைன்டி செவன்', 'நைன்டி எட்டு', 'நைன்டி நைன்',
+        # Tamil combinations of 91-99
+        'தொண்ணூற்று ஒன்று', 'தொண்ணூற்று இரண்டு', 'தொண்ணூற்று மூன்று', 'தொண்ணூற்று நான்கு', 'தொண்ணூற்று ஐந்து', 'தொண்ணூற்று ஆறு', 'தொண்ணூற்று ஏழு', 'தொண்ணூற்று எட்டு', 'தொண்ணூற்று ஒன்பது',
+        # Tamil script for English numbers (0-10)
+        'ஜீரோ', 'ஒன்', 'டூ', 'த்ரீ', 'போர்', 'ஃபைவ்', 'சிக்ஸ்', 'சிவன்', 'ஏட்', 'நைன்', 'டென்',
+        # Tamil numbers (0-10)
+        'பூஜ்ஜியம்', 'ஒன்று', 'இரண்டு', 'மூன்று', 'நான்கு', 'ஐந்து', 'ஆறு', 'ஏழு', 'எட்டு', 'ஒன்பது', 'பத்து',
+        # Tamil script for 100
+        'ஹண்ட்ரெட்',
+        # Tamil for 100
+        'நூறு',
+        # Tamil for 1000
+        'ஆயிரம்'
+    ]
+    return text_list
+# In[ ]:

applyVad.py ADDED Viewed

	@@ -0,0 +1,212 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+# import webrtcvad
+# import numpy as np
+# import librosa
+# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
+#     '''
+#      Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech.
+#      This is useful in noisy environments where you want to filter out non-speech parts of the audio.
+#      webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project.
+#      It helps detect speech in small chunks of audio.
+#      '''
+#     vad = webrtcvad.Vad()
+#     audio_int16 = np.int16(audio * 32767)
+#     frame_size = int(sr * frame_duration / 1000)
+#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
+#     voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
+#     voiced_audio = np.float32(voiced_audio) / 32767
+#     return voiced_audio
+# In[1]:
+# import webrtcvad
+# import numpy as np
+# import librosa
+# def apply_vad(audio, sr):
+#     # Ensure that sample rate is supported by webrtcvad
+#     if sr not in [8000, 16000, 32000, 48000]:
+#         raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")
+#     vad = webrtcvad.Vad(2)  # Aggressiveness mode: 0-3
+#     frame_duration_ms = 30  # Use 10ms, 20ms, or 30ms frames only
+#     # Convert to PCM 16-bit and calculate frame length
+#     audio_pcm16 = (audio * 32767).astype(np.int16)
+#     frame_length = int(sr * frame_duration_ms / 1000) * 2  # 2 bytes per sample for 16-bit PCM
+#     # Create frames ensuring correct frame size
+#     frames = [
+#         audio_pcm16[i:i + frame_length].tobytes()
+#         for i in range(0, len(audio_pcm16) - frame_length, frame_length)
+#     ]
+#     # Apply VAD
+#     voiced_frames = []
+#     for frame in frames:
+#         try:
+#             if vad.is_speech(frame, sample_rate=sr):
+#                 voiced_frames.append(frame)
+#         except Exception as e:
+#             print(f"Error during VAD frame processing: {e}")
+#     if not voiced_frames:
+#         raise Exception("No voiced frames detected.")
+#     # Concatenate voiced frames
+#     voiced_audio = b''.join(voiced_frames)
+#     return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0
+# In[ ]:
+# import webrtcvad
+# import numpy as np
+# import librosa
+# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
+#     '''
+#     Voice Activity Detection (VAD): Detects speech in audio.
+#     '''
+#     vad = webrtcvad.Vad(aggressiveness)
+#     # Resample to 16000 Hz if not already (recommended for better compatibility)
+#     if sr != 16000:
+#         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+#         sr = 16000
+#     # Convert to 16-bit PCM format expected by webrtcvad
+#     audio_int16 = np.int16(audio * 32767)
+#     # Ensure frame size matches WebRTC's expected lengths
+#     frame_size = int(sr * frame_duration / 1000)
+#     if frame_size % 2 != 0:
+#         frame_size -= 1  # Make sure it's even to avoid processing issues
+#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
+#     # Filter out non-speech frames
+#     voiced_frames = []
+#     for frame in frames:
+#         if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
+#             voiced_frames.append(frame)
+#     # Concatenate the voiced frames
+#     voiced_audio = np.concatenate(voiced_frames)
+#     voiced_audio = np.float32(voiced_audio) / 32767
+#     return voiced_audio
+# In[3]:
+# import webrtcvad
+# import numpy as np
+# import librosa
+# def frame_generator(frame_duration_ms, audio, sample_rate):
+#     """
+#     Generates audio frames from PCM audio data.
+#     Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
+#     """
+#     n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)  # Convert to byte length
+#     offset = 0
+#     while offset + n < len(audio):
+#         yield audio[offset:offset + n]
+#         offset += n
+# def apply_vad(audio, sample_rate):
+#     vad = webrtcvad.Vad()
+#     vad.set_mode(1)
+#     print("Applying VAD with mode:", 1)
+#     print("Audio length:", len(audio), "bytes")
+#     print("Sample rate:", sample_rate)
+#     # Ensure mono and correct sample rate
+#     if sample_rate != 16000:
+#         print("Sample rate issue detected.")
+#         raise ValueError("Sample rate must be 16000 Hz")
+#     frames = frame_generator(30, audio, sample_rate)
+#     frames = list(frames)
+#     print("Number of frames:", len(frames))
+#     try:
+#         segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]
+#         if not segments:
+#             raise Exception("No voiced frames detected.")
+#         return b''.join(segments)
+#     except Exception as e:
+#         print(f"Error during VAD frame processing: {e}")
+#         raise
+# In[5]:
+import torch
+import torchaudio
+from silero_vad import get_speech_timestamps, read_audio, save_audio
+def apply_silero_vad(audio_file_path):
+    """
+    Applies Silero VAD to an audio file and returns the processed audio
+    containing only the voiced segments.
+    """
+    # Load the Silero VAD model
+    model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
+    # Define helper utilities manually
+    def read_audio(path, sampling_rate=16000):
+        wav, sr = torchaudio.load(path)
+        if sr != sampling_rate:
+            wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
+        return wav.squeeze(0)
+    def save_audio(path, tensor, sampling_rate=16000):
+        torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)
+    # Read the audio file
+    wav = read_audio(audio_file_path, sampling_rate=16000)
+    # Get timestamps for speech segments
+    speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)
+    # If no speech detected, raise an exception
+    if not speech_timestamps:
+        raise Exception("No voiced frames detected using Silero VAD.")
+    # Combine the voiced segments
+    voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])
+    # Save the processed audio if needed
+    save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)
+    # Convert to numpy bytes for further processing
+    return voiced_audio.numpy().tobytes()
+# Example usage
+try:
+    processed_audio = apply_silero_vad("path_to_your_audio.wav")
+    print("VAD completed successfully!")
+except Exception as e:
+    print(f"Error during Silero VAD processing: {e}")
+# In[ ]:

convert2list.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[2]:
+# import nbimporter
+import nbimporter
+from Text2List import text_to_list
+def convert_to_list(text, text_list):
+    matched_words = []
+    unmatched_text = ''  # To accumulate unmatched characters
+    # Sort text_list by length in descending order to prioritize longest matches first
+    text_list_sorted = sorted(text_list, key=len, reverse=True)
+    while text:
+        matched = False
+        for word in text_list_sorted:
+            if text.startswith(word):
+                # Add any accumulated unmatched text before appending the matched word
+                if unmatched_text:
+                    matched_words.append(unmatched_text)
+                    unmatched_text = ''  # Reset unmatched text accumulator
+                matched_words.append(word)
+                text = text[len(word):]  # Remove the matched part from text
+                matched = True
+                break
+        if not matched:
+            # Accumulate unmatched characters
+            unmatched_text += text[0]
+            text = text[1:]
+    # If there's any remaining unmatched text, add it to the result
+    if unmatched_text:
+        matched_words.append(unmatched_text)
+    # Join matched words and unmatched text with a space
+    result = ' '.join(matched_words)
+    return result
+# text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच"
+# if __name__=="__main__":
+#     converted=convert_to_list(text, text_to_list())
+#     print(converted)
+# In[ ]:

highPassFilter.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[2]:
+# import scipy.signal
+# def high_pass_filter(audio, sr, cutoff=200, order=3):
+#     """
+#     Applies a high-pass filter to an audio signal.
+#     Parameters:
+#     audio (numpy array): The input audio signal.
+#     sr (int): The sample rate of the audio signal.
+#     cutoff (float): The cutoff frequency in Hz. Default is 100 Hz.
+#     order (int): The order of the filter. Default is 5.
+#     Returns:
+#     numpy array: The filtered audio signal.
+#     """
+#     # Design the high-pass filter using a Butterworth filter design
+#     sos = scipy.signal.butter(order, cutoff, btype='highpass', fs=sr, output='sos')
+#     # Apply the filter using sosfilt (second-order sections filter)
+#     filtered_audio = scipy.signal.sosfilt(sos, audio)
+#     return filtered_audio
+# In[ ]:
+def high_pass_filter(audio, sr, cutoff=300):
+    # Design a Butterworth high-pass filter
+    nyquist = 0.5 * sr
+    normal_cutoff = cutoff / nyquist
+    b, a = butter(1, normal_cutoff, btype='high', analog=False)
+    filtered_audio = lfilter(b, a, audio)
+    return filtered_audio

ipynb2py.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+pip install nbconvert nbformat
+# In[4]:
+import nbformat
+from nbconvert import PythonExporter
+def convert_ipynb_to_py(ipynb_file, py_file):
+    # Load the notebook file
+    with open(ipynb_file, 'r', encoding='utf-8') as f:
+        notebook_content = nbformat.read(f, as_version=4)
+    # Create a Python exporter
+    python_exporter = PythonExporter()
+    # Convert the notebook to Python code
+    python_code, _ = python_exporter.from_notebook_node(notebook_content)
+    # Save the generated Python code to a .py file
+    with open(py_file, 'w', encoding='utf-8') as f:
+        f.write(python_code)
+    print(f"Conversion complete! {ipynb_file} has been converted to {py_file}.")
+# Example usage:
+convert_ipynb_to_py('highPassFilter.ipynb', 'huggingface/highPassFilter.py')
+# In[ ]:

isNumber.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+# Function to check if the string is a number
+def is_number(x):
+    if type(x) == str:
+        x = x.replace(',', '')
+    try:
+        float(x)
+    except:
+        return False
+    return True
+# In[ ]:

numberMapping.py ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+replacement_map = {
+    'zero': ['शून्य', 'जेरो', 'शुन्ना', 'जीरो'],
+    'one': ['वन', 'एंक', 'इक', 'एक'],
+    'two': ['टू', 'दौ', 'दो'],
+    'three': ['थ्री', 'तीना', 'तीन', 'त्री'],
+    'four': ['फोर', 'फॉर', 'च्यार', 'चार'],
+    'five': ['फाइव', 'पाँच', 'पांच'],
+    'six': ['सिक्स', 'चह', 'छौ', 'छै', 'छह'],
+    'seven': ['सेवन', 'सात'],
+    'eight': ['एट', 'अट', 'आठ'],
+    'nine': ['नाइन', 'नौ'],
+    'ten': ['टेन', 'दस'],
+    # Numbers from 11 to 19
+    'eleven': ['इलेवन', 'ग्यारह'],
+    'twelve': ['ट्वेल्व', 'बारह'],
+    'thirteen': ['थर्टीन', 'तेरह'],
+    'fourteen': ['फोर्टीन', 'चौदह'],
+    'fifteen': ['फिफ्टीन', 'पंद्रह'],
+    'sixteen': ['सिक्स्टीन', 'सोलह'],
+    'seventeen': ['सेवंटीन', 'सत्रह'],
+    'eighteen': ['एटीन', 'अठारह'],
+    'nineteen': ['नाइनटीन', 'उन्नीस'],
+    # Multiples of ten
+    'twenty': ['ट्वेंटी', 'बीस'],
+    'thirty': ['थर्टी', 'तीस'],
+    'forty': ['फोर्टी', 'चालीस'],
+    'fifty': ['फिफ्टी', 'पचास'],
+    'sixty': ['सिक्स्टी', 'साठ'],
+    'seventy': ['सेवंटी', 'सत्तर'],
+    'eighty': ['एटी', 'अस्सी'],
+    'ninety': ['नाइंटी', 'नब्बे'],
+    # Numbers from 21 to 29
+    'twenty one': ['ट्वेंटी वन', 'इक्कीस'],
+    'twenty two': ['ट्वेंटी टू', 'बाईस'],
+    'twenty three': ['ट्वेंटी थ्री', 'तेईस'],
+    'twenty four': ['ट्वेंटी फोर', 'चौबीस'],
+    'twenty five': ['ट्वेंटी फाइव', 'पच्चीस'],
+    'twenty six': ['ट्वेंटी सिक्स', 'छब्बीस'],
+    'twenty seven': ['ट्वेंटी सेवन', 'सत्ताईस'],
+    'twenty eight': ['ट्वेंटी एट', 'अट्ठाईस'],
+    'twenty nine': ['ट्वेंटी नाइन', 'उनतीस'],
+    # Numbers from 31 to 39
+    'thirty one': ['थर्टी वन', 'इकतीस'],
+    'thirty two': ['थर्टी टू', 'बत्तीस'],
+    'thirty three': ['थर्टी थ्री', 'तेतीस'],
+    'thirty four': ['थर्टी फोर', 'चौंतीस'],
+    'thirty five': ['थर्टी फाइव', 'पैंतीस'],
+    'thirty six': ['थर्टी सिक्स', 'छत्तीस'],
+    'thirty seven': ['थर्टी सेवन', 'सैंतीस'],
+    'thirty eight': ['थर्टी एट', 'अड़तीस'],
+    'thirty nine': ['थर्टी नाइन', 'उनतालीस'],
+    # Numbers from 41 to 49
+    'forty one': ['फोर्टी वन', 'इकतालीस'],
+    'forty two': ['फोर्टी टू', 'बयालीस'],
+    'forty three': ['फोर्टी थ्री', 'तैंतालीस'],
+    'forty four': ['फोर्टी फोर', 'चौंतालीस'],
+    'forty five': ['फोर्टी फाइव', 'पैंतालीस'],
+    'forty six': ['फोर्टी सिक्स', 'छयालिस'],
+    'forty seven': ['फोर्टी सेवन', 'सैंतालीस'],
+    'forty eight': ['फोर्टी एट', 'अड़तालीस'],
+    'forty nine': ['फोर्टी नाइन', 'उनचास'],
+    # Numbers from 51 to 59
+    'fifty one': ['फिफ्टी वन', 'इक्यावन'],
+    'fifty two': ['फिफ्टी टू', 'बावन'],
+    'fifty three': ['फिफ्टी थ्री', 'तिरेपन'],
+    'fifty four': ['फिफ्टी फोर', 'चौवन'],
+    'fifty five': ['फिफ्टी फाइव', 'पचपन'],
+    'fifty six': ['फिफ्टी सिक्स', 'छप्पन'],
+    'fifty seven': ['फिफ्टी सेवन', 'सत्तावन'],
+    'fifty eight': ['फिफ्टी एट', 'अट्ठावन'],
+    'fifty nine': ['फिफ्टी नाइन', 'उनसठ'],
+    # Numbers from 61 to 69
+    'sixty one': ['सिक्स्टी वन', 'इकसठ'],
+    'sixty two': ['सिक्स्टी टू', 'बासठ'],
+    'sixty three': ['सिक्स्टी थ्री', 'तिरसठ'],
+    'sixty four': ['सिक्स्टी फोर', 'चौंसठ'],
+    'sixty five': ['सिक्स��टी फाइव', 'पैंसठ'],
+    'sixty six': ['सिक्स्टी सिक्स', 'छियासठ'],
+    'sixty seven': ['सिक्स्टी सेवन', 'सड़सठ'],
+    'sixty eight': ['सिक्स्टी एट', 'अड़सठ'],
+    'sixty nine': ['सिक्स्टी नाइन', 'उनहत्तर'],
+    # Numbers from 71 to 79
+    'seventy one': ['सेवंटी वन', 'इकहत्तर'],
+    'seventy two': ['सेवंटी टू', 'बहत्तर'],
+    'seventy three': ['सेवंटी थ्री', 'तिहत्तर'],
+    'seventy four': ['सेवंटी फोर', 'चौहत्तर'],
+    'seventy five': ['सेवंटी फाइव', 'पचहत्तर'],
+    'seventy six': ['सेवंटी सिक्स', 'छिहत्तर'],
+    'seventy seven': ['सेवंटी सेवन', 'सतहत्तर'],
+    'seventy eight': ['सेवंटी एट', 'अठहत्तर'],
+    'seventy nine': ['सेवंटी नाइन', 'उन्यासी'],
+    # Numbers from 81 to 89
+    'eighty one': ['एटी वन', 'इक्यासी'],
+    'eighty two': ['एटी टू', 'बयासी'],
+    'eighty three': ['एटी थ्री', 'तिरासी'],
+    'eighty four': ['एटी फोर', 'चौरासी'],
+    'eighty five': ['एटी फाइव', 'पचासी'],
+    'eighty six': ['एटी सिक्स', 'छियासी'],
+    'eighty seven': ['एटी सेवन', 'सतासी'],
+    'eighty eight': ['एटी एट', 'अठासी'],
+    'eighty nine': ['एटी नाइन', 'नवासी'],
+    # Numbers from 91 to 99
+    'ninety one': ['नाइंटी वन', 'इक्यानवे'],
+    'ninety two': ['नाइंटी टू', 'बानवे'],
+    'ninety three': ['नाइंटी थ्री', 'तिरानवे'],
+    'ninety four': ['नाइंटी फोर', 'चौरानवे'],
+    'ninety five': ['नाइंटी फाइव', 'पचानवे'],
+    'ninety six': ['नाइंटी सिक्स', 'छियानवे'],
+    'ninety seven': ['नाइंटी सेवन', 'सतानवे'],
+    'ninety eight': ['नाइंटी एट', 'अठानवे'],
+    'ninety nine': ['नाइंटी नाइन', 'निन्यानवे'],
+    # Hundred
+    'hundred': ['हंड्रेड', 'सौ'],
+    # Special for double digits
+    'डबल': ['दबल', 'डबल', 'दुबाल'],
+}

processDoubles.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[2]:
+# # Function to process "double" followed by a number
+# def process_doubles(sentence):
+#     tokens = sentence.split()
+#     result = []
+#     i = 0
+#     while i < len(tokens):
+#         if tokens[i] == "डबल":
+#             if i + 1 < len(tokens):
+#                 result.append(tokens[i + 1])
+#                 result.append(tokens[i + 1])
+#                 i += 2
+#             else:
+#                 result.append(tokens[i])
+#                 i += 1
+#         else:
+#             result.append(tokens[i])
+#             i += 1
+#     return ' '.join(result)
+# In[ ]:
+import re
+def process_doubles(sentence):
+    # Use regex to split 'डबल' followed by numbers/words without space (e.g., "डबलवन" -> "डबल वन")
+    sentence = re.sub(r'(डबल)(\S+)', r'\1 \2', sentence)
+    tokens = sentence.split()
+    result = []
+    i = 0
+    while i < len(tokens):
+        if tokens[i] == "डबल":
+            if i + 1 < len(tokens):
+                result.append(tokens[i + 1])  # Append the next word/number
+                result.append(tokens[i + 1])  # Append the next word/number again to duplicate
+                i += 2  # Skip over the next word since it's already processed
+            else:
+                result.append(tokens[i])
+                i += 1
+        else:
+            result.append(tokens[i])
+            i += 1
+    return ' '.join(result)

replaceWords.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[9]:
+import re
+def replace_words(sentence):
+    replacement_map = {
+    # Single digits
+    'one': ['ஒன்று', 'ஒன்னு', 'ஒன்'],
+    'two': ['இரண்டு', 'ரெண்டு', 'டூ'],
+    'three': ['மூன்று', 'முன்னு', 'த்ரீ'],
+    'four': ['நான்கு', 'நாலு', 'ஃபோர்'],
+    'five': ['ஐந்து', 'அஞ்சு', 'ஃபைவ்'],
+    'six': ['ஆறு', 'ஆறு', 'சிக்ஸ்'],
+    'seven': ['ஏழு', 'எழு', 'செவன்'],
+    'eight': ['எட்டு', 'எட்டு', 'எய்ட்'],
+    'nine': ['ஒன்பது', 'ஒம்பது', 'நைன்'],
+    'ten': ['பத்து', 'பத்து', 'டென'],
+    # Numbers from 11 to 19
+    'eleven': ['பதினொன்று', 'பதினொன்னு', 'எலெவன்'],
+    'twelve': ['பன்னிரண்டு', 'பன்னிரண்டு', 'ட்வெல்வ்'],
+    'thirteen': ['பதின்மூன்று', 'பதின்முன்னு', 'தர்டீன்'],
+    'fourteen': ['பதினான்கு', 'பதின்நாலு', 'ஃபோர்டீன்'],
+    'fifteen': ['பதினைந்து', 'பதினஞ்சு', 'ஃபிப்டீன்'],
+    'sixteen': ['பதினாறு', 'பதினாறோ', 'சிக்ஸ்டீன்'],
+    'seventeen': ['பதினேழு', 'பதினேழு', 'செவன்டீன்'],
+    'eighteen': ['பதினெட்டு', 'பதினெட்டு', 'ஏட்டீன்'],
+    'nineteen': ['பத்தொன்பது', 'பத்தொம்பது', 'நைன்டீன்'],
+    # Multiples of ten
+    'twenty': ['இருபது', 'இருபத்து', 'ட்வென்டி'],
+    'thirty': ['முப்பது', 'முப்பத்து', 'தர்டி'],
+    'forty': ['நாற்பது', 'நாற்பத்து', 'ஃபோர்டி'],
+    'fifty': ['ஐம்பது', 'ஐம்பத்து', 'ஃபிப்டி'],
+    'sixty': ['அறுபது', 'அறுபத்து', 'சிக்ஸ்டி'],
+    'seventy': ['எழுபது', 'எழுபத்து', 'செவன்டி'],
+    'eighty': ['எண்பது', 'எண்பத்து', 'ஏட்டி'],
+    'ninety': ['தொண்ணூறு', 'தொன்னூறு', 'நைன்டி'],
+    # Numbers from 21 to 29
+    'twenty one': ['இருபத்து ஒன்று', 'இருபத்தொன்று', 'ட்வென்டி ஒன்'],
+    'twenty two': ['இருபத்து இரண்டு', 'இருபத்திரண்டு', 'ட்வென்டி டூ'],
+    'twenty three': ['இருபத்து மூன்று', 'இருபத்துமூன்று', 'ட்வென்டி த்ரீ'],
+    'twenty four': ['இருபத்து நான்கு', 'இருபத்துநான்கு', 'ட்வென்டி ஃபோர்'],
+    'twenty five': ['இருபத்து ஐந்து', 'இருபத்துஐந்து', 'ட்வென்டி ஃபைவ்'],
+    'twenty six': ['இருபத்து ஆறு', 'இருபத்துஆறு', 'ட்வென்டி சிக்ஸ்'],
+    'twenty seven': ['இருபத்து ஏழு', 'இருபத்துஏழு', 'ட்வென்டி செவன்'],
+    'twenty eight': ['இருபத்து எட்டு', 'இருபத்துஎட்டு', 'ட்வென்டி ஏட்'],
+    'twenty nine': ['இருபத்து ஒன்பது', 'இருபத்தொன்பது', 'ட்வென்டி நைன்'],
+    # Numbers from 31 to 39
+    'thirty one': ['முப்பத்து ஒன்று', 'முப்பத்தொன்று', 'தர்டி ஒன்'],
+    'thirty two': ['முப்பத்து இரண்டு', 'முப்பத்திரண்டு', 'தர்டி டூ'],
+    'thirty three': ['முப்பத்து மூன்று', 'முப்பத்துமூன்று', 'தர்டி த்ரீ'],
+    'thirty four': ['முப்பத்து நான்கு', 'முப்பத்துநான்கு', 'தர்டி ஃபோர்'],
+    'thirty five': ['முப்பத்து ஐந்து', 'முப்பத்துஐந்து', 'தர்டி ஃபைவ்'],
+    'thirty six': ['முப்பத்து ஆறு', 'முப்பத்துஆறு', 'தர்டி சிக்ஸ்'],
+    'thirty seven': ['முப்பத்து ஏழு', 'முப்பத்துஏழு', 'தர்டி செவன்'],
+    'thirty eight': ['முப்பத்து எட்டு', 'முப்பத்துஎட்டு', 'தர்டி ஏட்'],
+    'thirty nine': ['முப்பத்து ஒன்பது', 'முப்பத்தொன்பது', 'தர்டி நைன்'],
+    # Numbers from 41 to 49
+    'forty one': ['நாற்பத்து ஒன்று', 'நாற்பத்தொன்று', 'ஃபோர்டி ஒன்'],
+    'forty two': ['நாற்பத்து இரண்டு', 'நாற்பத்திரண்டு', 'ஃபோர்டி டூ'],
+    'forty three': ['நாற்பத்து மூன்று', 'நாற்பத்துமூன்று', 'ஃபோர்டி த்ரீ'],
+    'forty four': ['நாற்பத்து நான்கு', 'நாற்பத்துநான்கு', 'ஃபோர்டி ஃபோர்'],
+    'forty five': ['நாற்பத்து ஐந்து', 'நாற்பத்துஐந்து', 'ஃபோர்டி ஃபைவ்'],
+    'forty six': ['நாற்பத்து ஆறு', 'நாற்பத்துஆறு', 'ஃபோர்டி சிக்ஸ்'],
+    'forty seven': ['நாற்பத்து ஏழு', 'நாற்பத்துஏழு', 'ஃபோர்டி செவன்'],
+    'forty eight': ['நாற்பத்து எட்டு', 'நாற்பத்துஎட்டு', 'ஃபோர்டி ஏட்'],
+    'forty nine': ['நாற்பத்து ஒன்பது', 'நாற்பத்தொன்பது', 'ஃபோர்டி நைன்'],
+    # Numbers from 51 to 59
+    'fifty one': ['ஐம்பத்து ஒன்று', 'ஐம்பத்தொன்று', 'ஃபிப்டி ஒன்'],
+    'fifty two': ['ஐம்பத்து இரண்டு', 'ஐம்பத்திரண்டு', 'ஃபிப்டி டூ'],
+    'fifty three': ['ஐம்பத்து மூன்று', 'ஐம்பத்துமூன்று', 'ஃபிப்டி த்ரீ'],
+    'fifty four': ['ஐம்பத்து நான்கு', 'ஐம்பத்துநான்கு', 'ஃபிப்டி ஃபோர்'],
+    'fifty five': ['ஐம்பத்து ஐந்து', 'ஐம்பத்துஐந்து', 'ஃபிப்டி ஃபைவ்'],
+    'fifty six': ['ஐம்பத்து ஆறு', 'ஐம்பத்துஆறு', 'ஃபிப்டி சிக்ஸ்'],
+    'fifty seven': ['ஐம்பத்து ஏழு', 'ஐம்பத்துஏழு', 'ஃபிப்டி செவன்'],
+    'fifty eight': ['ஐம்பத்து எட்டு', 'ஐம்பத்துஎட்டு', 'ஃபிப்டி ஏட்'],
+    'fifty nine': ['ஐம்பத்து ஒன்பது', 'ஐம்பத்தொன்பது', 'ஃபிப்டி நைன்'],
+    # Numbers from 61 to 69
+    'sixty one': ['அறுபத்து ஒன்று', 'அறுபத்தொன்று', 'சிக்ஸ்டி ஒன்'],
+    'sixty two': ['அறுபத்து இரண்டு', 'அறுபத்திரண்டு', 'சிக்ஸ்டி டூ'],
+    'sixty three': ['அறுபத்து மூன்று', 'அறுபத்துமூன்று', 'சிக்ஸ்டி த்ரீ'],
+    'sixty four': ['அறுபத்து நான்கு', 'அறுபத்துநான்கு', 'சிக்ஸ்டி ஃபோர்'],
+    'sixty five': ['அறுபத்து ஐந்து', 'அறுபத்துஐந்து', 'சிக்ஸ்டி ஃபைவ்'],
+    'sixty six': ['அறுபத்து ஆறு', 'அறுபத்துஆறு', 'சிக்ஸ்டி சிக்ஸ்'],
+    'sixty seven': ['அறுபத்து ஏழு', 'அறுபத்துஏழு', 'சிக்ஸ்டி செவன்'],
+    'sixty eight': ['அறுபத்து எட்டு', 'அறுபத்துஎட்டு', 'சிக்ஸ்டி ஏட்'],
+    'sixty nine': ['அறுபத்து ஒன்பது', 'அறுபத்தொன்பது', 'சிக்ஸ்டி நைன்'],
+    # Numbers from 71 to 79
+    'seventy one': ['எழுபத்து ஒன்று', 'எழுபத்தொன்று', 'செவன்டி ஒன்'],
+    'seventy two': ['எழுபத்து இரண்டு', 'எழுபத்திரண்டு', 'செவன்டி டூ'],
+    'seventy three': ['எழுபத்து மூன்று', 'எழுபத்துமூன்று', 'செவன்டி த்ரீ'],
+    'seventy four': ['எழுபத்து நான்கு', 'எழுபத்துநான்கு', 'செவன்டி ஃபோர்'],
+    'seventy five': ['எழுபத்து ஐந���து', 'எழுபத்துஐந்து', 'செவன்டி ஃபைவ்'],
+    'seventy six': ['எழுபத்து ஆறு', 'எழுபத்துஆறு', 'செவன்டி சிக்ஸ்'],
+    'seventy seven': ['எழுபத்து ஏழு', 'எழுபத்துஏழு', 'செவன்டி செவன்'],
+    'seventy eight': ['எழுபத்து எட்டு', 'எழுபத்துஎட்டு', 'செவன்டி ஏட்'],
+    'seventy nine': ['எழுபத்து ஒன்பது', 'எழுபத்தொன்பது', 'செவன்டி நைன்'],
+    # Numbers from 81 to 89
+    'eighty one': ['எண்பத்து ஒன்று', 'எண்பத்தொன்று', 'ஏட்டி ஒன்'],
+    'eighty two': ['எண்பத்து இரண்டு', 'எண்பத்திரண்டு', 'ஏட்டி டூ'],
+    'eighty three': ['எண்பத்து மூன்று', 'எண்பத்துமூன்று', 'ஏட்டி த்ரீ'],
+    'eighty four': ['எண்பத்து நான்கு', 'எண்பத்துநான்கு', 'ஏட்டி ஃபோர்'],
+    'eighty five': ['எண்பத்து ஐந்து', 'எண்பத்துஐந்து', 'ஏட்டி ஃபைவ்'],
+    'eighty six': ['எண்பத்து ஆறு', 'எண்பத்துஆறு', 'ஏட்டி சிக்ஸ்'],
+    'eighty seven': ['எண்பத்து ஏழு', 'எண்பத்துஏழு', 'ஏட்டி செவன்'],
+    'eighty eight': ['எண்பத்து எட்டு', 'எண்பத்துஎட்டு', 'ஏட்டி ஏட்'],
+    'eighty nine': ['எண்பத்து ஒன்பது', 'எண்பத்தொன்பது', 'ஏட்டி நைன்'],
+    # Numbers from 91 to 99
+    'ninety one': ['தொண்ணூற்று ஒன்று', 'தொண்ணூற்றொன்று', 'நைன்டி ஒன்'],
+    'ninety two': ['தொண்ணூற்று இரண்டு', 'தொண்ணூற்றிரண்டு', 'நைன்டி டூ'],
+    'ninety three': ['தொண்ணூற்று மூன்று', 'தொண்ணூற்றுமூன்று', 'நைன்டி த்ரீ'],
+    'ninety four': ['தொண்ணூற்று நான்கு', 'தொண்ணூற்றுநான்கு', 'நைன்டி ஃபோர்'],
+    'ninety five': ['தொண்ணூற்று ஐந்து', 'தொண்ணூற்றுஐந்து', 'நைன்டி ஃபைவ்'],
+    'ninety six': ['தொண்ணூற்று ஆறு', 'தொண்ணூற்றுஆறு', 'நைன்டி சிக்ஸ்'],
+    'ninety seven': ['தொண்ணூற்று ஏழு', 'தொண்ணூற்றுஏழு', 'நைன்டி செவன்'],
+    'ninety eight': ['தொண்ணூற்று எட்டு', 'தொண்ணூற்றுஎட்டு', 'நைன்டி ஏட்'],
+    'ninety nine': ['தொண்ணூற்று ஒன்பது', 'தொண்ணூற்றொன்பது', 'நைன்டி நைன்'],
+    # Hundred
+    'hundred': ['நூறு', 'நூறை', 'ஹண்ட்ரெட்'],
+    # Thousand
+    'thousand': ['ஆயிரம்'],
+    }
+    words = sentence.split()  # Split the sentence by spaces
+    # Replace words using the mapping
+    for i, word in enumerate(words):
+        for replacement, patterns in replacement_map.items():
+            if word in patterns:
+                words[i] = replacement  # Replace the word if it's fully matched
+    # Join the processed words back into a sentence
+    return ' '.join(words)
+# In[ ]:

text2int.py ADDED Viewed

	@@ -0,0 +1,200 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+# # Function to convert Hindi text to numerical representation
+# from isNumber import is_number
+# def text_to_int (textnum, numwords={}):
+#     units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
+#             'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
+#             'sixteen', 'seventeen', 'eighteen', 'nineteen']
+#     tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+#     scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']
+#     ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
+#     ordinal_endings = [('ieth', 'y'), ('th', '')]
+#     if not numwords:
+#         numwords['and'] = (1, 0)
+#         for idx, word in enumerate(units): numwords[word] = (1, idx)
+#         for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
+#         for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
+#     textnum = textnum.replace('-', ' ')
+#     current = result = 0
+#     curstring = ''
+#     onnumber = False
+#     lastunit = False
+#     lastscale = False
+#     def is_numword(x):
+#         if is_number(x):
+#             return True
+#         if word in numwords:
+#             return True
+#         return False
+#     def from_numword(x):
+#         if is_number(x):
+#             scale = 0
+#             increment = int(x.replace(',', ''))
+#             return scale, increment
+#         return numwords[x]
+#     for word in textnum.split():
+#         if word in ordinal_words:
+#             scale, increment = (1, ordinal_words[word])
+#             current = current * scale + increment
+#             if scale > 100:
+#                 result += current
+#                 current = 0
+#             onnumber = True
+#             lastunit = False
+#             lastscale = False
+#         else:
+#             for ending, replacement in ordinal_endings:
+#                 if word.endswith(ending):
+#                     word = "%s%s" % (word[:-len(ending)], replacement)
+#             if (not is_numword(word)) or (word == 'and' and not lastscale):
+#                 if onnumber:
+#                     # Flush the current number we are building
+#                     curstring += repr(result + current) + " "
+#                 curstring += word + " "
+#                 result = current = 0
+#                 onnumber = False
+#                 lastunit = False
+#                 lastscale = False
+#             else:
+#                 scale, increment = from_numword(word)
+#                 onnumber = True
+#                 if lastunit and (word not in scales):
+#                     # Assume this is part of a string of individual numbers to
+#                     # be flushed, such as a zipcode "one two three four five"
+#                     curstring += repr(result + current)
+#                     result = current = 0
+#                 if scale > 1:
+#                     current = max(1, current)
+#                 current = current * scale + increment
+#                 if scale > 100:
+#                     result += current
+#                     current = 0
+#                 lastscale = False
+#                 lastunit = False
+#                 if word in scales:
+#                     lastscale = True
+#                 elif word in units:
+#                     lastunit = True
+#     if onnumber:
+#         curstring += repr(result + current)
+#     return curstring
+# In[3]:
+import nbimporter
+from isNumber import is_number  # Remove or replace this if unnecessary
+def text_to_int(textnum, numwords={}):
+    # Define units, tens, and scales including "lac"
+    units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
+            'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
+            'sixteen', 'seventeen', 'eighteen', 'nineteen']
+    tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+    scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion']  # "lac" added
+    ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
+    ordinal_endings = [('ieth', 'y'), ('th', '')]
+    if not numwords:
+        numwords['and'] = (1, 0)  # Handle "one hundred and twenty"
+        # Add units, tens, and scales to numwords
+        for idx, word in enumerate(units):
+            numwords[word] = (1, idx)
+        for idx, word in enumerate(tens):
+            numwords[word] = (1, idx * 10)
+        for idx, word in enumerate(scales):
+            numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0)  # Handle "lac" as 10^5
+    # Remove hyphens and normalize input
+    textnum = textnum.replace('-', ' ')
+    current = result = 0
+    curstring = ''
+    onnumber = False
+    lastunit = False
+    lastscale = False
+    def is_numword(x):
+        return is_number(x) or x in numwords
+    def from_numword(x):
+        if is_number(x):
+            return 0, int(x.replace(',', ''))
+        return numwords[x]
+    for word in textnum.split():
+        if word in ordinal_words:
+            scale, increment = (1, ordinal_words[word])
+            current = current * scale + increment
+            if scale > 100:
+                result += current
+                current = 0
+            onnumber = True
+            lastunit = False
+            lastscale = False
+        else:
+            for ending, replacement in ordinal_endings:
+                if word.endswith(ending):
+                    word = f"{word[:-len(ending)]}{replacement}"
+            if not is_numword(word) or (word == 'and' and not lastscale):
+                if onnumber:
+                    curstring += repr(result + current) + " "
+                curstring += word + " "
+                result = current = 0
+                onnumber = False
+                lastunit = False
+                lastscale = False
+            else:
+                scale, increment = from_numword(word)
+                onnumber = True
+                if lastunit and word not in scales:
+                    curstring += repr(result + current) + " "
+                    result = current = 0
+                if scale > 1:
+                    current = max(1, current)
+                current = current * scale + increment
+                if scale >= 100:
+                    result += current
+                    current = 0
+                lastscale = word in scales
+                lastunit = word in units
+    if onnumber:
+        curstring += repr(result + current)
+    return curstring.strip()
+# In[ ]:

waveletDenoise.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+# Function to apply wavelet denoising
+def wavelet_denoise(audio, wavelet='db1', level=1):
+    coeffs = pywt.wavedec(audio, wavelet, mode='per')
+    # Thresholding detail coefficients
+    sigma = np.median(np.abs(coeffs[-level])) / 0.6745
+    uthresh = sigma * np.sqrt(2 * np.log(len(audio)))
+    coeffs[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeffs[1:]]
+    return pywt.waverec(coeffs, wavelet, mode='per')
+# In[ ]:

wienerFilter.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[2]:
+import scipy.signal
+def wiener_filter(audio):
+    '''
+    The Wiener filter is designed to minimize the impact of noise by applying an adaptive filtering process.
+    It tries to estimate the original, clean signal by taking into account both the noisy signal and the statistical properties of the noise.
+    The Wiener filter is particularly useful when dealing with stationary noise (constant background noise, like white noise).
+    '''
+    return scipy.signal.wiener(audio)
+# In[ ]: