Spaces:

Bishan
/

test-odia

Sleeping

File size: 3,508 Bytes

fb75f92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4c40fa
 
 
 
 
fb75f92
 
 
 
 
 
ce73a0d
 
 
 
 
 
 
 
 
 
 
fb75f92
 
 
 
ce73a0d
fb75f92
e4c40fa
 
fb75f92
 
 
 
775715c
4f038de
c0d8b29
4f038de
561b555
ce73a0d
a026b19
921ba44
 
d1884ab
921ba44
 
 
 
4f038de
ce73a0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f038de
775715c
fb75f92
 
 
ce73a0d
fb75f92
ce73a0d
 
 
fb75f92
 
ce73a0d
fb75f92
ce73a0d

import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import sox
import subprocess


def read_file_and_process(wav_file):
    filename = wav_file.split('.')[0]
    filename_16k = filename + "16k.wav"
    resampler(wav_file, filename_16k)
    speech, _ = sf.read(filename_16k)
    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
    
    return inputs


def resampler(input_file_path, output_file_path):
    command = (
        f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
        f"{output_file_path}"
    )
    subprocess.call(command, shell=True)


# def parse_transcription_with_lm(logits):
#     result = processor_with_LM.batch_decode(logits.cpu().numpy())
#     text = result.text
#     transcription = text[0].replace('<s>','')
#     return transcription

def parse_transcription(logits):
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription

# def parse(wav_file, applyLM):
#     input_values = read_file_and_process(wav_file)
#     with torch.no_grad():
#         logits = model(**input_values).logits
   
#     if applyLM:
#         # return parse_transcription_with_lm(logits)
#         return "done"
#     else:
#         return parse_transcription(logits)

def parse(wav_file, applyLM):
    input_values = read_file_and_process(wav_file)
    with torch.no_grad():
        logits = model(**input_values).logits

    if applyLM:
        # return parse_transcription_with_lm(logits)
        return "done"
    else:
        return parse_transcription(logits)

    
# model_id = "infinitejoy/wav2vec2-large-xls-r-300m-odia"
# working 50%  
# model_id = "Harveenchadha/odia_large_wav2vec2"

# It worked when first run but after that getting error
# model_id = "anuragshas/wav2vec2-large-xlsr-53-odia"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# model_id = "Ranjit/Whisper_Small_Odia_CV_11.0_5k_steps"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# This is hindi
model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"



# processor = Wav2Vec2Processor.from_pretrained(model_id)
# # processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
# model = Wav2Vec2ForCTC.from_pretrained(model_id)

    
# input_ = gr.Audio(source="microphone", type="filepath") 
# txtbox = gr.Textbox(
#             label="Output from model will appear here:",
#             lines=5
#         )
# chkbox = gr.Checkbox(label="Apply LM", value=False)


# gr.Interface(parse, inputs = [input_, chkbox],  outputs=txtbox,
#              streaming=True, interactive=True,
#              analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);





processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

input_ = gr.inputs.File(source="upload", type="audio")  # Change input source to "upload" and type to "audio"
txtbox = gr.Textbox(
    label="Output from the model will appear here:",
    lines=5
)
chkbox = gr.Checkbox(label="Apply LM", value=False)

gr.Interface(parse, inputs=[input_, chkbox], outputs=txtbox,
             streaming=True, interactive=True,
             analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);