File size: 3,508 Bytes
fb75f92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4c40fa
 
 
 
 
fb75f92
 
 
 
 
 
ce73a0d
 
 
 
 
 
 
 
 
 
 
fb75f92
 
 
 
ce73a0d
fb75f92
e4c40fa
 
fb75f92
 
 
 
775715c
4f038de
c0d8b29
4f038de
561b555
ce73a0d
a026b19
921ba44
 
d1884ab
921ba44
 
 
 
4f038de
ce73a0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f038de
775715c
fb75f92
 
 
ce73a0d
fb75f92
ce73a0d
 
 
fb75f92
 
ce73a0d
fb75f92
ce73a0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import sox
import subprocess


def read_file_and_process(wav_file):
    filename = wav_file.split('.')[0]
    filename_16k = filename + "16k.wav"
    resampler(wav_file, filename_16k)
    speech, _ = sf.read(filename_16k)
    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
    
    return inputs


def resampler(input_file_path, output_file_path):
    command = (
        f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
        f"{output_file_path}"
    )
    subprocess.call(command, shell=True)


# def parse_transcription_with_lm(logits):
#     result = processor_with_LM.batch_decode(logits.cpu().numpy())
#     text = result.text
#     transcription = text[0].replace('<s>','')
#     return transcription

def parse_transcription(logits):
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription

# def parse(wav_file, applyLM):
#     input_values = read_file_and_process(wav_file)
#     with torch.no_grad():
#         logits = model(**input_values).logits
   
#     if applyLM:
#         # return parse_transcription_with_lm(logits)
#         return "done"
#     else:
#         return parse_transcription(logits)

def parse(wav_file, applyLM):
    input_values = read_file_and_process(wav_file)
    with torch.no_grad():
        logits = model(**input_values).logits

    if applyLM:
        # return parse_transcription_with_lm(logits)
        return "done"
    else:
        return parse_transcription(logits)

    
# model_id = "infinitejoy/wav2vec2-large-xls-r-300m-odia"
# working 50%  
# model_id = "Harveenchadha/odia_large_wav2vec2"

# It worked when first run but after that getting error
# model_id = "anuragshas/wav2vec2-large-xlsr-53-odia"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# model_id = "Ranjit/Whisper_Small_Odia_CV_11.0_5k_steps"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# model_id = "theainerd/wav2vec2-large-xlsr-53-odia"

# This is hindi
model_id = "Harveenchadha/vakyansh-wav2vec2-hindi-him-4200"



# processor = Wav2Vec2Processor.from_pretrained(model_id)
# # processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id)
# model = Wav2Vec2ForCTC.from_pretrained(model_id)

    
# input_ = gr.Audio(source="microphone", type="filepath") 
# txtbox = gr.Textbox(
#             label="Output from model will appear here:",
#             lines=5
#         )
# chkbox = gr.Checkbox(label="Apply LM", value=False)


# gr.Interface(parse, inputs = [input_, chkbox],  outputs=txtbox,
#              streaming=True, interactive=True,
#              analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);





processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

input_ = gr.inputs.File(source="upload", type="audio")  # Change input source to "upload" and type to "audio"
txtbox = gr.Textbox(
    label="Output from the model will appear here:",
    lines=5
)
chkbox = gr.Checkbox(label="Apply LM", value=False)

gr.Interface(parse, inputs=[input_, chkbox], outputs=txtbox,
             streaming=True, interactive=True,
             analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);