Ritwika-Das-Gupta commited on
Commit
b1bc9d5
·
verified ·
1 Parent(s): 46663e2

Upload app (1).py

Browse files
Files changed (1) hide show
  1. app (1).py +391 -0
app (1).py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[1]:
5
+
6
+
7
+ #!/usr/bin/env python
8
+ # coding: utf-8
9
+
10
+ # In[3]:
11
+
12
+
13
+ #!pip install torchaudio
14
+
15
+
16
+ # In[2]:
17
+
18
+
19
+ from IPython.display import Audio
20
+ import IPython.display as ipd
21
+ from scipy.io import wavfile
22
+ import numpy as np
23
+ import warnings
24
+ import re
25
+ warnings.filterwarnings("ignore")
26
+ import soundfile as sf
27
+ import librosa
28
+ import torch
29
+ import os
30
+ import soundfile as sf
31
+ import librosa
32
+ import noisereduce as nr
33
+ import numpy as np
34
+ import gradio as gr
35
+ import pyloudnorm as pyln
36
+ # import torchaudio
37
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
38
+ from transformers import AutoModelForCTC, AutoProcessor, AutoTokenizer, AutoModelForCausalLM
39
+ from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
40
+ import pandas as pd
41
+ from transformers import pipeline, AutoModelForAudioClassification, AutoProcessor
42
+
43
+
44
+ # In[3]:
45
+
46
+
47
+ # In[3]:
48
+
49
+
50
+ # Set device and dtype
51
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
52
+ # device= "cpu"
53
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
54
+ lid_model_id = "facebook/mms-lid-126"
55
+ lid_pipeline = pipeline("audio-classification", model=lid_model_id,device=device)
56
+ language_mapping = {
57
+ "hin": "hindi",
58
+ "ben": "bengali",
59
+ "eng": "english",
60
+ "guj": "gujarati"
61
+ }
62
+
63
+
64
+ # In[4]:
65
+
66
+
67
+ def detect_language_for_audio_file(audio_file_path, lid_pipeline, target_sampling_rate=16000):
68
+ """
69
+ Detects the language of a given audio file and returns a DataFrame.
70
+
71
+ Parameters:
72
+ - audio_file_path (str): The path to the audio file.
73
+ - lid_pipeline: The language identification pipeline.
74
+ - target_sampling_rate (int): The target sampling rate for the audio file. Default is 16000.
75
+
76
+ Returns:
77
+ - df (pd.DataFrame): A DataFrame containing the detected language and filename.
78
+ """
79
+ detected_languages = []
80
+ audio_filenames = []
81
+
82
+ filename = os.path.basename(audio_file_path)
83
+ waveform, original_sampling_rate = librosa.load(audio_file_path, sr=None)
84
+
85
+ if len(waveform.shape) > 1:
86
+ waveform = librosa.to_mono(waveform)
87
+
88
+ if original_sampling_rate != target_sampling_rate:
89
+ waveform = librosa.resample(waveform, orig_sr=original_sampling_rate, target_sr=target_sampling_rate)
90
+
91
+ # Perform language identification
92
+ lid_result = lid_pipeline(waveform, sampling_rate=target_sampling_rate)
93
+ detected_language = lid_result[0]['label'].split('_')[0]
94
+ print(f"Detected language for {filename}: {detected_language}")
95
+
96
+ detected_languages.append(detected_language)
97
+ audio_filenames.append(filename)
98
+
99
+ df = pd.DataFrame({
100
+ "Detected_Language": detected_languages,
101
+ "Audio_Filename": audio_filenames
102
+ })
103
+
104
+ # removing nondetected languages
105
+
106
+ df['Detected_Language'] = df['Detected_Language'].map(language_mapping)
107
+
108
+ df.dropna(inplace=True, axis= 0)
109
+
110
+ # adding model names based on language
111
+ model_names = []
112
+
113
+ for index, row in df.iterrows():
114
+ detected_language = row['Detected_Language']
115
+
116
+ model_name = "ai4bharat/indicwav2vec_v1_" + detected_language
117
+
118
+ model_names.append(model_name)
119
+
120
+ df['Model_Name'] = model_names
121
+
122
+ return df
123
+ # Example usage:
124
+ # audio_file_path = 'processed_audio.wav'
125
+ # df = detect_language_for_audio_file(audio_file_path, lid_pipeline)
126
+ # print(df)
127
+
128
+
129
+ # In[11]:
130
+
131
+
132
+ loaded_models = {}
133
+ current_loaded_model = None
134
+
135
+ def load_model_and_tokenizer(standardized_language):
136
+ global current_loaded_model
137
+
138
+ # If the requested model is already loaded, return it
139
+ if standardized_language in loaded_models:
140
+ return loaded_models[standardized_language]
141
+
142
+ # Check if the current loaded model is the same as the new one
143
+ if current_loaded_model == standardized_language:
144
+ return loaded_models[standardized_language]
145
+
146
+ # Clear the specific model currently loaded on the GPU, if any
147
+ elif current_loaded_model is not None:
148
+ del loaded_models[current_loaded_model]
149
+ torch.cuda.empty_cache()
150
+ current_loaded_model = None
151
+
152
+ # Determine the model name based on the standardized language
153
+ if standardized_language == 'hindi':
154
+ model_name = "ai4bharat/indicwav2vec-hindi"
155
+ elif standardized_language == 'odia':
156
+ model_name = "ai4bharat/indicwav2vec-odia"
157
+ elif standardized_language == 'english':
158
+ model_name = "facebook/wav2vec2-large-960h-lv60-self"
159
+ else:
160
+ model_name = "ai4bharat/indicwav2vec_v1_" + standardized_language
161
+
162
+ # Load the model and tokenizer
163
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
164
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
165
+
166
+ # Update the loaded models and current loaded model
167
+ loaded_models[standardized_language] = (model, tokenizer)
168
+ current_loaded_model = standardized_language
169
+
170
+ return model, tokenizer
171
+
172
+
173
+ # In[6]:
174
+
175
+
176
+ # In[5]:
177
+
178
+
179
+ def perform_transcription(df):
180
+
181
+ transcriptions = []
182
+
183
+ for index, row in df.iterrows():
184
+ audio_file_path = row['Audio_Filename']
185
+ detected_language = row['Detected_Language']
186
+
187
+ standardized_language = language_mapping.get(detected_language, detected_language)
188
+ model, tokenizer = load_model_and_tokenizer(standardized_language)
189
+
190
+ input_audio, _ = librosa.load(audio_file_path, sr=16000)
191
+ input_values = tokenizer(input_audio, return_tensors="pt").input_values
192
+
193
+ with torch.no_grad():
194
+ logits = model(input_values).logits
195
+
196
+ predicted_ids = torch.argmax(logits, dim=-1)
197
+ text = tokenizer.batch_decode(predicted_ids)[0]
198
+
199
+ transcriptions.append(text)
200
+
201
+ df['Transcription'] = transcriptions
202
+
203
+ return df
204
+
205
+
206
+ # In[8]:
207
+
208
+
209
+ # In[7]:
210
+
211
+
212
+ # Loading the tokenizer and model from Hugging Face's model hub.
213
+ tokenizer = AutoTokenizer.from_pretrained("soketlabs/pragna-1b", token=os.environ.get('HF_TOKEN'))
214
+ model = AutoModelForCausalLM.from_pretrained(
215
+ "soketlabs/pragna-1b",
216
+ token=os.environ.get('HF_TOKEN'),
217
+ revision='3c5b8b1309f7d89710331ba2f164570608af0de7'
218
+ )
219
+ model.load_adapter('soketlabs/pragna-1b-it-v0.1', token=os.environ.get('HF_TOKEN'))
220
+ model = model.to(device)
221
+
222
+
223
+ # Function to generate response
224
+ def generate_response(transcription):
225
+ try:
226
+ messages = [
227
+ {"role": "system", "content": " you are a friendly bot to help the user"},
228
+ {"role": "user", "content": transcription},
229
+ ]
230
+ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
231
+ input_ids = tokenized_chat[0].to(device)
232
+ if len(input_ids.shape) == 1:
233
+ input_ids = input_ids.unsqueeze(0)
234
+ with torch.no_grad():
235
+ output = model.generate(
236
+ input_ids,
237
+ max_new_tokens=300,
238
+ do_sample=True,
239
+ top_k=5,
240
+ num_beams=1,
241
+ use_cache=False,
242
+ temperature=0.2,
243
+ repetition_penalty=1.1,
244
+ )
245
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
246
+ return find_last_sentence(generated_text)
247
+ except Exception as e:
248
+ print("Error during response generation:", e)
249
+ return "Response generation error: " + str(e)
250
+
251
+ # Function to find last sentence in generated text
252
+ def find_last_sentence(text):
253
+ sentence_endings = re.finditer(r'[।?!]', text)
254
+ end_positions = [ending.end() for ending in sentence_endings]
255
+ if end_positions:
256
+ return text[:end_positions[-1]]
257
+ return text
258
+
259
+
260
+ # In[9]:
261
+
262
+
263
+ # In[15]:
264
+
265
+
266
+ def generate_text_and_display_audio(row, model, tokenizer):
267
+ audio_file = row['Audio_Filename']
268
+ transcription = row['Transcription']
269
+
270
+ # Generate text
271
+ generated_text = generate_response(transcription)
272
+
273
+ generated_text = find_last_sentence(generated_text)
274
+ # Display audio
275
+ # display(ipd.Audio(audio_path))
276
+ return transcription, generated_text
277
+ # Display prompt and generated text
278
+ # print("Transcribed Text:", transcription)
279
+ # print("Generated Text:", generated_text)
280
+
281
+
282
+ # In[12]:
283
+
284
+
285
+ # In[16]:
286
+
287
+ def spectral_subtraction(audio_data, sample_rate):
288
+ # Compute short-time Fourier transform (STFT)
289
+ stft = librosa.stft(audio_data)
290
+
291
+ # Compute power spectrogram
292
+ power_spec = np.abs(stft)**2
293
+
294
+ # Estimate noise power spectrum
295
+ noise_power = np.median(power_spec, axis=1)
296
+
297
+ # Apply spectral subtraction
298
+ alpha = 2.0 # Adjustment factor, typically between 1.0 and 2.0
299
+ denoised_spec = np.maximum(power_spec - alpha * noise_power[:, np.newaxis], 0)
300
+
301
+ # Inverse STFT to obtain denoised audio
302
+ denoised_audio = librosa.istft(np.sqrt(denoised_spec) * np.exp(1j * np.angle(stft)))
303
+
304
+ return denoised_audio
305
+
306
+ def apply_compression(audio_data, sample_rate):
307
+ # Apply dynamic range compression
308
+ meter = pyln.Meter(sample_rate) # create BS.1770 meter
309
+ loudness = meter.integrated_loudness(audio_data)
310
+
311
+ # Normalize audio to target loudness of -24 LUFS
312
+ loud_norm = pyln.normalize.loudness(audio_data, loudness, -24.0)
313
+
314
+ return loud_norm
315
+
316
+ def process_audio(audio_file_path):
317
+ try:
318
+ # Read audio data
319
+ audio_data, sample_rate = librosa.load(audio_file_path)
320
+ print(f"Read audio data: {audio_file_path}, Sample Rate: {sample_rate}")
321
+
322
+ # Apply noise reduction using noisereduce
323
+ reduced_noise = nr.reduce_noise(y=audio_data, sr=sample_rate)
324
+ print("Noise reduction applied")
325
+
326
+ # Apply spectral subtraction for additional noise reduction
327
+ denoised_audio = spectral_subtraction(reduced_noise, sample_rate)
328
+ print("Spectral subtraction applied")
329
+
330
+ # Apply dynamic range compression to make foreground louder
331
+ compressed_audio = apply_compression(denoised_audio, sample_rate)
332
+ print("Dynamic range compression applied")
333
+
334
+ # Remove silent spaces
335
+ final_audio = librosa.effects.trim(compressed_audio)[0]
336
+ print("Silences trimmed")
337
+
338
+ # Save the final processed audio to a file with a fixed name
339
+ processed_file_path = 'processed_audio.wav'
340
+ sf.write(processed_file_path, final_audio, sample_rate)
341
+ print(f"Processed audio saved to: {processed_file_path}")
342
+
343
+ # Check if file exists to confirm it was saved
344
+ if not os.path.isfile(processed_file_path):
345
+ raise FileNotFoundError(f"Processed file not found: {processed_file_path}")
346
+
347
+ # Load the processed audio for transcription
348
+ processed_audio_data, _ = librosa.load(processed_file_path)
349
+ print(f"Processed audio reloaded for transcription: {processed_file_path}")
350
+
351
+ df = detect_language_for_audio_file(processed_file_path, lid_pipeline)
352
+ print(df)
353
+ df_transcription= perform_transcription(df)
354
+ print(df_transcription)
355
+ for index, row in df_transcription.iterrows():
356
+ print(index, row)
357
+ transcription, response = generate_text_and_display_audio(row, model, tokenizer)
358
+
359
+
360
+ # Transcribe audio
361
+ # transcription = transcribe_audio(processed_audio_data)
362
+ # print("Transcription completed")
363
+
364
+ # # Generate response
365
+ # response = generate_response(transcription)
366
+ # print("Response generated")
367
+
368
+ return processed_file_path, transcription, response
369
+ except Exception as e:
370
+ print("Error during audio processing:", e)
371
+ return "Error during audio processing:", str(e)
372
+
373
+
374
+ # Create Gradio interface
375
+ iface = gr.Interface(
376
+ fn=process_audio,
377
+ inputs=gr.Audio(label="Record Audio", type="filepath"),
378
+ outputs=[gr.Audio(label="Processed Audio"), gr.Textbox(label="Transcription"), gr.Textbox(label="Response")]
379
+ )
380
+
381
+ iface.launch(share=True)
382
+
383
+
384
+ # In[ ]:
385
+
386
+
387
+ # In[ ]:
388
+
389
+
390
+
391
+