Ritwika-Das-Gupta commited on
Commit
d7f3759
·
verified ·
1 Parent(s): 77a0e89

Upload multilingual_audio_chat.py

Browse files
Files changed (1) hide show
  1. multilingual_audio_chat.py +342 -0
multilingual_audio_chat.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[3]:
5
+
6
+
7
+ #!pip install torchaudio
8
+
9
+
10
+ # In[2]:
11
+
12
+
13
+ from IPython.display import Audio
14
+ import IPython.display as ipd
15
+ from scipy.io import wavfile
16
+ import numpy as np
17
+ import warnings
18
+ import re
19
+ warnings.filterwarnings("ignore")
20
+ import soundfile as sf
21
+ import librosa
22
+ import torch
23
+ import os
24
+ # import torchaudio
25
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
26
+ from transformers import AutoModelForCTC, AutoProcessor, AutoTokenizer, AutoModelForCausalLM
27
+ from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
28
+ import pandas as pd
29
+ from transformers import pipeline, AutoModelForAudioClassification, AutoProcessor
30
+
31
+
32
+ # In[3]:
33
+
34
+
35
+ # Set device and dtype
36
+ #device = "cuda:0" if torch.cuda.is_available() else "cpu"
37
+ device="cpu"
38
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
39
+ lid_model_id = "facebook/mms-lid-126"
40
+ lid_pipeline = pipeline("audio-classification", model=lid_model_id,device=device)
41
+ language_mapping = {
42
+ "hin": "hindi",
43
+ "ben": "bengali",
44
+ "eng": "english",
45
+ "guj": "gujarati"
46
+ }
47
+
48
+ def detect_language_for_audio_file(audio_file_path, lid_pipeline, target_sampling_rate=16000):
49
+ """
50
+ Detects the language of a given audio file and returns a DataFrame.
51
+
52
+ Parameters:
53
+ - audio_file_path (str): The path to the audio file.
54
+ - lid_pipeline: The language identification pipeline.
55
+ - target_sampling_rate (int): The target sampling rate for the audio file. Default is 16000.
56
+
57
+ Returns:
58
+ - df (pd.DataFrame): A DataFrame containing the detected language and filename.
59
+ """
60
+ detected_languages = []
61
+ audio_filenames = []
62
+
63
+ filename = os.path.basename(audio_file_path)
64
+ waveform, original_sampling_rate = librosa.load(audio_file_path, sr=None)
65
+
66
+ if len(waveform.shape) > 1:
67
+ waveform = librosa.to_mono(waveform)
68
+
69
+ if original_sampling_rate != target_sampling_rate:
70
+ waveform = librosa.resample(waveform, orig_sr=original_sampling_rate, target_sr=target_sampling_rate)
71
+
72
+ # Perform language identification
73
+ lid_result = lid_pipeline(waveform, sampling_rate=target_sampling_rate)
74
+ detected_language = lid_result[0]['label'].split('_')[0]
75
+ print(f"Detected language for {filename}: {detected_language}")
76
+
77
+ detected_languages.append(detected_language)
78
+ audio_filenames.append(filename)
79
+
80
+ df = pd.DataFrame({
81
+ "Detected_Language": detected_languages,
82
+ "Audio_Filename": audio_filenames
83
+ })
84
+
85
+ # removing nondetected languages
86
+
87
+ df['Detected_Language'] = df['Detected_Language'].map(language_mapping)
88
+
89
+ df.dropna(inplace=True, axis= 0)
90
+
91
+ # adding model names based on language
92
+ model_names = []
93
+
94
+ for index, row in df.iterrows():
95
+ detected_language = row['Detected_Language']
96
+
97
+ model_name = "ai4bharat/indicwav2vec_v1_" + detected_language
98
+
99
+ model_names.append(model_name)
100
+
101
+ df['Model_Name'] = model_names
102
+
103
+ return df
104
+ # Example usage:
105
+ # audio_file_path = 'processed_audio.wav'
106
+ # df = detect_language_for_audio_file(audio_file_path, lid_pipeline)
107
+ # print(df)
108
+
109
+
110
+ # In[4]:
111
+
112
+
113
+ loaded_models = {}
114
+ def load_model_and_tokenizer(standardized_language):
115
+ if standardized_language not in loaded_models:
116
+ if standardized_language == 'hindi':
117
+ model_name = "ai4bharat/indicwav2vec-hindi"
118
+ elif standardized_language == 'odia':
119
+ model_name = "ai4bharat/indicwav2vec-odia"
120
+ elif standardized_language == 'english':
121
+ model_name = "facebook/wav2vec2-large-960h-lv60-self"
122
+ else:
123
+ model_name = "ai4bharat/indicwav2vec_v1_" + standardized_language
124
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
125
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
126
+ loaded_models[standardized_language] = (model, tokenizer)
127
+ else:
128
+ model, tokenizer = loaded_models[standardized_language]
129
+ return model, tokenizer
130
+
131
+
132
+ # In[5]:
133
+
134
+
135
+ def perform_transcription(df):
136
+
137
+ transcriptions = []
138
+
139
+ for index, row in df.iterrows():
140
+ audio_file_path = row['Audio_Filename']
141
+ detected_language = row['Detected_Language']
142
+
143
+ standardized_language = language_mapping.get(detected_language, detected_language)
144
+ model, tokenizer = load_model_and_tokenizer(standardized_language)
145
+
146
+ input_audio, _ = librosa.load(audio_file_path, sr=16000)
147
+ input_values = tokenizer(input_audio, return_tensors="pt").input_values
148
+
149
+ with torch.no_grad():
150
+ logits = model(input_values).logits
151
+
152
+ predicted_ids = torch.argmax(logits, dim=-1)
153
+ text = tokenizer.batch_decode(predicted_ids)[0]
154
+
155
+ transcriptions.append(text)
156
+
157
+ df['Transcription'] = transcriptions
158
+
159
+ return df
160
+
161
+
162
+ # In[7]:
163
+
164
+
165
+ # Loading the tokenizer and model from Hugging Face's model hub.
166
+ tokenizer = AutoTokenizer.from_pretrained("soketlabs/pragna-1b", token=os.environ.get('HF_TOKEN'))
167
+ model = AutoModelForCausalLM.from_pretrained(
168
+ "soketlabs/pragna-1b",
169
+ token=os.environ.get('HF_TOKEN'),
170
+ revision='3c5b8b1309f7d89710331ba2f164570608af0de7'
171
+ )
172
+ model.load_adapter('soketlabs/pragna-1b-it-v0.1', token=os.environ.get('HF_TOKEN'))
173
+ model = model.to(device)
174
+
175
+ # Function to generate response
176
+ def generate_response(transcription):
177
+ try:
178
+ messages = [
179
+ {"role": "system", "content": " you are a friendly bot to help the user"},
180
+ {"role": "user", "content": transcription},
181
+ ]
182
+ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
183
+ input_ids = tokenized_chat[0].to('cuda')
184
+ if len(input_ids.shape) == 1:
185
+ input_ids = input_ids.unsqueeze(0)
186
+ with torch.no_grad():
187
+ output = model.generate(
188
+ input_ids,
189
+ max_new_tokens=100,
190
+ num_return_sequences=1,
191
+ temperature=0.1,
192
+ top_k=50,
193
+ top_p=0.5,
194
+ repetition_penalty=1.2,
195
+ do_sample=True
196
+ )
197
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
198
+ return find_last_sentence(generated_text)
199
+ except Exception as e:
200
+ print("Error during response generation:", e)
201
+ return "Response generation error: " + str(e)
202
+
203
+ # Function to find last sentence in generated text
204
+ def find_last_sentence(text):
205
+ sentence_endings = re.finditer(r'[।?!]', text)
206
+ end_positions = [ending.end() for ending in sentence_endings]
207
+ if end_positions:
208
+ return text[:end_positions[-1]]
209
+ return text
210
+
211
+
212
+ # In[15]:
213
+
214
+
215
+ def generate_text_and_display_audio(row, model, tokenizer):
216
+ audio_file = row['Audio_Filename']
217
+ transcription = row['Transcription']
218
+
219
+ # Generate text
220
+ generated_text = generate_response(transcription)
221
+
222
+ generated_text = find_last_sentence(generated_text)
223
+ # Display audio
224
+ # display(ipd.Audio(audio_path))
225
+ return transcription, generated_text
226
+ # Display prompt and generated text
227
+ # print("Transcribed Text:", transcription)
228
+ # print("Generated Text:", generated_text)
229
+
230
+
231
+ # In[16]:
232
+
233
+
234
+ import soundfile as sf
235
+ import librosa
236
+ import noisereduce as nr
237
+ import numpy as np
238
+ import gradio as gr
239
+ import pyloudnorm as pyln
240
+
241
+ def spectral_subtraction(audio_data, sample_rate):
242
+ # Compute short-time Fourier transform (STFT)
243
+ stft = librosa.stft(audio_data)
244
+
245
+ # Compute power spectrogram
246
+ power_spec = np.abs(stft)**2
247
+
248
+ # Estimate noise power spectrum
249
+ noise_power = np.median(power_spec, axis=1)
250
+
251
+ # Apply spectral subtraction
252
+ alpha = 2.0 # Adjustment factor, typically between 1.0 and 2.0
253
+ denoised_spec = np.maximum(power_spec - alpha * noise_power[:, np.newaxis], 0)
254
+
255
+ # Inverse STFT to obtain denoised audio
256
+ denoised_audio = librosa.istft(np.sqrt(denoised_spec) * np.exp(1j * np.angle(stft)))
257
+
258
+ return denoised_audio
259
+
260
+ def apply_compression(audio_data, sample_rate):
261
+ # Apply dynamic range compression
262
+ meter = pyln.Meter(sample_rate) # create BS.1770 meter
263
+ loudness = meter.integrated_loudness(audio_data)
264
+
265
+ # Normalize audio to target loudness of -24 LUFS
266
+ loud_norm = pyln.normalize.loudness(audio_data, loudness, -24.0)
267
+
268
+ return loud_norm
269
+
270
+ def process_audio(audio_file_path):
271
+ try:
272
+ # Read audio data
273
+ audio_data, sample_rate = librosa.load(audio_file_path)
274
+ print(f"Read audio data: {audio_file_path}, Sample Rate: {sample_rate}")
275
+
276
+ # Apply noise reduction using noisereduce
277
+ reduced_noise = nr.reduce_noise(y=audio_data, sr=sample_rate)
278
+ print("Noise reduction applied")
279
+
280
+ # Apply spectral subtraction for additional noise reduction
281
+ denoised_audio = spectral_subtraction(reduced_noise, sample_rate)
282
+ print("Spectral subtraction applied")
283
+
284
+ # Apply dynamic range compression to make foreground louder
285
+ compressed_audio = apply_compression(denoised_audio, sample_rate)
286
+ print("Dynamic range compression applied")
287
+
288
+ # Remove silent spaces
289
+ final_audio = librosa.effects.trim(compressed_audio)[0]
290
+ print("Silences trimmed")
291
+
292
+ # Save the final processed audio to a file with a fixed name
293
+ processed_file_path = 'processed_audio.wav'
294
+ sf.write(processed_file_path, final_audio, sample_rate)
295
+ print(f"Processed audio saved to: {processed_file_path}")
296
+
297
+ # Check if file exists to confirm it was saved
298
+ if not os.path.isfile(processed_file_path):
299
+ raise FileNotFoundError(f"Processed file not found: {processed_file_path}")
300
+
301
+ # Load the processed audio for transcription
302
+ processed_audio_data, _ = librosa.load(processed_file_path)
303
+ print(f"Processed audio reloaded for transcription: {processed_file_path}")
304
+
305
+ df = detect_language_for_audio_file(processed_file_path, lid_pipeline)
306
+ print(df)
307
+ df_transcription= perform_transcription(df)
308
+ print(df_transcription)
309
+ for index, row in df_transcription.iterrows():
310
+ print(index, row)
311
+ transcription, response = generate_text_and_display_audio(row, model, tokenizer)
312
+
313
+
314
+ # Transcribe audio
315
+ # transcription = transcribe_audio(processed_audio_data)
316
+ # print("Transcription completed")
317
+
318
+ # # Generate response
319
+ # response = generate_response(transcription)
320
+ # print("Response generated")
321
+
322
+ return processed_file_path, transcription, response
323
+ except Exception as e:
324
+ print("Error during audio processing:", e)
325
+ return "Error during audio processing:", str(e)
326
+
327
+
328
+ # Create Gradio interface
329
+ iface = gr.Interface(
330
+ fn=process_audio,
331
+ inputs=gr.Audio(label="Record Audio", type="filepath"),
332
+ outputs=[gr.Audio(label="Processed Audio"), gr.Textbox(label="Transcription"), gr.Textbox(label="Response")]
333
+ )
334
+
335
+ iface.launch(share=True)
336
+
337
+
338
+ # In[ ]:
339
+
340
+
341
+
342
+