amenIKh commited on
Commit
ef7037b
·
1 Parent(s): 40688d5

Added ASR model files

Browse files
Files changed (3) hide show
  1. README.md +14 -0
  2. app.py +107 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -10,3 +10,17 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ # ASR Transcription Service
15
+
16
+ This is an Automatic Speech Recognition (ASR) system deployed on Hugging Face Spaces using Gradio.
17
+
18
+ ## Features
19
+ - Supports **Tunisian Arabic (tn)**, **French (fr)**, and **English (en)**
20
+ - Voice Activity Detection (VAD) for noise removal
21
+ - Model based on OpenAI Whisper and a fine-tuned Tunisian ASR model
22
+
23
+ ## How to Use
24
+ 1. Upload an audio file or record using the microphone.
25
+ 2. Select the transcription language.
26
+ 3. Get the transcribed text!
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import librosa
4
+ import numpy as np
5
+ import webrtcvad
6
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
7
+
8
+ # Model names
9
+ TN_MODEL_NAME = "amenIKh/Tunisian_Checkpoint12"
10
+ WHISPER_MODEL_NAME = "openai/whisper-small"
11
+
12
+ # Initialize pipelines
13
+ pipe_tn = pipeline(
14
+ task="automatic-speech-recognition",
15
+ model=TN_MODEL_NAME,
16
+ device=0 if torch.cuda.is_available() else -1,
17
+ )
18
+
19
+ # Load Whisper model and processor
20
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME)
21
+ whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME)
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ whisper_model.to(device)
25
+
26
+ # Function to apply VAD
27
+ def apply_vad(audio, sr, frame_duration_ms=30):
28
+ vad = webrtcvad.Vad()
29
+ vad.set_mode(3) # Aggressiveness mode, higher value is more aggressive
30
+
31
+ frame_size = int(sr * frame_duration_ms / 1000)
32
+ offset = 0
33
+ voiced_frames = []
34
+
35
+ while offset + frame_size < len(audio):
36
+ frame = audio[offset:offset + frame_size].astype(np.int16)
37
+ is_speech = vad.is_speech(frame.tobytes(), sr)
38
+
39
+ if is_speech:
40
+ voiced_frames.append(frame)
41
+
42
+ offset += frame_size
43
+
44
+ if len(voiced_frames) == 0:
45
+ return audio # Return original audio if no voiced frames are detected
46
+
47
+ voiced_audio = np.concatenate(voiced_frames)
48
+ return voiced_audio
49
+
50
+ # Function to transcribe audio based on language
51
+ def transcribe_audio(audio, language):
52
+ try:
53
+ # Load audio
54
+ sr = 16000 # Assuming the audio is in 16kHz; adjust if necessary
55
+ audio, _ = librosa.load(audio, sr=sr)
56
+
57
+ # Apply VAD
58
+ voiced_audio = apply_vad(audio, sr)
59
+
60
+ # Select the correct model based on language
61
+ if language == "tn":
62
+ result = pipe_tn(voiced_audio)
63
+ transcription = result.get("text", "")
64
+ elif language in ["fr", "en"]:
65
+ forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe")
66
+ input_features = whisper_processor(voiced_audio, return_tensors="pt").input_features.to(device)
67
+ generated_ids = whisper_model.generate(
68
+ input_features,
69
+ forced_decoder_ids=forced_decoder_ids
70
+ )
71
+ transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
72
+ else:
73
+ return "Unsupported language specified"
74
+
75
+ return transcription
76
+ except Exception as e:
77
+ return f"An unexpected error occurred: {str(e)}"
78
+ # Define Gradio interface
79
+ def gradio_interface(audio, language):
80
+ try:
81
+ # Extract the file path or microphone input from the Gradio audio input
82
+ if isinstance(audio, tuple):
83
+ temp_file_path = audio[0] # For microphone recordings, extract file path from the tuple
84
+ else:
85
+ temp_file_path = audio # For uploaded files
86
+
87
+ # Perform transcription
88
+ result = transcribe_audio(temp_file_path, language)
89
+
90
+ return result
91
+ except Exception as e:
92
+ return f"An error occurred: {str(e)}"
93
+
94
+ # Create the Gradio app
95
+ iface = gr.Interface(
96
+ fn=gradio_interface,
97
+ inputs=[
98
+ gr.Audio(sources=["upload","microphone"],type="filepath", label="Upload Audio"),
99
+ gr.Dropdown(choices=["tn", "fr", "en"], label="Select Language")
100
+ ],
101
+ outputs="text",
102
+ title="ASR Transcription Service",
103
+ description="Upload an audio file and select the language to transcribe the audio."
104
+ )
105
+
106
+ # Add the custom HTML with background image
107
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ gradio
3
+ transformers
4
+ librosa
5
+ webrtcvad