Spaces:

hackergeek98
/

tinyyy

Sleeping

App Files Files Community

hackergeek98 commited on Mar 24

Commit

bdf330c

verified ·

1 Parent(s): c558d1d

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -33

app.py CHANGED Viewed

@@ -1,44 +1,68 @@
-import gradio as gr
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import torch
-import librosa
-# Load the fine-tuned Whisper model and processor
-model_name = "hackergeek98/tinyyyy_whisper"
-processor = WhisperProcessor.from_pretrained(model_name)
-model = WhisperForConditionalGeneration.from_pretrained(model_name)
-# Force the model to transcribe in Persian
-model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="fa", task="transcribe")
-# Move model to GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-# Define the ASR function
-def transcribe_audio(audio_file):
-    # Load audio file using librosa (supports multiple formats)
-    audio_data, sampling_rate = librosa.load(audio_file, sr=16000)  # Resample to 16kHz
-    # Preprocess the audio
-    inputs = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(device)
-    # Generate transcription
-    with torch.no_grad():
-        predicted_ids = model.generate(inputs)
-    # Decode the transcription
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=transcribe_audio,  # Function to call
-    inputs=gr.Audio(type="filepath"),  # Input: Upload audio file (any format)
-    outputs=gr.Textbox(label="Transcription"),  # Output: Display transcription
-    title="Whisper ASR: Persian Transcription",
-    description="Upload an audio file (e.g., .wav, .mp3, .ogg), and the fine-tuned Whisper model will transcribe it in Persian.",
 )
-# Launch the app
-interface.launch()

+# Install required packages
 import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from pydub import AudioSegment
+import os
+import gradio as gr
+# Load the model and processor
+model_id = "hackergeek98/tinyyyy_whisper"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+# Create pipeline
+whisper_pipe = pipeline(
+    "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=0 if torch.cuda.is_available() else -1
+)
+# Convert audio to WAV format
+def convert_to_wav(audio_path):
+    audio = AudioSegment.from_file(audio_path)
+    wav_path = "converted_audio.wav"
+    audio.export(wav_path, format="wav")
+    return wav_path
+# Split long audio into chunks
+def split_audio(audio_path, chunk_length_ms=30000):  # Default: 30 sec per chunk
+    audio = AudioSegment.from_wav(audio_path)
+    chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+    chunk_paths = []
+    for i, chunk in enumerate(chunks):
+        chunk_path = f"chunk_{i}.wav"
+        chunk.export(chunk_path, format="wav")
+        chunk_paths.append(chunk_path)
+    return chunk_paths
+# Transcribe a long audio file
+def transcribe_long_audio(audio_path):
+    wav_path = convert_to_wav(audio_path)
+    chunk_paths = split_audio(wav_path)
+    transcription = ""
+    for chunk in chunk_paths:
+        result = whisper_pipe(chunk)
+        transcription += result["text"] + "\n"
+        os.remove(chunk)  # Remove processed chunk
+    os.remove(wav_path)  # Cleanup original file
     return transcription
+# Gradio interface
+def transcribe_interface(audio_file):
+    return transcribe_long_audio(audio_file)
+iface = gr.Interface(
+    fn=transcribe_interface,
+    inputs=gr.Audio(source="upload", type="filepath"),
+    outputs="text",
+    title="Whisper ASR - Transcription",
+    description="Upload an audio file, and the model will transcribe it."
 )
+if __name__ == "__main__":
+    iface.launch()