Spaces:

Lguyogiro
/

Totlahtol-Tetelahtzinco-Omitlan

Running

App Files Files Community

Lguyogiro commited on Dec 14, 2024

Commit

f4e441a

1 Parent(s): a6be9e9

initial commit

Browse files

Files changed (3) hide show

app.py +95 -0
asr.py +41 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import sys
+import datetime
+import streamlit as st
+from asr import load_model, inference
+from audio_recorder_streamlit import audio_recorder
+@st.cache_resource
+def load_asr_model():
+    return load_model()
+processor, asr_model = load_asr_model()
+def save_audio_file(audio_bytes, file_extension):
+    """
+    Save audio bytes to a file with the specified extension.
+    :param audio_bytes: Audio data in bytes
+    :param file_extension: The extension of the output audio file
+    :return: The name of the saved audio file
+    """
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    file_name = f"audio_{timestamp}.{file_extension}"
+    with open(file_name, "wb") as f:
+        f.write(audio_bytes)
+    return file_name
+def transcribe_audio(file_path):
+    """
+    Transcribe the audio file at the specified path.
+    :param file_path: The path of the audio file to transcribe
+    :return: The transcribed text
+    """
+    with open(file_path, "rb") as audio_file:
+        transcript = inference(processor, asr_model, audio_file)
+    return transcript
+def main():
+    """
+    """
+    st.title("Anishinaabemowin Transcription")
+    tab1, tab2 = st.tabs(["Record Audio", "Upload Audio"])
+    # Record Audio tab
+    with tab1:
+        audio_bytes = audio_recorder()
+        if audio_bytes:
+            st.audio(audio_bytes, format="audio/wav")
+            fname = save_audio_file(audio_bytes, "wav")
+    # Upload Audio tab
+    with tab2:
+        audio_file = st.file_uploader("Upload Audio", type=["wav"])
+        if audio_file:
+            file_extension = audio_file.type.split('/')[1]
+            fname = save_audio_file(audio_file.read(), file_extension)
+    # Transcribe button action
+    if st.button("Transcribe"):
+        # Find the newest audio file
+        #audio_file_path = max(
+        #    [f for f in os.listdir(".") if f.startswith("audio")],
+        #    key=os.path.getctime,
+        #)
+        # Transcribe the audio file
+        transcript_text = transcribe_audio(fname)
+        # Display the transcript
+        st.header("Transcript")
+        st.write(transcript_text)
+        # Save the transcript to a text file
+        with open("transcript.txt", "w") as f:
+            f.write(transcript_text)
+        # Provide a download button for the transcript
+        st.download_button("Download Transcript", transcript_text)
+if __name__ == "__main__":
+    # Set up the working directory
+    working_dir = os.path.dirname(os.path.abspath(__file__))
+    sys.path.append(working_dir)
+    # Run the main function
+    main()

asr.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from transformers import Wav2Vec2ForCTC, AutoProcessor
+import torchaudio
+import torch
+import os
+import librosa
+hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
+def read_audio_data(file):
+    speech_array, sampling_rate = torchaudio.load(file, normalize = True)
+    return speech_array, sampling_rate
+def load_model():
+    model_id = "Lguyogiro/wav2vec2-large-mms-1b-nhi-adapterft-ilv_fold1"
+    target_lang = "nhi"
+    processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang, use_auth_token=hf_token)
+    model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True, use_safetensors=True, use_auth_token=hf_token)
+    return processor, model
+def inference(processor, model, audio_path):
+    audio, sampling_rate = librosa.load(audio_path, sr=16000)  # Ensure the correct sampling rate
+    inputs = processor(audio, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = model(inputs.input_values).logits
+    # Decode predicted tokens
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    #arr, rate = read_audio_data(audio_path)
+    #inputs = processor(arr.squeeze().numpy(), sampling_rate=16_000, return_tensors="pt")
+    #with torch.no_grad():
+    #    outputs = model(**inputs).logits
+    #ids = torch.argmax(outputs, dim=-1)[0]
+    #transcription = processor.decode(ids)
+    return transcription

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+soundfile
+transformers
+torch
+torchaudio
+streamlit_webrtc
+audio_recorder_streamlit
+librosa