File size: 2,327 Bytes
06fc5c8 7730969 5fe70fc 06fc5c8 7730969 06fc5c8 7730969 06fc5c8 7730969 06fc5c8 7730969 06fc5c8 7730969 06fc5c8 04668c4 06fc5c8 8581ee6 06fc5c8 7730969 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import time
import json
import random
import gradio as gr
import torch
import torchaudio
import numpy as np
from scipy.io import wavfile
import scipy.signal as sps
from denoiser.demucs import Demucs
from pydub import AudioSegment
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# 設定 Hugging Face Hub 的 Access Token
auth_token = os.getenv("HF_HOME")
# 加載私有模型
model_id = "DeepLearning101/Speech-Quality-Inspection_Meta-Denoiser"
model = AutoModelForSequenceClassification.from_pretrained(model_id, token=auth_token)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=auth_token)
def transcribe(file_upload, microphone):
file = microphone if microphone is not None else file_upload
demucs_model = Demucs(hidden=64)
state_dict = torch.load(modelpath, map_location='cpu')
demucs_model.load_state_dict(state_dict)
x, sr = torchaudio.load(file)
out = demucs_model(x[None])[0]
out = out / max(out.abs().max().item(), 1)
torchaudio.save('enhanced.wav', out, sr)
enhanced = AudioSegment.from_wav('enhanced.wav') #只有去完噪的需要降bitrate再做語音識別
enhanced.export('enhanced.wav', format="wav", bitrate="256k")
# 假設模型是用於文本分類
inputs = tokenizer(enhanced, return_tensors="pt")
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
return "enhanced.wav", predictions
demo = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath", label="語音質檢麥克風實時錄音"),
gr.Audio(type="filepath", label="語音質檢原始音檔"),
],
outputs=[
gr.Audio(type="filepath", label="Output"),
gr.Textbox(label="Model Predictions")
],
title="<p style='text-align: center'><a href='https://www.twman.org/AI' target='_blank'>語音質檢噪音去除 (語音增強):Meta Denoiser</a>",
description=(
"為了提升語音識別的效果,可以在識別前先進行噪音去除"
),
allow_flagging="never",
examples=[
["exampleAudio/15s_2020-03-27_sep1.wav"],
["exampleAudio/13s_2020-03-27_sep2.wav"],
["exampleAudio/30s_2020-04-23_sep1.wav"],
["exampleAudio/15s_2020-04-23_sep2.wav"],
],
)
demo.launch(debug=True)
|