# import os # import time # import json # import gradio as gr # import torch # import torchaudio # import numpy as np # from denoiser.demucs import Demucs # from pydub import AudioSegment # modelpath = './denoiser/master64.th' # def transcribe(file_upload, microphone): # file = microphone if microphone is not None else file_upload # model = Demucs(hidden=64) # state_dict = torch.load(modelpath, map_location='cpu') # model.load_state_dict(state_dict) # demucs = model # x, sr = torchaudio.load(file) # out = demucs(x[None])[0] # out = out / max(out.abs().max().item(), 1) # torchaudio.save('enhanced.wav', out, sr) # enhanced = AudioSegment.from_wav('enhanced.wav') # 只有去完噪的需要降 bitrate 再做語音識別 # enhanced.export('enhanced.wav', format="wav", bitrate="256k") # return "enhanced.wav" import os import time import json import gradio as gr import torch import torchaudio import numpy as np from denoiser.demucs import Demucs from pydub import AudioSegment import soundfile as sf import librosa modelpath = './denoiser/master64.th' def transcribe(file_upload, microphone): file = microphone if microphone is not None else file_upload # 新增音訊預處理 → 統一格式 def preprocess_audio(path): data, sr = sf.read(path) # 如果是雙聲道 → 轉單聲道 if len(data.shape) > 1: data = data.mean(axis=1) # 如果不是 16kHz → 重採樣 if sr != 16000: data = librosa.resample(data, orig_sr=sr, target_sr=16000) sr = 16000 # 儲存為 WAV 供模型使用 sf.write("enhanced.wav", data, sr) return "enhanced.wav" # 如果是 MP3,先轉成 WAV 再處理 if file.lower().endswith(".mp3"): audio = AudioSegment.from_file(file) audio = audio.set_frame_rate(16000).set_channels(1) # 轉單聲道 + 16kHz audio.export("enhanced.wav", format="wav") file = "enhanced.wav" else: file = preprocess_audio(file) model = Demucs(hidden=64) state_dict = torch.load(modelpath, map_location='cpu') model.load_state_dict(state_dict) demucs = model.eval() x, sr = torchaudio.load(file) x = x[0:1] # 強制取第一個聲道(確保是單聲道) with torch.no_grad(): out = demucs(x[None])[0] out = out / max(out.abs().max().item(), 1) torchaudio.save('enhanced_final.wav', out, sr) # 輸出 WAV 格式給前端播放 enhanced = AudioSegment.from_wav('enhanced_final.wav') enhanced.export('enhanced_final.mp3', format="mp3", bitrate="256k") return "enhanced_final.mp3" # 回傳 MP3 更省空間 demo = gr.Interface( fn=transcribe, inputs=[ gr.Audio(type="filepath", label="語音質檢原始音檔"), ], outputs=gr.Audio(type="filepath", label="Output"), title="