import os import time import json import gradio as gr import torch import torchaudio import numpy as np from denoiser.demucs import Demucs from pydub import AudioSegment modelpath = './denoiser/master64.th' def transcribe(file_upload, microphone): file = microphone if microphone is not None else file_upload # 載入模型 model = Demucs(hidden=64) state_dict = torch.load(modelpath, map_location='cpu') model.load_state_dict(state_dict) # 載入音訊並強制轉單聲道 x, sr = torchaudio.load(file, channels_first=True) # 載入音訊 # 新增:音訊長度檢查(插入在此處) MAX_AUDIO_SECONDS = 600 # 10分鐘限制 if x.shape[1] / sr > MAX_AUDIO_SECONDS: raise ValueError(f"音訊長度不可超過 {MAX_AUDIO_SECONDS} 秒,當前音訊長度:{x.shape[1]/sr:.1f} 秒") # 單聲道轉換 if x.shape[0] > 1: x = torch.mean(x, dim=0, keepdim=True) # 執行降噪 out = model(x[None])[0] # 後處理 out = out / max(out.abs().max().item(), 1) torchaudio.save('enhanced.wav', out, sr) # 降低位元率(僅供語音辨識使用) enhanced = AudioSegment.from_wav('enhanced.wav') enhanced.export('enhanced.wav', format="wav", bitrate="256k") return "enhanced.wav" # import os # import time # import json # import gradio as gr # import torch # import torchaudio # import numpy as np # from denoiser.demucs import Demucs # from pydub import AudioSegment # import soundfile as sf # import librosa # modelpath = './denoiser/master64.th' # def transcribe(file_upload, microphone): # file = microphone if microphone is not None else file_upload # # 新增音訊預處理 → 統一格式 # def preprocess_audio(path): # data, sr = sf.read(path) # # 如果是雙聲道 → 轉單聲道 # if len(data.shape) > 1: # data = data.mean(axis=1) # # 如果不是 16kHz → 重採樣 # if sr != 16000: # data = librosa.resample(data, orig_sr=sr, target_sr=16000) # sr = 16000 # # 儲存為 WAV 供模型使用 # sf.write("enhanced.wav", data, sr) # return "enhanced.wav" # # 如果是 MP3,先轉成 WAV 再處理 # if file.lower().endswith(".mp3"): # audio = AudioSegment.from_file(file) # audio = audio.set_frame_rate(16000).set_channels(1) # 轉單聲道 + 16kHz # audio.export("enhanced.wav", format="wav") # file = "enhanced.wav" # else: # file = preprocess_audio(file) # model = Demucs(hidden=64) # state_dict = torch.load(modelpath, map_location='cpu') # model.load_state_dict(state_dict) # demucs = model.eval() # x, sr = torchaudio.load(file) # x = x[0:1] # 強制取第一個聲道(確保是單聲道) # with torch.no_grad(): # out = demucs(x[None])[0] # out = out / max(out.abs().max().item(), 1) # torchaudio.save('enhanced_final.wav', out, sr) # # 輸出 WAV 格式給前端播放 # enhanced = AudioSegment.from_wav('enhanced_final.wav') # enhanced.export('enhanced_final.mp3', format="mp3", bitrate="256k") # return "enhanced_final.mp3" # 回傳 MP3 更省空間 # # 👇 加上這一行,解決 Gradio schema 推導錯誤 # transcribe.__annotations__ = { # "file_upload": str, # "microphone": str, # "return": str # } demo = gr.Interface( fn=transcribe, inputs=[ gr.Audio(type="filepath", label="語音質檢原始音檔", sources=["upload", "microphone"]) # 顯式指定來源 ], outputs=[ gr.Audio(type="filepath", label="Output") # 保持列表形式 ], live=True, allow_flagging="never", title="