File size: 2,449 Bytes
b47054b bc437c2 1a82ef7 6459a44 1a82ef7 3d4cdb6 b47054b 72d6ac4 bc437c2 6459a44 376dc4c 6459a44 376dc4c 6459a44 1a82ef7 72d6ac4 6f65c39 a4b8183 1a82ef7 a2cdd68 86f9703 bc437c2 a2cdd68 bc437c2 a2cdd68 bc437c2 4c1eb71 bc437c2 72d6ac4 4c1eb71 a151b3f f7fb0b4 62d7baf f7fb0b4 a151b3f 4c1eb71 f7fb0b4 a151b3f f7fb0b4 a151b3f bc437c2 1a82ef7 a2cdd68 72d6ac4 1a82ef7 63b6598 1a82ef7 72d6ac4 1a82ef7 72d6ac4 1a82ef7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
# from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline,WhisperProcessor
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
# 加载 Whisper 模型和 processor
# model_name = "openai/whisper-large-v3-turbo"
# processor = WhisperProcessor.from_pretrained(model_name)
# model = WhisperForConditionalGeneration.from_pretrained(model_name)
model_name = "openai/whisper-large-v3-turbo"
# models = AutoModelForSpeechSeq2Seq.from_pretrained(
# model_id, low_cpu_mem_usage=True
# )
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
# model = pipeline("automatic-speech-recognition", model=models, chunk_length_s=30, device=0)
# 加载数据集 bigcode/the-stack
# ds = load_dataset("CoIR-Retrieval/CodeSearchNet-php-queries-corpus")
def transcribe(audio_path):
# 加载音频文件并转换为信号
# audio, sr = librosa.load(audio_path, sr=16000)
# input_values = processor(audio_path, return_tensors="pt", sampling_rate=16000).["text"]
# # 模型推理
# with torch.no_grad():
# logits = model(input_values).logits
# predicted_ids = torch.argmax(logits, dim=-1)
# transcription = processor.batch_decode(predicted_ids)
# transcription = model(audio_path,batch_size=1000, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
# result = pipe(sample)
# 返回转录结果
# return transcription
#------
audio_cnt, sr = librosa.load(audio_path, sr=16000)
# 将音频数据传递给 processor
input_features = processor(audio_cnt, sampling_rate=16000, return_tensors="pt").input_features
print(input_features)
# 模型推理
with torch.no_grad():
generated_ids = model.generate(input_features)
# 解码得到转录结果
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return transcription
# Gradio 界面
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio( type="filepath"),
outputs="text",
title="Whisper Transcription for Developers",
description="使用 Whisper 和 bigcode 数据集转录开发者相关术语。"
)
# 启动 Gradio 应用
iface.launch()
|