File size: 2,449 Bytes
b47054b
bc437c2
1a82ef7
6459a44
 
 
1a82ef7
3d4cdb6
b47054b
72d6ac4
bc437c2
 
 
 
6459a44
376dc4c
6459a44
 
 
 
 
 
376dc4c
6459a44
1a82ef7
72d6ac4
6f65c39
a4b8183
1a82ef7
a2cdd68
 
86f9703
bc437c2
a2cdd68
bc437c2
 
 
a2cdd68
bc437c2
 
 
4c1eb71
bc437c2
 
72d6ac4
4c1eb71
 
 
 
a151b3f
 
 
f7fb0b4
62d7baf
f7fb0b4
a151b3f
 
4c1eb71
f7fb0b4
a151b3f
 
f7fb0b4
a151b3f
bc437c2
1a82ef7
a2cdd68
72d6ac4
1a82ef7
 
63b6598
1a82ef7
 
72d6ac4
1a82ef7
 
72d6ac4
1a82ef7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
# from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline,WhisperProcessor
from transformers import WhisperProcessor, WhisperForConditionalGeneration

import torch
import librosa

# 加载 Whisper 模型和 processor
# model_name = "openai/whisper-large-v3-turbo"
# processor = WhisperProcessor.from_pretrained(model_name)
# model = WhisperForConditionalGeneration.from_pretrained(model_name)

model_name = "openai/whisper-large-v3-turbo"

# models = AutoModelForSpeechSeq2Seq.from_pretrained(
#     model_id,  low_cpu_mem_usage=True
# )

processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# model = pipeline("automatic-speech-recognition", model=models, chunk_length_s=30, device=0)

# 加载数据集 bigcode/the-stack

# ds = load_dataset("CoIR-Retrieval/CodeSearchNet-php-queries-corpus")

def transcribe(audio_path):
    # 加载音频文件并转换为信号
    # audio, sr = librosa.load(audio_path, sr=16000)
    # input_values = processor(audio_path, return_tensors="pt", sampling_rate=16000).["text"]

    # # 模型推理
    # with torch.no_grad():
    #     logits = model(input_values).logits

    # predicted_ids = torch.argmax(logits, dim=-1)
    # transcription = processor.batch_decode(predicted_ids)

    # transcription = model(audio_path,batch_size=1000, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]

    # result = pipe(sample)
    # 返回转录结果
    # return transcription


    #------
    audio_cnt, sr = librosa.load(audio_path, sr=16000)

    # 将音频数据传递给 processor
    input_features = processor(audio_cnt, sampling_rate=16000, return_tensors="pt").input_features

    print(input_features)
    
    # 模型推理
    with torch.no_grad():
        generated_ids = model.generate(input_features)
    
    # 解码得到转录结果
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return transcription

    
# Gradio 界面
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio( type="filepath"),
    outputs="text",
    title="Whisper Transcription for Developers",
    description="使用 Whisper 和 bigcode 数据集转录开发者相关术语。"
)

# 启动 Gradio 应用
iface.launch()