File size: 5,276 Bytes
2733e01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a0ea31
2733e01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a0ea31
 
 
 
 
 
 
 
 
 
 
 
2733e01
 
 
 
 
 
 
9a0ea31
2733e01
 
 
 
 
 
 
 
5d0da6c
 
 
 
2733e01
9a0ea31
2733e01
9a0ea31
 
 
f17aa72
9a0ea31
 
 
 
 
 
f17aa72
9a0ea31
2733e01
f17aa72
2733e01
 
 
 
 
 
 
 
 
 
 
 
9a0ea31
2733e01
 
 
 
 
 
 
 
 
 
 
9a0ea31
2733e01
 
 
 
 
 
 
 
 
 
 
9a0ea31
2733e01
 
 
 
 
9a0ea31
2733e01
 
9a0ea31
2733e01
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import time
import warnings
from pathlib import Path

import gradio as gr
import librosa
import spaces
import torch
from loguru import logger
from transformers import pipeline

warnings.filterwarnings("ignore")

is_hf = os.getenv("SYSTEM") == "spaces"

generate_kwargs = {
    "language": "Japanese",
    "do_sample": False,
    "num_beams": 1,
    "no_repeat_ngram_size": 5,
    "max_new_tokens": 64,
    "return_timestamps": True,  # Necesario para obtener los tiempos
}

model_dict = {
    "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
    "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
    "anime-whisper": "litagin/anime-whisper",
}

logger.info("Initializing pipelines...")
pipe_dict = {
    k: pipeline(
        "automatic-speech-recognition",
        model=v,
        device="cuda" if torch.cuda.is_available() else "cpu",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )
    for k, v in model_dict.items()
}
logger.success("Pipelines initialized!")


def save_as_srt(transcription, timestamps, output_path):
    """Genera un archivo .srt a partir de las transcripciones y sus marcas de tiempo."""
    with open(output_path, "w", encoding="utf-8") as f:
        for idx, (text, (start, end)) in enumerate(zip(transcription, timestamps)):
            start_time = time.strftime('%H:%M:%S', time.gmtime(start)) + f",{int(start % 1 * 1000):03d}"
            end_time = time.strftime('%H:%M:%S', time.gmtime(end)) + f",{int(end % 1 * 1000):03d}"

            f.write(f"{idx + 1}\n")
            f.write(f"{start_time} --> {end_time}\n")
            f.write(f"{text}\n\n")


@spaces.GPU
def transcribe_common(audio: str, model: str) -> str:
    if not audio:
        return "No audio file"
    filename = Path(audio).name
    logger.info(f"Model: {model}")
    logger.info(f"Audio: {filename}")

    try:
        y, sr = librosa.load(audio, mono=True, sr=16000)
    except Exception as e:
        from pydub import AudioSegment
        audio = AudioSegment.from_file(audio)
        audio.export("temp.wav", format="wav")
        y, sr = librosa.load("temp.wav", mono=True, sr=16000)
        Path("temp.wav").unlink()

    duration = librosa.get_duration(y=y, sr=sr)
    logger.info(f"Duration: {duration:.2f}s")

    start_time = time.time()
    result = pipe_dict[model](y, generate_kwargs=generate_kwargs)
    end_time = time.time()

    transcription = result["text"]
    timestamps = result["chunks"]  # Esto contiene las marcas de tiempo
    
    logger.success(f"Finished in {end_time - start_time:.2f}s\n{transcription}")

    # Guardar resultado en un archivo .srt
    output_path = f"{Path(filename).stem}.srt"
    save_as_srt([chunk["text"] for chunk in timestamps], [(chunk["timestamp_start"], chunk["timestamp_end"]) for chunk in timestamps], output_path)

    logger.info(f"Transcription saved to {output_path}")
    return transcription


def transcribe_others(audio) -> tuple[str, str]:
    result_v3 = transcribe_common(audio, "whisper-large-v3-turbo")
    result_kotoba_v2 = transcribe_common(audio, "kotoba-whisper-v2.0")
    return result_v3, result_kotoba_v2


def transcribe_anime_whisper(audio) -> str:
    return transcribe_common(audio, "anime-whisper")


initial_md = """
# Anime-Whisper Demo
[**Anime Whisper**](https://huggingface.co/litagin/anime-whisper): 5千時間以上のアニメ調セリフと台本でファインチューニングされた日本語音声認識モデルのデモです。句読点や感嘆符がリズムや感情に合わせて自然に付き、NSFW含む非言語発話もうまく台本調に書き起こされます。
- デモでは**音声は15秒まで**しか受け付けません
- 日本語のみ対応 (Japanese only)
- 比較のために [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています
pipeに渡しているkwargsは以下:
```python
generate_kwargs = {
    "language": "Japanese",
    "do_sample": False,
    "num_beams": 1,
    "no_repeat_ngram_size": 5,
    "max_new_tokens": 64,  # 結果が長いときは途中で打ち切られる
    "return_timestamps": True,  # Para incluir tiempos
}
```
"""

with gr.Blocks() as app:
    gr.Markdown(initial_md)
    audio = gr.Audio(type="filepath")
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Anime-Whisper")
            button_galgame = gr.Button("Transcribe with Anime-Whisper")
            output_galgame = gr.Textbox(label="Result")
    gr.Markdown("### Comparison")
    button_others = gr.Button("Transcribe with other models")
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Whisper-Large-V3-Turbo")
            output_v3 = gr.Textbox(label="Result")
        with gr.Column():
            gr.Markdown("### Kotoba-Whisper-V2.0")
            output_kotoba_v2 = gr.Textbox(label="Result")

    button_galgame.click(
        transcribe_anime_whisper,
        inputs=[audio],
        outputs=[output_galgame],
    )
    button_others.click(
        transcribe_others,
        inputs=[audio],
        outputs=[output_v3, output_kotoba_v2],
    )

app.launch(inbrowser=True)