Yazael commited on
Commit
9a0ea31
·
verified ·
1 Parent(s): 5af4803

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -16
app.py CHANGED
@@ -20,6 +20,7 @@ generate_kwargs = {
20
  "num_beams": 1,
21
  "no_repeat_ngram_size": 5,
22
  "max_new_tokens": 64,
 
23
  }
24
 
25
  model_dict = {
@@ -41,6 +42,18 @@ pipe_dict = {
41
  logger.success("Pipelines initialized!")
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  @spaces.GPU
45
  def transcribe_common(audio: str, model: str) -> str:
46
  if not audio:
@@ -48,7 +61,7 @@ def transcribe_common(audio: str, model: str) -> str:
48
  filename = Path(audio).name
49
  logger.info(f"Model: {model}")
50
  logger.info(f"Audio: {filename}")
51
-
52
  try:
53
  y, sr = librosa.load(audio, mono=True, sr=16000)
54
  except Exception as e:
@@ -60,22 +73,22 @@ def transcribe_common(audio: str, model: str) -> str:
60
 
61
  duration = librosa.get_duration(y=y, sr=sr)
62
  logger.info(f"Duration: {duration:.2f}s")
63
- kwargs = generate_kwargs.copy()
64
- if duration > 30:
65
- kwargs["return_timestamps"] = True
66
 
67
  start_time = time.time()
68
- result = pipe_dict[model](y, generate_kwargs=kwargs)["text"]
69
  end_time = time.time()
70
- logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
71
-
72
- # Guardar resultado en un archivo .str
73
- output_path = f"{Path(filename).stem}_{model}.str"
74
- with open(output_path, "w", encoding="utf-8") as f:
75
- f.write(result)
76
 
 
 
 
 
 
 
77
  logger.info(f"Transcription saved to {output_path}")
78
- return output_path # Devuelve el path del archivo transcrito
79
 
80
 
81
  def transcribe_others(audio) -> tuple[str, str]:
@@ -90,7 +103,7 @@ def transcribe_anime_whisper(audio) -> str:
90
 
91
  initial_md = """
92
  # Anime-Whisper Demo
93
- [**Anime Whisper**](https://huggingface.co/litagin/anime-whisper): 5千時間以上のアニメ調セリフと台本でファインチューニングされた日本語音声認識モデルのデモです。句読点や感嘆符がリズムや感情に合わせて自然に付き、NSFW含む非言語発言もうまく台本調に書き起こされます。
94
  - デモでは**音声は15秒まで**しか受け付けません
95
  - 日本語のみ対応 (Japanese only)
96
  - 比較のために [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています
@@ -102,6 +115,7 @@ generate_kwargs = {
102
  "num_beams": 1,
103
  "no_repeat_ngram_size": 5,
104
  "max_new_tokens": 64, # 結果が長いときは途中で打ち切られる
 
105
  }
106
  ```
107
  """
@@ -113,16 +127,16 @@ with gr.Blocks() as app:
113
  with gr.Column():
114
  gr.Markdown("### Anime-Whisper")
115
  button_galgame = gr.Button("Transcribe with Anime-Whisper")
116
- output_galgame = gr.File(label="Download Anime-Whisper Transcription")
117
  gr.Markdown("### Comparison")
118
  button_others = gr.Button("Transcribe with other models")
119
  with gr.Row():
120
  with gr.Column():
121
  gr.Markdown("### Whisper-Large-V3-Turbo")
122
- output_v3 = gr.File(label="Download Whisper-Large-V3-Turbo Transcription")
123
  with gr.Column():
124
  gr.Markdown("### Kotoba-Whisper-V2.0")
125
- output_kotoba_v2 = gr.File(label="Download Kotoba-Whisper-V2.0 Transcription")
126
 
127
  button_galgame.click(
128
  transcribe_anime_whisper,
 
20
  "num_beams": 1,
21
  "no_repeat_ngram_size": 5,
22
  "max_new_tokens": 64,
23
+ "return_timestamps": True, # Necesario para obtener los tiempos
24
  }
25
 
26
  model_dict = {
 
42
  logger.success("Pipelines initialized!")
43
 
44
 
45
+ def save_as_srt(transcription, timestamps, output_path):
46
+ """Genera un archivo .srt a partir de las transcripciones y sus marcas de tiempo."""
47
+ with open(output_path, "w", encoding="utf-8") as f:
48
+ for idx, (text, (start, end)) in enumerate(zip(transcription, timestamps)):
49
+ start_time = time.strftime('%H:%M:%S', time.gmtime(start)) + f",{int(start % 1 * 1000):03d}"
50
+ end_time = time.strftime('%H:%M:%S', time.gmtime(end)) + f",{int(end % 1 * 1000):03d}"
51
+
52
+ f.write(f"{idx + 1}\n")
53
+ f.write(f"{start_time} --> {end_time}\n")
54
+ f.write(f"{text}\n\n")
55
+
56
+
57
  @spaces.GPU
58
  def transcribe_common(audio: str, model: str) -> str:
59
  if not audio:
 
61
  filename = Path(audio).name
62
  logger.info(f"Model: {model}")
63
  logger.info(f"Audio: {filename}")
64
+
65
  try:
66
  y, sr = librosa.load(audio, mono=True, sr=16000)
67
  except Exception as e:
 
73
 
74
  duration = librosa.get_duration(y=y, sr=sr)
75
  logger.info(f"Duration: {duration:.2f}s")
 
 
 
76
 
77
  start_time = time.time()
78
+ result = pipe_dict[model](y, generate_kwargs=generate_kwargs)
79
  end_time = time.time()
80
+
81
+ transcription = result["text"]
82
+ timestamps = result["chunks"] # Esto contiene las marcas de tiempo
 
 
 
83
 
84
+ logger.success(f"Finished in {end_time - start_time:.2f}s\n{transcription}")
85
+
86
+ # Guardar resultado en un archivo .srt
87
+ output_path = f"{Path(filename).stem}.srt"
88
+ save_as_srt([chunk["text"] for chunk in timestamps], [(chunk["timestamp_start"], chunk["timestamp_end"]) for chunk in timestamps], output_path)
89
+
90
  logger.info(f"Transcription saved to {output_path}")
91
+ return transcription
92
 
93
 
94
  def transcribe_others(audio) -> tuple[str, str]:
 
103
 
104
  initial_md = """
105
  # Anime-Whisper Demo
106
+ [**Anime Whisper**](https://huggingface.co/litagin/anime-whisper): 5千時間以上のアニメ調セリフと台本でファインチューニングされた日本語音声認識モデルのデモです。句読点や感嘆符がリズムや感情に合わせて自然に付き、NSFW含む非言語発話もうまく台本調に書き起こされます。
107
  - デモでは**音声は15秒まで**しか受け付けません
108
  - 日本語のみ対応 (Japanese only)
109
  - 比較のために [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています
 
115
  "num_beams": 1,
116
  "no_repeat_ngram_size": 5,
117
  "max_new_tokens": 64, # 結果が長いときは途中で打ち切られる
118
+ "return_timestamps": True, # Para incluir tiempos
119
  }
120
  ```
121
  """
 
127
  with gr.Column():
128
  gr.Markdown("### Anime-Whisper")
129
  button_galgame = gr.Button("Transcribe with Anime-Whisper")
130
+ output_galgame = gr.Textbox(label="Result")
131
  gr.Markdown("### Comparison")
132
  button_others = gr.Button("Transcribe with other models")
133
  with gr.Row():
134
  with gr.Column():
135
  gr.Markdown("### Whisper-Large-V3-Turbo")
136
+ output_v3 = gr.Textbox(label="Result")
137
  with gr.Column():
138
  gr.Markdown("### Kotoba-Whisper-V2.0")
139
+ output_kotoba_v2 = gr.Textbox(label="Result")
140
 
141
  button_galgame.click(
142
  transcribe_anime_whisper,