Spaces:
Runtime error
Runtime error
Commit
·
e698260
1
Parent(s):
a442a66
Version 0.1
Browse files- adjust.py +126 -0
- app.py +55 -20
- diarization.py +7 -4
- file_name.py +15 -0
- lang_ui.py +102 -0
- list.py +0 -0
- set_up.py +65 -0
- transcribe.py +36 -37
- ui.py +130 -0
- utils.py +37 -0
- video_tool.py +11 -0
adjust.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ui import *
|
2 |
+
from pydub import AudioSegment
|
3 |
+
from utils import load_groups_json
|
4 |
+
from video_tool import add_subtitle_to_video
|
5 |
+
import gradio as gr
|
6 |
+
import os
|
7 |
+
import utils
|
8 |
+
import file_name
|
9 |
+
|
10 |
+
min_duration_ms = 10000
|
11 |
+
current_pos = 0
|
12 |
+
|
13 |
+
speaker_to_name = {"Speaker": "Name"}
|
14 |
+
speaker_to_sample = {"Speaker": "File"}
|
15 |
+
|
16 |
+
sample_groups, _ = load_groups_json()
|
17 |
+
|
18 |
+
|
19 |
+
def get_current():
|
20 |
+
name = list(speaker_to_name.values())[current_pos]
|
21 |
+
sample = list(speaker_to_sample.values())[current_pos]
|
22 |
+
return [name, sample]
|
23 |
+
|
24 |
+
|
25 |
+
def prepare_output(input_file):
|
26 |
+
if input_file is None or not os.path.exists(input_file):
|
27 |
+
return
|
28 |
+
|
29 |
+
speakers = get_speakers(input_file)
|
30 |
+
adjust_speaker_update = adjust_speaker.update(
|
31 |
+
speakers[0], interactive=True)
|
32 |
+
adjust_audio_update = adjust_audio.update(speakers[1], interactive=True)
|
33 |
+
prev_button_update = prev_button.update(interactive=True)
|
34 |
+
next_button_update = next_button.update(interactive=True)
|
35 |
+
adjust_button_update = adjust_button.update(interactive=True)
|
36 |
+
# Return adjust speaker, adjust audio, previous, next, adjust button
|
37 |
+
return [adjust_speaker_update, adjust_audio_update, prev_button_update, next_button_update, adjust_button_update]
|
38 |
+
|
39 |
+
|
40 |
+
def get_speakers(input_file):
|
41 |
+
if (input_file == None):
|
42 |
+
return [None, None]
|
43 |
+
|
44 |
+
global speaker_to_name
|
45 |
+
global speaker_to_sample
|
46 |
+
|
47 |
+
speaker_to_name = {}
|
48 |
+
speaker_to_sample = {}
|
49 |
+
|
50 |
+
for speaker in sample_groups:
|
51 |
+
for suffix in range(1, 100):
|
52 |
+
file_path = f"{speaker}-{suffix}.wav"
|
53 |
+
if os.path.exists(file_path):
|
54 |
+
audio_segment = AudioSegment.from_file(file_path)
|
55 |
+
if len(audio_segment) > min_duration_ms:
|
56 |
+
print(f"Found file: {file_path}")
|
57 |
+
print(
|
58 |
+
f"File duration: {len(audio_segment) / 1000} seconds")
|
59 |
+
break
|
60 |
+
|
61 |
+
temp_file_name = f"{speaker}-sample.wav"
|
62 |
+
audio_segment[:10 * 1000].export(temp_file_name, format="wav")
|
63 |
+
speaker_to_sample[speaker] = temp_file_name
|
64 |
+
speaker_to_name[speaker] = speaker
|
65 |
+
return get_current()
|
66 |
+
|
67 |
+
|
68 |
+
def change_name(to_name):
|
69 |
+
current_speaker = sample_groups[current_pos]
|
70 |
+
speaker_to_name[current_speaker] = to_name
|
71 |
+
print(str(get_current()))
|
72 |
+
|
73 |
+
|
74 |
+
def get_speakers_next(to_name):
|
75 |
+
change_name(to_name)
|
76 |
+
global current_pos
|
77 |
+
if (current_pos < len(sample_groups) - 1):
|
78 |
+
current_pos += 1
|
79 |
+
return get_current()
|
80 |
+
|
81 |
+
|
82 |
+
def get_speakers_previous(to_name):
|
83 |
+
change_name(to_name)
|
84 |
+
global current_pos
|
85 |
+
if (current_pos > 0):
|
86 |
+
current_pos -= 1
|
87 |
+
return get_current()
|
88 |
+
|
89 |
+
|
90 |
+
def start_adjust(to_name, progress=gr.Progress()):
|
91 |
+
change_name(to_name)
|
92 |
+
|
93 |
+
# Replacing texts
|
94 |
+
progress(0.4, desc=ui_lang["progress_adjust_speaker"])
|
95 |
+
transcribe_txt_list, subtitle_txt_list = utils.read_transcribe_subtitle_file(
|
96 |
+
False)
|
97 |
+
modified_transcribe = replace_text(transcribe_txt_list)
|
98 |
+
modified_subtitle = replace_text(subtitle_txt_list)
|
99 |
+
utils.write_transcribe_subtitle_file(
|
100 |
+
modified_transcribe, modified_subtitle, True)
|
101 |
+
|
102 |
+
# Adding subtitle to video
|
103 |
+
progress(0.8, desc=ui_lang["progress_add_subtitle"])
|
104 |
+
add_subtitle_to_video(
|
105 |
+
file_name.input_file, file_name.subtitle_adjusted_file, file_name.video_subtitle_file, file_name.start_time_for_adjustment, file_name.end_time_for_adjustment)
|
106 |
+
|
107 |
+
# Return video file link, transcribe string, transcribe.txt, subtitle.txt
|
108 |
+
transcribe_txt_list, _ = utils.read_transcribe_subtitle_file(True)
|
109 |
+
print(line for line in transcribe_txt_list)
|
110 |
+
transcribe_txt = "\n".join(transcribe_txt_list)
|
111 |
+
return [
|
112 |
+
file_name.video_subtitle_file,
|
113 |
+
transcribe_txt,
|
114 |
+
[file_name.transcribe_adjusted_file, file_name.subtitle_adjusted_file]
|
115 |
+
]
|
116 |
+
|
117 |
+
|
118 |
+
def replace_text(lines):
|
119 |
+
modified_lines = []
|
120 |
+
for line in lines:
|
121 |
+
for key, value in speaker_to_name.items():
|
122 |
+
line = line.replace(key, value)
|
123 |
+
print(f"Replacing {key} with {value}")
|
124 |
+
modified_lines.append(line)
|
125 |
+
print(modified_lines)
|
126 |
+
return modified_lines
|
app.py
CHANGED
@@ -1,27 +1,62 @@
|
|
1 |
-
from
|
2 |
-
from
|
3 |
-
from
|
4 |
-
import ffmpeg
|
5 |
import gradio as gr
|
6 |
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
if __name__ == "__main__":
|
27 |
-
|
|
|
1 |
+
from ui import *
|
2 |
+
from adjust import *
|
3 |
+
from set_up import prepare_input
|
|
|
4 |
import gradio as gr
|
5 |
|
6 |
+
with gr.Blocks() as demo:
|
7 |
+
ui_lang_radio.render()
|
8 |
+
ui_lang_radio.change(change_lang, inputs=ui_lang_radio,
|
9 |
+
outputs=comp_to_update)
|
10 |
|
11 |
+
top_markdown.render()
|
12 |
+
with gr.Column():
|
13 |
+
with gr.Row():
|
14 |
+
with gr.Column():
|
15 |
+
input_video.render()
|
16 |
+
input_video.change(get_duration, input_video, [
|
17 |
+
start_time, end_time])
|
18 |
+
with gr.Row():
|
19 |
+
start_time.render()
|
20 |
+
end_time.render()
|
21 |
+
with gr.Column():
|
22 |
+
lang_radio.render()
|
23 |
+
model_dropdown.render()
|
24 |
+
summary_checkbox.render()
|
25 |
+
start_button.render()
|
26 |
+
start_button.click(prepare_input,
|
27 |
+
[input_video, start_time, end_time, lang_radio,
|
28 |
+
model_dropdown, summary_checkbox],
|
29 |
+
[output_video, output_transcribe, output_file])
|
30 |
|
31 |
+
bottom_markdown.render()
|
32 |
+
with gr.Row(equal_height=False):
|
33 |
+
with gr.Column():
|
34 |
+
output_video.render()
|
35 |
+
output_file.render()
|
36 |
+
output_file.change(prepare_output, inputs=output_file, outputs=[
|
37 |
+
adjust_speaker, adjust_audio, prev_button, next_button, adjust_button])
|
38 |
+
with gr.Column():
|
39 |
+
output_transcribe.render()
|
40 |
+
output_summary.render()
|
41 |
|
42 |
+
middle_markdown.render()
|
43 |
+
with gr.Row(equal_height=False):
|
44 |
+
adjust_audio.render()
|
45 |
+
adjust_speaker.render()
|
46 |
+
with gr.Row():
|
47 |
+
prev_button.render()
|
48 |
+
next_button.render()
|
49 |
+
prev_button.click(get_speakers_previous, inputs=[adjust_speaker], outputs=[
|
50 |
+
adjust_speaker, adjust_audio])
|
51 |
+
next_button.click(get_speakers_next, inputs=[adjust_speaker], outputs=[
|
52 |
+
adjust_speaker, adjust_audio])
|
53 |
+
|
54 |
+
adjust_button.render()
|
55 |
+
adjust_button.click(start_adjust, inputs=[adjust_speaker], outputs=[
|
56 |
+
output_video, output_transcribe, output_file])
|
57 |
+
|
58 |
+
with gr.Accordion("Copyright"):
|
59 |
+
gr.Markdown("Created with OpenAI Whisper and Huggingface")
|
60 |
|
61 |
if __name__ == "__main__":
|
62 |
+
demo.queue().launch()
|
diarization.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
from pyannote.audio import Pipeline
|
2 |
from pydub import AudioSegment
|
3 |
import gradio as gr
|
4 |
-
import os
|
5 |
import torch
|
6 |
import json
|
|
|
7 |
|
8 |
-
|
|
|
|
|
|
|
9 |
pipeline = Pipeline.from_pretrained(
|
10 |
'pyannote/speaker-diarization', use_auth_token=hugging_face_token)
|
11 |
device = torch.device("cuda")
|
@@ -48,7 +51,7 @@ def audio_segmentation(input_file, speaker_groups_dict):
|
|
48 |
|
49 |
|
50 |
def save_groups_json(sample_groups_list: list, speaker_groups_dict: dict):
|
51 |
-
with open("sample_groups.json", "w") as json_file_sample:
|
52 |
json.dump(sample_groups_list, json_file_sample)
|
53 |
-
with open("speaker_groups.json", "w") as json_file_speaker:
|
54 |
json.dump(speaker_groups_dict, json_file_speaker)
|
|
|
1 |
from pyannote.audio import Pipeline
|
2 |
from pydub import AudioSegment
|
3 |
import gradio as gr
|
|
|
4 |
import torch
|
5 |
import json
|
6 |
+
import gc
|
7 |
|
8 |
+
gc.collect()
|
9 |
+
torch.cuda.empty_cache()
|
10 |
+
|
11 |
+
hugging_face_token = "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"
|
12 |
pipeline = Pipeline.from_pretrained(
|
13 |
'pyannote/speaker-diarization', use_auth_token=hugging_face_token)
|
14 |
device = torch.device("cuda")
|
|
|
51 |
|
52 |
|
53 |
def save_groups_json(sample_groups_list: list, speaker_groups_dict: dict):
|
54 |
+
with open("sample_groups.json", "w", encoding="utf-8") as json_file_sample:
|
55 |
json.dump(sample_groups_list, json_file_sample)
|
56 |
+
with open("speaker_groups.json", "w", encoding="utf-8") as json_file_speaker:
|
57 |
json.dump(speaker_groups_dict, json_file_speaker)
|
file_name.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
def get_title():
|
3 |
+
return input_file[:-4]
|
4 |
+
|
5 |
+
|
6 |
+
input_file = ""
|
7 |
+
audio_file = "input.wav"
|
8 |
+
transcribe_file = "transcribe.txt"
|
9 |
+
subtitle_file = "subtitle.srt"
|
10 |
+
transcribe_adjusted_file = "transcribe_adjusted.txt"
|
11 |
+
subtitle_adjusted_file = "subtitle_adjusted.srt"
|
12 |
+
video_subtitle_file = f"output_{get_title()}.mp4"
|
13 |
+
|
14 |
+
start_time_for_adjustment = "00:00:00"
|
15 |
+
end_time_for_adjustment = "00:10:00"
|
lang_ui.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
english_ui = {
|
2 |
+
"top_markdown": """
|
3 |
+
## Transcribe
|
4 |
+
Insert your video and set the options for transcribing
|
5 |
+
""",
|
6 |
+
"middle_markdown": """
|
7 |
+
## Adjustment
|
8 |
+
Listen to the clips below and type in the name according to the speaker's sound. After that, please click the 'Adjust Speaker' button to adjust the result above accordingly.
|
9 |
+
""",
|
10 |
+
"bottom_markdown": """
|
11 |
+
## Result
|
12 |
+
The following is the result of the transcribe
|
13 |
+
""",
|
14 |
+
|
15 |
+
"input_video_warning": "Please submit a video",
|
16 |
+
"start_time_warning": "Please provide a correct start time",
|
17 |
+
"end_time_warning": "Please provide a correct end time",
|
18 |
+
"lang_radio_warning": "Please choose a language for the video",
|
19 |
+
"model_dropdown_warning": "Please choose a model size for the video",
|
20 |
+
|
21 |
+
"input_video_label": "Insert video",
|
22 |
+
"start_time_label": "Start time",
|
23 |
+
"end_time_label": "End time",
|
24 |
+
"lang_radio_choices": ["English", "Indonesian", "Automatic"],
|
25 |
+
"lang_radio_label": "Language",
|
26 |
+
"lang_radio_info": "What language do they speak in the video?",
|
27 |
+
"model_dropdown_choices": ["tiny", "base", "small", "medium", "large"],
|
28 |
+
"model_dropdown_label": "Model size",
|
29 |
+
"model_dropdown_info": "The higher the model, the more accurate the transcript will be but the process will take longer",
|
30 |
+
"summary_checkbox_label": "Use summary",
|
31 |
+
"summary_checkbox_info": "Do you need a summary of the transcribe? Note: Result might be inaccurate",
|
32 |
+
"start_button_value": "Start Transcribing",
|
33 |
+
|
34 |
+
"adjust_speaker_value": "Speaker name",
|
35 |
+
"prev_button_value": "Previous Speaker",
|
36 |
+
"next_button_value": "Next Speaker",
|
37 |
+
"adjust_button_value": "Adjust Speaker",
|
38 |
+
|
39 |
+
"output_video_label": "Video with subtitle",
|
40 |
+
"output_transcribe_label": "Transcribe result",
|
41 |
+
|
42 |
+
"progress_starting_process": "Starting process",
|
43 |
+
"progress_preparing_video": "Preparing video",
|
44 |
+
"progress_acquiring_diarization": "Acquiring diarization",
|
45 |
+
"progress_transcribing_audio": "Transcribing audio",
|
46 |
+
"progress_adjust_speaker": "Adjusting speakers",
|
47 |
+
"progress_add_subtitle": "Adding subtitle on video"
|
48 |
+
}
|
49 |
+
|
50 |
+
indonesia_ui = {
|
51 |
+
"top_markdown": """
|
52 |
+
## Transkrip
|
53 |
+
Masukkan video dan sesuaikan opsi untuk transkrip
|
54 |
+
""",
|
55 |
+
"middle_markdown": """
|
56 |
+
## Penyesuaian
|
57 |
+
Dengarkan cuplikan suara pembicara dan ubah nama sesuai suara pembicara. Setelah itu, silahkan tekan tombol 'Sesuaikan Pembicara' untuk menyesuaikan nama pembicara pada hasil di atas
|
58 |
+
""",
|
59 |
+
"bottom_markdown": """
|
60 |
+
## Hasil
|
61 |
+
Berikut hasil akhir dari transkrip
|
62 |
+
""",
|
63 |
+
|
64 |
+
"input_video_warning": "Mohon masukkan video",
|
65 |
+
"start_time_warning": "Mohon berikan waktu mulai yang sesuai",
|
66 |
+
"end_time_warning": "Mohon berikan waktu selesai yang sesuai",
|
67 |
+
"lang_radio_warning": "Mohon pilih bahasa yang digunakan dalam video",
|
68 |
+
"model_dropdown_warning": "Mohon pilih ukuran model yang digunakan untuk video",
|
69 |
+
|
70 |
+
"input_video_label": "Masukkan video",
|
71 |
+
"start_time_label": "Waktu mulai",
|
72 |
+
"end_time_label": "Waktu selesai",
|
73 |
+
"lang_radio_choices": ["Bahasa Inggris", "Bahasa Indonesia", "Otomatis"],
|
74 |
+
"lang_radio_label": "Bahasa",
|
75 |
+
"lang_radio_info": "Bahasa apa yang digunakan dalam video?",
|
76 |
+
"model_dropdown_choices": ["mungil", "dasar", "kecil", "sedang", "besar"],
|
77 |
+
"model_dropdown_label": "Ukuran model",
|
78 |
+
"model_dropdown_info": "Semakin tinggi modelnya, semakin akurat transkripnya namun prosesnya akan membutuhkan waktu yang lebih lama",
|
79 |
+
"summary_checkbox_label": "Gunakan kesimpulan",
|
80 |
+
"summary_checkbox_info": "Apakah anda memerlukan kesimpulan dari transkrip? Hasil mungkin tidak sepenuhnya akurat",
|
81 |
+
"start_button_value": "Mulai Transkrip",
|
82 |
+
|
83 |
+
"adjust_speaker_value": "Nama pembicara",
|
84 |
+
"prev_button_value": "Pembicara Sebelumnya",
|
85 |
+
"next_button_value": "Pembicara Selanjutnya",
|
86 |
+
"adjust_button_value": "Sesuaikan Pembicara",
|
87 |
+
|
88 |
+
"output_video_label": "Video dengan subtitle",
|
89 |
+
"output_transcribe_label": "Hasil transkrip",
|
90 |
+
|
91 |
+
"progress_starting_process": "Memulai proses",
|
92 |
+
"progress_preparing_video": "Mempersiapkan video",
|
93 |
+
"progress_acquiring_diarization": "Mengenali pembicara",
|
94 |
+
"progress_transcribing_audio": "Mendapatkan transkrip suara",
|
95 |
+
"progress_adjust_speaker": "Menyesuaikan pembicara",
|
96 |
+
"progress_add_subtitle": "Menambahkan subtitle pada video"
|
97 |
+
}
|
98 |
+
|
99 |
+
|
100 |
+
def get_ui_lang(index):
|
101 |
+
selectable_ui_lang = [english_ui, indonesia_ui]
|
102 |
+
return selectable_ui_lang[index]
|
list.py
DELETED
File without changes
|
set_up.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ui import *
|
2 |
+
from diarization import start_diarization
|
3 |
+
from transcribe import start_transcribe
|
4 |
+
from video_tool import convert_video_to_audio, add_subtitle_to_video
|
5 |
+
import gradio as gr
|
6 |
+
import re
|
7 |
+
import os
|
8 |
+
import file_name
|
9 |
+
import utils
|
10 |
+
|
11 |
+
|
12 |
+
def prepare_input(input_file, start_time, end_time, lang, model_size, use_summary, progress=gr.Progress()):
|
13 |
+
gr.Info(ui_lang["progress_starting_process"])
|
14 |
+
|
15 |
+
if input_file is None or not os.path.exists(input_file):
|
16 |
+
gr.Warning(ui_lang["input_video_warning"])
|
17 |
+
return [None, None, [None, None]]
|
18 |
+
if validate_time_format(start_time) is False:
|
19 |
+
gr.Warning(ui_lang["start_time_warning"])
|
20 |
+
return [None, None, [None, None]]
|
21 |
+
if validate_time_format(end_time) is False:
|
22 |
+
gr.Warning(ui_lang["end_time_warning"])
|
23 |
+
return [None, None, [None, None]]
|
24 |
+
if lang is None:
|
25 |
+
gr.Warning(ui_lang["lang_radio_warning"])
|
26 |
+
return [None, None, [None, None]]
|
27 |
+
if model_size is None:
|
28 |
+
gr.Warning(ui_lang["model_dropdown_warning"])
|
29 |
+
return [None, None, [None, None]]
|
30 |
+
|
31 |
+
file_name.input_file = input_file
|
32 |
+
file_name.start_time_for_adjustment = start_time
|
33 |
+
file_name.end_time_for_adjustment = end_time
|
34 |
+
|
35 |
+
# Convert video to audio
|
36 |
+
progress(0.2, desc=ui_lang["progress_preparing_video"])
|
37 |
+
convert_video_to_audio(
|
38 |
+
input_file, file_name.audio_file, start_time, end_time)
|
39 |
+
|
40 |
+
# Start diarization
|
41 |
+
progress(0.4, desc=ui_lang["progress_acquiring_diarization"])
|
42 |
+
start_diarization(file_name.audio_file)
|
43 |
+
|
44 |
+
# Start transcribing
|
45 |
+
progress(0.6, desc=ui_lang["progress_transcribing_audio"])
|
46 |
+
start_transcribe(lang, model_size, progress)
|
47 |
+
|
48 |
+
# Add subtitle to video
|
49 |
+
progress(0.8, desc=ui_lang["progress_add_subtitle"])
|
50 |
+
add_subtitle_to_video(input_file, file_name.subtitle_file,
|
51 |
+
file_name.video_subtitle_file, start_time, end_time)
|
52 |
+
|
53 |
+
# Return video file link, transcribe string, transcribe.txt, subtitle.txt
|
54 |
+
transcribe_txt_list, _ = utils.read_transcribe_subtitle_file(False)
|
55 |
+
transcribe_txt = "\n".join(transcribe_txt_list)
|
56 |
+
return [
|
57 |
+
file_name.video_subtitle_file,
|
58 |
+
transcribe_txt,
|
59 |
+
[file_name.transcribe_file, file_name.subtitle_file]
|
60 |
+
]
|
61 |
+
|
62 |
+
|
63 |
+
def validate_time_format(input_string):
|
64 |
+
pattern = re.compile(r'^\d{2}:\d{2}:\d{2}$')
|
65 |
+
return pattern.match(input_string) is not None
|
transcribe.py
CHANGED
@@ -1,59 +1,58 @@
|
|
1 |
from faster_whisper import WhisperModel
|
|
|
2 |
import torch
|
3 |
import gc
|
4 |
-
import
|
|
|
5 |
|
6 |
gc.collect()
|
7 |
torch.cuda.empty_cache()
|
8 |
|
9 |
-
|
|
|
10 |
|
11 |
|
12 |
-
def start_transcribe(progress):
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
_, speaker_groups = load_groups_json()
|
14 |
|
15 |
-
|
|
|
16 |
for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc="Transcribing")):
|
17 |
# Transcribe and save temp file
|
18 |
audiof = f"{speaker}.wav"
|
19 |
print(f"Loading {audiof}")
|
20 |
segments, _ = model.transcribe(
|
21 |
-
audio=audiof, language=
|
22 |
segments_list = list(segments)
|
23 |
|
24 |
-
|
|
|
|
|
25 |
for segment in segments_list:
|
26 |
-
start =
|
27 |
-
end =
|
|
|
28 |
name = str(speaker)[:10]
|
29 |
-
|
30 |
-
|
31 |
-
f"{len(subtitle_txt) + 1}\n{start} --> {end}\n[{name}] {text}\n\n")
|
32 |
-
# Appending text for each segment to print
|
33 |
-
text_list_to_print.append(text)
|
34 |
-
|
35 |
-
# Print full text for each speaker turn
|
36 |
-
text = "\n".join(text_list_to_print)
|
37 |
-
print(text)
|
38 |
-
# Append to complete transcribe file
|
39 |
-
with open("transcribe.txt", "a") as file:
|
40 |
-
file.write(f"[{name}] {text}\n")
|
41 |
-
|
42 |
-
# Appending subtitle txt for each segment
|
43 |
-
with open("subtitle.srt", "w") as file:
|
44 |
-
file.writelines(subtitle_txt)
|
45 |
-
return ["transcribe.txt", "subtitle.srt"]
|
46 |
-
|
47 |
-
|
48 |
-
def timeStr(t):
|
49 |
-
return '{0:02d}:{1:02d}:{2:06.2f}'.format(round(t // 3600),
|
50 |
-
round(t % 3600 // 60),
|
51 |
-
t % 60)
|
52 |
|
|
|
|
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from faster_whisper import WhisperModel
|
2 |
+
from utils import load_groups_json
|
3 |
import torch
|
4 |
import gc
|
5 |
+
import file_name
|
6 |
+
import utils
|
7 |
|
8 |
gc.collect()
|
9 |
torch.cuda.empty_cache()
|
10 |
|
11 |
+
model_lang_list = ['en', 'id', None]
|
12 |
+
model_size = ["tiny", "base", "small", "medium", "large"]
|
13 |
|
14 |
|
15 |
+
def start_transcribe(lang_choice: int, model_size_choice: int, progress):
|
16 |
+
|
17 |
+
print(
|
18 |
+
f"Starting transcribing with model size {model_size[model_size_choice]} for language {model_lang_list[lang_choice]}")
|
19 |
+
|
20 |
+
model = WhisperModel(
|
21 |
+
model_size[model_size_choice], device="cuda", compute_type="int8_float16")
|
22 |
_, speaker_groups = load_groups_json()
|
23 |
|
24 |
+
subtitle_txt_list = []
|
25 |
+
transcribe_txt_list = []
|
26 |
for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc="Transcribing")):
|
27 |
# Transcribe and save temp file
|
28 |
audiof = f"{speaker}.wav"
|
29 |
print(f"Loading {audiof}")
|
30 |
segments, _ = model.transcribe(
|
31 |
+
audio=audiof, language=model_lang_list[lang_choice], word_timestamps=True)
|
32 |
segments_list = list(segments)
|
33 |
|
34 |
+
speaker_txt_list = []
|
35 |
+
shift = speaker_groups[speaker][0] + 1
|
36 |
+
print(f"Current starting point: {shift}s or {time_str(shift)}")
|
37 |
for segment in segments_list:
|
38 |
+
start = time_str(segment.start + shift)
|
39 |
+
end = time_str(segment.end + shift)
|
40 |
+
|
41 |
name = str(speaker)[:10]
|
42 |
+
segment_txt = segment.text
|
43 |
+
speaker_txt_list.append(segment_txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
subtitle = f"{len(subtitle_txt_list) + 1}\n{start} --> {end}\n[{name}] {segment_txt}\n\n"
|
46 |
+
subtitle_txt_list.append(subtitle)
|
47 |
|
48 |
+
speaker_txt = " ".join(speaker_txt_list)
|
49 |
+
transcribe_txt_list.append(f"[{name}] {speaker_txt}\n")
|
50 |
+
|
51 |
+
utils.write_transcribe_subtitle_file(
|
52 |
+
transcribe_txt_list, subtitle_txt_list, False)
|
53 |
+
|
54 |
+
|
55 |
+
def time_str(t):
|
56 |
+
return '{0:02d}:{1:02d}:{2:06.3f}'.format(round(t // 3600),
|
57 |
+
round(t % 3600 // 60),
|
58 |
+
t % 60)
|
ui.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lang_ui import get_ui_lang
|
2 |
+
import gradio as gr
|
3 |
+
import ffmpeg
|
4 |
+
|
5 |
+
# Display available langauges and set default UI language
|
6 |
+
ui_lang_index = 1
|
7 |
+
available_ui_lang = ["English", "Bahasa Indonesia"]
|
8 |
+
ui_lang = get_ui_lang(ui_lang_index)
|
9 |
+
|
10 |
+
lang_radio_choice = 1
|
11 |
+
model_dropdown_choice = 2
|
12 |
+
|
13 |
+
# Transcribe components
|
14 |
+
ui_lang_radio = gr.Radio(
|
15 |
+
available_ui_lang, type="index", value=available_ui_lang[ui_lang_index], interactive=True, show_label=False)
|
16 |
+
top_markdown = gr.Markdown(
|
17 |
+
ui_lang["top_markdown"])
|
18 |
+
# TODO Add video by link
|
19 |
+
input_video = gr.Video(
|
20 |
+
label=ui_lang["input_video_label"], interactive=True)
|
21 |
+
start_time = gr.Textbox(
|
22 |
+
max_lines=1, placeholder="00:00:00", label=ui_lang["start_time_label"], interactive=False)
|
23 |
+
end_time = gr.Textbox(
|
24 |
+
max_lines=1, placeholder="01:00:00", label=ui_lang["end_time_label"], interactive=False)
|
25 |
+
# TODO Use custom language
|
26 |
+
lang_radio = gr.Radio(
|
27 |
+
ui_lang["lang_radio_choices"], label=ui_lang["lang_radio_label"], info=ui_lang["lang_radio_info"], type='index', interactive=True)
|
28 |
+
model_dropdown = gr.Dropdown(
|
29 |
+
ui_lang["model_dropdown_choices"], label=ui_lang["model_dropdown_label"], info=ui_lang["model_dropdown_info"], type='index', interactive=True)
|
30 |
+
summary_checkbox = gr.Checkbox(
|
31 |
+
label=ui_lang["summary_checkbox_label"], info=ui_lang["summary_checkbox_info"], interactive=False, visible=False)
|
32 |
+
start_button = gr.Button(
|
33 |
+
ui_lang["start_button_value"], variant="primary", interactive=True)
|
34 |
+
|
35 |
+
# Adjust components
|
36 |
+
middle_markdown = gr.Markdown(
|
37 |
+
ui_lang["middle_markdown"])
|
38 |
+
adjust_audio = gr.Audio(interactive=False)
|
39 |
+
adjust_speaker = gr.Textbox(
|
40 |
+
label=ui_lang["adjust_speaker_value"], interactive=False)
|
41 |
+
prev_button = gr.Button(ui_lang["prev_button_value"], interactive=False)
|
42 |
+
next_button = gr.Button(ui_lang["next_button_value"], interactive=False)
|
43 |
+
adjust_button = gr.Button(
|
44 |
+
ui_lang["adjust_button_value"], variant="primary", interactive=False)
|
45 |
+
|
46 |
+
# Result components
|
47 |
+
bottom_markdown = gr.Markdown(
|
48 |
+
ui_lang["bottom_markdown"]
|
49 |
+
)
|
50 |
+
output_video = gr.Video(label=ui_lang["output_video_label"], interactive=False)
|
51 |
+
output_file = gr.File(file_count="multiple", interactive=False)
|
52 |
+
output_transcribe = gr.Textbox(
|
53 |
+
label=ui_lang["output_transcribe_label"], interactive=False, show_copy_button=True)
|
54 |
+
output_summary = gr.Textbox(
|
55 |
+
interactive=False, show_copy_button=True, visible=False)
|
56 |
+
|
57 |
+
|
58 |
+
def time_str(t):
|
59 |
+
return '{0:02d}:{1:02d}:{2:02d}'.format(round(t // 3600),
|
60 |
+
round(t % 3600 // 60),
|
61 |
+
round(t % 60))
|
62 |
+
|
63 |
+
|
64 |
+
def get_duration(input_file):
|
65 |
+
print("Checking file")
|
66 |
+
if input_file is None:
|
67 |
+
gr.Warning(ui_lang["input_video_warning"])
|
68 |
+
return [
|
69 |
+
start_time.update(None, interactive=False),
|
70 |
+
end_time.update(None, interactive=False)
|
71 |
+
]
|
72 |
+
print("Getting duration")
|
73 |
+
info_json = ffmpeg.probe(input_file)
|
74 |
+
print("Probing finished")
|
75 |
+
duration_seconds = float(info_json['format']['duration'])
|
76 |
+
duration_formatted = time_str(duration_seconds)
|
77 |
+
return [
|
78 |
+
start_time.update("00:00:00", interactive=True),
|
79 |
+
end_time.update(duration_formatted, interactive=True)
|
80 |
+
]
|
81 |
+
|
82 |
+
|
83 |
+
# Change language function
|
84 |
+
|
85 |
+
|
86 |
+
def change_lang(input):
|
87 |
+
global ui_lang
|
88 |
+
ui_lang = get_ui_lang(input)
|
89 |
+
print(f"Change language to {available_ui_lang[input]}")
|
90 |
+
return [
|
91 |
+
# Top
|
92 |
+
top_markdown.update(
|
93 |
+
ui_lang["top_markdown"]),
|
94 |
+
input_video.update(
|
95 |
+
label=ui_lang["input_video_label"]),
|
96 |
+
start_time.update(
|
97 |
+
label=ui_lang["start_time_label"]),
|
98 |
+
end_time.update(
|
99 |
+
label=ui_lang["end_time_label"]),
|
100 |
+
lang_radio.update(
|
101 |
+
choices=ui_lang["lang_radio_choices"], value=None, label=ui_lang["lang_radio_label"], info=ui_lang["lang_radio_info"],),
|
102 |
+
model_dropdown.update(
|
103 |
+
choices=ui_lang["model_dropdown_choices"], value=None, label=ui_lang["model_dropdown_label"], info=ui_lang["model_dropdown_info"]),
|
104 |
+
summary_checkbox.update(
|
105 |
+
label=ui_lang["summary_checkbox_label"], info=ui_lang["summary_checkbox_info"]),
|
106 |
+
start_button.update(
|
107 |
+
ui_lang["start_button_value"]),
|
108 |
+
|
109 |
+
# Middle
|
110 |
+
middle_markdown.update(
|
111 |
+
ui_lang["middle_markdown"]),
|
112 |
+
adjust_speaker.update(label=ui_lang["adjust_speaker_value"]),
|
113 |
+
prev_button.update(
|
114 |
+
ui_lang["prev_button_value"]),
|
115 |
+
next_button.update(
|
116 |
+
ui_lang["next_button_value"]),
|
117 |
+
adjust_button.update(
|
118 |
+
ui_lang["adjust_button_value"]),
|
119 |
+
|
120 |
+
# Bottom
|
121 |
+
bottom_markdown.update(
|
122 |
+
ui_lang["bottom_markdown"]),
|
123 |
+
output_video.update(label=ui_lang["output_video_label"]),
|
124 |
+
output_transcribe.update(label=ui_lang["output_transcribe_label"]),
|
125 |
+
]
|
126 |
+
|
127 |
+
|
128 |
+
# comp_to_update and change_lang return must always be in equal number
|
129 |
+
comp_to_update = [
|
130 |
+
top_markdown, input_video, start_time, end_time, lang_radio, model_dropdown, summary_checkbox, start_button, middle_markdown, adjust_speaker, prev_button, next_button, adjust_button, bottom_markdown, output_video, output_transcribe]
|
utils.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import file_name
|
3 |
+
|
4 |
+
|
5 |
+
def load_groups_json():
|
6 |
+
with open("sample_groups.json", "r", encoding="utf-8") as json_file_sample:
|
7 |
+
sample_groups_list: list = json.load(json_file_sample)
|
8 |
+
with open("speaker_groups.json", "r", encoding="utf-8") as json_file_speaker:
|
9 |
+
speaker_groups_dict: dict = json.load(json_file_speaker)
|
10 |
+
return sample_groups_list, speaker_groups_dict
|
11 |
+
|
12 |
+
|
13 |
+
def write_transcribe_subtitle_file(transcribe_txt_list: list, subtitle_txt_list: list, adjustment: bool):
|
14 |
+
transcribe = file_name.transcribe_file
|
15 |
+
subtitle = file_name.subtitle_file
|
16 |
+
if adjustment:
|
17 |
+
transcribe = file_name.transcribe_adjusted_file
|
18 |
+
subtitle = file_name.subtitle_adjusted_file
|
19 |
+
|
20 |
+
with open(transcribe, "w", encoding="utf-8") as file:
|
21 |
+
file.writelines(transcribe_txt_list)
|
22 |
+
with open(subtitle, "w", encoding="utf-8") as file:
|
23 |
+
file.writelines(subtitle_txt_list)
|
24 |
+
|
25 |
+
|
26 |
+
def read_transcribe_subtitle_file(adjustment: bool):
|
27 |
+
transcribe = file_name.transcribe_file
|
28 |
+
subtitle = file_name.subtitle_file
|
29 |
+
if adjustment:
|
30 |
+
transcribe = file_name.transcribe_adjusted_file
|
31 |
+
subtitle = file_name.subtitle_adjusted_file
|
32 |
+
|
33 |
+
with open(transcribe, "r", encoding="utf-8") as file:
|
34 |
+
transcribe_txt_list = file.readlines()
|
35 |
+
with open(subtitle, "r", encoding="utf-8") as file:
|
36 |
+
subtitle_txt_list = file.readlines()
|
37 |
+
return transcribe_txt_list, subtitle_txt_list
|
video_tool.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ffmpeg
|
2 |
+
|
3 |
+
|
4 |
+
def convert_video_to_audio(input_file, output_file, start_time, end_time):
|
5 |
+
ffmpeg.input(input_file, ss=start_time, to=end_time).audio.output(
|
6 |
+
output_file, format="wav").run(overwrite_output=True)
|
7 |
+
|
8 |
+
|
9 |
+
def add_subtitle_to_video(input_file, subtitle_file, output_file, start_time, end_time):
|
10 |
+
ffmpeg.input(input_file, ss=start_time, to=end_time).output(
|
11 |
+
output_file, vf='subtitles=' + subtitle_file, preset='ultrafast', acodec='copy').run(overwrite_output=True)
|