chompionsawelo commited on
Commit
e698260
·
1 Parent(s): a442a66

Version 0.1

Browse files
Files changed (11) hide show
  1. adjust.py +126 -0
  2. app.py +55 -20
  3. diarization.py +7 -4
  4. file_name.py +15 -0
  5. lang_ui.py +102 -0
  6. list.py +0 -0
  7. set_up.py +65 -0
  8. transcribe.py +36 -37
  9. ui.py +130 -0
  10. utils.py +37 -0
  11. video_tool.py +11 -0
adjust.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ui import *
2
+ from pydub import AudioSegment
3
+ from utils import load_groups_json
4
+ from video_tool import add_subtitle_to_video
5
+ import gradio as gr
6
+ import os
7
+ import utils
8
+ import file_name
9
+
10
+ min_duration_ms = 10000
11
+ current_pos = 0
12
+
13
+ speaker_to_name = {"Speaker": "Name"}
14
+ speaker_to_sample = {"Speaker": "File"}
15
+
16
+ sample_groups, _ = load_groups_json()
17
+
18
+
19
+ def get_current():
20
+ name = list(speaker_to_name.values())[current_pos]
21
+ sample = list(speaker_to_sample.values())[current_pos]
22
+ return [name, sample]
23
+
24
+
25
+ def prepare_output(input_file):
26
+ if input_file is None or not os.path.exists(input_file):
27
+ return
28
+
29
+ speakers = get_speakers(input_file)
30
+ adjust_speaker_update = adjust_speaker.update(
31
+ speakers[0], interactive=True)
32
+ adjust_audio_update = adjust_audio.update(speakers[1], interactive=True)
33
+ prev_button_update = prev_button.update(interactive=True)
34
+ next_button_update = next_button.update(interactive=True)
35
+ adjust_button_update = adjust_button.update(interactive=True)
36
+ # Return adjust speaker, adjust audio, previous, next, adjust button
37
+ return [adjust_speaker_update, adjust_audio_update, prev_button_update, next_button_update, adjust_button_update]
38
+
39
+
40
+ def get_speakers(input_file):
41
+ if (input_file == None):
42
+ return [None, None]
43
+
44
+ global speaker_to_name
45
+ global speaker_to_sample
46
+
47
+ speaker_to_name = {}
48
+ speaker_to_sample = {}
49
+
50
+ for speaker in sample_groups:
51
+ for suffix in range(1, 100):
52
+ file_path = f"{speaker}-{suffix}.wav"
53
+ if os.path.exists(file_path):
54
+ audio_segment = AudioSegment.from_file(file_path)
55
+ if len(audio_segment) > min_duration_ms:
56
+ print(f"Found file: {file_path}")
57
+ print(
58
+ f"File duration: {len(audio_segment) / 1000} seconds")
59
+ break
60
+
61
+ temp_file_name = f"{speaker}-sample.wav"
62
+ audio_segment[:10 * 1000].export(temp_file_name, format="wav")
63
+ speaker_to_sample[speaker] = temp_file_name
64
+ speaker_to_name[speaker] = speaker
65
+ return get_current()
66
+
67
+
68
+ def change_name(to_name):
69
+ current_speaker = sample_groups[current_pos]
70
+ speaker_to_name[current_speaker] = to_name
71
+ print(str(get_current()))
72
+
73
+
74
+ def get_speakers_next(to_name):
75
+ change_name(to_name)
76
+ global current_pos
77
+ if (current_pos < len(sample_groups) - 1):
78
+ current_pos += 1
79
+ return get_current()
80
+
81
+
82
+ def get_speakers_previous(to_name):
83
+ change_name(to_name)
84
+ global current_pos
85
+ if (current_pos > 0):
86
+ current_pos -= 1
87
+ return get_current()
88
+
89
+
90
+ def start_adjust(to_name, progress=gr.Progress()):
91
+ change_name(to_name)
92
+
93
+ # Replacing texts
94
+ progress(0.4, desc=ui_lang["progress_adjust_speaker"])
95
+ transcribe_txt_list, subtitle_txt_list = utils.read_transcribe_subtitle_file(
96
+ False)
97
+ modified_transcribe = replace_text(transcribe_txt_list)
98
+ modified_subtitle = replace_text(subtitle_txt_list)
99
+ utils.write_transcribe_subtitle_file(
100
+ modified_transcribe, modified_subtitle, True)
101
+
102
+ # Adding subtitle to video
103
+ progress(0.8, desc=ui_lang["progress_add_subtitle"])
104
+ add_subtitle_to_video(
105
+ file_name.input_file, file_name.subtitle_adjusted_file, file_name.video_subtitle_file, file_name.start_time_for_adjustment, file_name.end_time_for_adjustment)
106
+
107
+ # Return video file link, transcribe string, transcribe.txt, subtitle.txt
108
+ transcribe_txt_list, _ = utils.read_transcribe_subtitle_file(True)
109
+ print(line for line in transcribe_txt_list)
110
+ transcribe_txt = "\n".join(transcribe_txt_list)
111
+ return [
112
+ file_name.video_subtitle_file,
113
+ transcribe_txt,
114
+ [file_name.transcribe_adjusted_file, file_name.subtitle_adjusted_file]
115
+ ]
116
+
117
+
118
+ def replace_text(lines):
119
+ modified_lines = []
120
+ for line in lines:
121
+ for key, value in speaker_to_name.items():
122
+ line = line.replace(key, value)
123
+ print(f"Replacing {key} with {value}")
124
+ modified_lines.append(line)
125
+ print(modified_lines)
126
+ return modified_lines
app.py CHANGED
@@ -1,27 +1,62 @@
1
- from huggingface_hub import login
2
- from diarization import start_diarization
3
- from transcribe import start_transcribe
4
- import ffmpeg
5
  import gradio as gr
6
 
 
 
 
 
7
 
8
- def prepare_input(input_file, progress=gr.Progress()):
9
- output_file = "input.wav"
10
- progress(0.2, desc="Preparing video")
11
- ffmpeg.input(input_file).audio.output(
12
- output_file, format="wav").run()
13
- progress(0.4, desc="Acquiring diarization")
14
- start_diarization(output_file)
15
- progress(0.6, desc="Transcribing audio")
16
- return start_transcribe(progress)
 
 
 
 
 
 
 
 
 
 
17
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- video_interface = gr.Interface(
20
- fn=prepare_input,
21
- inputs=gr.Video(type="file"),
22
- outputs="files",
23
- title="Test 2"
24
- )
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  if __name__ == "__main__":
27
- video_interface.launch()
 
1
+ from ui import *
2
+ from adjust import *
3
+ from set_up import prepare_input
 
4
  import gradio as gr
5
 
6
+ with gr.Blocks() as demo:
7
+ ui_lang_radio.render()
8
+ ui_lang_radio.change(change_lang, inputs=ui_lang_radio,
9
+ outputs=comp_to_update)
10
 
11
+ top_markdown.render()
12
+ with gr.Column():
13
+ with gr.Row():
14
+ with gr.Column():
15
+ input_video.render()
16
+ input_video.change(get_duration, input_video, [
17
+ start_time, end_time])
18
+ with gr.Row():
19
+ start_time.render()
20
+ end_time.render()
21
+ with gr.Column():
22
+ lang_radio.render()
23
+ model_dropdown.render()
24
+ summary_checkbox.render()
25
+ start_button.render()
26
+ start_button.click(prepare_input,
27
+ [input_video, start_time, end_time, lang_radio,
28
+ model_dropdown, summary_checkbox],
29
+ [output_video, output_transcribe, output_file])
30
 
31
+ bottom_markdown.render()
32
+ with gr.Row(equal_height=False):
33
+ with gr.Column():
34
+ output_video.render()
35
+ output_file.render()
36
+ output_file.change(prepare_output, inputs=output_file, outputs=[
37
+ adjust_speaker, adjust_audio, prev_button, next_button, adjust_button])
38
+ with gr.Column():
39
+ output_transcribe.render()
40
+ output_summary.render()
41
 
42
+ middle_markdown.render()
43
+ with gr.Row(equal_height=False):
44
+ adjust_audio.render()
45
+ adjust_speaker.render()
46
+ with gr.Row():
47
+ prev_button.render()
48
+ next_button.render()
49
+ prev_button.click(get_speakers_previous, inputs=[adjust_speaker], outputs=[
50
+ adjust_speaker, adjust_audio])
51
+ next_button.click(get_speakers_next, inputs=[adjust_speaker], outputs=[
52
+ adjust_speaker, adjust_audio])
53
+
54
+ adjust_button.render()
55
+ adjust_button.click(start_adjust, inputs=[adjust_speaker], outputs=[
56
+ output_video, output_transcribe, output_file])
57
+
58
+ with gr.Accordion("Copyright"):
59
+ gr.Markdown("Created with OpenAI Whisper and Huggingface")
60
 
61
  if __name__ == "__main__":
62
+ demo.queue().launch()
diarization.py CHANGED
@@ -1,11 +1,14 @@
1
  from pyannote.audio import Pipeline
2
  from pydub import AudioSegment
3
  import gradio as gr
4
- import os
5
  import torch
6
  import json
 
7
 
8
- hugging_face_token = os.environ["HUGGING_FACE_TOKEN"]
 
 
 
9
  pipeline = Pipeline.from_pretrained(
10
  'pyannote/speaker-diarization', use_auth_token=hugging_face_token)
11
  device = torch.device("cuda")
@@ -48,7 +51,7 @@ def audio_segmentation(input_file, speaker_groups_dict):
48
 
49
 
50
  def save_groups_json(sample_groups_list: list, speaker_groups_dict: dict):
51
- with open("sample_groups.json", "w") as json_file_sample:
52
  json.dump(sample_groups_list, json_file_sample)
53
- with open("speaker_groups.json", "w") as json_file_speaker:
54
  json.dump(speaker_groups_dict, json_file_speaker)
 
1
  from pyannote.audio import Pipeline
2
  from pydub import AudioSegment
3
  import gradio as gr
 
4
  import torch
5
  import json
6
+ import gc
7
 
8
+ gc.collect()
9
+ torch.cuda.empty_cache()
10
+
11
+ hugging_face_token = "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"
12
  pipeline = Pipeline.from_pretrained(
13
  'pyannote/speaker-diarization', use_auth_token=hugging_face_token)
14
  device = torch.device("cuda")
 
51
 
52
 
53
  def save_groups_json(sample_groups_list: list, speaker_groups_dict: dict):
54
+ with open("sample_groups.json", "w", encoding="utf-8") as json_file_sample:
55
  json.dump(sample_groups_list, json_file_sample)
56
+ with open("speaker_groups.json", "w", encoding="utf-8") as json_file_speaker:
57
  json.dump(speaker_groups_dict, json_file_speaker)
file_name.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def get_title():
3
+ return input_file[:-4]
4
+
5
+
6
+ input_file = ""
7
+ audio_file = "input.wav"
8
+ transcribe_file = "transcribe.txt"
9
+ subtitle_file = "subtitle.srt"
10
+ transcribe_adjusted_file = "transcribe_adjusted.txt"
11
+ subtitle_adjusted_file = "subtitle_adjusted.srt"
12
+ video_subtitle_file = f"output_{get_title()}.mp4"
13
+
14
+ start_time_for_adjustment = "00:00:00"
15
+ end_time_for_adjustment = "00:10:00"
lang_ui.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ english_ui = {
2
+ "top_markdown": """
3
+ ## Transcribe
4
+ Insert your video and set the options for transcribing
5
+ """,
6
+ "middle_markdown": """
7
+ ## Adjustment
8
+ Listen to the clips below and type in the name according to the speaker's sound. After that, please click the 'Adjust Speaker' button to adjust the result above accordingly.
9
+ """,
10
+ "bottom_markdown": """
11
+ ## Result
12
+ The following is the result of the transcribe
13
+ """,
14
+
15
+ "input_video_warning": "Please submit a video",
16
+ "start_time_warning": "Please provide a correct start time",
17
+ "end_time_warning": "Please provide a correct end time",
18
+ "lang_radio_warning": "Please choose a language for the video",
19
+ "model_dropdown_warning": "Please choose a model size for the video",
20
+
21
+ "input_video_label": "Insert video",
22
+ "start_time_label": "Start time",
23
+ "end_time_label": "End time",
24
+ "lang_radio_choices": ["English", "Indonesian", "Automatic"],
25
+ "lang_radio_label": "Language",
26
+ "lang_radio_info": "What language do they speak in the video?",
27
+ "model_dropdown_choices": ["tiny", "base", "small", "medium", "large"],
28
+ "model_dropdown_label": "Model size",
29
+ "model_dropdown_info": "The higher the model, the more accurate the transcript will be but the process will take longer",
30
+ "summary_checkbox_label": "Use summary",
31
+ "summary_checkbox_info": "Do you need a summary of the transcribe? Note: Result might be inaccurate",
32
+ "start_button_value": "Start Transcribing",
33
+
34
+ "adjust_speaker_value": "Speaker name",
35
+ "prev_button_value": "Previous Speaker",
36
+ "next_button_value": "Next Speaker",
37
+ "adjust_button_value": "Adjust Speaker",
38
+
39
+ "output_video_label": "Video with subtitle",
40
+ "output_transcribe_label": "Transcribe result",
41
+
42
+ "progress_starting_process": "Starting process",
43
+ "progress_preparing_video": "Preparing video",
44
+ "progress_acquiring_diarization": "Acquiring diarization",
45
+ "progress_transcribing_audio": "Transcribing audio",
46
+ "progress_adjust_speaker": "Adjusting speakers",
47
+ "progress_add_subtitle": "Adding subtitle on video"
48
+ }
49
+
50
+ indonesia_ui = {
51
+ "top_markdown": """
52
+ ## Transkrip
53
+ Masukkan video dan sesuaikan opsi untuk transkrip
54
+ """,
55
+ "middle_markdown": """
56
+ ## Penyesuaian
57
+ Dengarkan cuplikan suara pembicara dan ubah nama sesuai suara pembicara. Setelah itu, silahkan tekan tombol 'Sesuaikan Pembicara' untuk menyesuaikan nama pembicara pada hasil di atas
58
+ """,
59
+ "bottom_markdown": """
60
+ ## Hasil
61
+ Berikut hasil akhir dari transkrip
62
+ """,
63
+
64
+ "input_video_warning": "Mohon masukkan video",
65
+ "start_time_warning": "Mohon berikan waktu mulai yang sesuai",
66
+ "end_time_warning": "Mohon berikan waktu selesai yang sesuai",
67
+ "lang_radio_warning": "Mohon pilih bahasa yang digunakan dalam video",
68
+ "model_dropdown_warning": "Mohon pilih ukuran model yang digunakan untuk video",
69
+
70
+ "input_video_label": "Masukkan video",
71
+ "start_time_label": "Waktu mulai",
72
+ "end_time_label": "Waktu selesai",
73
+ "lang_radio_choices": ["Bahasa Inggris", "Bahasa Indonesia", "Otomatis"],
74
+ "lang_radio_label": "Bahasa",
75
+ "lang_radio_info": "Bahasa apa yang digunakan dalam video?",
76
+ "model_dropdown_choices": ["mungil", "dasar", "kecil", "sedang", "besar"],
77
+ "model_dropdown_label": "Ukuran model",
78
+ "model_dropdown_info": "Semakin tinggi modelnya, semakin akurat transkripnya namun prosesnya akan membutuhkan waktu yang lebih lama",
79
+ "summary_checkbox_label": "Gunakan kesimpulan",
80
+ "summary_checkbox_info": "Apakah anda memerlukan kesimpulan dari transkrip? Hasil mungkin tidak sepenuhnya akurat",
81
+ "start_button_value": "Mulai Transkrip",
82
+
83
+ "adjust_speaker_value": "Nama pembicara",
84
+ "prev_button_value": "Pembicara Sebelumnya",
85
+ "next_button_value": "Pembicara Selanjutnya",
86
+ "adjust_button_value": "Sesuaikan Pembicara",
87
+
88
+ "output_video_label": "Video dengan subtitle",
89
+ "output_transcribe_label": "Hasil transkrip",
90
+
91
+ "progress_starting_process": "Memulai proses",
92
+ "progress_preparing_video": "Mempersiapkan video",
93
+ "progress_acquiring_diarization": "Mengenali pembicara",
94
+ "progress_transcribing_audio": "Mendapatkan transkrip suara",
95
+ "progress_adjust_speaker": "Menyesuaikan pembicara",
96
+ "progress_add_subtitle": "Menambahkan subtitle pada video"
97
+ }
98
+
99
+
100
+ def get_ui_lang(index):
101
+ selectable_ui_lang = [english_ui, indonesia_ui]
102
+ return selectable_ui_lang[index]
list.py DELETED
File without changes
set_up.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ui import *
2
+ from diarization import start_diarization
3
+ from transcribe import start_transcribe
4
+ from video_tool import convert_video_to_audio, add_subtitle_to_video
5
+ import gradio as gr
6
+ import re
7
+ import os
8
+ import file_name
9
+ import utils
10
+
11
+
12
+ def prepare_input(input_file, start_time, end_time, lang, model_size, use_summary, progress=gr.Progress()):
13
+ gr.Info(ui_lang["progress_starting_process"])
14
+
15
+ if input_file is None or not os.path.exists(input_file):
16
+ gr.Warning(ui_lang["input_video_warning"])
17
+ return [None, None, [None, None]]
18
+ if validate_time_format(start_time) is False:
19
+ gr.Warning(ui_lang["start_time_warning"])
20
+ return [None, None, [None, None]]
21
+ if validate_time_format(end_time) is False:
22
+ gr.Warning(ui_lang["end_time_warning"])
23
+ return [None, None, [None, None]]
24
+ if lang is None:
25
+ gr.Warning(ui_lang["lang_radio_warning"])
26
+ return [None, None, [None, None]]
27
+ if model_size is None:
28
+ gr.Warning(ui_lang["model_dropdown_warning"])
29
+ return [None, None, [None, None]]
30
+
31
+ file_name.input_file = input_file
32
+ file_name.start_time_for_adjustment = start_time
33
+ file_name.end_time_for_adjustment = end_time
34
+
35
+ # Convert video to audio
36
+ progress(0.2, desc=ui_lang["progress_preparing_video"])
37
+ convert_video_to_audio(
38
+ input_file, file_name.audio_file, start_time, end_time)
39
+
40
+ # Start diarization
41
+ progress(0.4, desc=ui_lang["progress_acquiring_diarization"])
42
+ start_diarization(file_name.audio_file)
43
+
44
+ # Start transcribing
45
+ progress(0.6, desc=ui_lang["progress_transcribing_audio"])
46
+ start_transcribe(lang, model_size, progress)
47
+
48
+ # Add subtitle to video
49
+ progress(0.8, desc=ui_lang["progress_add_subtitle"])
50
+ add_subtitle_to_video(input_file, file_name.subtitle_file,
51
+ file_name.video_subtitle_file, start_time, end_time)
52
+
53
+ # Return video file link, transcribe string, transcribe.txt, subtitle.txt
54
+ transcribe_txt_list, _ = utils.read_transcribe_subtitle_file(False)
55
+ transcribe_txt = "\n".join(transcribe_txt_list)
56
+ return [
57
+ file_name.video_subtitle_file,
58
+ transcribe_txt,
59
+ [file_name.transcribe_file, file_name.subtitle_file]
60
+ ]
61
+
62
+
63
+ def validate_time_format(input_string):
64
+ pattern = re.compile(r'^\d{2}:\d{2}:\d{2}$')
65
+ return pattern.match(input_string) is not None
transcribe.py CHANGED
@@ -1,59 +1,58 @@
1
  from faster_whisper import WhisperModel
 
2
  import torch
3
  import gc
4
- import json
 
5
 
6
  gc.collect()
7
  torch.cuda.empty_cache()
8
 
9
- model = WhisperModel("medium", device="cuda", compute_type="int8_float16")
 
10
 
11
 
12
- def start_transcribe(progress):
 
 
 
 
 
 
13
  _, speaker_groups = load_groups_json()
14
 
15
- subtitle_txt = []
 
16
  for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc="Transcribing")):
17
  # Transcribe and save temp file
18
  audiof = f"{speaker}.wav"
19
  print(f"Loading {audiof}")
20
  segments, _ = model.transcribe(
21
- audio=audiof, language='id', word_timestamps=True)
22
  segments_list = list(segments)
23
 
24
- text_list_to_print = []
 
 
25
  for segment in segments_list:
26
- start = timeStr(segment.start)
27
- end = timeStr(segment.end)
 
28
  name = str(speaker)[:10]
29
- text = segment.text
30
- subtitle_txt.append(
31
- f"{len(subtitle_txt) + 1}\n{start} --> {end}\n[{name}] {text}\n\n")
32
- # Appending text for each segment to print
33
- text_list_to_print.append(text)
34
-
35
- # Print full text for each speaker turn
36
- text = "\n".join(text_list_to_print)
37
- print(text)
38
- # Append to complete transcribe file
39
- with open("transcribe.txt", "a") as file:
40
- file.write(f"[{name}] {text}\n")
41
-
42
- # Appending subtitle txt for each segment
43
- with open("subtitle.srt", "w") as file:
44
- file.writelines(subtitle_txt)
45
- return ["transcribe.txt", "subtitle.srt"]
46
-
47
-
48
- def timeStr(t):
49
- return '{0:02d}:{1:02d}:{2:06.2f}'.format(round(t // 3600),
50
- round(t % 3600 // 60),
51
- t % 60)
52
 
 
 
53
 
54
- def load_groups_json():
55
- with open("sample_groups.json", "r") as json_file_sample:
56
- sample_groups_list: list = json.load(json_file_sample)
57
- with open("speaker_groups.json", "r") as json_file_speaker:
58
- speaker_groups_dict: dict = json.load(json_file_speaker)
59
- return sample_groups_list, speaker_groups_dict
 
 
 
 
 
 
1
  from faster_whisper import WhisperModel
2
+ from utils import load_groups_json
3
  import torch
4
  import gc
5
+ import file_name
6
+ import utils
7
 
8
  gc.collect()
9
  torch.cuda.empty_cache()
10
 
11
+ model_lang_list = ['en', 'id', None]
12
+ model_size = ["tiny", "base", "small", "medium", "large"]
13
 
14
 
15
+ def start_transcribe(lang_choice: int, model_size_choice: int, progress):
16
+
17
+ print(
18
+ f"Starting transcribing with model size {model_size[model_size_choice]} for language {model_lang_list[lang_choice]}")
19
+
20
+ model = WhisperModel(
21
+ model_size[model_size_choice], device="cuda", compute_type="int8_float16")
22
  _, speaker_groups = load_groups_json()
23
 
24
+ subtitle_txt_list = []
25
+ transcribe_txt_list = []
26
  for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc="Transcribing")):
27
  # Transcribe and save temp file
28
  audiof = f"{speaker}.wav"
29
  print(f"Loading {audiof}")
30
  segments, _ = model.transcribe(
31
+ audio=audiof, language=model_lang_list[lang_choice], word_timestamps=True)
32
  segments_list = list(segments)
33
 
34
+ speaker_txt_list = []
35
+ shift = speaker_groups[speaker][0] + 1
36
+ print(f"Current starting point: {shift}s or {time_str(shift)}")
37
  for segment in segments_list:
38
+ start = time_str(segment.start + shift)
39
+ end = time_str(segment.end + shift)
40
+
41
  name = str(speaker)[:10]
42
+ segment_txt = segment.text
43
+ speaker_txt_list.append(segment_txt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ subtitle = f"{len(subtitle_txt_list) + 1}\n{start} --> {end}\n[{name}] {segment_txt}\n\n"
46
+ subtitle_txt_list.append(subtitle)
47
 
48
+ speaker_txt = " ".join(speaker_txt_list)
49
+ transcribe_txt_list.append(f"[{name}] {speaker_txt}\n")
50
+
51
+ utils.write_transcribe_subtitle_file(
52
+ transcribe_txt_list, subtitle_txt_list, False)
53
+
54
+
55
+ def time_str(t):
56
+ return '{0:02d}:{1:02d}:{2:06.3f}'.format(round(t // 3600),
57
+ round(t % 3600 // 60),
58
+ t % 60)
ui.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lang_ui import get_ui_lang
2
+ import gradio as gr
3
+ import ffmpeg
4
+
5
+ # Display available langauges and set default UI language
6
+ ui_lang_index = 1
7
+ available_ui_lang = ["English", "Bahasa Indonesia"]
8
+ ui_lang = get_ui_lang(ui_lang_index)
9
+
10
+ lang_radio_choice = 1
11
+ model_dropdown_choice = 2
12
+
13
+ # Transcribe components
14
+ ui_lang_radio = gr.Radio(
15
+ available_ui_lang, type="index", value=available_ui_lang[ui_lang_index], interactive=True, show_label=False)
16
+ top_markdown = gr.Markdown(
17
+ ui_lang["top_markdown"])
18
+ # TODO Add video by link
19
+ input_video = gr.Video(
20
+ label=ui_lang["input_video_label"], interactive=True)
21
+ start_time = gr.Textbox(
22
+ max_lines=1, placeholder="00:00:00", label=ui_lang["start_time_label"], interactive=False)
23
+ end_time = gr.Textbox(
24
+ max_lines=1, placeholder="01:00:00", label=ui_lang["end_time_label"], interactive=False)
25
+ # TODO Use custom language
26
+ lang_radio = gr.Radio(
27
+ ui_lang["lang_radio_choices"], label=ui_lang["lang_radio_label"], info=ui_lang["lang_radio_info"], type='index', interactive=True)
28
+ model_dropdown = gr.Dropdown(
29
+ ui_lang["model_dropdown_choices"], label=ui_lang["model_dropdown_label"], info=ui_lang["model_dropdown_info"], type='index', interactive=True)
30
+ summary_checkbox = gr.Checkbox(
31
+ label=ui_lang["summary_checkbox_label"], info=ui_lang["summary_checkbox_info"], interactive=False, visible=False)
32
+ start_button = gr.Button(
33
+ ui_lang["start_button_value"], variant="primary", interactive=True)
34
+
35
+ # Adjust components
36
+ middle_markdown = gr.Markdown(
37
+ ui_lang["middle_markdown"])
38
+ adjust_audio = gr.Audio(interactive=False)
39
+ adjust_speaker = gr.Textbox(
40
+ label=ui_lang["adjust_speaker_value"], interactive=False)
41
+ prev_button = gr.Button(ui_lang["prev_button_value"], interactive=False)
42
+ next_button = gr.Button(ui_lang["next_button_value"], interactive=False)
43
+ adjust_button = gr.Button(
44
+ ui_lang["adjust_button_value"], variant="primary", interactive=False)
45
+
46
+ # Result components
47
+ bottom_markdown = gr.Markdown(
48
+ ui_lang["bottom_markdown"]
49
+ )
50
+ output_video = gr.Video(label=ui_lang["output_video_label"], interactive=False)
51
+ output_file = gr.File(file_count="multiple", interactive=False)
52
+ output_transcribe = gr.Textbox(
53
+ label=ui_lang["output_transcribe_label"], interactive=False, show_copy_button=True)
54
+ output_summary = gr.Textbox(
55
+ interactive=False, show_copy_button=True, visible=False)
56
+
57
+
58
+ def time_str(t):
59
+ return '{0:02d}:{1:02d}:{2:02d}'.format(round(t // 3600),
60
+ round(t % 3600 // 60),
61
+ round(t % 60))
62
+
63
+
64
+ def get_duration(input_file):
65
+ print("Checking file")
66
+ if input_file is None:
67
+ gr.Warning(ui_lang["input_video_warning"])
68
+ return [
69
+ start_time.update(None, interactive=False),
70
+ end_time.update(None, interactive=False)
71
+ ]
72
+ print("Getting duration")
73
+ info_json = ffmpeg.probe(input_file)
74
+ print("Probing finished")
75
+ duration_seconds = float(info_json['format']['duration'])
76
+ duration_formatted = time_str(duration_seconds)
77
+ return [
78
+ start_time.update("00:00:00", interactive=True),
79
+ end_time.update(duration_formatted, interactive=True)
80
+ ]
81
+
82
+
83
+ # Change language function
84
+
85
+
86
+ def change_lang(input):
87
+ global ui_lang
88
+ ui_lang = get_ui_lang(input)
89
+ print(f"Change language to {available_ui_lang[input]}")
90
+ return [
91
+ # Top
92
+ top_markdown.update(
93
+ ui_lang["top_markdown"]),
94
+ input_video.update(
95
+ label=ui_lang["input_video_label"]),
96
+ start_time.update(
97
+ label=ui_lang["start_time_label"]),
98
+ end_time.update(
99
+ label=ui_lang["end_time_label"]),
100
+ lang_radio.update(
101
+ choices=ui_lang["lang_radio_choices"], value=None, label=ui_lang["lang_radio_label"], info=ui_lang["lang_radio_info"],),
102
+ model_dropdown.update(
103
+ choices=ui_lang["model_dropdown_choices"], value=None, label=ui_lang["model_dropdown_label"], info=ui_lang["model_dropdown_info"]),
104
+ summary_checkbox.update(
105
+ label=ui_lang["summary_checkbox_label"], info=ui_lang["summary_checkbox_info"]),
106
+ start_button.update(
107
+ ui_lang["start_button_value"]),
108
+
109
+ # Middle
110
+ middle_markdown.update(
111
+ ui_lang["middle_markdown"]),
112
+ adjust_speaker.update(label=ui_lang["adjust_speaker_value"]),
113
+ prev_button.update(
114
+ ui_lang["prev_button_value"]),
115
+ next_button.update(
116
+ ui_lang["next_button_value"]),
117
+ adjust_button.update(
118
+ ui_lang["adjust_button_value"]),
119
+
120
+ # Bottom
121
+ bottom_markdown.update(
122
+ ui_lang["bottom_markdown"]),
123
+ output_video.update(label=ui_lang["output_video_label"]),
124
+ output_transcribe.update(label=ui_lang["output_transcribe_label"]),
125
+ ]
126
+
127
+
128
+ # comp_to_update and change_lang return must always be in equal number
129
+ comp_to_update = [
130
+ top_markdown, input_video, start_time, end_time, lang_radio, model_dropdown, summary_checkbox, start_button, middle_markdown, adjust_speaker, prev_button, next_button, adjust_button, bottom_markdown, output_video, output_transcribe]
utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import file_name
3
+
4
+
5
+ def load_groups_json():
6
+ with open("sample_groups.json", "r", encoding="utf-8") as json_file_sample:
7
+ sample_groups_list: list = json.load(json_file_sample)
8
+ with open("speaker_groups.json", "r", encoding="utf-8") as json_file_speaker:
9
+ speaker_groups_dict: dict = json.load(json_file_speaker)
10
+ return sample_groups_list, speaker_groups_dict
11
+
12
+
13
+ def write_transcribe_subtitle_file(transcribe_txt_list: list, subtitle_txt_list: list, adjustment: bool):
14
+ transcribe = file_name.transcribe_file
15
+ subtitle = file_name.subtitle_file
16
+ if adjustment:
17
+ transcribe = file_name.transcribe_adjusted_file
18
+ subtitle = file_name.subtitle_adjusted_file
19
+
20
+ with open(transcribe, "w", encoding="utf-8") as file:
21
+ file.writelines(transcribe_txt_list)
22
+ with open(subtitle, "w", encoding="utf-8") as file:
23
+ file.writelines(subtitle_txt_list)
24
+
25
+
26
+ def read_transcribe_subtitle_file(adjustment: bool):
27
+ transcribe = file_name.transcribe_file
28
+ subtitle = file_name.subtitle_file
29
+ if adjustment:
30
+ transcribe = file_name.transcribe_adjusted_file
31
+ subtitle = file_name.subtitle_adjusted_file
32
+
33
+ with open(transcribe, "r", encoding="utf-8") as file:
34
+ transcribe_txt_list = file.readlines()
35
+ with open(subtitle, "r", encoding="utf-8") as file:
36
+ subtitle_txt_list = file.readlines()
37
+ return transcribe_txt_list, subtitle_txt_list
video_tool.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ffmpeg
2
+
3
+
4
+ def convert_video_to_audio(input_file, output_file, start_time, end_time):
5
+ ffmpeg.input(input_file, ss=start_time, to=end_time).audio.output(
6
+ output_file, format="wav").run(overwrite_output=True)
7
+
8
+
9
+ def add_subtitle_to_video(input_file, subtitle_file, output_file, start_time, end_time):
10
+ ffmpeg.input(input_file, ss=start_time, to=end_time).output(
11
+ output_file, vf='subtitles=' + subtitle_file, preset='ultrafast', acodec='copy').run(overwrite_output=True)