whisperx-kit

Runtime error

App Files Files Community

whisperx-kit / src /transcription_utils.py

LukeJacob2023

Update src/transcription_utils.py

aa0d499 verified 11 months ago

raw

history blame contribute delete

7.85 kB

	import whisperx
	import json
	import os
	import torch
	import mimetypes
	import shutil

	# Define language options
	language_options = {
	"Identify": None,
	"English": "en", "Spanish": "es", "Chinese": "zh", "Hindi": "hi", "Arabic": "ar",
	"Portuguese": "pt", "Bengali": "bn", "Russian": "ru", "Japanese": "ja", "Punjabi": "pa",
	"German": "de", "Javanese": "jv", "Wu Chinese": "zh", "Malay": "ms", "Telugu": "te",
	"Vietnamese": "vi", "Korean": "ko", "French": "fr", "Marathi": "mr", "Turkish": "tr"
	}

	# Available models for transcription
	model_options = {
	"Large-v3": "large-v3",
	"Medium": "medium",
	"Small": "small",
	"Base": "base"
	}

	# Initializes the ModelManager by setting default values and loading a model based on system capabilities (CUDA availability).
	class ModelManager:
	def __init__(self):
	self.current_model = None
	self.current_model_name = None
	self.current_device = None
	if torch.cuda.is_available():
	default_device = "cuda"
	default_model = "Large-v3"
	else:
	default_device = "cpu"
	default_model = "Small"
	self.load_model(default_model, default_device)

	def load_model(self, model_choice, device):
	if self.current_model is None or model_choice != self.current_model_name or device != self.current_device:
	print(f"Attempting to load model: {model_choice} on device: {device}")
	compute_type = "float32" if device == "cpu" else "float16"
	self.current_model = whisperx.load_model(model_options[model_choice], device, compute_type=compute_type)
	self.current_model_name = model_choice
	self.current_device = device
	else:
	print(f"Using already loaded model: {self.current_model_name} on device: {self.current_device}")
	return self.current_model

	# Validates if the given file path corresponds to a multimedia file (audio or video) by checking MIME types and specific file extensions.
	def validate_multimedia_file(file_path):
	file_path = os.path.normpath(file_path)
	mime_type, _ = mimetypes.guess_type(file_path)
	if mime_type and (mime_type.startswith('audio') or mime_type.startswith('video')):
	return file_path
	else:
	if file_path.lower().endswith(('.mp3', '.mp4', '.wav', '.avi', '.mov', '.flv')):
	return file_path
	else:
	raise ValueError("The uploaded file is not a multimedia file. Please upload an appropriate audio or video file.")

	# Transcribes a multimedia file
	def transcribe(file_obj, device, language, model_choice, model_manager):
	"""
	Transcribes a multimedia file using a specified model, handling file operations,
	language identification, and transcription alignment, and outputs transcription in multiple formats.
	"""
	_, ext = os.path.splitext(file_obj.name)
	temp_dir = os.path.join(os.getcwd(), 'Temp')

	if not os.path.exists(temp_dir):
	os.makedirs(temp_dir)
	new_file_path = os.path.join(temp_dir, f'resource{ext}')

	shutil.copy(file_obj.name, new_file_path)

	model = model_manager.load_model(model_choice, device)

	validated_file_path = validate_multimedia_file(new_file_path)
	audio = whisperx.load_audio(validated_file_path)

	if language == "Identify":
	result = model.transcribe(audio, batch_size=16)
	language_code = result["language"]
	else:
	language_code = language_options[language]
	result = model.transcribe(audio, language=language_code, batch_size=16)

	model_a, metadata = whisperx.load_align_model(language_code=language_code, device=device)
	try:
	aligned_segments = []
	for segment in result["segments"]:
	aligned_segment = whisperx.align([segment], model_a, metadata, audio, device, return_char_alignments=False)
	aligned_segments.extend(aligned_segment["segments"])
	except Exception as e:
	print(f"Error during alignment: {e}")
	return None

	segments_output = {"segments": aligned_segments}
	json_output = json.dumps(segments_output, ensure_ascii=False, indent=4)
	json_file_path = download_json_interface(json_output, temp_dir)
	txt_path = save_as_text(aligned_segments, temp_dir)
	vtt_path = save_as_vtt(aligned_segments, temp_dir)
	srt_path = save_as_srt(aligned_segments, temp_dir)
	return json_file_path, txt_path, vtt_path, srt_path

	# Saves the transcription text of audio segments to a file in the specified temporary directory and returns the file path.
	def save_as_text(segments, temp_dir):
	txt_file_path = os.path.join(temp_dir, 'transcription_output.txt')
	with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
	for segment in segments:
	txt_file.write(f"{segment['text'].strip()}\n")
	return txt_file_path


	def save_as_vtt(segments, temp_dir):
	"""
	Saves the transcription text as a .vtt file (Web Video Text Tracks format),
	which includes timestamps for each segment, in the specified temporary directory and returns the file path.
	"""
	vtt_file_path = os.path.join(temp_dir, 'transcription_output.vtt')
	with open(vtt_file_path, 'w', encoding='utf-8') as vtt_file:
	vtt_file.write("WEBVTT\n\n")
	for i, segment in enumerate(segments):
	start = segment['start']
	end = segment['end']
	vtt_file.write(f"{i}\n")
	vtt_file.write(f"{format_time(start)} --> {format_time(end)}\n")
	vtt_file.write(f"{segment['text'].strip()}\n\n")
	return vtt_file_path

	def download_json_interface(json_data, temp_dir):
	"""
	Reads JSON-formatted transcription data, modifies and re-saves it in a neatly
	formatted JSON file in the specified temporary directory, and returns the file path.
	"""
	json_file_path = os.path.join(temp_dir, 'transcription_output.json')
	with open(json_file_path, 'w', encoding='utf-8') as json_file:
	json_data = json.loads(json_data)
	for segment in json_data['segments']:
	segment['text'] = segment['text'].strip()
	json_data = json.dumps(json_data, ensure_ascii=False, indent=4)
	json_file.write(json_data)
	return json_file_path


	def save_as_srt(segments, temp_dir):
	"""
	Saves the transcription text as an .srt file (SubRip Subtitle format),
	which includes numbered entries with start and end times and corresponding text for each segment,
	in the specified temporary directory and returns the file path.
	"""
	srt_file_path = os.path.join(temp_dir, 'transcription_output.srt')
	with open(srt_file_path, 'w', encoding='utf-8') as srt_file:
	for i, segment in enumerate(segments):
	start = segment['start']
	end = segment['end']
	srt_file.write(f"{i+1}\n")
	srt_file.write(f"{format_time_srt(start)} --> {format_time_srt(end)}\n")
	srt_file.write(f"{segment['text'].strip()}\n\n")
	return srt_file_path

	# Converts a time value in seconds to a formatted string in the "hours:minutes:seconds,milliseconds" format, used for timestamps in VTT files.
	def format_time(time_in_seconds):
	hours = int(time_in_seconds // 3600)
	minutes = int((time_in_seconds % 3600) // 60)
	seconds = time_in_seconds % 60
	return f"{hours:02}:{minutes:02}:{seconds:06.3f}"

	# Converts a time value in seconds to a formatted string suitable for SRT files, specifically in the "hours:minutes:seconds,milliseconds" format.
	def format_time_srt(time_in_seconds):
	hours = int(time_in_seconds // 3600)
	minutes = int((time_in_seconds % 3600) // 60)
	seconds = int(time_in_seconds % 60)
	milliseconds = int((time_in_seconds - int(time_in_seconds)) * 1000)
	return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"