import json import time from json.decoder import JSONDecodeError from typing import Union from .hallucinations import KNOWN_HALLUCINATIONS ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] class Transcript: """ Class for storing transcript data, including speaker information and text segments, and exporting it to various file formats such as JSON, HTML, and LaTeX. """ def __init__(self, transcript: dict) -> None: """ Initializes the Transcript object with the given transcript data. Args: transcript (dict): A dictionary containing the formatted transcript string. Keys should correspond to segment IDs, and values should contain speaker and segment information. """ self.transcript = transcript self._remove_hallucinations() self.speakers = self._extract_speakers() self.segments = self._extract_segments() self.annotation = {} def annotate(self, *args, **kwargs) -> dict: """ Annotates the transcript to associate specific names with speakers. Args: args (list): List of speaker names. These will be mapped sequentially to the speakers. kwargs (dict): Dictionary with speaker names as keys and list of segments as values. Returns: dict: Dictionary with speaker names as keys and list of segments as values. Raises: ValueError: If the number of speaker names does not match the number of speakers, or if an unknown speaker is found. """ annotations = {} if args and len(args) != len(self.speakers): raise ValueError( "Number of speaker names does not match number of speakers") if args: for arg, speaker in zip(args, sorted(self.speakers)): annotations[speaker] = arg invalid_speakers = set(kwargs.keys()) - set(self.speakers) if invalid_speakers: raise ValueError( f"These keys are not speakers: {', '.join(invalid_speakers)}") annotations.update({key: kwargs[key] for key in self.speakers if key in kwargs}) self.annotation = annotations return self def _remove_hallucinations(self) -> None: """ Removes all occurances of known hallucinations from all segments of the transcript. Segments that are identical to empty strings afterwards are removed from the transcript. """ segments_to_drop = [] for id in self.transcript: for snippet in KNOWN_HALLUCINATIONS: self.transcript[id]['text'] = self.transcript[id]['text'].replace( snippet, '') if self.transcript[id]['text'] == '': segments_to_drop.append(id) for id in segments_to_drop: del self.transcript[id] def _extract_speakers(self) -> list: """ Extracts the unique speaker names from the transcript. Returns: list: List of unique speaker names in the transcript. """ return list(set([self.transcript[id]["speakers"] for id in self.transcript])) def _extract_segments(self) -> list: """ Extracts all the text segments from the transcript. Returns: list: List of segments, where each segment is represented by the starting and ending times. """ return [self.transcript[id]["segments"] for id in self.transcript] def __str__(self) -> str: """ Converts the transcript to a string representation. Returns: str: String representation of the transcript, including speaker names and time stamps for each segment. """ fstring = "" for _id in self.transcript: seq = self.transcript[_id] if self.annotation: speaker = self.annotation[seq["speakers"]] else: speaker = seq["speakers"] segm = seq["segments"] sseg = time.strftime("%H:%M:%S", time.gmtime(segm[0])) eseg = time.strftime("%H:%M:%S", time.gmtime(segm[1])) fstring += f"{speaker} ({sseg} ; {eseg}):\t{seq['text']}\n" return fstring def __repr__(self) -> str: """Return a string representation of the Transcript object. Returns: str: A string that provides an informative description of the object. """ return f"Transcript(speakers = {self.speakers},"\ f"segments = {self.segments}, annotation = {self.annotation})" def get_dict(self) -> dict: """ Get transcript as dict :return: transcript as dict :rtype: dict """ return self.transcript def get_json(self, *args, use_annotation: bool = True, **kwargs) -> str: """ Get transcript as json string :return: transcript as json string :rtype: str """ if "indent" not in kwargs: kwargs["indent"] = 3 if use_annotation and self.annotation: for _id in self.transcript: seq = self.transcript[_id] seq["speakers"] = self.annotation[seq["speakers"]] return json.dumps(self.transcript, *args, **kwargs) def get_html(self) -> str: """ Get transcript as html string :return: transcript as html string :rtype: str """ html = "
" + self.__str__().replace("\n", "
") + "