Spaces:

yasserrmd
/

NotebookLlama

Running

File size: 2,868 Bytes

import pickle
import numpy as np
from tqdm import tqdm
import edge_tts
import ast
import asyncio


class EdgeTTSGenerator:
    """
    A class to generate podcast-style audio from a transcript using edge-tts.
    """
    def __init__(self, transcript_file_path, output_audio_path):
        """
        Initialize the TTS generator with the path to the rewritten transcript file.
        
        Args:
            transcript_file_path (str): Path to the file containing the rewritten transcript.
            output_audio_path (str): Path to save the generated audio file.
        """
        self.transcript_file_path = transcript_file_path
        self.output_audio_path = output_audio_path

        # Speaker descriptions for edge-tts voices
        self.speaker1_voice = "en-US-AriaNeural"
        self.speaker2_voice = "en-US-GuyNeural"

    def load_transcript(self):
        """
        Loads the rewritten transcript from the specified file.
        
        Returns:
            list: The content of the transcript as a list of tuples (speaker, text).
        """
        with open(self.transcript_file_path, 'rb') as f:
            return ast.literal_eval(pickle.load(f))

    async def generate_audio_segment(self, text, voice_name):
        """
        Generate audio for a given text using edge-tts.
        
        Args:
            text (str): Text to be synthesized.
            voice_name (str): The voice name to use for TTS.
        
        Returns:
            AudioSegment: Generated audio segment.
        """
        communicator = edge_tts.Communicate(text, voice_name)
        audio_bytes = b""
        async for chunk in communicator.stream():
            if "data" in chunk:  # Check if 'data' exists in chunk
                audio_bytes += chunk["data"]  # Concatenate only the audio data
        return audio_bytes

    def save_audio(self, audio_data):
        """
        Save the combined audio data to an output file.
        
        Args:
            audio_data (list): List of bytes containing the audio data for each segment.
        """
        combined_audio = b"".join(audio_data)
        with open(self.output_audio_path, "wb") as f:
            f.write(combined_audio)

    async def generate_audio(self):
        """
        Converts the transcript into audio and saves it to a file.
        
        Returns:
            str: Path to the saved audio file.
        """
        transcript = self.load_transcript()
        audio_data = []

        for speaker, text in tqdm(transcript, desc="Generating podcast segments", unit="segment"):
            voice = self.speaker1_voice if speaker == "Speaker 1" else self.speaker2_voice
            segment_audio = await self.generate_audio_segment(text, voice)
            audio_data.append(segment_audio)

        self.save_audio(audio_data)
        return self.output_audio_path