File size: 2,868 Bytes
5671703
 
 
 
 
 
 
cc42078
5671703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef1844a
5671703
 
59ab165
 
5671703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pickle
import numpy as np
from tqdm import tqdm
import edge_tts
import ast
import asyncio


class EdgeTTSGenerator:
    """
    A class to generate podcast-style audio from a transcript using edge-tts.
    """
    def __init__(self, transcript_file_path, output_audio_path):
        """
        Initialize the TTS generator with the path to the rewritten transcript file.
        
        Args:
            transcript_file_path (str): Path to the file containing the rewritten transcript.
            output_audio_path (str): Path to save the generated audio file.
        """
        self.transcript_file_path = transcript_file_path
        self.output_audio_path = output_audio_path

        # Speaker descriptions for edge-tts voices
        self.speaker1_voice = "en-US-AriaNeural"
        self.speaker2_voice = "en-US-GuyNeural"

    def load_transcript(self):
        """
        Loads the rewritten transcript from the specified file.
        
        Returns:
            list: The content of the transcript as a list of tuples (speaker, text).
        """
        with open(self.transcript_file_path, 'rb') as f:
            return ast.literal_eval(pickle.load(f))

    async def generate_audio_segment(self, text, voice_name):
        """
        Generate audio for a given text using edge-tts.
        
        Args:
            text (str): Text to be synthesized.
            voice_name (str): The voice name to use for TTS.
        
        Returns:
            AudioSegment: Generated audio segment.
        """
        communicator = edge_tts.Communicate(text, voice_name)
        audio_bytes = b""
        async for chunk in communicator.stream():
            if "data" in chunk:  # Check if 'data' exists in chunk
                audio_bytes += chunk["data"]  # Concatenate only the audio data
        return audio_bytes

    def save_audio(self, audio_data):
        """
        Save the combined audio data to an output file.
        
        Args:
            audio_data (list): List of bytes containing the audio data for each segment.
        """
        combined_audio = b"".join(audio_data)
        with open(self.output_audio_path, "wb") as f:
            f.write(combined_audio)

    async def generate_audio(self):
        """
        Converts the transcript into audio and saves it to a file.
        
        Returns:
            str: Path to the saved audio file.
        """
        transcript = self.load_transcript()
        audio_data = []

        for speaker, text in tqdm(transcript, desc="Generating podcast segments", unit="segment"):
            voice = self.speaker1_voice if speaker == "Speaker 1" else self.speaker2_voice
            segment_audio = await self.generate_audio_segment(text, voice)
            audio_data.append(segment_audio)

        self.save_audio(audio_data)
        return self.output_audio_path