# generate_transcript.py

import torch
from accelerate import Accelerator
import transformers
import pickle
from tqdm import tqdm
import warnings
import spaces

warnings.filterwarnings('ignore')


class TranscriptProcessor:
    """
    A class to generate and rewrite podcast-style transcripts using a specified language model.
    """

    def __init__(self, text_file_path, model_name="meta-llama/Llama-3.1-8B-Instruct"):
        """
        Initialize with the path to the cleaned text file and the model name.
        
        Args:
            text_file_path (str): Path to the file containing cleaned PDF text.
            model_name (str): Name of the language model to use.
        """
        self.text_file_path = text_file_path
        self.transcript_output_path = './resources/data.pkl'
        self.tts_output_path = './resources/podcast_ready_data.pkl'
        self.model_name = model_name
        self.accelerator = Accelerator()
        self.model = transformers.pipeline(
            "text-generation",
            model=self.model_name,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto"
        )
        self.transcript_prompt = """
        You are a world-class podcast writer, working as a ghost writer for top podcast hosts.
        You will write the dialogue with engaging interruptions, anecdotes, and curiosity-led questions.
        
        Speaker 1: Leads the conversation. Speaker 2: Asks follow-up questions and reacts with expressions.
        
        ALWAYS START WITH SPEAKER 1: STRICTLY THE DIALOGUES.
        """
        
        self.rewrite_prompt = """
        You are an international oscar-winning screenwriter creating a refined script for TTS.
        
        Speaker 1: Teaches with anecdotes; Speaker 2: Reacts with expressions like "umm," "hmm," [sigh].
        
        Return the response as a list of tuples only, with no extra formatting.
        """

    def load_text(self):
        """
        Reads the cleaned text file and returns its content.
        
        Returns:
            str: Content of the cleaned text file.
        """
        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        for encoding in encodings:
            try:
                with open(self.text_file_path, 'r', encoding=encoding) as file:
                    content = file.read()
                print(f"Successfully read file using {encoding} encoding.")
                return content
            except (UnicodeDecodeError, FileNotFoundError):
                continue
        print(f"Error: Could not decode file '{self.text_file_path}' with any common encoding.")
        return None

    def generate_transcript(self):
        """
        Generates a podcast-style transcript and saves it as a pickled file.
        
        Returns:
            str: Path to the file where the transcript is saved.
        """
        input_text = self.load_text()
        if input_text is None:
            return None
        
        messages = [
            {"role": "system", "content": self.transcript_prompt},
            {"role": "user", "content": input_text}
        ]
        
        output = self.model(
            messages,
            max_new_tokens=8126,
            temperature=1
        )
        
        transcript = output[0]["generated_text"]
        
        # Save the transcript as a pickle file
        with open(self.transcript_output_path, 'wb') as f:
            pickle.dump(transcript, f)
        
        return self.transcript_output_path

    def rewrite_transcript(self):
        """
        Refines the transcript for TTS, adding expressive elements and saving as a list of tuples.
        
        Returns:
            str: Path to the file where the TTS-ready transcript is saved.
        """
        # Load the initial generated transcript
        with open(self.transcript_output_path, 'rb') as file:
            input_transcript = pickle.load(file)
        
        messages = [
            {"role": "system", "content": self.rewrite_prompt},
            {"role": "user", "content": input_transcript}
        ]
        
        output = self.model(
            messages,
            max_new_tokens=8126,
            temperature=1
        )
        
        rewritten_transcript = output[0]["generated_text"]
        
        # Save the rewritten transcript as a pickle file
        with open(self.tts_output_path, 'wb') as f:
            pickle.dump(rewritten_transcript, f)
        
        return self.tts_output_path