# generate_transcript.py

import torch
from accelerate import Accelerator
import transformers
import pickle
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')


class TranscriptGenerator:
    """
    A class to generate a conversational podcast transcript from cleaned text.
    """

    def __init__(self, text_file_path, model_name="meta-llama/Llama-3.1-70B-Instruct"):
        """
        Initialize with the path to the cleaned text file and the model name.
        
        Args:
            text_file_path (str): Path to the file containing cleaned PDF text.
            model_name (str): Name of the language model to use.
        """
        self.text_file_path = text_file_path
        self.output_path = './resources/data.pkl'
        self.model_name = model_name
        self.accelerator = Accelerator()
        self.model = transformers.pipeline(
            "text-generation",
            model=self.model_name,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto"
        )
        self.system_prompt = """
        You are a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferris. 
        We are in an alternate universe where actually you have been writing every line they say and they just stream it into their brains.

        Your job is to write word by word, even "umm, hmmm, right" interruptions by the second speaker based on the PDF upload. 
        Keep it extremely engaging, with realistic anecdotes, tangents, and interruptions.
        
        Speaker 1: Leads and teaches. Speaker 2: Asks follow-up questions, gets excited or confused.
        
        ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
        STRICTLY THE DIALOGUES.
        """
    
    def load_text(self):
        """
        Reads the cleaned text file and returns its content.
        
        Returns:
            str: Content of the cleaned text file.
        """
        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        for encoding in encodings:
            try:
                with open(self.text_file_path, 'r', encoding=encoding) as file:
                    content = file.read()
                print(f"Successfully read file using {encoding} encoding.")
                return content
            except (UnicodeDecodeError, FileNotFoundError):
                continue
        print(f"Error: Could not decode file '{self.text_file_path}' with any common encoding.")
        return None

    def generate_transcript(self):
        """
        Generates a podcast-style transcript and saves it as a pickled file.
        
        Returns:
            str: Path to the file where the transcript is saved.
        """
        input_text = self.load_text()
        if input_text is None:
            return None
        
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": input_text}
        ]
        
        output = self.model(
            messages,
            max_new_tokens=8126,
            temperature=1
        )
        
        transcript = output[0]["generated_text"]
        
        # Save the transcript as a pickle file
        with open(self.output_path, 'wb') as f:
            pickle.dump(transcript, f)
        
        return self.output_path