yasserrmd commited on
Commit
029a66e
·
verified ·
1 Parent(s): bf8498e

Create generate_transcript.py

Browse files
Files changed (1) hide show
  1. generate_transcript.py +96 -0
generate_transcript.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generate_transcript.py
2
+
3
+ import torch
4
+ from accelerate import Accelerator
5
+ import transformers
6
+ import pickle
7
+ from tqdm import tqdm
8
+ import warnings
9
+
10
+ warnings.filterwarnings('ignore')
11
+
12
+
13
+ class TranscriptGenerator:
14
+ """
15
+ A class to generate a conversational podcast transcript from cleaned text.
16
+ """
17
+
18
+ def __init__(self, text_file_path, model_name="meta-llama/Llama-3.1-70B-Instruct"):
19
+ """
20
+ Initialize with the path to the cleaned text file and the model name.
21
+
22
+ Args:
23
+ text_file_path (str): Path to the file containing cleaned PDF text.
24
+ model_name (str): Name of the language model to use.
25
+ """
26
+ self.text_file_path = text_file_path
27
+ self.output_path = './resources/data.pkl'
28
+ self.model_name = model_name
29
+ self.accelerator = Accelerator()
30
+ self.model = transformers.pipeline(
31
+ "text-generation",
32
+ model=self.model_name,
33
+ model_kwargs={"torch_dtype": torch.bfloat16},
34
+ device_map="auto"
35
+ )
36
+ self.system_prompt = """
37
+ You are a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferris.
38
+ We are in an alternate universe where actually you have been writing every line they say and they just stream it into their brains.
39
+
40
+ Your job is to write word by word, even "umm, hmmm, right" interruptions by the second speaker based on the PDF upload.
41
+ Keep it extremely engaging, with realistic anecdotes, tangents, and interruptions.
42
+
43
+ Speaker 1: Leads and teaches. Speaker 2: Asks follow-up questions, gets excited or confused.
44
+
45
+ ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
46
+ STRICTLY THE DIALOGUES.
47
+ """
48
+
49
+ def load_text(self):
50
+ """
51
+ Reads the cleaned text file and returns its content.
52
+
53
+ Returns:
54
+ str: Content of the cleaned text file.
55
+ """
56
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
57
+ for encoding in encodings:
58
+ try:
59
+ with open(self.text_file_path, 'r', encoding=encoding) as file:
60
+ content = file.read()
61
+ print(f"Successfully read file using {encoding} encoding.")
62
+ return content
63
+ except (UnicodeDecodeError, FileNotFoundError):
64
+ continue
65
+ print(f"Error: Could not decode file '{self.text_file_path}' with any common encoding.")
66
+ return None
67
+
68
+ def generate_transcript(self):
69
+ """
70
+ Generates a podcast-style transcript and saves it as a pickled file.
71
+
72
+ Returns:
73
+ str: Path to the file where the transcript is saved.
74
+ """
75
+ input_text = self.load_text()
76
+ if input_text is None:
77
+ return None
78
+
79
+ messages = [
80
+ {"role": "system", "content": self.system_prompt},
81
+ {"role": "user", "content": input_text}
82
+ ]
83
+
84
+ output = self.model(
85
+ messages,
86
+ max_new_tokens=8126,
87
+ temperature=1
88
+ )
89
+
90
+ transcript = output[0]["generated_text"]
91
+
92
+ # Save the transcript as a pickle file
93
+ with open(self.output_path, 'wb') as f:
94
+ pickle.dump(transcript, f)
95
+
96
+ return self.output_path