jacklangerman commited on
Commit
dca3e52
·
0 Parent(s):
Files changed (1) hide show
  1. tok2text.py +159 -0
tok2text.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyktok as pyk
2
+ pyk.specify_browser("chrome") # Specify the browser to use for accessing TikTok
3
+ import os
4
+ import moviepy.editor as mp
5
+ import requests
6
+ from openai import OpenAI
7
+ from openai import OpenAI
8
+
9
+ client = OpenAI()
10
+
11
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
12
+ import time
13
+ from pathlib import Path
14
+ from openai import OpenAI
15
+
16
+ client = OpenAI(
17
+ api_key=os.environ.get("OPENAI_API_KEY"),
18
+ )
19
+
20
+ def download_video(url, video_id, outpath=Path('.')):
21
+ video_path = outpath / 'tiktok_video.mp4'
22
+ if not video_path.exists():
23
+ pyk.save_tiktok(url, True, str(video_path))
24
+ new_vid = [p for p in outpath.parent.glob('*.mp4') if video_id in p.name][0]
25
+ video_path = outpath / new_vid.name
26
+ new_vid.rename(video_path)
27
+ print('Downloaded video wait...', end='')
28
+ time.sleep(2)
29
+ print('Done')
30
+ else:
31
+ print('Video already exists. Skipping download.')
32
+ return video_path
33
+
34
+ def extract_audio(path):
35
+ audio_path = Path(path).parent / "tiktok_audio.mp3"
36
+ if not audio_path.exists():
37
+ print('Extracting audio...', flush=True)
38
+ video = mp.VideoFileClip(str(path))
39
+ video.audio.write_audiofile(str(audio_path))
40
+ print('Audio extracted', flush=True)
41
+ else:
42
+ print('Audio already exists. Skipping extraction.')
43
+ return audio_path
44
+
45
+ def transcribe_audio(audio_file_path):
46
+ transcript_path = Path(audio_file_path).parent / 'transcript.txt'
47
+ if not transcript_path.exists():
48
+ print('Sending for transcription...', flush=True)
49
+ with open(audio_file_path, 'rb') as audio_file:
50
+ transcription = client.audio.transcriptions.create(model="whisper-1", file=audio_file,
51
+ response_format="text")
52
+ with transcript_path.open('w') as f:
53
+ f.write(transcription)
54
+ print('Transcription done', flush=True)
55
+ else:
56
+ print('Transcript already exists. Skipping transcription.')
57
+ with transcript_path.open() as f:
58
+ transcription = f.read()
59
+ return transcription
60
+
61
+ # def format_recipe(transcript, output_path):
62
+ # formatted_recipe_path = output_path / 'formatted_recipe.txt'
63
+ # if not formatted_recipe_path.exists():
64
+ # # prompt = f"Please format the following recipe transcript into a nicely formatted recipe:\n\n{transcript}"
65
+ # # response = client.completions.create(engine="text-davinci-003",
66
+ # # prompt=prompt,
67
+ # # max_tokens=500,
68
+ # # n=1,
69
+ # # stop=None,
70
+ # # temperature=0.7)
71
+ # prompt = f"Transcript: \n\n{transcript}\n\n\nRecipe:\n\nTitle: "
72
+ # response = client.completions.create(
73
+ # model="davinci-002",
74
+ # prompt=prompt,
75
+ # max_tokens=500,
76
+ # n=1,
77
+ # stop=None,
78
+ # temperature=0.7,
79
+ # stream=False,
80
+ # )
81
+ # print(response)
82
+
83
+
84
+
85
+ # formatted_recipe = response.choices[0].text.strip()
86
+ # with formatted_recipe_path.open('w') as f:
87
+ # f.write(formatted_recipe)
88
+ # else:
89
+ # print('Formatted recipe already exists. Skipping formatting.')
90
+ # with formatted_recipe_path.open() as f:
91
+ # formatted_recipe = f.read()
92
+ # return formatted_recipe
93
+
94
+
95
+
96
+
97
+ def format_recipe(transcript, output_path, tiktok_url):
98
+ formatted_recipe_path = output_path / 'formatted_recipe.txt'
99
+ if not formatted_recipe_path.exists():
100
+ prompt = f"Please format this recipe transcript into a nicely formatted recipe:\n\n{transcript}"
101
+
102
+ response = client.chat.completions.create(model="gpt-3.5-turbo",
103
+ messages=[
104
+ {"role": "system", "content": "You are a helpful assistant that turns transcripts of TikTok recipe videos into nicely formatted recipes. Please output the recipe only and now additional text or cometary. Each recipe should have exactly three sections: Title, Ingredients, and Instructions. Make sure to write every step and ingredient and if you're not sure about something make sure to write a note in parentheses explaining why you are unsure, and how you guessed, prepend '≈' to the amount, make the best estimation you can given the context."},
105
+ {"role": "user", "content": prompt}
106
+ ],
107
+ max_tokens=500,
108
+ n=1,
109
+ stop=None,
110
+ temperature=0.7)
111
+
112
+ formatted_recipe = response.choices[0].message.content.strip().lstrip('Title:').strip()
113
+
114
+ formatted_recipe = f'{formatted_recipe}\n\n\n{tiktok_url}'
115
+
116
+ with formatted_recipe_path.open('w') as f:
117
+ f.write(formatted_recipe)
118
+ else:
119
+ print('Formatted recipe already exists. Skipping formatting.')
120
+ with formatted_recipe_path.open() as f:
121
+ formatted_recipe = f.read()
122
+ return formatted_recipe, formatted_recipe_path
123
+
124
+
125
+
126
+
127
+
128
+ def expand_url(short_url):
129
+ response = requests.get(short_url, allow_redirects=False)
130
+ if response.status_code == 301 or response.status_code == 302:
131
+ return response.headers['Location']
132
+ else:
133
+ return short_url
134
+
135
+ def extract_recipe(tiktok_url):
136
+ tiktok_url = expand_url(tiktok_url)
137
+ tiktok_url = tiktok_url.strip('https://').strip('http://').strip('/').split("?")[0].lstrip('www.')
138
+ _, user, _, video_id = tiktok_url.split("/")
139
+
140
+ output_path = Path(video_id)
141
+ output_path.mkdir(parents=True, exist_ok=True)
142
+
143
+ video_path = download_video(tiktok_url, video_id, output_path)
144
+ audio_path = extract_audio(video_path)
145
+ transcript = transcribe_audio(audio_path)
146
+ formatted_recipe, formatted_recipe_path = format_recipe(transcript, output_path, tiktok_url)
147
+
148
+ return formatted_recipe, formatted_recipe_path
149
+
150
+ def main():
151
+ # tiktok_url = "https://www.tiktok.com/@emmaaaaaaam_/video/7348493781961886981"
152
+ tiktok_url = "https://www.tiktok.com/t/ZTLjYBSpt/"
153
+ formatted_recipe, formatted_recipe_path = extract_recipe(tiktok_url)
154
+
155
+ print("Formatted Recipe:")
156
+ print(formatted_recipe)
157
+
158
+ if __name__ == "__main__":
159
+ main()