Jiangxz01 commited on
Commit
e063c54
·
verified ·
1 Parent(s): fce0934

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +268 -123
app.py CHANGED
@@ -37,84 +37,269 @@ class PodcastGenerator:
37
  gr.Error: 如果 API 金鑰或速率限制出現問題。
38
 
39
  此方法使用 SambaNova API 根據使用者的輸入生成Podcast劇本。
40
- 它處理語言選擇,使用適當配置設定 AI 模型,並處理生成的響應。
41
  """
42
- # Significantly shorten the system prompt
43
- system_prompt = f"""Generate a podcast script with 2 speakers. {language} language. Be concise, engaging, and in JSON format."""
44
-
45
- example = """{"podcast":[{"speaker":1,"line":"Hello"},{"speaker":2,"line":"Hi there"}]}"""
46
-
47
- async def generate_chunk(chunk: str) -> str:
48
- try:
49
- # Calculate the available tokens for generation
50
- prompt_tokens = len(chunk.split())
51
- system_tokens = len(system_prompt.split())
52
- max_tokens = 3000 # Reduced from 4096 to leave more room for the prompt
53
-
54
- logger.info(f"Sending request to SambaNova API with prompt chunk: {chunk[:100]}...")
55
- response = client.chat.completions.create(
56
- model='Meta-Llama-3.1-405B-Instruct',
57
- messages=[
58
- {"role": "system", "content": system_prompt},
59
- {"role": "user", "content": f"Generate a podcast script based on this: {chunk}\nUse this format: {example}"}
60
- ],
61
- temperature=1,
62
- max_tokens=max_tokens
63
- )
64
- logger.info(f"Received response from API: {response}")
65
-
66
- if hasattr(response, 'error'):
67
- logger.error(f"API returned an error: {response.error}")
68
- return {"error": f"API error: {response.error.get('message', 'Unknown error')}"}
69
-
70
- if response.choices and len(response.choices) > 0:
71
- generated_text = response.choices[0].message.content
72
- logger.info(f"Generated text: {generated_text[:100]}...")
73
- return generated_text
74
- else:
75
- logger.warning("No content generated from the API")
76
- return {"error": "No content generated from the API"}
77
-
78
- except Exception as e:
79
- logger.error(f"Error generating script chunk: {str(e)}")
80
- return {"error": f"Failed to generate podcast script chunk: {str(e)}"}
81
-
82
- # Split the prompt into smaller chunks
83
- chunk_size = 500 # Reduced from 1000
84
- chunks = [prompt[i:i+chunk_size] for i in range(0, len(prompt), chunk_size)]
85
-
86
- # Generate script for each chunk
87
- generated_chunks = []
88
- for chunk in chunks:
89
- result = await generate_chunk(chunk)
90
- if isinstance(result, dict) and "error" in result:
91
- return result
92
- generated_chunks.append(result)
93
-
94
- # Combine generated chunks
95
- generated_text = " ".join(generated_chunks)
96
-
97
- # Try to parse JSON, if fails then extract dialogue from raw text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
- parsed_json = json.loads(generated_text)
100
- if "podcast" in parsed_json:
101
- return parsed_json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  else:
103
- raise json.JSONDecodeError("Missing 'podcast' key", generated_text, 0)
 
 
 
 
 
 
 
104
  except json.JSONDecodeError:
105
- logger.warning("Generated text is not valid JSON or missing 'podcast' key. Attempting to extract dialogue.")
106
- lines = generated_text.split('\n')
107
- podcast = []
108
- current_speaker = 1
109
- for line in lines:
110
- line = line.strip()
111
- if line:
112
- podcast.append({
113
- "speaker": current_speaker,
114
- "line": line
115
- })
116
- current_speaker = 3 - current_speaker # Switch between 1 and 2
117
- return {"podcast": podcast}
118
 
119
  async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
120
  """
@@ -127,7 +312,7 @@ class PodcastGenerator:
127
  speaker2 (str): 第二位說話者的語音設定。
128
 
129
  返回:
130
- str: 生成的臨時音訊檔案的檔名,或者 None 如果生成失敗。
131
 
132
  此方法使用 Edge TTS 將文字轉換爲語音,並將結果儲存爲臨時音訊檔案。
133
  根據指定的說話者編號選擇相應的語音設定。
@@ -143,16 +328,11 @@ class PodcastGenerator:
143
  # 儲存語音檔案
144
  await speech.save(temp_filename)
145
  return temp_filename
146
- except edge_tts.exceptions.NoAudioReceived:
147
- logger.error(f"No audio received for text: '{text[:50]}...' with voice: {voice}")
148
- return None
149
  except Exception as e:
150
- logger.error(f"Error generating audio for text: '{text[:50]}...' with voice: {voice}. Error: {str(e)}")
151
- return None
152
- finally:
153
- # 如果檔案存在但生成失敗,刪除臨時檔案
154
  if os.path.exists(temp_filename):
155
  os.remove(temp_filename)
 
156
 
157
  async def combine_audio_files(self, audio_files: List[str]) -> str:
158
  """
@@ -201,39 +381,16 @@ class PodcastGenerator:
201
  # 生成Podcast劇本
202
  gr.Info("Generating podcast script...")
203
  start_time = time.time()
204
- script_result = await self.generate_script(input_text, language, api_key)
205
  end_time = time.time()
206
-
207
- if "error" in script_result:
208
- gr.Error(f"Failed to generate podcast script: {script_result['error']}")
209
- return None
210
-
211
- if "raw_text" in script_result:
212
- gr.Warning("Generated text is not in the expected JSON format. Attempting to process raw text.")
213
- # Here you might want to implement a fallback method to process raw text
214
- # For now, we'll just return None
215
- return None
216
-
217
- if "podcast" not in script_result:
218
- gr.Error("Generated script does not contain a 'podcast' key.")
219
- return None
220
-
221
  gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
222
 
223
  # 生成Podcast音訊檔案
224
  gr.Info("Generating podcast audio files...")
225
  start_time = time.time()
226
- audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in script_result['podcast']])
227
  end_time = time.time()
228
-
229
- # Filter out None values (failed TTS generations)
230
- audio_files = [file for file in audio_files if file is not None]
231
-
232
- if not audio_files:
233
- gr.Error("Failed to generate any audio files. Please check your language and voice settings.")
234
- return None
235
-
236
- gr.Info(f"Successfully generated {len(audio_files)} out of {len(script_result['podcast'])} audio files in {(end_time - start_time):.2f} seconds!")
237
 
238
  # 合併音訊檔案
239
  combined_audio = await self.combine_audio_files(audio_files)
@@ -289,6 +446,9 @@ async def process_input(input_text: str, input_file, language: str, speaker1: st
289
 
290
  # 定義語音名稱對映
291
  voice_names = {
 
 
 
292
  "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
293
  "Ava - English (United States)": "en-US-AvaMultilingualNeural",
294
  "Brian - English (United States)": "en-US-BrianMultilingualNeural",
@@ -303,22 +463,10 @@ async def process_input(input_text: str, input_file, language: str, speaker1: st
303
  speaker1 = voice_names[speaker1]
304
  speaker2 = voice_names[speaker2]
305
 
306
- # Check if the selected voices are compatible with the chosen language
307
- if language != "Auto Detect":
308
- if not (speaker1.startswith(language[:2].lower()) and speaker2.startswith(language[:2].lower())):
309
- gr.Error(f"Selected voices may not be compatible with the chosen language: {language}")
310
- return None
311
-
312
  # 如果提供了輸入檔案,則從檔案中提取文字
313
  if input_file:
314
  input_text = await TextExtractor.extract_text(input_file.name)
315
 
316
- # Limit input text length
317
- max_input_length = 3000 # Adjust this value as needed
318
- if len(input_text) > max_input_length:
319
- input_text = input_text[:max_input_length]
320
- gr.Warning(f"Input text was truncated to {max_input_length} characters due to length limitations.")
321
-
322
  # 如果沒有提供API金鑰,則使用環境變數中的金鑰
323
  if not api_key:
324
  api_key = os.getenv("Your_API_KEY")
@@ -326,9 +474,6 @@ async def process_input(input_text: str, input_file, language: str, speaker1: st
326
  # 建立PodcastGenerator實例並生成Podcast
327
  podcast_generator = PodcastGenerator()
328
  podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key)
329
-
330
- if podcast is None:
331
- return None
332
 
333
  # 計算總耗時並顯示資訊
334
  end_time = time.time()
 
37
  gr.Error: 如果 API 金鑰或速率限制出現問題。
38
 
39
  此方法使用 SambaNova API 根據使用者的輸入生成Podcast劇本。
40
+ 它處理語言選擇,使用適當的配置設定 AI 模型,並處理生成的響應。
41
  """
42
+ # 定義一個示例JSON結構,用於指導AI生成類似格式的Podcast劇本
43
+ example = """
44
+ {
45
+ "topic": "AGI",
46
+ "podcast": [
47
+ {
48
+ "speaker": 2,
49
+ "line": "So, AGI, huh? Seems like everyone's talking about it these days."
50
+ },
51
+ {
52
+ "speaker": 1,
53
+ "line": "Yeah, it's definitely having a moment, isn't it?"
54
+ },
55
+ {
56
+ "speaker": 2,
57
+ "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"
58
+ },
59
+ {
60
+ "speaker": 1,
61
+ "line": "Honestly, it's the sheer scale of what AGI could do. We're talking about potentially reshaping well everything."
62
+ },
63
+ {
64
+ "speaker": 2,
65
+ "line": "No kidding, but let's be real. Sometimes it feels like every other headline is either hyping AGI up as this technological utopia or painting it as our inevitable robot overlords."
66
+ },
67
+ {
68
+ "speaker": 1,
69
+ "line": "It's easy to get lost in the noise, for sure."
70
+ },
71
+ {
72
+ "speaker": 2,
73
+ "line": "Exactly. So how about we try to cut through some of that, shall we?"
74
+ },
75
+ {
76
+ "speaker": 1,
77
+ "line": "Sounds like a plan."
78
+ },
79
+ {
80
+ "speaker": 2,
81
+ "line": "Okay, so first things first, AGI, what is it really? And I don't just mean some dictionary definition, we're talking about something way bigger than just a super smart computer, right?"
82
+ },
83
+ {
84
+ "speaker": 1,
85
+ "line": "Right, it's not just about more processing power or better algorithms, it's about a fundamental shift in how we think about intelligence itself."
86
+ },
87
+ {
88
+ "speaker": 2,
89
+ "line": "So like, instead of programming a machine for a specific task, we're talking about creating something that can learn and adapt like we do."
90
+ },
91
+ {
92
+ "speaker": 1,
93
+ "line": "Exactly, think of it this way: Right now, we've got AI that can beat a grandmaster at chess but ask that same AI to, say, write a poem or compose a symphony. No chance."
94
+ },
95
+ {
96
+ "speaker": 2,
97
+ "line": "Okay, I see. So, AGI is about bridging that gap, creating something that can move between those different realms of knowledge seamlessly."
98
+ },
99
+ {
100
+ "speaker": 1,
101
+ "line": "Precisely. It's about replicating that uniquely human ability to learn something new and apply that knowledge in completely different contexts and that's a tall order, let me tell you."
102
+ },
103
+ {
104
+ "speaker": 2,
105
+ "line": "I bet. I mean, think about how much we still don't even understand about our own brains."
106
+ },
107
+ {
108
+ "speaker": 1,
109
+ "line": "That's exactly it. We're essentially trying to reverse-engineer something we don't fully comprehend."
110
+ },
111
+ {
112
+ "speaker": 2,
113
+ "line": "And how are researchers even approaching that? What are some of the big ideas out there?"
114
+ },
115
+ {
116
+ "speaker": 1,
117
+ "line": "Well, there are a few different schools of thought. One is this idea of neuromorphic computing where they're literally trying to build computer chips that mimic the structure and function of the human brain."
118
+ },
119
+ {
120
+ "speaker": 2,
121
+ "line": "Wow, so like actually replicating the physical architecture of the brain. That's wild."
122
+ },
123
+ {
124
+ "speaker": 1,
125
+ "line": "It's pretty mind-blowing stuff and then you've got folks working on something called whole brain emulation."
126
+ },
127
+ {
128
+ "speaker": 2,
129
+ "line": "Okay, and what's that all about?"
130
+ },
131
+ {
132
+ "speaker": 1,
133
+ "line": "The basic idea there is to create a complete digital copy of a human brain down to the last neuron and synapse and run it on a sufficiently powerful computer simulation."
134
+ },
135
+ {
136
+ "speaker": 2,
137
+ "line": "Hold on, a digital copy of an entire brain, that sounds like something straight out of science fiction."
138
+ },
139
+ {
140
+ "speaker": 1,
141
+ "line": "It does, doesn't it? But it gives you an idea of the kind of ambition we're talking about here and the truth is we're still a long way off from truly achieving AGI, no matter which approach you look at."
142
+ },
143
+ {
144
+ "speaker": 2,
145
+ "line": "That makes sense but it's still exciting to think about the possibilities, even if they're a ways off."
146
+ },
147
+ {
148
+ "speaker": 1,
149
+ "line": "Absolutely and those possibilities are what really get people fired up about AGI, right? Yeah."
150
+ },
151
+ {
152
+ "speaker": 2,
153
+ "line": "For sure. In fact, I remember you mentioning something in that podcast about AGI's potential to revolutionize scientific research. Something about supercharging breakthroughs."
154
+ },
155
+ {
156
+ "speaker": 1,
157
+ "line": "Oh, absolutely. Imagine an AI that doesn't just crunch numbers but actually understands scientific data the way a human researcher does. We're talking about potential breakthroughs in everything from medicine and healthcare to material science and climate change."
158
+ },
159
+ {
160
+ "speaker": 2,
161
+ "line": "It's like giving scientists this incredibly powerful new tool to tackle some of the biggest challenges we face."
162
+ },
163
+ {
164
+ "speaker": 1,
165
+ "line": "Exactly, it could be a total game changer."
166
+ },
167
+ {
168
+ "speaker": 2,
169
+ "line": "Okay, but let's be real, every coin has two sides. What about the potential downsides of AGI? Because it can't all be sunshine and roses, right?"
170
+ },
171
+ {
172
+ "speaker": 1,
173
+ "line": "Right, there are definitely valid concerns. Probably the biggest one is the impact on the job market. As AGI gets more sophisticated, there's a real chance it could automate a lot of jobs that are currently done by humans."
174
+ },
175
+ {
176
+ "speaker": 2,
177
+ "line": "So we're not just talking about robots taking over factories but potentially things like, what, legal work, analysis, even creative fields?"
178
+ },
179
+ {
180
+ "speaker": 1,
181
+ "line": "Potentially, yes. And that raises a whole host of questions about what happens to those workers, how we retrain them, how we ensure that the benefits of AGI are shared equitably."
182
+ },
183
+ {
184
+ "speaker": 2,
185
+ "line": "Right, because it's not just about the technology itself, but how we choose to integrate it into society."
186
+ },
187
+ {
188
+ "speaker": 1,
189
+ "line": "Absolutely. We need to be having these conversations now about ethics, about regulation, about how to make sure AGI is developed and deployed responsibly."
190
+ },
191
+ {
192
+ "speaker": 2,
193
+ "line": "So it's less about preventing some kind of sci-fi robot apocalypse and more about making sure we're steering this technology in the right direction from the get-go."
194
+ },
195
+ {
196
+ "speaker": 1,
197
+ "line": "Exactly, AGI has the potential to be incredibly beneficial, but it's not going to magically solve all our problems. It's on us to make sure we're using it for good."
198
+ },
199
+ {
200
+ "speaker": 2,
201
+ "line": "It's like you said earlier, it's about shaping the future of intelligence."
202
+ },
203
+ {
204
+ "speaker": 1,
205
+ "line": "I like that. It really is."
206
+ },
207
+ {
208
+ "speaker": 2,
209
+ "line": "And honestly, that's a responsibility that extends beyond just the researchers and the policymakers."
210
+ },
211
+ {
212
+ "speaker": 1,
213
+ "line": "100%"
214
+ },
215
+ {
216
+ "speaker": 2,
217
+ "line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?"
218
+ },
219
+ {
220
+ "speaker": 1,
221
+ "line": "That's a question worth pondering."
222
+ },
223
+ {
224
+ "speaker": 2,
225
+ "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."
226
+ },
227
+ {
228
+ "speaker": 1,
229
+ "line": "Peace."
230
+ }
231
+ ]
232
+ }
233
+ """
234
+
235
+ # 根據使用者選擇的語言設定指令
236
+ if language == "Auto Detect":
237
+ language_instruction = "- The podcast MUST be in the same language as the user input."
238
+ else:
239
+ language_instruction = f"- The podcast MUST be in {language} language"
240
+
241
+ # 設定系統提示,指導AI如何生成Podcast指令碼
242
+ system_prompt = f"""
243
+ You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
244
+ {language_instruction}
245
+ - The podcast should have 2 speakers.
246
+ - The podcast should be long.
247
+ - Do not use names for the speakers.
248
+ - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
249
+ - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
250
+ - The script must be in JSON format.
251
+ Follow this example structure carefully:
252
+ {example}
253
+ """
254
+
255
+ # 設定使用者提示,包含使用者輸入的內容
256
+ user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
257
+
258
+ # 配置 SambaNova API client
259
+ if not api_key:
260
+ api_key = os.getenv("YOUR_API_TOKEN")
261
+ client = openai.OpenAI(
262
+ api_key=api_key,
263
+ base_url="https://api.sambanova.ai/v1",
264
+ )
265
+
266
+ # 嘗試生成內容
267
  try:
268
+ response = client.chat.completions.create(
269
+ model='Meta-Llama-3.1-405B-Instruct',
270
+ messages=[
271
+ {"role": "system", "content": system_prompt},
272
+ {"role": "user", "content": user_prompt}
273
+ ],
274
+ temperature=1
275
+ )
276
+ logger.info(f"API Response: {response}")
277
+
278
+ if response.choices and len(response.choices) > 0:
279
+ generated_text = response.choices[0].message.content
280
+ else:
281
+ logger.warning("No content generated from the API")
282
+ raise ValueError("No content generated from the API")
283
+
284
+ except Exception as e:
285
+ logger.error(f"Error generating script: {str(e)}")
286
+ # 處理可能的錯誤
287
+ if "API key not valid" in str(e):
288
+ raise gr.Error("Invalid API key. Please provide a valid SambaNova API key.")
289
+ elif "rate limit" in str(e).lower():
290
+ raise gr.Error("Rate limit exceeded for the API key. Please try again later or provide your own SambaNova API key.")
291
  else:
292
+ raise gr.Error(f"Failed to generate podcast script: {str(e)}")
293
+
294
+ # 列印生成的Podcast指令碼
295
+ print(f"Generated podcast script:\n{generated_text}")
296
+
297
+ # 嘗試解析JSON,如果失敗則返回原始文本
298
+ try:
299
+ return json.loads(generated_text)
300
  except json.JSONDecodeError:
301
+ print("Warning: Generated text is not valid JSON. Returning raw text.")
302
+ return {"raw_text": generated_text}
 
 
 
 
 
 
 
 
 
 
 
303
 
304
  async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
305
  """
 
312
  speaker2 (str): 第二位說話者的語音設定。
313
 
314
  返回:
315
+ str: 生成的臨時音訊檔案的檔名。
316
 
317
  此方法使用 Edge TTS 將文字轉換爲語音,並將結果儲存爲臨時音訊檔案。
318
  根據指定的說話者編號選擇相應的語音設定。
 
328
  # 儲存語音檔案
329
  await speech.save(temp_filename)
330
  return temp_filename
 
 
 
331
  except Exception as e:
332
+ # 如果出錯,刪除臨時檔案並丟擲異常
 
 
 
333
  if os.path.exists(temp_filename):
334
  os.remove(temp_filename)
335
+ raise e
336
 
337
  async def combine_audio_files(self, audio_files: List[str]) -> str:
338
  """
 
381
  # 生成Podcast劇本
382
  gr.Info("Generating podcast script...")
383
  start_time = time.time()
384
+ podcast_json = await self.generate_script(input_text, language, api_key)
385
  end_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
387
 
388
  # 生成Podcast音訊檔案
389
  gr.Info("Generating podcast audio files...")
390
  start_time = time.time()
391
+ audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in podcast_json['podcast']])
392
  end_time = time.time()
393
+ gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!")
 
 
 
 
 
 
 
 
394
 
395
  # 合併音訊檔案
396
  combined_audio = await self.combine_audio_files(audio_files)
 
446
 
447
  # 定義語音名稱對映
448
  voice_names = {
449
+ "臺女1 - Chinese Taiwanese (Taiwan)": "zh-TW-HsiaoChenNeural",
450
+ "臺女2 - Chinese Taiwanese (Taiwan)": "zh-TW-HsiaoYuNeural",
451
+ "臺男 - Chinese Taiwanese (Taiwan)": "zh-TW-YunJheNeural",
452
  "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
453
  "Ava - English (United States)": "en-US-AvaMultilingualNeural",
454
  "Brian - English (United States)": "en-US-BrianMultilingualNeural",
 
463
  speaker1 = voice_names[speaker1]
464
  speaker2 = voice_names[speaker2]
465
 
 
 
 
 
 
 
466
  # 如果提供了輸入檔案,則從檔案中提取文字
467
  if input_file:
468
  input_text = await TextExtractor.extract_text(input_file.name)
469
 
 
 
 
 
 
 
470
  # 如果沒有提供API金鑰,則使用環境變數中的金鑰
471
  if not api_key:
472
  api_key = os.getenv("Your_API_KEY")
 
474
  # 建立PodcastGenerator實例並生成Podcast
475
  podcast_generator = PodcastGenerator()
476
  podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key)
 
 
 
477
 
478
  # 計算總耗時並顯示資訊
479
  end_time = time.time()