Spaces:

Jiangxz01
/

Generated_Podcast_Audio

Running

App Files Files

Jiangxz01 commited on Sep 26, 2024

Commit

e063c54

verified ·

1 Parent(s): fce0934

Upload app.py

Browse files

Files changed (1) hide show

app.py +268 -123

app.py CHANGED Viewed

@@ -37,84 +37,269 @@ class PodcastGenerator:
             gr.Error: 如果 API 金鑰或速率限制出現問題。
         此方法使用 SambaNova API 根據使用者的輸入生成Podcast劇本。
-        它處理語言選擇，使用適當配置設定 AI 模型，並處理生成的響應。
         """
-        # Significantly shorten the system prompt
-        system_prompt = f"""Generate a podcast script with 2 speakers. {language} language. Be concise, engaging, and in JSON format."""
-        example = """{"podcast":[{"speaker":1,"line":"Hello"},{"speaker":2,"line":"Hi there"}]}"""
-        async def generate_chunk(chunk: str) -> str:
-            try:
-                # Calculate the available tokens for generation
-                prompt_tokens = len(chunk.split())
-                system_tokens = len(system_prompt.split())
-                max_tokens = 3000  # Reduced from 4096 to leave more room for the prompt
-                logger.info(f"Sending request to SambaNova API with prompt chunk: {chunk[:100]}...")
-                response = client.chat.completions.create(
-                    model='Meta-Llama-3.1-405B-Instruct',
-                    messages=[
-                        {"role": "system", "content": system_prompt},
-                        {"role": "user", "content": f"Generate a podcast script based on this: {chunk}\nUse this format: {example}"}
-                    ],
-                    temperature=1,
-                    max_tokens=max_tokens
-                )
-                logger.info(f"Received response from API: {response}")
-                if hasattr(response, 'error'):
-                    logger.error(f"API returned an error: {response.error}")
-                    return {"error": f"API error: {response.error.get('message', 'Unknown error')}"}
-                if response.choices and len(response.choices) > 0:
-                    generated_text = response.choices[0].message.content
-                    logger.info(f"Generated text: {generated_text[:100]}...")
-                    return generated_text
-                else:
-                    logger.warning("No content generated from the API")
-                    return {"error": "No content generated from the API"}
-            except Exception as e:
-                logger.error(f"Error generating script chunk: {str(e)}")
-                return {"error": f"Failed to generate podcast script chunk: {str(e)}"}
-        # Split the prompt into smaller chunks
-        chunk_size = 500  # Reduced from 1000
-        chunks = [prompt[i:i+chunk_size] for i in range(0, len(prompt), chunk_size)]
-        # Generate script for each chunk
-        generated_chunks = []
-        for chunk in chunks:
-            result = await generate_chunk(chunk)
-            if isinstance(result, dict) and "error" in result:
-                return result
-            generated_chunks.append(result)
-        # Combine generated chunks
-        generated_text = " ".join(generated_chunks)
-        # Try to parse JSON, if fails then extract dialogue from raw text
         try:
-            parsed_json = json.loads(generated_text)
-            if "podcast" in parsed_json:
-                return parsed_json
             else:
-                raise json.JSONDecodeError("Missing 'podcast' key", generated_text, 0)
         except json.JSONDecodeError:
-            logger.warning("Generated text is not valid JSON or missing 'podcast' key. Attempting to extract dialogue.")
-            lines = generated_text.split('\n')
-            podcast = []
-            current_speaker = 1
-            for line in lines:
-                line = line.strip()
-                if line:
-                    podcast.append({
-                        "speaker": current_speaker,
-                        "line": line
-                    })
-                    current_speaker = 3 - current_speaker  # Switch between 1 and 2
-            return {"podcast": podcast}
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
         """
@@ -127,7 +312,7 @@ class PodcastGenerator:
             speaker2 (str): 第二位說話者的語音設定。
         返回：
-            str: 生成的臨時音訊檔案的檔名，或者 None 如果生成失敗。
         此方法使用 Edge TTS 將文字轉換爲語音，並將結果儲存爲臨時音訊檔案。
         根據指定的說話者編號選擇相應的語音設定。
@@ -143,16 +328,11 @@ class PodcastGenerator:
             # 儲存語音檔案
             await speech.save(temp_filename)
             return temp_filename
-        except edge_tts.exceptions.NoAudioReceived:
-            logger.error(f"No audio received for text: '{text[:50]}...' with voice: {voice}")
-            return None
         except Exception as e:
-            logger.error(f"Error generating audio for text: '{text[:50]}...' with voice: {voice}. Error: {str(e)}")
-            return None
-        finally:
-            # 如果檔案存在但生成失敗，刪除臨時檔案
             if os.path.exists(temp_filename):
                 os.remove(temp_filename)
     async def combine_audio_files(self, audio_files: List[str]) -> str:
         """
@@ -201,39 +381,16 @@ class PodcastGenerator:
         # 生成Podcast劇本
         gr.Info("Generating podcast script...")
         start_time = time.time()
-        script_result = await self.generate_script(input_text, language, api_key)
         end_time = time.time()
-        if "error" in script_result:
-            gr.Error(f"Failed to generate podcast script: {script_result['error']}")
-            return None
-        if "raw_text" in script_result:
-            gr.Warning("Generated text is not in the expected JSON format. Attempting to process raw text.")
-            # Here you might want to implement a fallback method to process raw text
-            # For now, we'll just return None
-            return None
-        if "podcast" not in script_result:
-            gr.Error("Generated script does not contain a 'podcast' key.")
-            return None
         gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
         # 生成Podcast音訊檔案
         gr.Info("Generating podcast audio files...")
         start_time = time.time()
-        audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in script_result['podcast']])
         end_time = time.time()
-        # Filter out None values (failed TTS generations)
-        audio_files = [file for file in audio_files if file is not None]
-        if not audio_files:
-            gr.Error("Failed to generate any audio files. Please check your language and voice settings.")
-            return None
-        gr.Info(f"Successfully generated {len(audio_files)} out of {len(script_result['podcast'])} audio files in {(end_time - start_time):.2f} seconds!")
         # 合併音訊檔案
         combined_audio = await self.combine_audio_files(audio_files)
@@ -289,6 +446,9 @@ async def process_input(input_text: str, input_file, language: str, speaker1: st
     # 定義語音名稱對映
     voice_names = {
         "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
         "Ava - English (United States)": "en-US-AvaMultilingualNeural",
         "Brian - English (United States)": "en-US-BrianMultilingualNeural",
@@ -303,22 +463,10 @@ async def process_input(input_text: str, input_file, language: str, speaker1: st
     speaker1 = voice_names[speaker1]
     speaker2 = voice_names[speaker2]
-    # Check if the selected voices are compatible with the chosen language
-    if language != "Auto Detect":
-        if not (speaker1.startswith(language[:2].lower()) and speaker2.startswith(language[:2].lower())):
-            gr.Error(f"Selected voices may not be compatible with the chosen language: {language}")
-            return None
     # 如果提供了輸入檔案，則從檔案中提取文字
     if input_file:
         input_text = await TextExtractor.extract_text(input_file.name)
-    # Limit input text length
-    max_input_length = 3000  # Adjust this value as needed
-    if len(input_text) > max_input_length:
-        input_text = input_text[:max_input_length]
-        gr.Warning(f"Input text was truncated to {max_input_length} characters due to length limitations.")
     # 如果沒有提供API金鑰，則使用環境變數中的金鑰
     if not api_key:
         api_key = os.getenv("Your_API_KEY")
@@ -326,9 +474,6 @@ async def process_input(input_text: str, input_file, language: str, speaker1: st
     # 建立PodcastGenerator實例並生成Podcast
     podcast_generator = PodcastGenerator()
     podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key)
-    if podcast is None:
-        return None
     # 計算總耗時並顯示資訊
     end_time = time.time()

             gr.Error: 如果 API 金鑰或速率限制出現問題。
         此方法使用 SambaNova API 根據使用者的輸入生成Podcast劇本。
+        它處理語言選擇，使用適當的配置設定 AI 模型，並處理生成的響應。
         """
+        # 定義一個示例JSON結構，用於指導AI生成類似格式的Podcast劇本
+        example = """
+        {
+            "topic": "AGI",
+            "podcast": [
+                {
+                    "speaker": 2,
+                    "line": "So, AGI, huh? Seems like everyone's talking about it these days."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Yeah, it's definitely having a moment, isn't it?"
+                },
+                {
+                    "speaker": 2,
+                    "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "Honestly, it's the sheer scale of what AGI could do. We're talking about potentially reshaping well everything."
+                },
+                {
+                    "speaker": 2,
+                    "line": "No kidding, but let's be real. Sometimes it feels like every other headline is either hyping AGI up as this technological utopia or painting it as our inevitable robot overlords."
+                },
+                {
+                    "speaker": 1,
+                    "line": "It's easy to get lost in the noise, for sure."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Exactly. So how about we try to cut through some of that, shall we?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "Sounds like a plan."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Okay, so first things first, AGI, what is it really? And I don't just mean some dictionary definition, we're talking about something way bigger than just a super smart computer, right?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "Right, it's not just about more processing power or better algorithms, it's about a fundamental shift in how we think about intelligence itself."
+                },
+                {
+                    "speaker": 2,
+                    "line": "So like, instead of programming a machine for a specific task, we're talking about creating something that can learn and adapt like we do."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Exactly, think of it this way: Right now, we've got AI that can beat a grandmaster at chess but ask that same AI to, say, write a poem or compose a symphony. No chance."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Okay, I see. So, AGI is about bridging that gap, creating something that can move between those different realms of knowledge seamlessly."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Precisely. It's about replicating that uniquely human ability to learn something new and apply that knowledge in completely different contexts and that's a tall order, let me tell you."
+                },
+                {
+                    "speaker": 2,
+                    "line": "I bet. I mean, think about how much we still don't even understand about our own brains."
+                },
+                {
+                    "speaker": 1,
+                    "line": "That's exactly it. We're essentially trying to reverse-engineer something we don't fully comprehend."
+                },
+                {
+                    "speaker": 2,
+                    "line": "And how are researchers even approaching that? What are some of the big ideas out there?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "Well, there are a few different schools of thought. One is this idea of neuromorphic computing where they're literally trying to build computer chips that mimic the structure and function of the human brain."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Wow, so like actually replicating the physical architecture of the brain. That's wild."
+                },
+                {
+                    "speaker": 1,
+                    "line": "It's pretty mind-blowing stuff and then you've got folks working on something called whole brain emulation."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Okay, and what's that all about?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "The basic idea there is to create a complete digital copy of a human brain down to the last neuron and synapse and run it on a sufficiently powerful computer simulation."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Hold on, a digital copy of an entire brain, that sounds like something straight out of science fiction."
+                },
+                {
+                    "speaker": 1,
+                    "line": "It does, doesn't it? But it gives you an idea of the kind of ambition we're talking about here and the truth is we're still a long way off from truly achieving AGI, no matter which approach you look at."
+                },
+                {
+                    "speaker": 2,
+                    "line": "That makes sense but it's still exciting to think about the possibilities, even if they're a ways off."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Absolutely and those possibilities are what really get people fired up about AGI, right? Yeah."
+                },
+                {
+                    "speaker": 2,
+                    "line": "For sure. In fact, I remember you mentioning something in that podcast about AGI's potential to revolutionize scientific research. Something about supercharging breakthroughs."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Oh, absolutely. Imagine an AI that doesn't just crunch numbers but actually understands scientific data the way a human researcher does. We're talking about potential breakthroughs in everything from medicine and healthcare to material science and climate change."
+                },
+                {
+                    "speaker": 2,
+                    "line": "It's like giving scientists this incredibly powerful new tool to tackle some of the biggest challenges we face."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Exactly, it could be a total game changer."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Okay, but let's be real, every coin has two sides. What about the potential downsides of AGI? Because it can't all be sunshine and roses, right?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "Right, there are definitely valid concerns. Probably the biggest one is the impact on the job market. As AGI gets more sophisticated, there's a real chance it could automate a lot of jobs that are currently done by humans."
+                },
+                {
+                    "speaker": 2,
+                    "line": "So we're not just talking about robots taking over factories but potentially things like, what, legal work, analysis, even creative fields?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "Potentially, yes. And that raises a whole host of questions about what happens to those workers, how we retrain them, how we ensure that the benefits of AGI are shared equitably."
+                },
+                {
+                    "speaker": 2,
+                    "line": "Right, because it's not just about the technology itself, but how we choose to integrate it into society."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Absolutely. We need to be having these conversations now about ethics, about regulation, about how to make sure AGI is developed and deployed responsibly."
+                },
+                {
+                    "speaker": 2,
+                    "line": "So it's less about preventing some kind of sci-fi robot apocalypse and more about making sure we're steering this technology in the right direction from the get-go."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Exactly, AGI has the potential to be incredibly beneficial, but it's not going to magically solve all our problems. It's on us to make sure we're using it for good."
+                },
+                {
+                    "speaker": 2,
+                    "line": "It's like you said earlier, it's about shaping the future of intelligence."
+                },
+                {
+                    "speaker": 1,
+                    "line": "I like that. It really is."
+                },
+                {
+                    "speaker": 2,
+                    "line": "And honestly, that's a responsibility that extends beyond just the researchers and the policymakers."
+                },
+                {
+                    "speaker": 1,
+                    "line": "100%"
+                },
+                {
+                    "speaker": 2,
+                    "line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "That's a question worth pondering."
+                },
+                {
+                    "speaker": 2,
+                    "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Peace."
+                }
+            ]
+        }
+        """
+        # 根據使用者選擇的語言設定指令
+        if language == "Auto Detect":
+            language_instruction = "- The podcast MUST be in the same language as the user input."
+        else:
+            language_instruction = f"- The podcast MUST be in {language} language"
+        # 設定系統提示，指導AI如何生成Podcast指令碼
+        system_prompt = f"""
+        You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
+        {language_instruction}
+        - The podcast should have 2 speakers.
+        - The podcast should be long.
+        - Do not use names for the speakers.
+        - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
+        - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
+        - The script must be in JSON format.
+        Follow this example structure carefully:
+        {example}
+        """
+        # 設定使用者提示，包含使用者輸入的內容
+        user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
+        # 配置 SambaNova API client
+        if not api_key:
+            api_key = os.getenv("YOUR_API_TOKEN")
+        client = openai.OpenAI(
+            api_key=api_key,
+            base_url="https://api.sambanova.ai/v1",
+        )
+        # 嘗試生成內容
         try:
+            response = client.chat.completions.create(
+                model='Meta-Llama-3.1-405B-Instruct',
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                temperature=1
+            )
+            logger.info(f"API Response: {response}")
+            if response.choices and len(response.choices) > 0:
+                generated_text = response.choices[0].message.content
+            else:
+                logger.warning("No content generated from the API")
+                raise ValueError("No content generated from the API")
+        except Exception as e:
+            logger.error(f"Error generating script: {str(e)}")
+            # 處理可能的錯誤
+            if "API key not valid" in str(e):
+                raise gr.Error("Invalid API key. Please provide a valid SambaNova API key.")
+            elif "rate limit" in str(e).lower():
+                raise gr.Error("Rate limit exceeded for the API key. Please try again later or provide your own SambaNova API key.")
             else:
+                raise gr.Error(f"Failed to generate podcast script: {str(e)}")
+        # 列印生成的Podcast指令碼
+        print(f"Generated podcast script:\n{generated_text}")
+        # 嘗試解析JSON，如果失敗則返回原始文本
+        try:
+            return json.loads(generated_text)
         except json.JSONDecodeError:
+            print("Warning: Generated text is not valid JSON. Returning raw text.")
+            return {"raw_text": generated_text}
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
         """
             speaker2 (str): 第二位說話者的語音設定。
         返回：
+            str: 生成的臨時音訊檔案的檔名。
         此方法使用 Edge TTS 將文字轉換爲語音，並將結果儲存爲臨時音訊檔案。
         根據指定的說話者編號選擇相應的語音設定。
             # 儲存語音檔案
             await speech.save(temp_filename)
             return temp_filename
         except Exception as e:
+            # 如果出錯，刪除臨時檔案並丟擲異常
             if os.path.exists(temp_filename):
                 os.remove(temp_filename)
+            raise e
     async def combine_audio_files(self, audio_files: List[str]) -> str:
         """
         # 生成Podcast劇本
         gr.Info("Generating podcast script...")
         start_time = time.time()
+        podcast_json = await self.generate_script(input_text, language, api_key)
         end_time = time.time()
         gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
         # 生成Podcast音訊檔案
         gr.Info("Generating podcast audio files...")
         start_time = time.time()
+        audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in podcast_json['podcast']])
         end_time = time.time()
+        gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!")
         # 合併音訊檔案
         combined_audio = await self.combine_audio_files(audio_files)
     # 定義語音名稱對映
     voice_names = {
+        "臺女1 - Chinese Taiwanese (Taiwan)": "zh-TW-HsiaoChenNeural",
+        "臺女2 - Chinese Taiwanese (Taiwan)": "zh-TW-HsiaoYuNeural",
+        "臺男 - Chinese Taiwanese (Taiwan)": "zh-TW-YunJheNeural",
         "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
         "Ava - English (United States)": "en-US-AvaMultilingualNeural",
         "Brian - English (United States)": "en-US-BrianMultilingualNeural",
     speaker1 = voice_names[speaker1]
     speaker2 = voice_names[speaker2]
     # 如果提供了輸入檔案，則從檔案中提取文字
     if input_file:
         input_text = await TextExtractor.extract_text(input_file.name)
     # 如果沒有提供API金鑰，則使用環境變數中的金鑰
     if not api_key:
         api_key = os.getenv("Your_API_KEY")
     # 建立PodcastGenerator實例並生成Podcast
     podcast_generator = PodcastGenerator()
     podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key)
     # 計算總耗時並顯示資訊
     end_time = time.time()