Spaces:

Jiangxz01
/

Generated_Podcast_Audio

Running

App Files Files

Jiangxz01 commited on Sep 26, 2024

Commit

bd4b0d1

verified ·

1 Parent(s): 805d359

Upload app.py

Browse files

Files changed (1) hide show

app.py +376 -87

app.py CHANGED Viewed

@@ -1,87 +1,376 @@
-# -*- coding: utf-8 -*-
-# 財政部財政資訊中心 江信宗
-import gradio as gr
-import openai
-import os
-MODEL = "Meta-Llama-3.1-405B-Instruct"
-def create_client(api_key=None):
-    if api_key:
-        openai.api_key = api_key
-    else:
-        openai.api_key = os.getenv("YOUR_API_TOKEN")
-    return openai.OpenAI(api_key=openai.api_key, base_url="https://api.sambanova.ai/v1")
-def generate_response(input_text):
-    system_prompt = """你的任務是將提供的輸入文字轉換為一個引人入勝、訊息豐富且專業的Podcast對話。輸入文字可能會比較混亂或結構不完整，因為它可能來自不同來源，如PDF檔案或文字檔等。不要擔心格式問題或任何不相關的訊息；你的目標是提取可以在Podcast中討論的關鍵點、識別重要定義，並突出有趣的事實。
-以下是你將要處理的輸入文字：
-<input_text>
-{input_text}
-</input_text>
-首先，仔細閱讀輸入文字，找出主要話題、關鍵點，以及任何有趣的事實或軼事。思考如何將這些訊息以一種有趣且吸引人的方式呈現出來，適合高質量的音訊Podcast。
-<scratchpad>
-頭腦風暴一些創造性的方法來討論你在輸入文字中識別出的主要話題、關鍵點及任何有趣的事實或軼事。可以考慮使用類比、講故事技巧或假設情境來讓內容對聽眾更加貼近和有趣。
-請記住，你的Podcast應當易於普通聽眾理解，所以避免使用過多的專業術語或假設聽眾對該話題已有瞭解。如有必要，請思考如何用簡單的術語簡要解釋任何複雜的概念。
-利用你的想像力填補輸入文字中的任何空白，或者想出一些值得探討與發人深省的問題，以供Podcast討論。目標是創造一個訊息豐富且娛樂性強的對話，因此可以在你的方法上大膽自由發揮創意。
-將你的頭腦風暴想法和Podcast對話的粗略大綱寫在這裡。確保記錄下你希望在結尾重申的主要見解和要點。
-</scratchpad>
-現在你已經進行了頭腦風暴並建立了一個粗略的大綱，是時候撰寫實際的Podcast對話了。目標是主持人(speaker1)與嘉賓(speaker2)之間自然、對話式的交流。融入你在頭腦風暴中得出的最佳想法，並確保將任何複雜話題以易於理解的方式解釋清楚。
-- The podcast should have 2 speakers.
-- Use english names for the speakers.
-- The podcast should be long.
-- The podcast should be interesting, lively, and engaging, and hook the listener from the start.
-- The script must be in JSON format.
-Follow this example structure:
-```
-{
-  "speaker1": "歡迎收聽財資歐北共Podcast，我是主持人XXX，今天我們邀請到XX專家XXX，百忙之中來上我們節目",
-  "speaker2": "大家好，我是XXX，很榮幸能來跟大家一起聊聊",
-  "speaker1": "今天我們將探討一個非常有趣的話題……",
-  "speaker2": "沒錯，這個話題確實讓人著迷，讓我們先從……開始說起吧……",
-  …………
-  "speaker1": "謝謝嘉賓的分享，歡迎訂閱來許願Podcast節目喔，我們下次再見"
-}
-```
-<podcast_dialogue>
-根據你在頭腦風暴階段提出的關鍵點和創造性想法，撰寫你的引人入勝、訊息豐富的Podcast對話。採用對話式的語氣，並包括任何必要的上下文或解釋，使內容對一般聽眾而言容易理解。使用虛構的主持人和嘉賓名字，以營造更吸引人和身臨其境的聆聽體驗。不要包括像[主持人]或[嘉賓]這樣的括號預留位置。設計你的輸出內容以供直接朗讀——它將直接轉換為音訊。
-確保對話儘可能詳細、完整，同時保持在主題之內並維持吸引人的流暢性。目標是使用你的全部輸出容量，建立儘可能長的Podcast節目，同時以有趣的方式傳遞輸入文字中的關鍵訊息。
-在對話結束時，讓主持人和嘉賓自然總結他們討論中的主要見解和要點。這應當是對話的隨機部分，以自然隨意而非明顯的總結——目的是在結束前最後一次以自然流暢的方式強化核心思想。最終以感謝詞結束。
-</podcast_dialogue>
-"""
-    client = create_client()
-    response = client.chat.completions.create(
-        model=MODEL,
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": input_text}
-        ],
-        temperature=1
-    )
-    return response.choices[0].message.content
-with gr.Blocks(theme=gr.themes.Monochrome()) as iface:
-    gr.Markdown("# 🎙️ Generated Podcast Audio. Deployed by 江信宗")
-    input_text = gr.Textbox(label="請輸入您的文字")
-    output_text = gr.Textbox(label="生成的結果")
-    generate_button = gr.Button("生成")
-    generate_button.click(fn=generate_response, inputs=input_text, outputs=output_text)
-if __name__ == "__main__":
-    if "SPACE_ID" in os.environ:
-        iface.launch()
-    else:
-        iface.launch(share=True, show_api=False)

+import gradio as gr
+from pydub import AudioSegment
+import google.generativeai as genai
+from google.generativeai.types import HarmCategory, HarmBlockThreshold
+import json
+import uuid
+import io
+import edge_tts
+import asyncio
+import aiofiles
+import pypdf
+import os
+import time
+from typing import List, Dict, Tuple
+import openai
+class PodcastGenerator:
+    def __init__(self):
+        pass
+    async def generate_script(self, prompt: str, language: str, api_key: str) -> Dict:
+        """
+        非同步生成基於給定提示和語言的Podcast劇本。
+        引數：
+            prompt (str): 用於生成Podcast劇本的使用者輸入文字。
+            language (str): Podcast指劇本所需的語言。
+            api_key (str): 用於訪問 SambaNova API 服務的 API 金鑰。
+        返回：
+            Dict: 包含以 JSON 格式生成Podcast劇本的字典。
+        異常：
+            gr.Error: 如果 API 金鑰或速率限制出現問題。
+        此方法使用 SambaNova API 根據使用者的輸入生成Podcast劇本。
+        它處理語言選擇，使用適當的配置設定 AI 模型，並處理生成的響應。
+        """
+        # 定義一個示例JSON結構，用於指導AI生成類似格式的Podcast劇本
+        example = """
+        {
+            "topic": "AGI",
+            "podcast": [
+                {
+                    "speaker": 2,
+                    "line": "So, AGI, huh? Seems like everyone's talking about it these days."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Yeah, it's definitely having a moment, isn't it?"
+                },
+                {
+                    "speaker": 2,
+                    "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "Honestly, it's the sheer scale of what AGI could do. We're talking about potentially reshaping well everything."
+                },
+                ......
+                {
+                    "speaker": 2,
+                    "line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?"
+                },
+                {
+                    "speaker": 1,
+                    "line": "That's a question worth pondering."
+                },
+                {
+                    "speaker": 2,
+                    "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."
+                },
+                {
+                    "speaker": 1,
+                    "line": "Peace."
+                }
+            ]
+        }
+        """
+        # 根據使用者選擇的語言設定指令
+        if language == "Auto Detect":
+            language_instruction = "- The podcast MUST be in the same language as the user input."
+        else:
+            language_instruction = f"- The podcast MUST be in {language} language"
+        # 設定系統提示，指導AI如何生成Podcast指令碼
+        system_prompt = f"""
+        You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
+        {language_instruction}
+        - The podcast should have 2 speakers.
+        - The podcast should be long.
+        - Do not use names for the speakers.
+        - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
+        - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
+        - The script must be in JSON format.
+        Follow this example structure carefully:
+        {example}
+        """
+        # 設定使用者提示，包含使用者輸入的內容
+        user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
+        # 配置 SambaNova API client
+        if api_key:
+            openai.api_key = api_key
+        else:
+            openai.api_key = os.getenv("YOUR_API_TOKEN")
+        client = openai.OpenAI(
+            api_key=openai.api_key,
+            base_url="https://api.sambanova.ai/v1",
+        )
+        # 嘗試生成內容
+        try:
+            response = client.chat.completions.create(
+                model='Meta-Llama-3.1-405B-Instruct',
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                temperature=1,
+                max_tokens=4096
+            )
+            generated_text = response.choices[0].message.content
+        except Exception as e:
+            # 處理可能的錯誤
+            if "API key not valid" in str(e):
+                raise gr.Error("Invalid API key. Please provide a valid SambaNova API key.")
+            elif "rate limit" in str(e).lower():
+                raise gr.Error("Rate limit exceeded for the API key. Please try again later or provide your own SambaNova API key.")
+            else:
+                raise gr.Error(f"Failed to generate podcast script: {e}")
+        # 列印生成的Podcast指令碼
+        print(f"Generated podcast script:\n{generated_text}")
+        # 返回解析後的JSON資料
+        return json.loads(generated_text)
+    async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
+        """
+        非同步生成文字轉語音音訊檔案。
+        引數：
+            text (str): 要轉換爲語音的文字內容。
+            speaker (int): 說話者的編號（1 或 2）。
+            speaker1 (str): 第一位說話者的語音設定。
+            speaker2 (str): 第二位說話者的語音設定。
+        返回：
+            str: 生成的臨時音訊檔案的檔名。
+        此方法使用 Edge TTS 將文字轉換爲語音，並將結果儲存爲臨時音訊檔案。
+        根據指定的說話者編號選擇相應的語音設定。
+        """
+        # 根據說話者選擇語音
+        voice = speaker1 if speaker == 1 else speaker2
+        # 建立語音合成對象
+        speech = edge_tts.Communicate(text, voice)
+        # 生成臨時檔名
+        temp_filename = f"temp_{uuid.uuid4()}.wav"
+        try:
+            # 儲存語音檔案
+            await speech.save(temp_filename)
+            return temp_filename
+        except Exception as e:
+            # 如果出錯，刪除臨時檔案並丟擲異常
+            if os.path.exists(temp_filename):
+                os.remove(temp_filename)
+            raise e
+    async def combine_audio_files(self, audio_files: List[str]) -> str:
+        """
+        非同步合併音訊檔案。
+        引數：
+            audio_files (List[str]): 包含音訊檔案路徑的列表。
+        返回：
+            str: 合併後的音訊檔案的檔名。
+        """
+        # 建立空的音訊段
+        combined_audio = AudioSegment.empty()
+        # 遍歷所有音訊檔案並合併
+        for audio_file in audio_files:
+            combined_audio += AudioSegment.from_file(audio_file)
+            os.remove(audio_file)  # 清理臨時檔案
+        # 生成輸出文件名
+        output_filename = f"output_{uuid.uuid4()}.wav"
+        # 匯出合併後的音訊
+        combined_audio.export(output_filename, format="wav")
+        return output_filename
+    async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str) -> str:
+        """
+        非同步生成Podcast音訊檔案。
+        引數：
+            input_text (str): 用於生成Podcast指令碼的輸入文字。
+            language (str): Podcast使用的語言。
+            speaker1 (str): 第一位說話者的語音設定。
+            speaker2 (str): 第二位說話者的語音設定。
+            api_key (str): 用於訪問 Gemini AI 服務的 API 金鑰。
+        返回：
+            str: 生成的Podcast音訊檔案的檔名。
+        此方法執行以下步驟：
+        1. 使用 generate_script 方法生成Podcast劇本。
+        2. 使用 tts_generate 方法爲每個對話行生成音訊檔案。
+        3. 使用 combine_audio_files 方法將所有音訊檔案合併爲一個完整的Podcast。
+        整個過程是非同步的，以提高效率。方法還會記錄並顯示每個步驟的執行時間。
+        """
+        # 生成Podcast劇本
+        gr.Info("Generating podcast script...")
+        start_time = time.time()
+        podcast_json = await self.generate_script(input_text, language, api_key)
+        end_time = time.time()
+        gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
+        # 生成Podcast音訊檔案
+        gr.Info("Generating podcast audio files...")
+        start_time = time.time()
+        audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in podcast_json['podcast']])
+        end_time = time.time()
+        gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!")
+        # 合併音訊檔案
+        combined_audio = await self.combine_audio_files(audio_files)
+        return combined_audio
+class TextExtractor:
+    @staticmethod
+    async def extract_from_pdf(file_path: str) -> str:
+        # 從PDF檔案中提取文字
+        async with aiofiles.open(file_path, 'rb') as file:
+            content = await file.read()
+            pdf_reader = pypdf.PdfReader(io.BytesIO(content))
+            return "\n\n".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
+    @staticmethod
+    async def extract_from_txt(file_path: str) -> str:
+        # 從TXT檔案中提取文字
+        async with aiofiles.open(file_path, 'r') as file:
+            return await file.read()
+    @classmethod
+    async def extract_text(cls, file_path: str) -> str:
+        # 根據檔案型別選擇適當的提取方法
+        _, file_extension = os.path.splitext(file_path)
+        if file_extension.lower() == '.pdf':
+            return await cls.extract_from_pdf(file_path)
+        elif file_extension.lower() == '.txt':
+            return await cls.extract_from_txt(file_path)
+        else:
+            raise gr.Error(f"Unsupported file type: {file_extension}")
+async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str:
+    """
+    處理輸入並生成Podcast的非同步函式。
+    引數:
+    input_text (str): 使用者輸入的文字內容。
+    input_file: 使用者上傳的檔案（可以是 PDF 或 TXT）。
+    language (str): 選擇的語言。
+    speaker1 (str): 第一位說話者的語音選擇。
+    speaker2 (str): 第二位說話者的語音選擇。
+    api_key (str): 用於生成 AI 的 API 金鑰，預設爲空字串。
+    返回:
+    str: 生成的Podcast音訊檔案路徑。
+    此函式協調整個Podcast生成過程，包括文字提取、指令碼生成和音訊合成。
+    它處理不同的輸入型別（文字或檔案），並使用指定的語音和語言設定來建立最終的Podcast。
+    """
+    # 開始生成Podcast
+    gr.Info("Starting podcast generation...")
+    start_time = time.time()
+    # 定義語音名稱對映
+    voice_names = {
+        "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
+        "Ava - English (United States)": "en-US-AvaMultilingualNeural",
+        "Brian - English (United States)": "en-US-BrianMultilingualNeural",
+        "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
+        "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
+        "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
+        "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
+        "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
+    }
+    # 獲取實際的語音名稱
+    speaker1 = voice_names[speaker1]
+    speaker2 = voice_names[speaker2]
+    # 如果提供了輸入檔案，則從檔案中提取文字
+    if input_file:
+        input_text = await TextExtractor.extract_text(input_file.name)
+    # 如果沒有提供API金鑰，則使用環境變數中的金鑰
+    if not api_key:
+        api_key = os.getenv("Your_API_KEY")
+    # 建立PodcastGenerator實例並生成Podcast
+    podcast_generator = PodcastGenerator()
+    podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key)
+    # 計算總耗時並顯示資訊
+    end_time = time.time()
+    gr.Info(f"Successfully generated podcast in {(end_time - start_time):.2f} seconds!")
+    return podcast
+# 定義Gradio介面
+iface = gr.Interface(
+    fn=process_input,
+    inputs=[
+        gr.Textbox(label="Input Text"),
+        gr.File(label="Or Upload a PDF or TXT file"),
+        gr.Dropdown(label="Language", choices=[
+            "Auto Detect",
+            "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani",
+            "Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian",
+            "Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin",
+            "Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English",
+            "Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian",
+            "German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish",
+            "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean",
+            "Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam",
+            "Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian",
+            "Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Sinhala",
+            "Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili",
+            "Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu",
+            "Uzbek", "Vietnamese", "Welsh", "Zulu"
+        ],
+        value="Auto Detect"),
+        gr.Dropdown(label="Speaker 1 Voice", choices=[
+            "Andrew - English (United States)",
+            "Ava - English (United States)",
+            "Brian - English (United States)",
+            "Emma - English (United States)",
+            "Florian - German (Germany)",
+            "Seraphina - German (Germany)",
+            "Remy - French (France)",
+            "Vivienne - French (France)"
+        ],
+        value="Andrew - English (United States)"),
+        gr.Dropdown(label="Speaker 2 Voice", choices=[
+            "Andrew - English (United States)",
+            "Ava - English (United States)",
+            "Brian - English (United States)",
+            "Emma - English (United States)",
+            "Florian - German (Germany)",
+            "Seraphina - German (Germany)",
+            "Remy - French (France)",
+            "Vivienne - French (France)"
+        ],
+        value="Ava - English (United States)"),
+        gr.Textbox(label="Your Gemini API Key (Optional) - In case you are getting rate limited"),
+    ],
+    outputs=[
+        gr.Audio(label="Generated Podcast Audio")
+    ],
+    title="🎙️ PodcastGen 🎙️",
+    description="Generate a 2-speaker podcast from text input or documents!",
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    iface.launch()