Spaces:
Running
Running
Last commit not found
import gradio as gr | |
from pydub import AudioSegment | |
import google.generativeai as genai | |
from google.generativeai.types import HarmCategory, HarmBlockThreshold | |
import json | |
import uuid | |
import io | |
import edge_tts | |
import asyncio | |
import aiofiles | |
import pypdf | |
import os | |
import time | |
from typing import List, Dict, Tuple | |
import openai | |
class PodcastGenerator: | |
def __init__(self): | |
pass | |
async def generate_script(self, prompt: str, language: str, api_key: str) -> Dict: | |
""" | |
非同步生成基於給定提示和語言的Podcast劇本。 | |
引數: | |
prompt (str): 用於生成Podcast劇本的使用者輸入文字。 | |
language (str): Podcast指劇本所需的語言。 | |
api_key (str): 用於訪問 SambaNova API 服務的 API 金鑰。 | |
返回: | |
Dict: 包含以 JSON 格式生成Podcast劇本的字典。 | |
異常: | |
gr.Error: 如果 API 金鑰或速率限制出現問題。 | |
此方法使用 SambaNova API 根據使用者的輸入生成Podcast劇本。 | |
它處理語言選擇,使用適當的配置設定 AI 模型,並處理生成的響應。 | |
""" | |
# 定義一個示例JSON結構,用於指導AI生成類似格式的Podcast劇本 | |
example = """ | |
{ | |
"topic": "AGI", | |
"podcast": [ | |
{ | |
"speaker": 2, | |
"line": "So, AGI, huh? Seems like everyone's talking about it these days." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Yeah, it's definitely having a moment, isn't it?" | |
}, | |
{ | |
"speaker": 2, | |
"line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "Honestly, it's the sheer scale of what AGI could do. We're talking about potentially reshaping well everything." | |
}, | |
...... | |
{ | |
"speaker": 2, | |
"line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?" | |
}, | |
{ | |
"speaker": 1, | |
"line": "That's a question worth pondering." | |
}, | |
{ | |
"speaker": 2, | |
"line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone." | |
}, | |
{ | |
"speaker": 1, | |
"line": "Peace." | |
} | |
] | |
} | |
""" | |
# 根據使用者選擇的語言設定指令 | |
if language == "Auto Detect": | |
language_instruction = "- The podcast MUST be in the same language as the user input." | |
else: | |
language_instruction = f"- The podcast MUST be in {language} language" | |
# 設定系統提示,指導AI如何生成Podcast指令碼 | |
system_prompt = f""" | |
You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input. | |
{language_instruction} | |
- The podcast should have 2 speakers. | |
- The podcast should be long. | |
- Do not use names for the speakers. | |
- The podcast should be interesting, lively, and engaging, and hook the listener from the start. | |
- The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast. | |
- The script must be in JSON format. | |
Follow this example structure carefully: | |
{example} | |
""" | |
# 設定使用者提示,包含使用者輸入的內容 | |
user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}" | |
# 配置 SambaNova API client | |
if api_key: | |
openai.api_key = api_key | |
else: | |
openai.api_key = os.getenv("YOUR_API_TOKEN") | |
client = openai.OpenAI( | |
api_key=openai.api_key, | |
base_url="https://api.sambanova.ai/v1", | |
) | |
# 嘗試生成內容 | |
try: | |
response = client.chat.completions.create( | |
model='Meta-Llama-3.1-405B-Instruct', | |
messages=[ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": user_prompt} | |
], | |
temperature=1, | |
max_tokens=4096 | |
) | |
generated_text = response.choices[0].message.content | |
except Exception as e: | |
# 處理可能的錯誤 | |
if "API key not valid" in str(e): | |
raise gr.Error("Invalid API key. Please provide a valid SambaNova API key.") | |
elif "rate limit" in str(e).lower(): | |
raise gr.Error("Rate limit exceeded for the API key. Please try again later or provide your own SambaNova API key.") | |
else: | |
raise gr.Error(f"Failed to generate podcast script: {e}") | |
# 列印生成的Podcast指令碼 | |
print(f"Generated podcast script:\n{generated_text}") | |
# 返回解析後的JSON資料 | |
return json.loads(generated_text) | |
async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str: | |
""" | |
非同步生成文字轉語音音訊檔案。 | |
引數: | |
text (str): 要轉換爲語音的文字內容。 | |
speaker (int): 說話者的編號(1 或 2)。 | |
speaker1 (str): 第一位說話者的語音設定。 | |
speaker2 (str): 第二位說話者的語音設定。 | |
返回: | |
str: 生成的臨時音訊檔案的檔名。 | |
此方法使用 Edge TTS 將文字轉換爲語音,並將結果儲存爲臨時音訊檔案。 | |
根據指定的說話者編號選擇相應的語音設定。 | |
""" | |
# 根據說話者選擇語音 | |
voice = speaker1 if speaker == 1 else speaker2 | |
# 建立語音合成對象 | |
speech = edge_tts.Communicate(text, voice) | |
# 生成臨時檔名 | |
temp_filename = f"temp_{uuid.uuid4()}.wav" | |
try: | |
# 儲存語音檔案 | |
await speech.save(temp_filename) | |
return temp_filename | |
except Exception as e: | |
# 如果出錯,刪除臨時檔案並丟擲異常 | |
if os.path.exists(temp_filename): | |
os.remove(temp_filename) | |
raise e | |
async def combine_audio_files(self, audio_files: List[str]) -> str: | |
""" | |
非同步合併音訊檔案。 | |
引數: | |
audio_files (List[str]): 包含音訊檔案路徑的列表。 | |
返回: | |
str: 合併後的音訊檔案的檔名。 | |
""" | |
# 建立空的音訊段 | |
combined_audio = AudioSegment.empty() | |
# 遍歷所有音訊檔案並合併 | |
for audio_file in audio_files: | |
combined_audio += AudioSegment.from_file(audio_file) | |
os.remove(audio_file) # 清理臨時檔案 | |
# 生成輸出文件名 | |
output_filename = f"output_{uuid.uuid4()}.wav" | |
# 匯出合併後的音訊 | |
combined_audio.export(output_filename, format="wav") | |
return output_filename | |
async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str) -> str: | |
""" | |
非同步生成Podcast音訊檔案。 | |
引數: | |
input_text (str): 用於生成Podcast指令碼的輸入文字。 | |
language (str): Podcast使用的語言。 | |
speaker1 (str): 第一位說話者的語音設定。 | |
speaker2 (str): 第二位說話者的語音設定。 | |
api_key (str): 用於訪問 Gemini AI 服務的 API 金鑰。 | |
返回: | |
str: 生成的Podcast音訊檔案的檔名。 | |
此方法執行以下步驟: | |
1. 使用 generate_script 方法生成Podcast劇本。 | |
2. 使用 tts_generate 方法爲每個對話行生成音訊檔案。 | |
3. 使用 combine_audio_files 方法將所有音訊檔案合併爲一個完整的Podcast。 | |
整個過程是非同步的,以提高效率。方法還會記錄並顯示每個步驟的執行時間。 | |
""" | |
# 生成Podcast劇本 | |
gr.Info("Generating podcast script...") | |
start_time = time.time() | |
podcast_json = await self.generate_script(input_text, language, api_key) | |
end_time = time.time() | |
gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!") | |
# 生成Podcast音訊檔案 | |
gr.Info("Generating podcast audio files...") | |
start_time = time.time() | |
audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in podcast_json['podcast']]) | |
end_time = time.time() | |
gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!") | |
# 合併音訊檔案 | |
combined_audio = await self.combine_audio_files(audio_files) | |
return combined_audio | |
class TextExtractor: | |
async def extract_from_pdf(file_path: str) -> str: | |
# 從PDF檔案中提取文字 | |
async with aiofiles.open(file_path, 'rb') as file: | |
content = await file.read() | |
pdf_reader = pypdf.PdfReader(io.BytesIO(content)) | |
return "\n\n".join(page.extract_text() for page in pdf_reader.pages if page.extract_text()) | |
async def extract_from_txt(file_path: str) -> str: | |
# 從TXT檔案中提取文字 | |
async with aiofiles.open(file_path, 'r') as file: | |
return await file.read() | |
async def extract_text(cls, file_path: str) -> str: | |
# 根據檔案型別選擇適當的提取方法 | |
_, file_extension = os.path.splitext(file_path) | |
if file_extension.lower() == '.pdf': | |
return await cls.extract_from_pdf(file_path) | |
elif file_extension.lower() == '.txt': | |
return await cls.extract_from_txt(file_path) | |
else: | |
raise gr.Error(f"Unsupported file type: {file_extension}") | |
async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str: | |
""" | |
處理輸入並生成Podcast的非同步函式。 | |
引數: | |
input_text (str): 使用者輸入的文字內容。 | |
input_file: 使用者上傳的檔案(可以是 PDF 或 TXT)。 | |
language (str): 選擇的語言。 | |
speaker1 (str): 第一位說話者的語音選擇。 | |
speaker2 (str): 第二位說話者的語音選擇。 | |
api_key (str): 用於生成 AI 的 API 金鑰,預設爲空字串。 | |
返回: | |
str: 生成的Podcast音訊檔案路徑。 | |
此函式協調整個Podcast生成過程,包括文字提取、指令碼生成和音訊合成。 | |
它處理不同的輸入型別(文字或檔案),並使用指定的語音和語言設定來建立最終的Podcast。 | |
""" | |
# 開始生成Podcast | |
gr.Info("Starting podcast generation...") | |
start_time = time.time() | |
# 定義語音名稱對映 | |
voice_names = { | |
"Andrew - English (United States)": "en-US-AndrewMultilingualNeural", | |
"Ava - English (United States)": "en-US-AvaMultilingualNeural", | |
"Brian - English (United States)": "en-US-BrianMultilingualNeural", | |
"Emma - English (United States)": "en-US-EmmaMultilingualNeural", | |
"Florian - German (Germany)": "de-DE-FlorianMultilingualNeural", | |
"Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural", | |
"Remy - French (France)": "fr-FR-RemyMultilingualNeural", | |
"Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural" | |
} | |
# 獲取實際的語音名稱 | |
speaker1 = voice_names[speaker1] | |
speaker2 = voice_names[speaker2] | |
# 如果提供了輸入檔案,則從檔案中提取文字 | |
if input_file: | |
input_text = await TextExtractor.extract_text(input_file.name) | |
# 如果沒有提供API金鑰,則使用環境變數中的金鑰 | |
if not api_key: | |
api_key = os.getenv("Your_API_KEY") | |
# 建立PodcastGenerator實例並生成Podcast | |
podcast_generator = PodcastGenerator() | |
podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key) | |
# 計算總耗時並顯示資訊 | |
end_time = time.time() | |
gr.Info(f"Successfully generated podcast in {(end_time - start_time):.2f} seconds!") | |
return podcast | |
# 定義Gradio介面 | |
iface = gr.Interface( | |
fn=process_input, | |
inputs=[ | |
gr.Textbox(label="Input Text"), | |
gr.File(label="Or Upload a PDF or TXT file"), | |
gr.Dropdown(label="Language", choices=[ | |
"Auto Detect", | |
"Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani", | |
"Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian", | |
"Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin", | |
"Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English", | |
"Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian", | |
"German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish", | |
"Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean", | |
"Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam", | |
"Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian", | |
"Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Sinhala", | |
"Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili", | |
"Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu", | |
"Uzbek", "Vietnamese", "Welsh", "Zulu" | |
], | |
value="Auto Detect"), | |
gr.Dropdown(label="Speaker 1 Voice", choices=[ | |
"Andrew - English (United States)", | |
"Ava - English (United States)", | |
"Brian - English (United States)", | |
"Emma - English (United States)", | |
"Florian - German (Germany)", | |
"Seraphina - German (Germany)", | |
"Remy - French (France)", | |
"Vivienne - French (France)" | |
], | |
value="Andrew - English (United States)"), | |
gr.Dropdown(label="Speaker 2 Voice", choices=[ | |
"Andrew - English (United States)", | |
"Ava - English (United States)", | |
"Brian - English (United States)", | |
"Emma - English (United States)", | |
"Florian - German (Germany)", | |
"Seraphina - German (Germany)", | |
"Remy - French (France)", | |
"Vivienne - French (France)" | |
], | |
value="Ava - English (United States)"), | |
gr.Textbox(label="Your Gemini API Key (Optional) - In case you are getting rate limited"), | |
], | |
outputs=[ | |
gr.Audio(label="Generated Podcast Audio") | |
], | |
title="🎙️ PodcastGen 🎙️", | |
description="Generate a 2-speaker podcast from text input or documents!", | |
allow_flagging="never" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |