import gradio as gr import json import random from transformers import AutoModelForCausalLM, AutoTokenizer import os import re # 模型初始化 model_name = "EleutherAI/pythia-410m" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # 資料夾 DATA_DIR = "./data" # 自動掃描資料夾生成選單 def get_sources(): files = os.listdir(DATA_DIR) sources = [f.split(".json")[0] for f in files if f.endswith(".json")] return sources # 清理 GPT 生成句子的雜訊 def clean_sentence(output): output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip() output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip() if not output.endswith("."): output += "." return output # 核心函數 def get_words_with_sentences(source, n): status = [] try: data_path = os.path.join(DATA_DIR, f"{source}.json") with open(data_path, 'r', encoding='utf-8') as f: words = json.load(f) selected_words = random.sample(words, n) results = [] for i, word_data in enumerate(selected_words): status.append(f"正在生成第 {i+1}/{n} 個單字 [{word_data['word']}] 例句...") word = word_data['word'] prompt = f"Use the word '{word}' in a simple English sentence suitable for beginners. Output only the sentence." inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=30) sentence = tokenizer.decode(outputs[0], skip_special_tokens=True) clean_output = clean_sentence(sentence) results.append({ "word": word, "phonetic": word_data["phonetic"], "sentence": clean_output }) status.append("✅ 完成!") return results, status except Exception as e: status.append(f"❌ 發生錯誤: {str(e)}") return [], status # Gradio 介面 demo = gr.Interface( fn=get_words_with_sentences, inputs=[ gr.Dropdown(choices=get_sources(), value="common3000", label="選擇單字庫", interactive=True, show_clear_button=False), gr.Number(value=10, label="抽幾個單字") ], outputs=[ gr.JSON(label="生成結果"), gr.JSON(label="生成進度") ] ) demo.launch()