Spaces:
Running
Running
import sqlite3 | |
import json | |
import random | |
import os | |
import re | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
# 初始化 GPT 模型 | |
model_name = "EleutherAI/pythia-410m" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
# 資料夾 | |
DATA_DIR = "./data" | |
DB_PATH = os.path.join(DATA_DIR, "sentences.db") | |
# 建立資料表 | |
def init_db(): | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute(''' | |
CREATE TABLE IF NOT EXISTS sentences ( | |
word TEXT PRIMARY KEY, | |
phonetic TEXT, | |
sentence TEXT, | |
created_at DATETIME DEFAULT CURRENT_TIMESTAMP | |
) | |
''') | |
conn.commit() | |
conn.close() | |
# 自動掃描資料夾生成選單 | |
def get_sources(): | |
files = os.listdir(DATA_DIR) | |
sources = [f.split(".json")[0] for f in files if f.endswith(".json")] | |
return sources | |
# 查詢句庫 | |
def get_sentence(word): | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,)) | |
result = c.fetchone() | |
conn.close() | |
return result | |
# 保存句子到 SQLite | |
def save_sentence(word, phonetic, sentence): | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute(''' | |
INSERT INTO sentences (word, phonetic, sentence) | |
VALUES (?, ?, ?) | |
ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic | |
''', (word, phonetic, sentence)) | |
conn.commit() | |
conn.close() | |
# 清理 GPT 生成句子的雜訊 | |
def clean_sentence(output): | |
output = output.split(":")[-1].strip() | |
output = re.sub(r"^\d+\.\s*", "", output).strip() | |
output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip() | |
output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip() | |
if not output.endswith("."): | |
output += "." | |
return output | |
# 核心:抽單字 + 查句庫 or GPT 生成句子 | |
def get_words_with_sentences(source, n): | |
status = [] | |
display_result = "" | |
try: | |
# 讀取單字庫 | |
data_path = os.path.join(DATA_DIR, f"{source}.json") | |
with open(data_path, 'r', encoding='utf-8') as f: | |
words = json.load(f) | |
# 隨機抽取 n 個單字 | |
selected_words = random.sample(words, n) | |
results = [] | |
for i, word_data in enumerate(selected_words): | |
word = word_data['word'] | |
phonetic = word_data['phonetic'] | |
# 查詢句庫,看是否已有例句 | |
cached_result = get_sentence(word) | |
if cached_result: | |
sentence = cached_result[2] | |
status.append(f"✅ {word} 已有例句,從句庫讀取") | |
else: | |
# 沒有的話,GPT 生成句子 | |
status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...") | |
prompt = f"A simple English sentence with the word '{word}':" | |
inputs = tokenizer(prompt, return_tensors="pt") | |
outputs = model.generate(**inputs, max_new_tokens=30) | |
sentence = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# 清理生成句子 | |
sentence = clean_sentence(sentence) | |
# 存入句庫 | |
save_sentence(word, phonetic, sentence) | |
# 美化輸出 | |
display_result += f""" | |
<div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;"> | |
<p><strong>📖 單字:</strong> {word}</p> | |
<p><strong>🔤 音標:</strong> {phonetic}</p> | |
<p><strong>✍️ 例句:</strong> {sentence}</p> | |
</div> | |
""" | |
status.append("✅ 完成!") | |
return display_result, "\n".join(status) | |
except Exception as e: | |
status.append(f"❌ 發生錯誤: {str(e)}") | |
return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status) | |
# 啟動時自動建表 | |
init_db() | |