VocabLine / vocab.py
dayuian's picture
Update vocab.py
dd2233b verified
raw
history blame
4.04 kB
import sqlite3
import json
import random
import os
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
# 初始化 GPT 模型
model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# 資料夾
DATA_DIR = "./data"
DB_PATH = os.path.join(DATA_DIR, "sentences.db")
# 建立資料表
def init_db():
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS sentences (
word TEXT PRIMARY KEY,
phonetic TEXT,
sentence TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
# 自動掃描資料夾生成選單
def get_sources():
files = os.listdir(DATA_DIR)
sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
return sources
# 查詢句庫
def get_sentence(word):
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,))
result = c.fetchone()
conn.close()
return result
# 保存句子到 SQLite
def save_sentence(word, phonetic, sentence):
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
INSERT INTO sentences (word, phonetic, sentence)
VALUES (?, ?, ?)
ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic
''', (word, phonetic, sentence))
conn.commit()
conn.close()
# 清理 GPT 生成句子的雜訊
def clean_sentence(output):
output = output.split(":")[-1].strip()
output = re.sub(r"^\d+\.\s*", "", output).strip()
output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
if not output.endswith("."):
output += "."
return output
# 核心:抽單字 + 查句庫 or GPT 生成句子
def get_words_with_sentences(source, n):
status = []
display_result = ""
try:
# 讀取單字庫
data_path = os.path.join(DATA_DIR, f"{source}.json")
with open(data_path, 'r', encoding='utf-8') as f:
words = json.load(f)
# 隨機抽取 n 個單字
selected_words = random.sample(words, n)
results = []
for i, word_data in enumerate(selected_words):
word = word_data['word']
phonetic = word_data['phonetic']
# 查詢句庫,看是否已有例句
cached_result = get_sentence(word)
if cached_result:
sentence = cached_result[2]
status.append(f"✅ {word} 已有例句,從句庫讀取")
else:
# 沒有的話,GPT 生成句子
status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...")
prompt = f"A simple English sentence with the word '{word}':"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=30)
sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 清理生成句子
sentence = clean_sentence(sentence)
# 存入句庫
save_sentence(word, phonetic, sentence)
# 美化輸出
display_result += f"""
<div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
<p><strong>📖 單字:</strong> {word}</p>
<p><strong>🔤 音標:</strong> {phonetic}</p>
<p><strong>✍️ 例句:</strong> {sentence}</p>
</div>
"""
status.append("✅ 完成!")
return display_result, "\n".join(status)
except Exception as e:
status.append(f"❌ 發生錯誤: {str(e)}")
return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status)
# 啟動時自動建表
init_db()