Spaces:

dayuian
/

VocabLine

Running

File size: 4,038 Bytes

import sqlite3
import json
import random
import os
import re
from transformers import AutoModelForCausalLM, AutoTokenizer

# 初始化 GPT 模型
model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 資料夾
DATA_DIR = "./data"
DB_PATH = os.path.join(DATA_DIR, "sentences.db")

# 建立資料表
def init_db():
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute('''
    CREATE TABLE IF NOT EXISTS sentences (
        word TEXT PRIMARY KEY,
        phonetic TEXT,
        sentence TEXT,
        created_at DATETIME DEFAULT CURRENT_TIMESTAMP
    )
    ''')
    conn.commit()
    conn.close()

# 自動掃描資料夾生成選單
def get_sources():
    files = os.listdir(DATA_DIR)
    sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
    return sources

# 查詢句庫
def get_sentence(word):
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,))
    result = c.fetchone()
    conn.close()
    return result

# 保存句子到 SQLite
def save_sentence(word, phonetic, sentence):
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute('''
    INSERT INTO sentences (word, phonetic, sentence)
    VALUES (?, ?, ?)
    ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic
    ''', (word, phonetic, sentence))
    conn.commit()
    conn.close()

# 清理 GPT 生成句子的雜訊
def clean_sentence(output):
    output = output.split(":")[-1].strip()
    output = re.sub(r"^\d+\.\s*", "", output).strip()
    output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
    output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
    if not output.endswith("."):
        output += "."
    return output

# 核心：抽單字 + 查句庫 or GPT 生成句子
def get_words_with_sentences(source, n):
    status = []
    display_result = ""

    try:
        # 讀取單字庫
        data_path = os.path.join(DATA_DIR, f"{source}.json")
        with open(data_path, 'r', encoding='utf-8') as f:
            words = json.load(f)

        # 隨機抽取 n 個單字
        selected_words = random.sample(words, n)
        results = []

        for i, word_data in enumerate(selected_words):
            word = word_data['word']
            phonetic = word_data['phonetic']

            # 查詢句庫，看是否已有例句
            cached_result = get_sentence(word)
            if cached_result:
                sentence = cached_result[2]
                status.append(f"✅ {word} 已有例句，從句庫讀取")
            else:
                # 沒有的話，GPT 生成句子
                status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...")
                prompt = f"A simple English sentence with the word '{word}':"
                inputs = tokenizer(prompt, return_tensors="pt")
                outputs = model.generate(**inputs, max_new_tokens=30)
                sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

                # 清理生成句子
                sentence = clean_sentence(sentence)

                # 存入句庫
                save_sentence(word, phonetic, sentence)

            # 美化輸出
            display_result += f"""
            <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
                <p><strong>📖 單字：</strong> {word}</p>
                <p><strong>🔤 音標：</strong> {phonetic}</p>
                <p><strong>✍️ 例句：</strong> {sentence}</p>
            </div>
            """

        status.append("✅ 完成！")
        return display_result, "\n".join(status)

    except Exception as e:
        status.append(f"❌ 發生錯誤: {str(e)}")
        return f"<p style='color:red;'>發生錯誤：{str(e)}</p>", "\n".join(status)

# 啟動時自動建表
init_db()