Spaces:
Running
Running
import sqlite3 | |
import os | |
import random | |
from ai_sentence import generate_sentence | |
from vocab import get_words_from_source, get_word_info | |
from tqdm import tqdm | |
DATA_DIR = "./data" | |
DB_PATH = os.path.join(DATA_DIR, "sentences.db") | |
# 初始化資料庫(建表) | |
def init_db(): | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute(''' | |
CREATE TABLE IF NOT EXISTS sentences ( | |
word TEXT, | |
phonetic TEXT, | |
sentence TEXT, | |
source TEXT, | |
model TEXT, | |
created_at DATETIME DEFAULT CURRENT_TIMESTAMP, | |
PRIMARY KEY (word, source, model) | |
) | |
''') | |
conn.commit() | |
conn.close() | |
# 查詢句庫中的某個單字的所有例句 | |
def get_sentences_by_word(word): | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute('SELECT word, phonetic, sentence, source, model FROM sentences WHERE word=?', (word,)) | |
results = c.fetchall() | |
conn.close() | |
return results | |
# 儲存句子到 SQLite | |
def save_sentence(word, phonetic, sentence, source, model): | |
conn = sqlite3.connect(DB_PATH) | |
c = conn.cursor() | |
c.execute(''' | |
INSERT INTO sentences (word, phonetic, sentence, source, model) | |
VALUES (?, ?, ?, ?, ?) | |
ON CONFLICT(word, source, model) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic | |
''', (word, phonetic, sentence, source, model)) | |
conn.commit() | |
conn.close() | |
# 隨機抽單字 + 查句庫 or GPT 生成例句 | |
def get_words_with_sentences(source, n): | |
try: | |
words = get_words_from_source(source) | |
selected_words = random.sample(words, n) | |
result_display = "" | |
for word_data in tqdm(selected_words, desc="處理單字"): | |
word = word_data['word'] | |
phonetic = word_data['phonetic'] | |
# 查詢句庫 | |
sentence_records = get_sentences_by_word(word) | |
if sentence_records: | |
# 優先取 Tatoeba | |
sentence = "" | |
for rec in sentence_records: | |
if rec[3] == "tatoeba": # source 字段 | |
sentence = rec[2] # sentence 字段 | |
break | |
if not sentence: | |
sentence = sentence_records[0][2] | |
source_used = sentence_records[0][3] | |
model_used = sentence_records[0][4] | |
else: | |
# GPT 生成句子 | |
sentence = generate_sentence(word, "EleutherAI/pythia-410m") | |
source_used = "ai" | |
model_used = "EleutherAI/pythia-410m" | |
# 查詢音標,避免 GPT 生成時音標缺失 | |
if not phonetic: | |
word_info = get_word_info(source, word) | |
phonetic = word_info['phonetic'] if word_info else '' | |
# 存回句庫 | |
save_sentence(word, phonetic, sentence, source_used, model_used) | |
result_display += f""" | |
<div style="margin-bottom: 10px; padding: 8px; border-left: 4px solid #4CAF50; background-color: #f9f9f9;"> | |
<strong>單字:</strong> {word} <br> | |
<strong>音標:</strong> {phonetic or '無'} <br> | |
<strong>句子:</strong> {sentence} <br> | |
<strong>來源:</strong> {source_used} {f"({model_used})" if model_used else ""} | |
</div> | |
""" | |
return result_display, f"✅ 成功抽取 {n} 個單字 & 句子" | |
except Exception as e: | |
return f"<p style='color:red;'>❌ 發生錯誤:{str(e)}</p>", f"❌ 錯誤:{str(e)}" | |