Spaces:
Running
Running
File size: 3,598 Bytes
1e165e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import sqlite3
import os
import random
from ai_sentence import generate_sentence
from vocab import get_words_from_source, get_word_info
from tqdm import tqdm
DATA_DIR = "./data"
DB_PATH = os.path.join(DATA_DIR, "sentences.db")
# 初始化資料庫(建表)
def init_db():
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS sentences (
word TEXT,
phonetic TEXT,
sentence TEXT,
source TEXT,
model TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (word, source, model)
)
''')
conn.commit()
conn.close()
# 查詢句庫中的某個單字的所有例句
def get_sentences_by_word(word):
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('SELECT word, phonetic, sentence, source, model FROM sentences WHERE word=?', (word,))
results = c.fetchall()
conn.close()
return results
# 儲存句子到 SQLite
def save_sentence(word, phonetic, sentence, source, model):
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
INSERT INTO sentences (word, phonetic, sentence, source, model)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(word, source, model) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic
''', (word, phonetic, sentence, source, model))
conn.commit()
conn.close()
# 隨機抽單字 + 查句庫 or GPT 生成例句
def get_words_with_sentences(source, n):
try:
words = get_words_from_source(source)
selected_words = random.sample(words, n)
result_display = ""
for word_data in tqdm(selected_words, desc="處理單字"):
word = word_data['word']
phonetic = word_data['phonetic']
# 查詢句庫
sentence_records = get_sentences_by_word(word)
if sentence_records:
# 優先取 Tatoeba
sentence = ""
for rec in sentence_records:
if rec[3] == "tatoeba": # source 字段
sentence = rec[2] # sentence 字段
break
if not sentence:
sentence = sentence_records[0][2]
source_used = sentence_records[0][3]
model_used = sentence_records[0][4]
else:
# GPT 生成句子
sentence = generate_sentence(word, "EleutherAI/pythia-410m")
source_used = "ai"
model_used = "EleutherAI/pythia-410m"
# 查詢音標,避免 GPT 生成時音標缺失
if not phonetic:
word_info = get_word_info(source, word)
phonetic = word_info['phonetic'] if word_info else ''
# 存回句庫
save_sentence(word, phonetic, sentence, source_used, model_used)
result_display += f"""
<div style="margin-bottom: 10px; padding: 8px; border-left: 4px solid #4CAF50; background-color: #f9f9f9;">
<strong>單字:</strong> {word} <br>
<strong>音標:</strong> {phonetic or '無'} <br>
<strong>句子:</strong> {sentence} <br>
<strong>來源:</strong> {source_used} {f"({model_used})" if model_used else ""}
</div>
"""
return result_display, f"✅ 成功抽取 {n} 個單字 & 句子"
except Exception as e:
return f"<p style='color:red;'>❌ 發生錯誤:{str(e)}</p>", f"❌ 錯誤:{str(e)}"
|