dayuian commited on
Commit
1e165e2
·
verified ·
1 Parent(s): 3ed2d28

Create sentences.py

Browse files
Files changed (1) hide show
  1. sentences.py +106 -0
sentences.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import os
3
+ import random
4
+ from ai_sentence import generate_sentence
5
+ from vocab import get_words_from_source, get_word_info
6
+ from tqdm import tqdm
7
+
8
+ DATA_DIR = "./data"
9
+ DB_PATH = os.path.join(DATA_DIR, "sentences.db")
10
+
11
+
12
+ # 初始化資料庫(建表)
13
+ def init_db():
14
+ conn = sqlite3.connect(DB_PATH)
15
+ c = conn.cursor()
16
+ c.execute('''
17
+ CREATE TABLE IF NOT EXISTS sentences (
18
+ word TEXT,
19
+ phonetic TEXT,
20
+ sentence TEXT,
21
+ source TEXT,
22
+ model TEXT,
23
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
24
+ PRIMARY KEY (word, source, model)
25
+ )
26
+ ''')
27
+ conn.commit()
28
+ conn.close()
29
+
30
+
31
+ # 查詢句庫中的某個單字的所有例句
32
+ def get_sentences_by_word(word):
33
+ conn = sqlite3.connect(DB_PATH)
34
+ c = conn.cursor()
35
+ c.execute('SELECT word, phonetic, sentence, source, model FROM sentences WHERE word=?', (word,))
36
+ results = c.fetchall()
37
+ conn.close()
38
+ return results
39
+
40
+
41
+ # 儲存句子到 SQLite
42
+ def save_sentence(word, phonetic, sentence, source, model):
43
+ conn = sqlite3.connect(DB_PATH)
44
+ c = conn.cursor()
45
+ c.execute('''
46
+ INSERT INTO sentences (word, phonetic, sentence, source, model)
47
+ VALUES (?, ?, ?, ?, ?)
48
+ ON CONFLICT(word, source, model) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic
49
+ ''', (word, phonetic, sentence, source, model))
50
+ conn.commit()
51
+ conn.close()
52
+
53
+
54
+ # 隨機抽單字 + 查句庫 or GPT 生成例句
55
+ def get_words_with_sentences(source, n):
56
+ try:
57
+ words = get_words_from_source(source)
58
+ selected_words = random.sample(words, n)
59
+
60
+ result_display = ""
61
+ for word_data in tqdm(selected_words, desc="處理單字"):
62
+ word = word_data['word']
63
+ phonetic = word_data['phonetic']
64
+
65
+ # 查詢句庫
66
+ sentence_records = get_sentences_by_word(word)
67
+
68
+ if sentence_records:
69
+ # 優先取 Tatoeba
70
+ sentence = ""
71
+ for rec in sentence_records:
72
+ if rec[3] == "tatoeba": # source 字段
73
+ sentence = rec[2] # sentence 字段
74
+ break
75
+ if not sentence:
76
+ sentence = sentence_records[0][2]
77
+ source_used = sentence_records[0][3]
78
+ model_used = sentence_records[0][4]
79
+
80
+ else:
81
+ # GPT 生成句子
82
+ sentence = generate_sentence(word, "EleutherAI/pythia-410m")
83
+ source_used = "ai"
84
+ model_used = "EleutherAI/pythia-410m"
85
+
86
+ # 查詢音標,避免 GPT 生成時音標缺失
87
+ if not phonetic:
88
+ word_info = get_word_info(source, word)
89
+ phonetic = word_info['phonetic'] if word_info else ''
90
+
91
+ # 存回句庫
92
+ save_sentence(word, phonetic, sentence, source_used, model_used)
93
+
94
+ result_display += f"""
95
+ <div style="margin-bottom: 10px; padding: 8px; border-left: 4px solid #4CAF50; background-color: #f9f9f9;">
96
+ <strong>單字:</strong> {word} <br>
97
+ <strong>音標:</strong> {phonetic or '無'} <br>
98
+ <strong>句子:</strong> {sentence} <br>
99
+ <strong>來源:</strong> {source_used} {f"({model_used})" if model_used else ""}
100
+ </div>
101
+ """
102
+
103
+ return result_display, f"✅ 成功抽取 {n} 個單字 & 句子"
104
+
105
+ except Exception as e:
106
+ return f"<p style='color:red;'>❌ 發生錯誤:{str(e)}</p>", f"❌ 錯誤:{str(e)}"