dayuian commited on
Commit
f2efbc0
·
verified ·
1 Parent(s): eaa87eb

Update vocab.py

Browse files
Files changed (1) hide show
  1. vocab.py +5 -81
vocab.py CHANGED
@@ -1,20 +1,11 @@
1
  import sqlite3
2
- import json
3
- import random
4
  import os
5
- import re
6
- from transformers import AutoModelForCausalLM, AutoTokenizer
7
 
8
- # 初始化 GPT 模型
9
- model_name = "EleutherAI/pythia-410m"
10
- tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- model = AutoModelForCausalLM.from_pretrained(model_name)
12
-
13
- # 資料夾
14
  DATA_DIR = "./data"
15
  DB_PATH = os.path.join(DATA_DIR, "sentences.db")
16
 
17
- # 建立資料表
 
18
  def init_db():
19
  conn = sqlite3.connect(DB_PATH)
20
  c = conn.cursor()
@@ -29,13 +20,8 @@ def init_db():
29
  conn.commit()
30
  conn.close()
31
 
32
- # 自動掃描資料夾生成選單
33
- def get_sources():
34
- files = os.listdir(DATA_DIR)
35
- sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
36
- return sources
37
 
38
- # 查詢句庫
39
  def get_sentence(word):
40
  conn = sqlite3.connect(DB_PATH)
41
  c = conn.cursor()
@@ -44,6 +30,7 @@ def get_sentence(word):
44
  conn.close()
45
  return result
46
 
 
47
  # 保存句子到 SQLite
48
  def save_sentence(word, phonetic, sentence):
49
  conn = sqlite3.connect(DB_PATH)
@@ -56,69 +43,6 @@ def save_sentence(word, phonetic, sentence):
56
  conn.commit()
57
  conn.close()
58
 
59
- # 清理 GPT 生成句子的雜訊
60
- def clean_sentence(output):
61
- output = output.split(":")[-1].strip()
62
- output = re.sub(r"^\d+\.\s*", "", output).strip()
63
- output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
64
- output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
65
- if not output.endswith("."):
66
- output += "."
67
- return output
68
-
69
- # 核心:抽單字 + 查句庫 or GPT 生成句子
70
- def get_words_with_sentences(source, n):
71
- status = []
72
- display_result = ""
73
-
74
- try:
75
- # 讀取單字庫
76
- data_path = os.path.join(DATA_DIR, f"{source}.json")
77
- with open(data_path, 'r', encoding='utf-8') as f:
78
- words = json.load(f)
79
-
80
- # 隨機抽取 n 個單字
81
- selected_words = random.sample(words, n)
82
- results = []
83
-
84
- for i, word_data in enumerate(selected_words):
85
- word = word_data['word']
86
- phonetic = word_data['phonetic']
87
-
88
- # 查詢句庫,看是否已有例句
89
- cached_result = get_sentence(word)
90
- if cached_result:
91
- sentence = cached_result[2]
92
- status.append(f"✅ {word} 已有例句,從句庫讀取")
93
- else:
94
- # 沒有的話,GPT 生成句子
95
- status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...")
96
- prompt = f"A simple English sentence with the word '{word}':"
97
- inputs = tokenizer(prompt, return_tensors="pt")
98
- outputs = model.generate(**inputs, max_new_tokens=30)
99
- sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
100
-
101
- # 清理生成句子
102
- sentence = clean_sentence(sentence)
103
-
104
- # 存入句庫
105
- save_sentence(word, phonetic, sentence)
106
-
107
- # 美化輸出
108
- display_result += f"""
109
- <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
110
- <p><strong>📖 單字:</strong> {word}</p>
111
- <p><strong>🔤 音標:</strong> {phonetic}</p>
112
- <p><strong>✍️ 例句:</strong> {sentence}</p>
113
- </div>
114
- """
115
-
116
- status.append("✅ 完成!")
117
- return display_result, "\n".join(status)
118
-
119
- except Exception as e:
120
- status.append(f"❌ 發生錯誤: {str(e)}")
121
- return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status)
122
 
123
- # 啟動時自動建表
124
  init_db()
 
1
  import sqlite3
 
 
2
  import os
 
 
3
 
 
 
 
 
 
 
4
  DATA_DIR = "./data"
5
  DB_PATH = os.path.join(DATA_DIR, "sentences.db")
6
 
7
+
8
+ # 建立資料表(若不存在)
9
  def init_db():
10
  conn = sqlite3.connect(DB_PATH)
11
  c = conn.cursor()
 
20
  conn.commit()
21
  conn.close()
22
 
 
 
 
 
 
23
 
24
+ # 查句庫,傳回 (word, phonetic, sentence) 或 None
25
  def get_sentence(word):
26
  conn = sqlite3.connect(DB_PATH)
27
  c = conn.cursor()
 
30
  conn.close()
31
  return result
32
 
33
+
34
  # 保存句子到 SQLite
35
  def save_sentence(word, phonetic, sentence):
36
  conn = sqlite3.connect(DB_PATH)
 
43
  conn.commit()
44
  conn.close()
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # 初始化資料表
48
  init_db()