dayuian commited on
Commit
dd2233b
·
verified ·
1 Parent(s): 1112df1

Update vocab.py

Browse files
Files changed (1) hide show
  1. vocab.py +76 -31
vocab.py CHANGED
@@ -1,79 +1,124 @@
 
1
  import json
2
  import random
3
  import os
4
  import re
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
 
7
- # 初始化模型,只執行一次,避免每次請求都重新載入
8
  model_name = "EleutherAI/pythia-410m"
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
  model = AutoModelForCausalLM.from_pretrained(model_name)
11
 
 
12
  DATA_DIR = "./data"
13
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def get_sources():
15
- """掃描資料夾,回傳所有單字庫名稱"""
16
  files = os.listdir(DATA_DIR)
17
  sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
18
  return sources
19
 
20
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def clean_sentence(output):
22
- """清理 GPT 生成的句子,去除雜訊"""
 
23
  output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
24
  output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
25
  if not output.endswith("."):
26
  output += "."
27
  return output
28
 
29
-
30
  def get_words_with_sentences(source, n):
31
- """抽取單字 + 生成例句,回傳結果和狀態"""
32
  status = []
33
  display_result = ""
 
34
  try:
35
- # 讀取單字庫資料
36
  data_path = os.path.join(DATA_DIR, f"{source}.json")
37
  with open(data_path, 'r', encoding='utf-8') as f:
38
  words = json.load(f)
39
 
40
- # 隨機抽取
41
  selected_words = random.sample(words, n)
42
  results = []
43
 
44
  for i, word_data in enumerate(selected_words):
45
- status.append(f"正在生成第 {i + 1}/{n} 個單字 [{word_data['word']}] 例句...")
46
  word = word_data['word']
47
-
48
- # GPT 造句 Prompt
49
- prompt = f"Use the word '{word}' in a simple English sentence suitable for beginners. Output only the sentence."
50
-
51
- inputs = tokenizer(prompt, return_tensors="pt")
52
- outputs = model.generate(**inputs, max_new_tokens=30)
53
- sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
54
-
55
- clean_output = clean_sentence(sentence)
56
-
57
- results.append({
58
- "word": word,
59
- "phonetic": word_data["phonetic"],
60
- "sentence": clean_output
61
- })
62
-
63
- # 美化輸出文字
 
 
 
 
 
64
  display_result += f"""
65
  <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
66
  <p><strong>📖 單字:</strong> {word}</p>
67
- <p><strong>🔤 音標:</strong> {word_data['phonetic']}</p>
68
- <p><strong>✍️ 例句:</strong> {clean_output}</p>
69
  </div>
70
  """
71
 
72
  status.append("✅ 完成!")
73
-
74
- # 以HTML形式回傳美化後的結果
75
  return display_result, "\n".join(status)
76
 
77
  except Exception as e:
78
  status.append(f"❌ 發生錯誤: {str(e)}")
79
  return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status)
 
 
 
 
1
+ import sqlite3
2
  import json
3
  import random
4
  import os
5
  import re
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
 
8
+ # 初始化 GPT 模型
9
  model_name = "EleutherAI/pythia-410m"
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
  model = AutoModelForCausalLM.from_pretrained(model_name)
12
 
13
+ # 資料夾
14
  DATA_DIR = "./data"
15
+ DB_PATH = os.path.join(DATA_DIR, "sentences.db")
16
+
17
+ # 建立資料表
18
+ def init_db():
19
+ conn = sqlite3.connect(DB_PATH)
20
+ c = conn.cursor()
21
+ c.execute('''
22
+ CREATE TABLE IF NOT EXISTS sentences (
23
+ word TEXT PRIMARY KEY,
24
+ phonetic TEXT,
25
+ sentence TEXT,
26
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP
27
+ )
28
+ ''')
29
+ conn.commit()
30
+ conn.close()
31
+
32
+ # 自動掃描資料夾生成選單
33
  def get_sources():
 
34
  files = os.listdir(DATA_DIR)
35
  sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
36
  return sources
37
 
38
+ # 查詢句庫
39
+ def get_sentence(word):
40
+ conn = sqlite3.connect(DB_PATH)
41
+ c = conn.cursor()
42
+ c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,))
43
+ result = c.fetchone()
44
+ conn.close()
45
+ return result
46
+
47
+ # 保存句子到 SQLite
48
+ def save_sentence(word, phonetic, sentence):
49
+ conn = sqlite3.connect(DB_PATH)
50
+ c = conn.cursor()
51
+ c.execute('''
52
+ INSERT INTO sentences (word, phonetic, sentence)
53
+ VALUES (?, ?, ?)
54
+ ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic
55
+ ''', (word, phonetic, sentence))
56
+ conn.commit()
57
+ conn.close()
58
+
59
+ # 清理 GPT 生成句子的雜訊
60
  def clean_sentence(output):
61
+ output = output.split(":")[-1].strip()
62
+ output = re.sub(r"^\d+\.\s*", "", output).strip()
63
  output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
64
  output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
65
  if not output.endswith("."):
66
  output += "."
67
  return output
68
 
69
+ # 核心:抽單字 + 查句庫 or GPT 生成句子
70
  def get_words_with_sentences(source, n):
 
71
  status = []
72
  display_result = ""
73
+
74
  try:
75
+ # 讀取單字庫
76
  data_path = os.path.join(DATA_DIR, f"{source}.json")
77
  with open(data_path, 'r', encoding='utf-8') as f:
78
  words = json.load(f)
79
 
80
+ # 隨機抽取 n 個單字
81
  selected_words = random.sample(words, n)
82
  results = []
83
 
84
  for i, word_data in enumerate(selected_words):
 
85
  word = word_data['word']
86
+ phonetic = word_data['phonetic']
87
+
88
+ # 查詢句庫,看是否已有例句
89
+ cached_result = get_sentence(word)
90
+ if cached_result:
91
+ sentence = cached_result[2]
92
+ status.append(f"✅ {word} 已有例句,從句庫讀取")
93
+ else:
94
+ # 沒有的話,GPT 生成句子
95
+ status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...")
96
+ prompt = f"A simple English sentence with the word '{word}':"
97
+ inputs = tokenizer(prompt, return_tensors="pt")
98
+ outputs = model.generate(**inputs, max_new_tokens=30)
99
+ sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
100
+
101
+ # 清理生成句子
102
+ sentence = clean_sentence(sentence)
103
+
104
+ # 存入句庫
105
+ save_sentence(word, phonetic, sentence)
106
+
107
+ # 美化輸出
108
  display_result += f"""
109
  <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
110
  <p><strong>📖 單字:</strong> {word}</p>
111
+ <p><strong>🔤 音標:</strong> {phonetic}</p>
112
+ <p><strong>✍️ 例句:</strong> {sentence}</p>
113
  </div>
114
  """
115
 
116
  status.append("✅ 完成!")
 
 
117
  return display_result, "\n".join(status)
118
 
119
  except Exception as e:
120
  status.append(f"❌ 發生錯誤: {str(e)}")
121
  return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status)
122
+
123
+ # 啟動時自動建表
124
+ init_db()