dayuian commited on
Commit
1112df1
·
verified ·
1 Parent(s): 2a9478d

Create vocab.py

Browse files
Files changed (1) hide show
  1. vocab.py +79 -0
vocab.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import os
4
+ import re
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+
7
+ # 初始化模型,只執行一次,避免每次請求都重新載入
8
+ model_name = "EleutherAI/pythia-410m"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModelForCausalLM.from_pretrained(model_name)
11
+
12
+ DATA_DIR = "./data"
13
+
14
+ def get_sources():
15
+ """掃描資料夾,回傳所有單字庫名稱"""
16
+ files = os.listdir(DATA_DIR)
17
+ sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
18
+ return sources
19
+
20
+
21
+ def clean_sentence(output):
22
+ """清理 GPT 生成的句子,去除雜訊"""
23
+ output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
24
+ output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
25
+ if not output.endswith("."):
26
+ output += "."
27
+ return output
28
+
29
+
30
+ def get_words_with_sentences(source, n):
31
+ """抽取單字 + 生成例句,回傳結果和狀態"""
32
+ status = []
33
+ display_result = ""
34
+ try:
35
+ # 讀取單字庫資料
36
+ data_path = os.path.join(DATA_DIR, f"{source}.json")
37
+ with open(data_path, 'r', encoding='utf-8') as f:
38
+ words = json.load(f)
39
+
40
+ # 隨機抽取
41
+ selected_words = random.sample(words, n)
42
+ results = []
43
+
44
+ for i, word_data in enumerate(selected_words):
45
+ status.append(f"正在生成第 {i + 1}/{n} 個單字 [{word_data['word']}] 例句...")
46
+ word = word_data['word']
47
+
48
+ # GPT 造句 Prompt
49
+ prompt = f"Use the word '{word}' in a simple English sentence suitable for beginners. Output only the sentence."
50
+
51
+ inputs = tokenizer(prompt, return_tensors="pt")
52
+ outputs = model.generate(**inputs, max_new_tokens=30)
53
+ sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
54
+
55
+ clean_output = clean_sentence(sentence)
56
+
57
+ results.append({
58
+ "word": word,
59
+ "phonetic": word_data["phonetic"],
60
+ "sentence": clean_output
61
+ })
62
+
63
+ # 美化輸出文字
64
+ display_result += f"""
65
+ <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
66
+ <p><strong>📖 單字:</strong> {word}</p>
67
+ <p><strong>🔤 音標:</strong> {word_data['phonetic']}</p>
68
+ <p><strong>✍️ 例句:</strong> {clean_output}</p>
69
+ </div>
70
+ """
71
+
72
+ status.append("✅ 完成!")
73
+
74
+ # 以HTML形式回傳美化後的結果
75
+ return display_result, "\n".join(status)
76
+
77
+ except Exception as e:
78
+ status.append(f"❌ 發生錯誤: {str(e)}")
79
+ return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status)