Spaces:
Sleeping
Sleeping
Create vocab.py
Browse files
vocab.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
6 |
+
|
7 |
+
# 初始化模型,只執行一次,避免每次請求都重新載入
|
8 |
+
model_name = "EleutherAI/pythia-410m"
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
10 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
11 |
+
|
12 |
+
DATA_DIR = "./data"
|
13 |
+
|
14 |
+
def get_sources():
|
15 |
+
"""掃描資料夾,回傳所有單字庫名稱"""
|
16 |
+
files = os.listdir(DATA_DIR)
|
17 |
+
sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
|
18 |
+
return sources
|
19 |
+
|
20 |
+
|
21 |
+
def clean_sentence(output):
|
22 |
+
"""清理 GPT 生成的句子,去除雜訊"""
|
23 |
+
output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
|
24 |
+
output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip()
|
25 |
+
if not output.endswith("."):
|
26 |
+
output += "."
|
27 |
+
return output
|
28 |
+
|
29 |
+
|
30 |
+
def get_words_with_sentences(source, n):
|
31 |
+
"""抽取單字 + 生成例句,回傳結果和狀態"""
|
32 |
+
status = []
|
33 |
+
display_result = ""
|
34 |
+
try:
|
35 |
+
# 讀取單字庫資料
|
36 |
+
data_path = os.path.join(DATA_DIR, f"{source}.json")
|
37 |
+
with open(data_path, 'r', encoding='utf-8') as f:
|
38 |
+
words = json.load(f)
|
39 |
+
|
40 |
+
# 隨機抽取
|
41 |
+
selected_words = random.sample(words, n)
|
42 |
+
results = []
|
43 |
+
|
44 |
+
for i, word_data in enumerate(selected_words):
|
45 |
+
status.append(f"正在生成第 {i + 1}/{n} 個單字 [{word_data['word']}] 例句...")
|
46 |
+
word = word_data['word']
|
47 |
+
|
48 |
+
# GPT 造句 Prompt
|
49 |
+
prompt = f"Use the word '{word}' in a simple English sentence suitable for beginners. Output only the sentence."
|
50 |
+
|
51 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
52 |
+
outputs = model.generate(**inputs, max_new_tokens=30)
|
53 |
+
sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
54 |
+
|
55 |
+
clean_output = clean_sentence(sentence)
|
56 |
+
|
57 |
+
results.append({
|
58 |
+
"word": word,
|
59 |
+
"phonetic": word_data["phonetic"],
|
60 |
+
"sentence": clean_output
|
61 |
+
})
|
62 |
+
|
63 |
+
# 美化輸出文字
|
64 |
+
display_result += f"""
|
65 |
+
<div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
|
66 |
+
<p><strong>📖 單字:</strong> {word}</p>
|
67 |
+
<p><strong>🔤 音標:</strong> {word_data['phonetic']}</p>
|
68 |
+
<p><strong>✍️ 例句:</strong> {clean_output}</p>
|
69 |
+
</div>
|
70 |
+
"""
|
71 |
+
|
72 |
+
status.append("✅ 完成!")
|
73 |
+
|
74 |
+
# 以HTML形式回傳美化後的結果
|
75 |
+
return display_result, "\n".join(status)
|
76 |
+
|
77 |
+
except Exception as e:
|
78 |
+
status.append(f"❌ 發生錯誤: {str(e)}")
|
79 |
+
return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status)
|