Spaces:

dayuian
/

VocabLine

Running

App Files Files Community

VocabLine / vocab.py

dayuian

Update vocab.py

dd2233b verified about 2 months ago

raw

history blame

4.04 kB

	import sqlite3
	import json
	import random
	import os
	import re
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# 初始化 GPT 模型
	model_name = "EleutherAI/pythia-410m"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)

	# 資料夾
	DATA_DIR = "./data"
	DB_PATH = os.path.join(DATA_DIR, "sentences.db")

	# 建立資料表
	def init_db():
	conn = sqlite3.connect(DB_PATH)
	c = conn.cursor()
	c.execute('''
	CREATE TABLE IF NOT EXISTS sentences (
	word TEXT PRIMARY KEY,
	phonetic TEXT,
	sentence TEXT,
	created_at DATETIME DEFAULT CURRENT_TIMESTAMP
	)
	''')
	conn.commit()
	conn.close()

	# 自動掃描資料夾生成選單
	def get_sources():
	files = os.listdir(DATA_DIR)
	sources = [f.split(".json")[0] for f in files if f.endswith(".json")]
	return sources

	# 查詢句庫
	def get_sentence(word):
	conn = sqlite3.connect(DB_PATH)
	c = conn.cursor()
	c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,))
	result = c.fetchone()
	conn.close()
	return result

	# 保存句子到 SQLite
	def save_sentence(word, phonetic, sentence):
	conn = sqlite3.connect(DB_PATH)
	c = conn.cursor()
	c.execute('''
	INSERT INTO sentences (word, phonetic, sentence)
	VALUES (?, ?, ?)
	ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic
	''', (word, phonetic, sentence))
	conn.commit()
	conn.close()

	# 清理 GPT 生成句子的雜訊
	def clean_sentence(output):
	output = output.split(":")[-1].strip()
	output = re.sub(r"^\d+\.\s*", "", output).strip()
	output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip()
	output = re.sub(r"\\?\d+\.\\*", "", output).strip()
	if not output.endswith("."):
	output += "."
	return output

	# 核心：抽單字 + 查句庫 or GPT 生成句子
	def get_words_with_sentences(source, n):
	status = []
	display_result = ""

	try:
	# 讀取單字庫
	data_path = os.path.join(DATA_DIR, f"{source}.json")
	with open(data_path, 'r', encoding='utf-8') as f:
	words = json.load(f)

	# 隨機抽取 n 個單字
	selected_words = random.sample(words, n)
	results = []

	for i, word_data in enumerate(selected_words):
	word = word_data['word']
	phonetic = word_data['phonetic']

	# 查詢句庫，看是否已有例句
	cached_result = get_sentence(word)
	if cached_result:
	sentence = cached_result[2]
	status.append(f"✅ {word} 已有例句，從句庫讀取")
	else:
	# 沒有的話，GPT 生成句子
	status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...")
	prompt = f"A simple English sentence with the word '{word}':"
	inputs = tokenizer(prompt, return_tensors="pt")
	outputs = model.generate(**inputs, max_new_tokens=30)
	sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# 清理生成句子
	sentence = clean_sentence(sentence)

	# 存入句庫
	save_sentence(word, phonetic, sentence)

	# 美化輸出
	display_result += f"""
	<div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;">
	<p><strong>📖 單字：</strong> {word}</p>
	<p><strong>🔤 音標：</strong> {phonetic}</p>
	<p><strong>✍️ 例句：</strong> {sentence}</p>
	</div>
	"""

	status.append("✅ 完成！")
	return display_result, "\n".join(status)

	except Exception as e:
	status.append(f"❌ 發生錯誤: {str(e)}")
	return f"<p style='color:red;'>發生錯誤：{str(e)}</p>", "\n".join(status)

	# 啟動時自動建表
	init_db()