Sakalti commited on
Commit
3dfb9ec
·
verified ·
1 Parent(s): a3ec89c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
5
+ from huggingface_hub import login
6
+
7
+ # === トークン読み込み ===
8
+ load_dotenv()
9
+ HF_TOKEN = os.getenv("HF_TOKEN")
10
+
11
+ if not HF_TOKEN:
12
+ raise ValueError("Hugging Faceのトークンが見つかりません。`.env`ファイルまたは環境変数を確認してください。")
13
+
14
+ login(HF_TOKEN)
15
+
16
+ # === 設定 ===
17
+ BASE_MODEL = "Sakalti/Template-4"
18
+ HF_REPO = "Sakalti/Template-4"
19
+
20
+ # === データ読み込み ===
21
+ dataset = load_dataset("Verah/JParaCrawl-Filtered-English-Japanese-Parallel-Corpus", split="train")
22
+
23
+ # === トークナイザー & モデル準備 ===
24
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
25
+ model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
26
+
27
+ # === データ前処理 ===
28
+ def preprocess(examples):
29
+ texts = [f"英語: {ex['en']}\n日本語:" for ex in examples]
30
+ model_inputs = tokenizer(texts, max_length=256, truncation=True)
31
+ model_inputs["labels"] = model_inputs["input_ids"]
32
+ return model_inputs
33
+
34
+ tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
35
+
36
+ # === トレーニング設定 ===
37
+ training_args = TrainingArguments(
38
+ output_dir="./results",
39
+ evaluation_strategy="no",
40
+ learning_rate=2e-5,
41
+ per_device_train_batch_size=2,
42
+ num_train_epochs=3,
43
+ save_total_limit=2,
44
+ save_steps=500, # 500ステップごとに保存(ご要望通り)
45
+ push_to_hub=True,
46
+ hub_model_id=HF_REPO,
47
+ hub_token=HF_TOKEN,
48
+ logging_steps=100,
49
+ )
50
+
51
+ # === Trainerで学習 & アップロード ===
52
+ trainer = Trainer(
53
+ model=model,
54
+ args=training_args,
55
+ train_dataset=tokenized_dataset,
56
+ )
57
+
58
+ trainer.train()
59
+ trainer.push_to_hub()
60
+ tokenizer.push_to_hub(HF_REPO)
61
+
62
+ print("アップロード完了!")