File size: 6,817 Bytes
471470c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# -*- coding: utf-8 -*-
"""korscideberta.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1vJNUG_F5El5LY8xmmwRVXo66bYBfXtdz
"""
#!git clone https://huggingface.co/kisti/korscideberta; cd korscideberta
# Commented out IPython magic to ensure Python compatibility.
#!pwd
#%cd ..
#!pip install konlpy
# %cd korscideberta
# Commented out IPython magic to ensure Python compatibility.
'''
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# %cd Mecab-ko-for-Google-Colab/
! bash install_mecab-ko_on_colab_light_220429.sh
# %cd ..
!pip install datasets transformers[sentencepiece]
'''
# Commented out IPython magic to ensure Python compatibility.
'''
!pip install -U accelerate; pip install -U transformers; pip install pydantic==1.8
'''
!pwd
# %cd /content/korscideberta
'''
#[ํ์]๋ฆฌ๋
์ค ํฐ๋ฏธ๋์์ ๋ณธ ์ฝ๋ ๋ฐ ํ ํฌ๋์ด์ ๋ค์ด๋ก๋
#git clone https://huggingface.co/kisti/korscideberta
#cd korscideberta
#[ํ์]๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น(Mecab ๋ฑ ์์ธํ ์ค์น ๋ฐฉ๋ฒ์ KorSciDeBERTaํ๊ฒฝ์ค์น+ํ์ธํ๋.pdf ์ฐธ์กฐ)
!apt install git-lfs
'''
from datasets import load_dataset
import datasets
from huggingface_hub import notebook_login
notebook_login() #Huggingface ๋ก๊ทธ์ธ
#ํ ํฐ ์์: hf_jRjLZcSBibYHwUaTjiNUEeoJlFxhFkGM
model_repository = "kisti/korscideberta" #Huggingface ๋ชจ๋ธ๋ช
์ค์
#model_repository = "./"
from transformers import AutoTokenizer
from tokenization_korscideberta_v2 import DebertaV2Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(model_repository)
out = tokenizer.tokenize("<cls> ํ๊ตญ์ด ๋ชจ๋ธ์ <s> ํ๊ตญ์ด ๋ชจ๋ธ์ ๊ณต์ ํฉ๋๋ค. <s>")
print(str(out))
#๋ฐ์ดํฐ์
๋ก๋
#data_files = {"train": "๋ฌธ์ฅ์๋ฏธ-๊ท ๋ฑ์ ๋/test.json", "test": "๋ฌธ์ฅ์๋ฏธ-๊ท ๋ฑ์ ๋/train.json", 'dev':'๋ฌธ์ฅ์๋ฏธ-๊ท ๋ฑ์ ๋/dev.json'}
#dataset = load_dataset('json', data_files=data_files)
dataset = load_dataset('csv', data_files='data/Abstract_Annotation_Data_tagsentence.csv', split='train')
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.1)
print("dataset:", str(dataset))
#๋ฐ์ดํฐ์
์ ํ ํฌ๋์ด์ง ํ ์ ์ฅ
from datasets import ClassLabel
labels = [x for x in dataset['train']['tag']]
labels = list(set(labels))
labels.sort()
num_labels = len(labels)
print('Labels: '+str(labels)[:200])
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)
def preprocess_function(example):
output_dict = tokenizer('<cls>'+example["sentence"]+'<s>', max_length=512, truncation=True)
output_dict['labels'] = ClassLabels.str2int(example['tag'])
return output_dict
#tokenized_datasets = dataset.map(preprocess_function, batched=False, remove_columns=dataset["train"].column_names)
tokenized_datasets = dataset.map(preprocess_function, batched=False)
tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=labels))
#๋ฐ์ดํฐ์
ํ ํฌ๋์ด์ง ํ์ธ
random_id = 1
print("Input IDS:", tokenized_datasets["train"][random_id]["input_ids"])
print("Labels:", tokenized_datasets["train"][random_id]["labels"])
tokenized_datasets.save_to_disk('data/tok')
#KorSciDeBERTa ๋ชจ๋ธ ๋ก๋ฉ
from transformers import AutoModelForSequenceClassification
num_labels = len(labels)
def model_init():
#return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=7)
#return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.25)
return AutoModelForSequenceClassification.from_pretrained(model_repository, num_labels=num_labels, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = model_init()
#DataCollator ํ์ธ
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from collections import Counter
print("Test:", Counter(tokenized_datasets["test"]["labels"]))
#์ ํ๋ ์ฒ๋
from datasets import load_metric
accuracy = load_metric("accuracy")
import numpy as np
def compute_metrics(pred):
pred_logits = pred.predictions
pred_classes = np.argmax(pred_logits, axis=-1)
labels = np.asarray(pred.label_ids)
acc = accuracy.compute(predictions=pred_classes, references=labels)
return {"accuracy": acc["accuracy"]}
#training_args ์ค์
#๋ค์ ์๋ฌ ๋ฐ์์ output_dir์ ๋ณ๊ฒฝํ์ฌ ๋ค์ ์๋
#MlflowException: Changing param values is not allowed. Param with key=
import gc
gc.collect()
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="deberta_sent4455",
num_train_epochs=4,
#learning_rate=5e-5,
learning_rate=1.5e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=8,
weight_decay=0.01,
fp16=True, # Use mixed precision
fp16_opt_level="01", # mixed precision mode
warmup_steps=500,
logging_steps=200,
save_steps=2000,
eval_steps=500,
push_to_hub=True,
evaluation_strategy="steps",
)
#Trainer ์ค์ ํ ํ์ต ์์
import gc
gc.collect()
from transformers import Trainer
trainer = Trainer(
args=training_args,
compute_metrics=compute_metrics,
model=model,
#tokenizer=tokenizer, #์๋ฌ ์ ๋ฐ: TypeError: save_vocabulary() got an unexpected keyword argument 'filename_prefix'
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"]
)
train_metrics = trainer.train().metrics
trainer.save_metrics("train", train_metrics)
trainer.push_to_hub()
#### ํ์ธํ๋ ๋ฐ ๋ชจ๋ธ ์
๋ก๋ ์๋ฃ
# Commented out IPython magic to ensure Python compatibility.
# %cd mecab
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh); cd mecab-0.996-ko-0.9.2;
!chmod 775 ./configure; ./configure; make; chmod 775 tests/*.sh; make check; make install
# Commented out IPython magic to ensure Python compatibility.
!pwd
# %cd mecab
!cd mecab-ko-dic-2.1.1-20180720; chmod 775 ./autogen.sh; ./autogen.sh; ./configure; make
#!mecab -d /usr/local/lib/mecab/dic/mecab-ko-dic
# Commented out IPython magic to ensure Python compatibility.
!pwd
!ls
# %cd korscideberta
! unzip korscideberta.zip -d korscideberta; cd korscideberta
# Commented out IPython magic to ensure Python compatibility.
!pwd
# %cd korscideberta
! pip3 install -r requirements.txt; pip install --upgrade nltk;
!pip uninstall -y torch torchtext torch-tensorrt; pip install --upgrade pip; pip install torch==1.10.1+cu111 torchvision==0.11.2+cu111 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu111/torch_stable.html --default-timeout=100; pip install setuptools_scm six mlflow; pip install "numpy<1.24.0"; pip install . |