|
|
|
from datasets import load_dataset |
|
import os |
|
import sys |
|
|
|
|
|
|
|
exp_name="/tf-ltl_eng_test_mid_ascii_gptAuged" |
|
|
|
|
|
|
|
dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"}) |
|
print(dataset) |
|
|
|
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID' |
|
os.environ['CUDA_VISIBLE_DEVICES']='4,5' |
|
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
model_id="google/flan-t5-xxl" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
|
|
from datasets import concatenate_datasets |
|
import numpy as np |
|
|
|
|
|
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"]) |
|
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]] |
|
|
|
max_source_length = int(np.percentile(input_lenghts, 100)) |
|
print(f"Max source length: {max_source_length}") |
|
|
|
|
|
|
|
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"]) |
|
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]] |
|
|
|
max_target_length = int(np.percentile(target_lenghts, 100)) |
|
print(f"Max target length: {max_target_length}") |
|
|
|
|
|
|
|
def preprocess_function(sample,padding="max_length"): |
|
|
|
inputs = ["Generate LTL: " + item for item in sample["natural"]] |
|
|
|
|
|
model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True) |
|
|
|
|
|
labels = tokenizer(text_target=sample["raw_ltl"], max_length=max_target_length, padding=padding, truncation=True) |
|
|
|
|
|
|
|
if padding == "max_length": |
|
labels["input_ids"] = [ |
|
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] |
|
] |
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["natural", "raw_ltl"]) |
|
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}") |
|
|
|
|
|
tokenized_dataset["train"].save_to_disk("data/train"+exp_name) |
|
tokenized_dataset["test"].save_to_disk("data/eval"+exp_name) |
|
|
|
|
|
|
|
from transformers import AutoModelForSeq2SeqLM |
|
from peft import PeftModel, PeftConfig |
|
|
|
model_id = "philschmid/flan-t5-xxl-sharded-fp16" |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(model) |
|
|
|
|
|
|
|
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType |
|
|
|
|
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
target_modules=["q", "v"], |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type=TaskType.SEQ_2_SEQ_LM |
|
) |
|
|
|
model = prepare_model_for_int8_training(model) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
model.print_trainable_parameters() |
|
|
|
|
|
|
|
|
|
|
|
from transformers import DataCollatorForSeq2Seq |
|
|
|
|
|
label_pad_token_id = -100 |
|
|
|
data_collator = DataCollatorForSeq2Seq( |
|
tokenizer, |
|
model=model, |
|
label_pad_token_id=label_pad_token_id, |
|
pad_to_multiple_of=8 |
|
) |
|
|
|
|
|
|
|
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments |
|
|
|
output_dir="lora-flan-t5-xxl" |
|
|
|
|
|
training_args = Seq2SeqTrainingArguments( |
|
output_dir=output_dir, |
|
auto_find_batch_size=True, |
|
learning_rate=1e-3, |
|
num_train_epochs=5, |
|
logging_dir=f"{output_dir}/logs", |
|
logging_strategy="steps", |
|
logging_steps=500, |
|
save_strategy="no", |
|
report_to="tensorboard", |
|
) |
|
|
|
|
|
trainer = Seq2SeqTrainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=tokenized_dataset["train"], |
|
) |
|
model.config.use_cache = False |
|
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
peft_model_id="finetuned_model/"+exp_name |
|
trainer.model.save_pretrained(peft_model_id) |
|
tokenizer.save_pretrained(peft_model_id) |
|
|
|
|
|
|
|
|
|
|
|
|
|
import evaluate |
|
import numpy as np |
|
from datasets import load_from_disk |
|
from tqdm import tqdm |
|
|
|
|
|
metric = evaluate.load("rouge") |
|
|
|
def evaluate_peft_model(sample,max_target_length=128): |
|
|
|
outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length) |
|
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True) |
|
|
|
|
|
labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id) |
|
|
|
labels = tokenizer.decode(labels, skip_special_tokens=True) |
|
|
|
|
|
input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True)) |
|
|
|
|
|
|
|
|
|
return prediction, labels,input_sentence |
|
|
|
|
|
test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch") |
|
|
|
|
|
|
|
predictions, references,input_sentence= [] , [], [] |
|
idx=0 |
|
for sample in tqdm(test_dataset): |
|
|
|
p,l,nl = evaluate_peft_model(sample) |
|
|
|
input_sentence.append(nl) |
|
predictions.append(p) |
|
references.append(l) |
|
idx+=1 |
|
print(idx,'\n',input_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n') |
|
|
|
|
|
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True) |
|
|
|
|
|
print(f"Rogue1: {rogue['rouge1']* 100:2f}%") |
|
print(f"rouge2: {rogue['rouge2']* 100:2f}%") |
|
print(f"rougeL: {rogue['rougeL']* 100:2f}%") |
|
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%") |
|
eval_output=np.array([input_sentence,predictions,references]).T |
|
import pandas as pd |
|
eval_output=pd.DataFrame(eval_output) |
|
pd.DataFrame.to_csv(eval_output,peft_model_id+'/output') |
|
|
|
|
|
|
|
|