Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
from sklearn.metrics import f1_score, accuracy_score | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForSequenceClassification, | |
Trainer, | |
TrainingArguments, | |
) | |
from model import SciBertPaperClassifier | |
def encode_labels(example): | |
example["labels"] = label2id[example["category"]] | |
return example | |
def preprocess_function(examples): | |
texts = [ | |
f"AUTHORS: {' '.join(a) if isinstance(a, list) else a} TITLE: {t} ABSTRACT: {ab}" | |
for a, t, ab in zip( | |
examples["authors"], examples["title"], examples["abstract"] | |
) | |
] | |
return tokenizer(texts, truncation=True, padding="max_length", max_length=256) | |
def compute_metrics(pred): | |
labels = pred.label_ids | |
logits = pred.predictions | |
preds = logits.argmax(-1) | |
return { | |
"accuracy": accuracy_score(labels, preds), | |
"f1": f1_score(labels, preds, average="weighted"), | |
} | |
if __name__ == "__main__": | |
print("DOWNLOADING DATASET...") | |
data_files = {"train": "arxiv_train.json", "test": "arxiv_test.json"} | |
dataset = load_dataset("json", data_files=data_files) | |
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(100000)) | |
print(f"DATA IS READY. TRAIN: {len(dataset['train'])}") | |
print("LABELING...") | |
unique_labels = sorted(set(example["category"] for example in dataset["train"])) | |
label2id = {label: idx for idx, label in enumerate(unique_labels)} | |
id2label = {idx: label for label, idx in label2id.items()} | |
dataset["train"] = dataset["train"].map(encode_labels) | |
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42) | |
train_dataset = split_dataset["train"] | |
valid_dataset = split_dataset["test"] | |
print(f"TRAIN SET: {len(train_dataset)}, VALIDATION SET: {len(valid_dataset)}") | |
print("TOKENIZATION...") | |
model_name = "allenai/scibert_scivocab_uncased" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
encoded_train = train_dataset.map(preprocess_function, batched=True, batch_size=32) | |
encoded_valid = valid_dataset.map(preprocess_function, batched=True, batch_size=32) | |
encoded_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) | |
encoded_valid.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) | |
print("TOKENIZATION COMPLETED") | |
print("DOWNLOADING MODEL...") | |
model = AutoModelForSequenceClassification.from_pretrained( | |
model_name, | |
num_labels=len(unique_labels), | |
id2label=id2label, | |
label2id=label2id, | |
) | |
training_args = TrainingArguments( | |
output_dir="./dataset_output", | |
report_to="none", | |
eval_strategy="steps", | |
eval_steps=100, | |
logging_steps=200, | |
disable_tqdm=True, | |
learning_rate=3e-5, | |
per_device_train_batch_size=32, | |
per_device_eval_batch_size=32, | |
num_train_epochs=2, | |
save_steps=200, | |
fp16=True, | |
remove_unused_columns=False, | |
) | |
print("LEARNING...") | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=encoded_train, | |
eval_dataset=encoded_valid, | |
compute_metrics=compute_metrics, | |
) | |
trainer.train() | |
print("LEARNING COMPLETED") | |
model.save_pretrained("trained_model") | |
tokenizer.save_pretrained("trained_model") | |
print("EVALUATION...") | |
final_metrics = trainer.evaluate() | |
print("METRICS:") | |
for key, value in final_metrics.items(): | |
print(f"{key}: {value:.4f}") |