Spaces:
Build error
Build error
File size: 5,013 Bytes
d483661 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import DatasetDict, Dataset
import json
def preprocess_data1(json_path, tokenizer):
with open(json_path, "r") as f:
data = json.load(f)["data"]
tokenized_data = {"input_ids": [], "attention_mask": [], "labels": []}
slot_label_map = {"O": 0}
label_id = 1
for intent_data in data:
for utterance in intent_data["utterances"]:
text = utterance["text"]
encoding = tokenizer(
text,
truncation=True,
padding="max_length",
max_length=128,
return_offsets_mapping=True
)
tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
# Create slot labels for the tokens
slot_labels = ["O"] * len(tokens)
for slot, value in utterance["slots"].items():
if value != "not specified": # Skip unspecified slots
slot_tokens = tokenizer.tokenize(value)
for i in range(len(tokens) - len(slot_tokens) + 1):
if tokens[i:i + len(slot_tokens)] == slot_tokens:
slot_labels[i] = f"B-{slot}"
for j in range(1, len(slot_tokens)):
slot_labels[i + j] = f"I-{slot}"
# Map slot labels to IDs
for label in slot_labels:
if label not in slot_label_map:
slot_label_map[label] = label_id
label_id += 1
label_ids = [slot_label_map[label] for label in slot_labels]
tokenized_data["input_ids"].append(encoding["input_ids"])
tokenized_data["attention_mask"].append(encoding["attention_mask"])
tokenized_data["labels"].append(label_ids)
print("Slot Label Map:", slot_label_map)
dataset = Dataset.from_dict(tokenized_data)
return DatasetDict({"train": dataset, "validation": dataset}), slot_label_map
# Update training preprocessing to handle multi-token amount
def preprocess_data(json_path, tokenizer):
with open(json_path, "r") as f:
data = json.load(f)["data"]
tokenized_data = {"input_ids": [], "attention_mask": [], "labels": []}
slot_label_map = {"O": 0}
for intent_data in data:
for utterance in intent_data["utterances"]:
text = utterance["text"]
encoding = tokenizer(
text,
truncation=True,
padding="max_length",
max_length=128,
return_offsets_mapping=True
)
tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
slot_labels = ["O"] * len(tokens)
for slot, value in utterance["slots"].items():
if value != "not specified":
slot_tokens = tokenizer.tokenize(value)
for i in range(len(tokens) - len(slot_tokens) + 1):
if tokens[i:i + len(slot_tokens)] == slot_tokens:
slot_labels[i] = f"B-{slot}"
for j in range(1, len(slot_tokens)):
slot_labels[i + j] = f"I-{slot}"
# Map slot labels to IDs
for label in slot_labels:
if label not in slot_label_map:
slot_label_map[label] = label_id
label_id += 1
label_ids = [slot_label_map[label] for label in slot_labels]
tokenized_data["input_ids"].append(encoding["input_ids"])
tokenized_data["attention_mask"].append(encoding["attention_mask"])
tokenized_data["labels"].append(label_ids)
dataset = Dataset.from_dict(tokenized_data)
return DatasetDict({"train": dataset, "validation": dataset}), slot_label_map
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
json_path = "nlu_dataset.json"
dataset, slot_label_map = preprocess_data(json_path, tokenizer)
model = BertForTokenClassification.from_pretrained(
"bert-base-multilingual-cased",
num_labels=len(slot_label_map)
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=100,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
evaluation_strategy="epoch",
logging_dir="./logs",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"],
data_collator=data_collator
)
trainer.train()
|