Commit
·
c7f72c0
1
Parent(s):
7d764a6
model improved
Browse files- maker.py +11 -2
- pytorch_model.bin +1 -1
maker.py
CHANGED
@@ -64,6 +64,7 @@ class UDEmbedsDataset(object):
|
|
64 |
import torch
|
65 |
if len(x)<127:
|
66 |
x=[True]*len(x)
|
|
|
67 |
else:
|
68 |
w=sum([len(x)-i+1 if b else 0 for i,b in enumerate(x)])+1
|
69 |
for i in range(len(x)):
|
@@ -80,6 +81,14 @@ class UDEmbedsDataset(object):
|
|
80 |
for j in range(i+1,len(x)):
|
81 |
ids.append(j)
|
82 |
upos.append(p[j]+"|"+d[j] if int(c[j][6])==i+1 else p[i]+"|"+d[i] if int(c[i][6])==j+1 else p[j]+"|_")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
ids.append(-1)
|
84 |
upos.append("SYM|_")
|
85 |
with torch.no_grad():
|
@@ -90,7 +99,7 @@ class UDEmbedsDataset(object):
|
|
90 |
m.append(self.embeddings[j,:].sum(axis=0))
|
91 |
m.append(self.embeddings[self.tokenizer.sep_token_id,:])
|
92 |
emb=torch.stack(m)
|
93 |
-
return{"inputs_embeds":emb[ids
|
94 |
from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
|
95 |
from tokenizers.pre_tokenizers import Sequence,Split
|
96 |
from tokenizers import Regex
|
@@ -102,7 +111,7 @@ lid=trainDS(devDS,testDS)
|
|
102 |
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
|
103 |
mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
|
104 |
trainDS.embeddings=mdl.get_input_embeddings().weight
|
105 |
-
arg=TrainingArguments(num_train_epochs=
|
106 |
trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
|
107 |
trn.train()
|
108 |
trn.save_model(tgt)
|
|
|
64 |
import torch
|
65 |
if len(x)<127:
|
66 |
x=[True]*len(x)
|
67 |
+
w=(len(x)+1)*(len(x)+2)/2
|
68 |
else:
|
69 |
w=sum([len(x)-i+1 if b else 0 for i,b in enumerate(x)])+1
|
70 |
for i in range(len(x)):
|
|
|
81 |
for j in range(i+1,len(x)):
|
82 |
ids.append(j)
|
83 |
upos.append(p[j]+"|"+d[j] if int(c[j][6])==i+1 else p[i]+"|"+d[i] if int(c[i][6])==j+1 else p[j]+"|_")
|
84 |
+
if i>0 and w>8192:
|
85 |
+
while w>8192:
|
86 |
+
if upos[-1].endswith("|_"):
|
87 |
+
upos.pop(-1)
|
88 |
+
ids.pop(-1)
|
89 |
+
w-=1
|
90 |
+
else:
|
91 |
+
break
|
92 |
ids.append(-1)
|
93 |
upos.append("SYM|_")
|
94 |
with torch.no_grad():
|
|
|
99 |
m.append(self.embeddings[j,:].sum(axis=0))
|
100 |
m.append(self.embeddings[self.tokenizer.sep_token_id,:])
|
101 |
emb=torch.stack(m)
|
102 |
+
return{"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
|
103 |
from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
|
104 |
from tokenizers.pre_tokenizers import Sequence,Split
|
105 |
from tokenizers import Regex
|
|
|
111 |
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
|
112 |
mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
|
113 |
trainDS.embeddings=mdl.get_input_embeddings().weight
|
114 |
+
arg=TrainingArguments(num_train_epochs=10,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
|
115 |
trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
|
116 |
trn.train()
|
117 |
trn.save_model(tgt)
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 516182194
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc561bfa7e300ac4264be2a3deb05693acd70d31cbc1b15cbb2749781469d311
|
3 |
size 516182194
|