KoichiYasuoka commited on
Commit
c7f72c0
·
1 Parent(s): 7d764a6

model improved

Browse files
Files changed (2) hide show
  1. maker.py +11 -2
  2. pytorch_model.bin +1 -1
maker.py CHANGED
@@ -64,6 +64,7 @@ class UDEmbedsDataset(object):
64
  import torch
65
  if len(x)<127:
66
  x=[True]*len(x)
 
67
  else:
68
  w=sum([len(x)-i+1 if b else 0 for i,b in enumerate(x)])+1
69
  for i in range(len(x)):
@@ -80,6 +81,14 @@ class UDEmbedsDataset(object):
80
  for j in range(i+1,len(x)):
81
  ids.append(j)
82
  upos.append(p[j]+"|"+d[j] if int(c[j][6])==i+1 else p[i]+"|"+d[i] if int(c[i][6])==j+1 else p[j]+"|_")
 
 
 
 
 
 
 
 
83
  ids.append(-1)
84
  upos.append("SYM|_")
85
  with torch.no_grad():
@@ -90,7 +99,7 @@ class UDEmbedsDataset(object):
90
  m.append(self.embeddings[j,:].sum(axis=0))
91
  m.append(self.embeddings[self.tokenizer.sep_token_id,:])
92
  emb=torch.stack(m)
93
- return{"inputs_embeds":emb[ids[:8192],:],"labels":[self.label2id[p] for p in upos[:8192]]}
94
  from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
95
  from tokenizers.pre_tokenizers import Sequence,Split
96
  from tokenizers import Regex
@@ -102,7 +111,7 @@ lid=trainDS(devDS,testDS)
102
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
103
  mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
104
  trainDS.embeddings=mdl.get_input_embeddings().weight
105
- arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
106
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
107
  trn.train()
108
  trn.save_model(tgt)
 
64
  import torch
65
  if len(x)<127:
66
  x=[True]*len(x)
67
+ w=(len(x)+1)*(len(x)+2)/2
68
  else:
69
  w=sum([len(x)-i+1 if b else 0 for i,b in enumerate(x)])+1
70
  for i in range(len(x)):
 
81
  for j in range(i+1,len(x)):
82
  ids.append(j)
83
  upos.append(p[j]+"|"+d[j] if int(c[j][6])==i+1 else p[i]+"|"+d[i] if int(c[i][6])==j+1 else p[j]+"|_")
84
+ if i>0 and w>8192:
85
+ while w>8192:
86
+ if upos[-1].endswith("|_"):
87
+ upos.pop(-1)
88
+ ids.pop(-1)
89
+ w-=1
90
+ else:
91
+ break
92
  ids.append(-1)
93
  upos.append("SYM|_")
94
  with torch.no_grad():
 
99
  m.append(self.embeddings[j,:].sum(axis=0))
100
  m.append(self.embeddings[self.tokenizer.sep_token_id,:])
101
  emb=torch.stack(m)
102
+ return{"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
103
  from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
104
  from tokenizers.pre_tokenizers import Sequence,Split
105
  from tokenizers import Regex
 
111
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
112
  mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
113
  trainDS.embeddings=mdl.get_input_embeddings().weight
114
+ arg=TrainingArguments(num_train_epochs=10,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
115
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
116
  trn.train()
117
  trn.save_model(tgt)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccd79192cd866429a281e3d0029002b495624fcc969f90929aad0b25e144f08c
3
  size 516182194
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc561bfa7e300ac4264be2a3deb05693acd70d31cbc1b15cbb2749781469d311
3
  size 516182194