maxpe commited on
Commit
9acb474
·
1 Parent(s): 4df9597

initial upload

Browse files
Files changed (4) hide show
  1. README.md +98 -0
  2. config.json +28 -0
  3. pytorch_model.bin +3 -0
  4. training_args.bin +3 -0
README.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BERTIN-roBERTa-base-Spanish_SemEval18_Emodetection
2
+
3
+ This is a BERTIN-roBERTa-base-Spanish model trained on ~3500 tweets in Spanish annotated for 11 emotion categories in [SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification](https://competitions.codalab.org/competitions/17751).
4
+
5
+ Run the classifier on the test set of the competition:
6
+
7
+ ```python
8
+ from datasets import load_dataset
9
+ from transformers import AutoTokenizer, AutoModel
10
+ from torch.utils.data import DataLoader
11
+ import torch
12
+ import pandas as pd
13
+
14
+ # choose GPU when available
15
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
+
17
+ tokenizer = AutoTokenizer.from_pretrained("bertin-project/bertin-roberta-base-spanish",model_max_length=512)
18
+
19
+ # build custom model with classification layer on top and a dropout layer before
20
+ class RobertaClass(torch.nn.Module):
21
+
22
+ def __init__(self):
23
+ super(RobertaClass, self).__init__()
24
+ self.l1 = AutoModel.from_pretrained("bertin-project/bertin-roberta-base-spanish",return_dict=False)
25
+ self.l2 = torch.nn.Dropout(0.3)
26
+ self.l3 = torch.nn.Linear(768, 11)
27
+
28
+ def forward(self, input_ids, attention_mask):
29
+ _, output_1= self.l1(input_ids=input_ids, attention_mask=attention_mask)
30
+ output_2 = self.l2(output_1)
31
+ output = self.l3(output_2)
32
+
33
+ return output
34
+
35
+ model_name="bertin-roberta-base-spanish_semeval18_emodetection/pytorch_model.bin"
36
+
37
+ model=RobertaClass()
38
+
39
+ model.load_state_dict(torch.load(model_name,map_location=torch.device(device)))
40
+
41
+ model.eval()
42
+
43
+ # run on more than 1 GPU
44
+ model = torch.nn.DataParallel(model)
45
+
46
+ model.to(device)
47
+
48
+ twnames=['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']
49
+
50
+ # load from hugging face dataset hub
51
+ testset_raw = load_dataset('sem_eval_2018_task_1','subtask5.spanish',split='test')
52
+
53
+ # remove old columns
54
+ testset=testset_raw.remove_columns(twnames+["ID"])
55
+
56
+ # tokenize
57
+ testset_tokenized = testset.map(lambda e: tokenizer(e['Tweet'], truncation=True, padding='max_length'), batched=True)
58
+
59
+ testset_tokenized=testset_tokenized.remove_columns("Tweet")
60
+
61
+ testset_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
62
+
63
+
64
+ outfile="predicted_2018-E-c-Es-test-gold.txt"
65
+
66
+ MAX_LEN = 512
67
+ VALID_BATCH_SIZE = 8
68
+ # set batch size according to available RAM
69
+ # VALID_BATCH_SIZE = 1000
70
+
71
+ # set num_workers for parallel processing
72
+ inference_params = {'batch_size': VALID_BATCH_SIZE,
73
+ 'shuffle': False,
74
+ # 'num_workers': 1
75
+ }
76
+
77
+ inference_loader = DataLoader(testset_tokenized, **inference_params)
78
+
79
+
80
+ open(outfile,"w").close()
81
+ with torch.no_grad():
82
+ # change lines for progress manager
83
+ # for _, data in tqdm(enumerate(inference_loader, 0),total=len(inference_loader)):
84
+ for _, data in enumerate(inference_loader, 0):
85
+ outputs = model(input_ids=data['input_ids'],attention_mask=data['attention_mask'])
86
+ fin_outputs=torch.sigmoid(outputs).cpu().detach().numpy().tolist()
87
+ pd.DataFrame(fin_outputs).to_csv(outfile,index=False,header=False,sep="\t",mode='a')
88
+
89
+
90
+ # # dataset from file (one text per line)
91
+ # from datasets import Dataset
92
+
93
+ # with open(linesoftextfile,"rb") as textfile:
94
+ # textdict={"text":[x.decode().rstrip("\n") for x in textfile.readlines()]}
95
+
96
+ # inference_dataset=Dataset.from_dict(textdict)
97
+ # del(textdict)
98
+ ```
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "maxpe/bertin-roberta-base-spanish_semeval18_emodetection",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "problem_type": "multi_label_classification",
24
+ "transformers_version": "4.11.3",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 50265
28
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5339065d51a0c569807fb113762a5ce142fb70e8b1af21e18a3c1378f7a93f29
3
+ size 498699629
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d002368eb9b7f97b0960ba30391b517e7bd4e3b205e571132665c6a39ff1ad1c
3
+ size 2927