initial upload

Browse files

Files changed (4) hide show

README.md +98 -0
config.json +28 -0
pytorch_model.bin +3 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,98 @@

+# BERTIN-roBERTa-base-Spanish_SemEval18_Emodetection
+This is a BERTIN-roBERTa-base-Spanish model trained on ~3500 tweets in Spanish annotated for 11 emotion categories in [SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification](https://competitions.codalab.org/competitions/17751).
+Run the classifier on the test set of the competition:
+```python
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModel
+from torch.utils.data import DataLoader
+import torch
+import pandas as pd
+# choose GPU when available
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = AutoTokenizer.from_pretrained("bertin-project/bertin-roberta-base-spanish",model_max_length=512)
+# build custom model with classification layer on top and a dropout layer before
+class RobertaClass(torch.nn.Module):
+    def __init__(self):
+        super(RobertaClass, self).__init__()
+        self.l1 = AutoModel.from_pretrained("bertin-project/bertin-roberta-base-spanish",return_dict=False)
+        self.l2 = torch.nn.Dropout(0.3)
+        self.l3 = torch.nn.Linear(768, 11)
+    def forward(self, input_ids, attention_mask):
+        _, output_1= self.l1(input_ids=input_ids, attention_mask=attention_mask)
+        output_2 = self.l2(output_1)
+        output = self.l3(output_2)
+        return output
+model_name="bertin-roberta-base-spanish_semeval18_emodetection/pytorch_model.bin"
+model=RobertaClass()
+model.load_state_dict(torch.load(model_name,map_location=torch.device(device)))
+model.eval()
+# run on more than 1 GPU
+model = torch.nn.DataParallel(model)
+model.to(device)
+twnames=['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']
+# load from hugging face dataset hub
+testset_raw = load_dataset('sem_eval_2018_task_1','subtask5.spanish',split='test')
+# remove old columns
+testset=testset_raw.remove_columns(twnames+["ID"])
+# tokenize
+testset_tokenized = testset.map(lambda e: tokenizer(e['Tweet'], truncation=True, padding='max_length'), batched=True)
+testset_tokenized=testset_tokenized.remove_columns("Tweet")
+testset_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
+outfile="predicted_2018-E-c-Es-test-gold.txt"
+MAX_LEN = 512
+VALID_BATCH_SIZE = 8
+# set batch size according to available RAM
+# VALID_BATCH_SIZE = 1000
+# set num_workers for parallel processing
+inference_params = {'batch_size': VALID_BATCH_SIZE,
+                'shuffle': False,
+                # 'num_workers': 1
+                }
+inference_loader = DataLoader(testset_tokenized, **inference_params)
+open(outfile,"w").close()
+with torch.no_grad():
+    # change lines for progress manager
+    # for _, data in tqdm(enumerate(inference_loader, 0),total=len(inference_loader)):
+    for _, data in enumerate(inference_loader, 0):
+        outputs = model(input_ids=data['input_ids'],attention_mask=data['attention_mask'])
+        fin_outputs=torch.sigmoid(outputs).cpu().detach().numpy().tolist()
+        pd.DataFrame(fin_outputs).to_csv(outfile,index=False,header=False,sep="\t",mode='a')
+# # dataset from file (one text per line)
+# from datasets import Dataset
+# with open(linesoftextfile,"rb") as textfile:
+#     textdict={"text":[x.decode().rstrip("\n") for x in textfile.readlines()]}
+# inference_dataset=Dataset.from_dict(textdict)
+# del(textdict)
+```

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "maxpe/bertin-roberta-base-spanish_semeval18_emodetection",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "multi_label_classification",
+  "transformers_version": "4.11.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5339065d51a0c569807fb113762a5ce142fb70e8b1af21e18a3c1378f7a93f29
+size 498699629

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d002368eb9b7f97b0960ba30391b517e7bd4e3b205e571132665c6a39ff1ad1c
+size 2927