luisespinosa commited on
Commit
3b24c3e
Β·
1 Parent(s): eba980e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +106 -79
README.md CHANGED
@@ -1,87 +1,114 @@
1
  # Twitter-roBERTa-base
2
-
3
- This is a roBERTa-base model trained on ~58M tweets and finetuned for the emoji prediction task at Semeval 2018.
4
- For full description: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf).
5
- To evaluate this and other models on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
6
-
7
- ## Example of classification
8
-
9
  ```python
10
- from transformers import AutoModelForSequenceClassification
11
- from transformers import TFAutoModelForSequenceClassification
12
- from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
13
  import numpy as np
14
- from scipy.special import softmax
15
- import csv
16
- import urllib.request
17
-
18
- # Tasks:
19
- # emoji, emotion, hate, irony, offensive, sentiment
20
- # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
21
-
22
- task='emoji'
23
- MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
24
-
25
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
26
-
27
- # download label mapping
28
- labels=[]
29
- mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
30
- with urllib.request.urlopen(mapping_link) as f:
31
- html = f.read().decode('utf-8').split("\n")
32
- spamreader = csv.reader(html[:-1], delimiter='\t')
33
- labels = [row[1] for row in spamreader]
34
-
35
- # PT
36
- model = AutoModelForSequenceClassification.from_pretrained(MODEL)
37
- model.save_pretrained(MODEL)
38
-
39
- text = "Good night 😊"
40
- encoded_input = tokenizer(text, return_tensors='pt')
41
- output = model(**encoded_input)
42
- scores = output[0][0].detach().numpy()
43
- scores = softmax(scores)
44
-
45
- # # TF
46
- # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
47
- # model.save_pretrained(MODEL)
48
-
49
- # text = "Good night 😊"
50
- # encoded_input = tokenizer(text, return_tensors='tf')
51
- # output = model(encoded_input)
52
- # scores = output[0][0].numpy()
53
- # scores = softmax(scores)
54
-
55
- ranking = np.argsort(scores)
56
- ranking = ranking[::-1]
57
- for i in range(scores.shape[0]):
58
- l = labels[ranking[i]]
59
- s = scores[ranking[i]]
60
- print(f"{i+1}) {l} {np.round(float(s), 4)}")
61
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  ```
63
-
64
  Output:
65
-
66
  ```
67
- 1) 😘 0.2637
68
- 2) ❀️ 0.1952
69
- 3) πŸ’• 0.1171
70
- 4) ✨ 0.0927
71
- 5) 😊 0.0756
72
- 6) πŸ’œ 0.046
73
- 7) πŸ’™ 0.0444
74
- 8) 😍 0.0272
75
- 9) πŸ˜‰ 0.0228
76
- 10) 😎 0.0198
77
- 11) 😜 0.0166
78
- 12) πŸ˜‚ 0.0132
79
- 13) 😁 0.0131
80
- 14) β˜€ 0.0112
81
- 15) πŸŽ„ 0.009
82
- 16) πŸ’― 0.009
83
- 17) πŸ”₯ 0.008
84
- 18) πŸ“· 0.0057
85
- 19) πŸ‡ΊπŸ‡Έ 0.005
86
- 20) πŸ“Έ 0.0048
87
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Twitter-roBERTa-base
2
+ This is a roBERTa-base model trained on ~58M tweets, described and evaluated in the [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). To evaluate this and other LMs on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
3
+ ## Preprocess Text
4
+ Replace usernames and links for placeholders: "@user" and "http".
 
 
 
 
5
  ```python
6
+ def preprocess(text):
7
+ new_text = []
8
+ for t in text.split(" "):
9
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
10
+ t = 'http' if t.startswith('http') else t
11
+ new_text.append(t)
12
+ return " ".join(new_text)
13
+ ```
14
+ ## Example Masked Language Model
15
+ ```python
16
+ from transformers import pipeline, AutoTokenizer
17
  import numpy as np
18
+ MODEL = "cardiffnlp/twitter-roberta-base"
19
+ fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
 
 
 
 
 
 
 
 
 
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
21
+ def print_candidates():
22
+ for i in range(5):
23
+ token = tokenizer.decode(candidates[i]['token'])
24
+ score = np.round(candidates[i]['score'], 4)
25
+ print(f"{i+1}) {token} {score}")
26
+ texts = [
27
+ "I am so <mask> 😊",
28
+ "I am so <mask> 😒"
29
+ ]
30
+ for text in texts:
31
+ t = preprocess(text)
32
+ print(f"{'-'*30}\n{t}")
33
+ candidates = fill_mask(t)
34
+ print_candidates()
35
+ ```
36
+ Output:
37
+ ```
38
+ ------------------------------
39
+ I am so <mask> 😊
40
+ 1) happy 0.402
41
+ 2) excited 0.1441
42
+ 3) proud 0.143
43
+ 4) grateful 0.0669
44
+ 5) blessed 0.0334
45
+ ------------------------------
46
+ I am so <mask> 😒
47
+ 1) sad 0.2641
48
+ 2) sorry 0.1605
49
+ 3) tired 0.138
50
+ 4) sick 0.0278
51
+ 5) hungry 0.0232
52
+ ```
53
+ ## Example Tweet Embeddings
54
+ ```python
55
+ from transformers import AutoTokenizer, AutoModel, TFAutoModel
56
+ import numpy as np
57
+ from scipy.spatial.distance import cosine
58
+ from collections import defaultdict
59
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
60
+ model = AutoModel.from_pretrained(MODEL)
61
+ def get_embedding(text):
62
+ text = preprocess(text)
63
+ encoded_input = tokenizer(text, return_tensors='pt')
64
+ features = model(**encoded_input)
65
+ features = features[0].detach().cpu().numpy()
66
+ features_mean = np.mean(features[0], axis=0)
67
+ return features_mean
68
+ MODEL = "cardiffnlp/twitter-roberta-base"
69
+ query = "The book was awesome"
70
+ tweets = ["I just ordered fried chicken 🐣",
71
+ "The movie was great",
72
+ "What time is the next game?",
73
+ "Just finished reading 'Embeddings in NLP'"]
74
+ d = defaultdict(int)
75
+ for tweet in tweets:
76
+ sim = 1-cosine(get_embedding(query),get_embedding(tweet))
77
+ d[tweet] = sim
78
+ print('Most similar to: ',query)
79
+ print('----------------------------------------')
80
+ for idx,x in enumerate(sorted(d.items(), key=lambda x:x[1], reverse=True)):
81
+ print(idx+1,x[0])
82
  ```
 
83
  Output:
 
84
  ```
85
+ Most similar to: The book was awesome
86
+ ----------------------------------------
87
+ 1 The movie was great
88
+ 2 Just finished reading 'Embeddings in NLP'
89
+ 3 I just ordered fried chicken 🐣
90
+ 4 What time is the next game?
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  ```
92
+ ## Example Feature Extraction
93
+ ```python
94
+ from transformers import AutoTokenizer, AutoModel, TFAutoModel
95
+ import numpy as np
96
+ MODEL = "cardiffnlp/twitter-roberta-base"
97
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
98
+ text = "Good night 😊"
99
+ text = preprocess(text)
100
+ # Pytorch
101
+ model = AutoModel.from_pretrained(MODEL)
102
+ encoded_input = tokenizer(text, return_tensors='pt')
103
+ features = model(**encoded_input)
104
+ features = features[0].detach().cpu().numpy()
105
+ features_mean = np.mean(features[0], axis=0)
106
+ #features_max = np.max(features[0], axis=0)
107
+ # # Tensorflow
108
+ # model = TFAutoModel.from_pretrained(MODEL)
109
+ # encoded_input = tokenizer(text, return_tensors='tf')
110
+ # features = model(encoded_input)
111
+ # features = features[0].numpy()
112
+ # features_mean = np.mean(features[0], axis=0)
113
+ # #features_max = np.max(features[0], axis=0)
114
+ ```