File size: 3,838 Bytes
b72431a
4485599
 
 
 
3015f8c
 
b72431a
382c5c3
b72431a
d120f1f
382c5c3
3416a77
382c5c3
3416a77
382c5c3
3416a77
382c5c3
b72431a
d120f1f
72a8b62
0325f6e
382c5c3
 
d7ba310
382c5c3
 
72a8b62
 
 
 
 
382c5c3
d7ba310
382c5c3
 
d7ba310
c909d5a
382c5c3
4485599
d120f1f
8c2765d
 
0325f6e
8c2765d
 
0325f6e
8c2765d
 
0325f6e
d120f1f
 
 
0fd88df
 
 
382c5c3
 
 
1b73302
382c5c3
 
0fd88df
d120f1f
0fd88df
d120f1f
0fd88df
4485599
 
d120f1f
4485599
 
 
d120f1f
 
 
 
 
b2414b4
 
 
d120f1f
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
from transformers import TFBertModel, TFXLMRobertaModel
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from huggingface_hub import from_pretrained_keras


app_title = "Portuguese Counter Hate Speech Detection"

app_description = """
This prototype from the kNOwHATE project aims to classify a Portuguese target sentence as either hate speech, counter hate speech or neutral, considering another sentence as context. 

We collected 24,739 YouTube comments and 29,846 tweets, annotated by experts, and trained our prototype on this data. 

We invite you to try it out. You can just enter a pair of sentences below, one as target and another as context, and submit it to see if the target is either hate speech, counter hate speech or neutral, relative to the context.

For more, visit our [website](https://knowhate.eu) and [Hugging Face page](https://huggingface.co/knowhate).
"""

def_model = 'knowhate/counterhate-youtube-hateberttuga'

model_list = [
    def_model, 
    "knowhate/counterhate-youtube-bertimbau"
]

# "knowhate/counterhate-twitter-xlmrobertabase",
# "knowhate/counterhate-twitter-bertbasemultilingualcased",
# "knowhate/counterhate-twitter-hateberttuga",
# "knowhate/counterhate-youtube-hateberttuga",

kw_to_hf = {"knowhate/counterhate-twitter-bertimbau": "neuralmind/bert-base-portuguese-cased", 
            "knowhate/counterhate-youtube-bertimbau": "neuralmind/bert-base-portuguese-cased", 
            "knowhate/counterhate-twitter-xlmrobertabase": "xlm-roberta-base", 
            "knowhate/counterhate-twitter-bertbasemultilingualcased": "bert-base-multilingual-cased", 
            "knowhate/counterhate-youtube-hateberttuga": "knowhate/hateberttuga",
            "knowhate/counterhate-twitter-hateberttuga": "knowhate/hateberttuga"}

# 1 0 2
app_examples = [
    ["Tudo apoiantes do lula livre que o bloco de esterco anda a importar para cá.", 
     "Sim, têm um presidente ditador. E se houver muita gente a pensar como o senhor, aqui acontecerá a mesma coisa.", 
     def_model],
    ["\"Não acredites em tudo o que lês na Internet\" - Abraham Lincoln", 
     "A Internet foi desenvolvida entre os anos 1973-1989.", 
     def_model],
    ["Então o Marcelo foi ao Qatar para 'falar de direitos humanos', mas não foi a Odemira?", 
     "esse retardado mental, foi a praia do katar, la tem a agua mais kentinha.", 
     def_model]
]

def predict(text, target, chosen_model):    
    # model1 = tf.keras.models.load_model(chosen_model, custom_objects={"TFBertModel": TFBertModel})
    model1 = from_pretrained_keras(chosen_model)
    
    checkpoint = kw_to_hf[chosen_model]   # "neuralmind/bert-base-portuguese-cased"
    if '/' in checkpoint:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True, model_max_length=512)
    else:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
        
    tokpair = tokenizer(text, target, truncation=True, padding='max_length', return_tensors='tf', return_token_type_ids=False)
    
    outp = model1.signatures["serving_default"](**tokpair)
    
    proto_tensor = tf.make_tensor_proto(outp['outp'])
    allscores = tf.make_ndarray(proto_tensor)[0]
        
    scores_dict = {
        'Neutral': allscores[0],
        'Counter Speech': allscores[1],
        'Hate Speech': allscores[2]
    }
    
    return scores_dict

inputs = [
    gr.Textbox(label="Context", value= app_examples[0][0]),
    gr.Textbox(label="Target", value= app_examples[0][1]),
    gr.Dropdown(label="Model", choices=model_list, value=model_list[0])
]

outputs = [
    gr.Label(label="Result"),
]

gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title, 
             description=app_description, examples=app_examples, theme=gr.themes.Base(primary_hue="red")).launch()