File size: 6,409 Bytes
b72431a 4485599 3015f8c 600cdb3 3015f8c b72431a f033d32 b72431a d120f1f 382c5c3 3416a77 382c5c3 3416a77 382c5c3 3416a77 382c5c3 b72431a d120f1f 3b282ec d40969f 0325f6e 382c5c3 2aa94ec 795d585 29b0210 278f39e 795d585 278f39e 382c5c3 795d585 382c5c3 600cdb3 382c5c3 795d585 382c5c3 4485599 d120f1f 3b282ec 0325f6e 8c2765d 0325f6e 8c2765d 600cdb3 a5b1554 600cdb3 a5b1554 600cdb3 d120f1f 600cdb3 d120f1f 0fd88df 6488dec 795d585 6488dec 795d585 6488dec 0fd88df 382c5c3 1b73302 382c5c3 0fd88df d120f1f 0fd88df d120f1f 0fd88df 4485599 d120f1f 4485599 d120f1f b2414b4 d120f1f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
from transformers import TFBertModel, TFXLMRobertaModel
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from huggingface_hub import from_pretrained_keras
import re
app_title = "Portuguese Counter Hate-Speech Detection"
app_description = """
This prototype from the kNOwHATE project aims to classify a Portuguese target sentence as either hate speech, counter hate speech or neutral, considering another sentence as context.
We collected 24,739 YouTube comments and 29,846 tweets, annotated by experts, and trained our prototype on this data.
We invite you to try it out. You can just enter a pair of sentences below, one as target and another as context, and submit it to see if the target is either hate speech, counter hate speech or neutral, relative to the context.
For more, visit our [website](https://knowhate.eu) and [Hugging Face page](https://huggingface.co/knowhate).
"""
def_model = 'knowhate/counterhate-youtube-bertimbau'
def_model2 = 'knowhate/counterhate-twitter-bertbasemultilingualcased-cleantxt'
model_list = [
def_model,
"knowhate/counterhate-youtube-xlmrobertabase",
"knowhate/counterhate-youtube-bertbasemultilingualcased",
"knowhate/counterhate-twitter-bertimbau",
"knowhate/counterhate-twitter-bertimbau-cleantxt",
"knowhate/counterhate-twitter-xlmrobertabase",
"knowhate/counterhate-twitter-xlmrobertabase-cleantxt",
"knowhate/counterhate-twitter-bertbasemultilingualcased",
def_model2
]
kw_to_hf = {"knowhate/counterhate-twitter-bertimbau": "neuralmind/bert-base-portuguese-cased",
"knowhate/counterhate-twitter-bertimbau-cleantxt": "neuralmind/bert-base-portuguese-cased",
"knowhate/counterhate-twitter-xlmrobertabase": "xlm-roberta-base",
"knowhate/counterhate-twitter-xlmrobertabase-cleantxt": "xlm-roberta-base",
"knowhate/counterhate-twitter-bertbasemultilingualcased": "bert-base-multilingual-cased",
"knowhate/counterhate-twitter-bertbasemultilingualcased-cleantxt": "bert-base-multilingual-cased",
"knowhate/counterhate-youtube-bertimbau": "neuralmind/bert-base-portuguese-cased",
"knowhate/counterhate-youtube-xlmrobertabase": "xlm-roberta-base",
"knowhate/counterhate-youtube-bertbasemultilingualcased": "bert-base-multilingual-cased"
}
# "knowhate/counterhate-youtube-hateberttuga": "knowhate/hateberttuga",
# "knowhate/counterhate-twitter-hateberttuga": "knowhate/hateberttuga"
# 1 0 2
app_examples = [
["Totalmente de acordo mano ...quando somos nós já nao e racismo...Fdse isto e uma vergonha ..",
"Mimimi... Vocês são preconceituosos e não tem vergonha na cara!",
def_model],
["\"Não acredites em tudo o que lês na Internet\" - Abraham Lincoln",
"A Internet foi desenvolvida entre os anos 1973-1989.",
def_model],
["Então o Marcelo foi ao Qatar para 'falar de direitos humanos', mas não foi a Odemira?",
"esse retardado mental, foi a praia do katar, la tem a agua mais kentinha.",
def_model],
["Essa gente tem é de deixar de ser apaparicada pelo Estado e começar a cumprir os seus deveres como cidadãos.",
"Nepia o que faz com que as pessoas generalizem é o ódio intrínseco que têm contra uma etnia, ng é responsável pela sua xenofobia",
def_model2],
["Nem vou comentar o hate e misoginia que tenho visto aqui no tt em relação à Anitta",
"E xenofobia também. Tugas no seu melhor",
def_model2],
["A Festa tá no Climax, chama o zuca pra Dançar.",
"Já reparaste no contador da luz? Vai trabalhar malandro",
def_model2]
]
def remove_emojis(data):
if '@' in data:
data = re.sub(r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z0-9-_]+[A-Za-z0-9-_]+)", "", data).strip()
if 'https' in data:
data = re.sub(
r"(?i)\b(?:[a-z][\w.+-]+:(?:/{1,3}|[?+]?[a-z0-9%]))(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s\x60!()\[\]{};:'\".,<>?«»“”‘’])",
"", data).strip()
emoj = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return re.sub(emoj, '', data)
def predict(text, target, chosen_model):
# model1 = tf.keras.models.load_model(chosen_model, custom_objects={"TFBertModel": TFBertModel})
print(chosen_model)
if '-cleantxt' in chosen_model:
text = remove_emojis(text)
target = remove_emojis(target)
print(text)
print(target)
model1 = from_pretrained_keras(chosen_model)
checkpoint = kw_to_hf[chosen_model] # "neuralmind/bert-base-portuguese-cased"
if '/' in checkpoint:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True, model_max_length=512)
else:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
tokpair = tokenizer(text, target, truncation=True, padding='max_length', return_tensors='tf', return_token_type_ids=False)
outp = model1.signatures["serving_default"](**tokpair)
proto_tensor = tf.make_tensor_proto(outp['outp'])
allscores = tf.make_ndarray(proto_tensor)[0]
scores_dict = {
'Neutral': allscores[0],
'Counter Speech': allscores[1],
'Hate Speech': allscores[2]
}
return scores_dict
inputs = [
gr.Textbox(label="Context", value= app_examples[0][0]),
gr.Textbox(label="Target", value= app_examples[0][1]),
gr.Dropdown(label="Model", choices=model_list, value=model_list[0])
]
outputs = [
gr.Label(label="Result"),
]
gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title,
description=app_description, examples=app_examples, theme=gr.themes.Base(primary_hue="red")).launch() |