|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
|
|
app_title = "Portuguese Hate Speech Detection" |
|
|
|
app_description = """ click on one of the examples provided below. |
|
""" |
|
This app detects hate speech on Portuguese text using multiple models. You can either introduce your own sentences by filling in "Text" or |
|
|
|
app_examples = [ |
|
["as pessoas tem que perceber que ser 'panasca' não é deixar de ser homem, é deixar de ser humano kkk"], |
|
["ontem encontrei-me com um amigo meu e tivemos uma conversa agradável"], |
|
] |
|
|
|
output_textbox_component_description = """ |
|
This box will display the hate speech detection results based on the average score of multiple models. |
|
""" |
|
|
|
output_json_component_description = { "breakdown": """ |
|
This box presents a detailed breakdown of the evaluation for each model. |
|
"""} |
|
|
|
short_score_descriptions = { |
|
0: "Non Hate Speech", |
|
1: "Hate Speech" |
|
} |
|
|
|
score_descriptions = { |
|
0: "This text is not Hate Speech.", |
|
1: "This text is Hate Speech.", |
|
} |
|
|
|
model_list = [ |
|
"knowhate/HateBERTimbau", |
|
"knowhate/HateBERTimbau-youtube", |
|
"knowhate/HateBERTimbau-twitter", |
|
"knowhate/HateBERTimbau-yt-tt", |
|
] |
|
|
|
user_friendly_name = { |
|
"knowhate/HateBERTimbau": "HateBERTimbau (Original)", |
|
"knowhate/HateBERTimbau-youtube": "HateBERTimbau (YouTube)", |
|
"knowhate/HateBERTimbau-twitter": "HateBERTimbau (Twitter)", |
|
"knowhate/HateBERTimbau-yt-tt": "HateBERTimbau (YouTube + Twitter)", |
|
} |
|
|
|
reverse_user_friendly_name = { v:k for k,v in user_friendly_name.items() } |
|
|
|
user_friendly_name_list = list(user_friendly_name.values()) |
|
|
|
model_array = [] |
|
|
|
for model_name in model_list: |
|
row = {} |
|
row["name"] = model_name |
|
row["tokenizer"] = AutoTokenizer.from_pretrained(model_name) |
|
row["model"] = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
model_array.append(row) |
|
|
|
def most_frequent(array): |
|
occurence_count = Counter(array) |
|
return occurence_count.most_common(1)[0][0] |
|
|
|
|
|
def predict(s1, chosen_model): |
|
if not chosen_model: |
|
chosen_model = user_friendly_name_list[0] |
|
scores = {} |
|
full_chosen_model_name = reverse_user_friendly_name[chosen_model] |
|
for row in model_array: |
|
name = row["name"] |
|
if name != full_chosen_model_name: |
|
continue |
|
else: |
|
tokenizer = row["tokenizer"] |
|
model = row["model"] |
|
model_input = tokenizer(*([s1],), padding=True, return_tensors="pt") |
|
with torch.no_grad(): |
|
output = model(**model_input) |
|
logits = output[0][0].detach().numpy() |
|
logits = softmax(logits).tolist() |
|
break |
|
def get_description(idx): |
|
description = score_descriptions[idx] |
|
description_pt = score_descriptions_pt[idx] |
|
final_description = description + "\n \n" + description_pt |
|
return final_description |
|
|
|
max_pos = logits.index(max(logits)) |
|
markdown_description = get_description(max_pos) |
|
scores = { short_score_descriptions[k]:v for k,v in enumerate(logits) } |
|
|
|
return scores, markdown_description |
|
|
|
|
|
inputs = [ |
|
gr.Textbox(label="Text", value=app_examples[0][0]), |
|
gr.Dropdown(label="Model", choices=user_friendly_name_list, value=user_friendly_name_list[0]) |
|
] |
|
|
|
outputs = [ |
|
gr.Label(label="Result"), |
|
gr.Markdown(), |
|
] |
|
|
|
|
|
gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title, |
|
description=app_description, |
|
examples=app_examples, |
|
article = article_string).launch() |