pfialho commited on
Commit
382c5c3
·
verified ·
1 Parent(s): 1beacf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -11
app.py CHANGED
@@ -6,17 +6,29 @@ from transformers import AutoTokenizer
6
  from huggingface_hub import from_pretrained_keras
7
 
8
 
9
- app_title = "Portuguese Counter Hate Speech Detection (NFAA)"
10
 
11
  app_description = """
12
- This app is the culmination of the kNOwHATE consortium project, which aimed to tackle Online Hate Speech in the Portuguese comunity. It serves as an user-friendly interface to classify text and identify instances of Hate Speech.
13
- This app leverages state-of-the-art Natural Language Processing models developed in the scope of this project to classify harmful text.
14
- Select a model from the dropdown menu and input your text to see the classification results. Explore the examples of Hate Speech and Non-Hate Speech offered, and join us in fostering a safer and more respectful online community.
15
- For more information about the kNOwHATE project and its initiatives, visit our website [here](https://knowhate.eu) and to explore and use these models visit our Hugging Face page [here](https://huggingface.co/knowhate).
16
  """
17
 
18
  def_model = 'knowhate/counterhate-twitter-bertimbau'
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # 1 0 2
21
  app_examples = [
22
  ["Essa gente tem é de deixar de ser apaparicada pelo Estado e começar a cumprir os seus deveres como cidadãos",
@@ -30,16 +42,16 @@ app_examples = [
30
  def_model]
31
  ]
32
 
33
- model_list = [
34
- def_model
35
- ]
36
-
37
  def predict(text, target, chosen_model):
38
  # model1 = tf.keras.models.load_model(chosen_model, custom_objects={"TFBertModel": TFBertModel})
39
  model1 = from_pretrained_keras(chosen_model)
40
 
41
- checkpoint = "neuralmind/bert-base-portuguese-cased"
42
- tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True, model_max_length=512)
 
 
 
 
43
  tokpair = tokenizer(text, target, truncation=True, padding='max_length', return_tensors='tf', return_token_type_ids=False)
44
 
45
  outp = model1.signatures["serving_default"](**tokpair)
 
6
  from huggingface_hub import from_pretrained_keras
7
 
8
 
9
+ app_title = "Portuguese Counter Hate Speech Detection"
10
 
11
  app_description = """
12
+ This prototype from the kNOwHATE project aims to classify a Portuguese target sentence as either hate speech, counter hate speech or neutral, considering another sentence as context.
13
+ We collected 24,739 YouTube comments and 29,846 tweets, annotated by experts, and trained our prototype on this data.
14
+ We invite you to try it out. You can just enter a pair of sentences below, one as target and another as context, and submit it to see if the target is either hate speech, counter hate speech or neutral, relative to the context.
15
+ For more, visit our [website](https://knowhate.eu) and [Hugging Face page](https://huggingface.co/knowhate).
16
  """
17
 
18
  def_model = 'knowhate/counterhate-twitter-bertimbau'
19
 
20
+ model_list = [
21
+ def_model,
22
+ "knowhate/counterhate-twitter-xlmrobertabase",
23
+ "knowhate/counterhate-twitter-bertbasemultilingualcased",
24
+ "knowhate/counterhate-twitter-hateberttuga"
25
+ ]
26
+
27
+ kw_to_hf = {"knowhate/counterhate-twitter-bertimbau": "neuralmind/bert-base-portuguese-cased",
28
+ "knowhate/counterhate-twitter-xlmrobertabase": "xlm-roberta-base",
29
+ "knowhate/counterhate-twitter-bertbasemultilingualcased": "bert-base-multilingual-cased",
30
+ "knowhate/counterhate-twitter-hateberttuga": "./hate_bert_tuga/"}
31
+
32
  # 1 0 2
33
  app_examples = [
34
  ["Essa gente tem é de deixar de ser apaparicada pelo Estado e começar a cumprir os seus deveres como cidadãos",
 
42
  def_model]
43
  ]
44
 
 
 
 
 
45
  def predict(text, target, chosen_model):
46
  # model1 = tf.keras.models.load_model(chosen_model, custom_objects={"TFBertModel": TFBertModel})
47
  model1 = from_pretrained_keras(chosen_model)
48
 
49
+ checkpoint = kw_to_hf[chosen_model] # "neuralmind/bert-base-portuguese-cased"
50
+ if '/' in checkpoint:
51
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True, model_max_length=512)
52
+ else
53
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
54
+
55
  tokpair = tokenizer(text, target, truncation=True, padding='max_length', return_tensors='tf', return_token_type_ids=False)
56
 
57
  outp = model1.signatures["serving_default"](**tokpair)