saattrupdan's picture
feat: Add topic classification and offensive speech detection
efd38a2
raw
history blame
4.62 kB
"""Gradio app that showcases Scandinavian zero-shot text classification models."""
import gradio as gr
from transformers import pipeline
from luga import language as detect_language
# Load the zero-shot classification pipeline
classifier = pipeline(
"zero-shot-classification", model="alexandrainst/scandi-nli-large"
)
def classification(task: str, doc: str) -> str:
"""Classify text into categories.
Args:
task (str):
Task to perform.
doc (str):
Text to classify.
Returns:
str:
The predicted label.
"""
# Detect the language of the text
language = detect_language(doc.replace('\n', ' ')).name
# Define the confidence string based on the language
if language == "sv" or language == "no":
confidence_str = "konfidensnivå"
else:
confidence_str = "konfidensniveau"
# If the task is sentiment, classify the text into positive, negative or neutral
if task == "Sentiment classification":
if language == "sv":
hypothesis_template = "Detta exempel är {}."
candidate_labels = ["positivt", "negativt", "neutralt"]
elif language == "no":
hypothesis_template = "Dette eksemplet er {}."
candidate_labels = ["positivt", "negativt", "nøytralt"]
else:
hypothesis_template = "Dette eksempel er {}."
candidate_labels = ["positivt", "negativt", "neutralt"]
# Else if the task is topic, classify the text into a topic
elif task == "News topic classification":
if language == "sv":
hypothesis_template = "Detta exempel handlar om {}."
candidate_labels = [
"krig",
"regering",
"politik",
"utbildning",
"hälsa",
"miljö",
"ekonomi",
"affärer",
"mode",
"underhållning",
"sport",
]
elif language == "no":
hypothesis_template = "Dette eksemplet handler om {}."
candidate_labels = [
"krig",
"myndighetene",
"politikk",
"utdanning",
"helse",
"miljø",
"økonomi",
"virksomhet",
"mote",
"underholdning",
"sport",
]
else:
hypothesis_template = "Denne nyhedsartikel handler primært om {}."
candidate_labels = [
"krig",
"regering",
"politik",
"uddannelse",
"sundhed",
"miljø",
"økonomi",
"forretning",
"mode",
"underholdning",
"sport",
]
# Else if the task is offensive text detection, classify the text into offensive
# or not offensive
elif task == "Offensive text detection":
if language == "sv":
hypothesis_template = "Detta exempel er {}."
candidate_labels = ["stötande", "inte stötande"]
elif language == "no":
hypothesis_template = "Dette eksemplet er {}."
candidate_labels = ["støtende", "ikke støtende"]
else:
hypothesis_template = "Dette eksempel er {}."
candidate_labels = ["anstødig tale", "ikke anstødig tale"]
# Else the task is not supported, so raise an error
else:
raise ValueError(f"Task {task} not supported.")
# Run the classifier on the text
result = classifier(
doc, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template
)
print(result)
# Return the predicted label
return (
f"{result['labels'][0].capitalize()}\n"
f"({confidence_str}: {result['scores'][0]:.0%})"
)
# Create a dropdown menu for the task
dropdown = gr.inputs.Dropdown(
label="Task",
choices=["Sentiment classification", "News topic classification", "Offensive text detection"],
default="Sentiment classification",
)
# Create the interface, where the function depends on the task chosen
interface = gr.Interface(
fn=classification,
inputs=[dropdown, gr.inputs.Textbox(label="Text")],
outputs=gr.outputs.Label(type="text"),
title="Scandinavian zero-shot text classification",
description="Classify text in Danish, Swedish or Norwegian into categories, without any training data!",
)
# Run the app
interface.launch()