File size: 6,354 Bytes
7420aa9
 
 
 
 
 
 
 
 
 
 
 
 
41bb40c
 
80b5399
41bb40c
 
 
a0f796f
 
 
 
41bb40c
 
efd38a2
 
7420aa9
 
efd38a2
 
7420aa9
 
 
 
 
efd38a2
7420aa9
 
efd38a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41bb40c
 
efd38a2
3406b92
41bb40c
 
 
 
efd38a2
3406b92
41bb40c
 
 
 
efd38a2
3406b92
41bb40c
 
 
 
efd38a2
3406b92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efd38a2
 
 
7420aa9
41bb40c
 
 
 
 
7420aa9
 
41bb40c
 
 
7420aa9
 
efd38a2
 
7420aa9
efd38a2
41bb40c
efd38a2
 
7420aa9
efd38a2
 
 
3406b92
 
 
 
 
 
efd38a2
 
7420aa9
bd095fd
 
 
 
 
efd38a2
7420aa9
efd38a2
bd095fd
7420aa9
efd38a2
41bb40c
7420aa9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""Gradio app that showcases Scandinavian zero-shot text classification models."""

import gradio as gr
from transformers import pipeline
from luga import language as detect_language


# Load the zero-shot classification pipeline
classifier = pipeline(
    "zero-shot-classification", model="alexandrainst/scandi-nli-large"
)


# Set the description for the interface
DESCRIPTION = """Classify text in Danish, Swedish or Norwegian into categories, without
finetuning on any training data!

Note that the models will most likely *not* work as well as a finetuned model on your
specific data, but they can be used as a starting point for your own classification
task ✨


Also, be patient, as this demo is running on a CPU!"""


def classification(task: str, doc: str) -> str:
    """Classify text into categories.

    Args:
        task (str):
            Task to perform.
        doc (str):
            Text to classify.

    Returns:
        str:
            The predicted label.
    """
    # Detect the language of the text
    language = detect_language(doc.replace('\n', ' ')).name

    # Define the confidence string based on the language
    if language == "sv" or language == "no":
        confidence_str = "konfidensnivå"
    else:
        confidence_str = "konfidensniveau"

    # If the task is sentiment, classify the text into positive, negative or neutral
    if task == "Sentiment classification":
        if language == "sv":
            hypothesis_template = "Detta exempel är {}."
            candidate_labels = ["positivt", "negativt", "neutralt"]
        elif language == "no":
            hypothesis_template = "Dette eksemplet er {}."
            candidate_labels = ["positivt", "negativt", "nøytralt"]
        else:
            hypothesis_template = "Dette eksempel er {}."
            candidate_labels = ["positivt", "negativt", "neutralt"]

    # Else if the task is topic, classify the text into a topic
    elif task == "News topic classification":
        if language == "sv":
            hypothesis_template = "Detta exempel handlar om {}."
            candidate_labels = [
                "krig",
                "politik",
                "utbildning",
                "hälsa",
                "ekonomi",
                "mode",
                "sport",
            ]
        elif language == "no":
            hypothesis_template = "Dette eksemplet handler om {}."
            candidate_labels = [
                "krig",
                "politikk",
                "utdanning",
                "helse",
                "økonomi",
                "mote",
                "sport",
            ]
        else:
            hypothesis_template = "Denne nyhedsartikel handler primært om {}."
            candidate_labels = [
                "krig",
                "politik",
                "uddannelse",
                "sundhed",
                "økonomi",
                "mode",
                "sport",
            ]

    # Else if the task is spam detection, classify the text into spam or not spam
    elif task == "Spam detection":
        if language == "sv":
            hypothesis_template = "Det här e-postmeddelandet ser {}."
            candidate_labels = {
                "ut som ett skräppostmeddelande": "Spam",
                "inte ut som ett skräppostmeddelande": "Inte spam",
            }
        elif language == "no":
            hypothesis_template = "Denne e-posten ser {}."
            candidate_labels = {
                "ut som en spam-e-post": "Spam",
                "ikke ut som en spam-e-post": "Ikke spam",
            }
        else:
            hypothesis_template = "Denne e-mail ligner {}."
            candidate_labels = {
                "en spam e-mail": "Spam",
                "ikke en spam e-mail": "Ikke spam",
            }

    # Else if the task is product feedback detection, classify the text into product
    # feedback or not product feedback
    elif task == "Product feedback detection":
        if language == "sv":
            hypothesis_template = "Den här kommentaren är {}."
            candidate_labels = {
                "en recension av en produkt": "Produktfeedback",
                "inte en recension av en produkt": "Inte produktfeedback",
            }
        elif language == "no":
            hypothesis_template = "Denne kommentaren er {}."
            candidate_labels = {
                "en anmeldelse av et produkt": "Produkttilbakemelding",
                "ikke en anmeldelse av et produkt": "Ikke produkttilbakemelding",
            }
        else:
            hypothesis_template = "Denne kommentar er {}."
            candidate_labels = {
                "en anmeldelse af et produkt": "Produktfeedback",
                "ikke en anmeldelse af et produkt": "Ikke produktfeedback",
            }

    # Else the task is not supported, so raise an error
    else:
        raise ValueError(f"Task {task} not supported.")

    # If `candidate_labels` is a list then convert it to a dictionary, where the keys
    # are the entries in the list and the values are the keys capitalized
    if isinstance(candidate_labels, list):
        candidate_labels = {label: label.capitalize() for label in candidate_labels}

    # Run the classifier on the text
    result = classifier(
        doc,
        candidate_labels=list(candidate_labels.keys()),
        hypothesis_template=hypothesis_template,
    )

    print(result)

    # Return the predicted label
    return (
        f"{candidate_labels[result['labels'][0]]}\n"
        f"({confidence_str}: {result['scores'][0]:.0%})"
    )

# Create a dropdown menu for the task
dropdown = gr.inputs.Dropdown(
    label="Task",
    choices=[
        "Sentiment classification",
        "News topic classification",
        "Spam detection",
        "Product feedback detection",
    ],
    default="Sentiment classification",
)

# Create a text box for the input text
input_textbox = gr.inputs.Textbox(
    label="Text", default="Jeg er helt vild med fodbolden 😊"
)

# Create the interface, where the function depends on the task chosen
interface = gr.Interface(
    fn=classification,
    inputs=[dropdown, input_textbox],
    outputs=gr.outputs.Label(type="text"),
    title="Scandinavian zero-shot text classification",
    description=DESCRIPTION,
)

# Run the app
interface.launch()