File size: 3,392 Bytes
158b5a1
 
fb05782
158b5a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb05782
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158b5a1
 
 
 
 
 
 
 
 
 
 
c1f3033
158b5a1
 
f61fe0b
158b5a1
 
 
 
 
 
fb05782
fb347fa
fb05782
 
158b5a1
 
 
 
 
fb05782
 
 
 
 
 
56b1f1f
fb05782
20f3f13
fb05782
158b5a1
 
fb05782
f7c3109
 
cc6aa04
 
f7c3109
 
cc6aa04
f7c3109
 
 
 
 
fb05782
 
158b5a1
fb05782
 
 
158b5a1
 
 
fb05782
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import torch
import spacy
import numpy as np
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import gradio as gr

PATH = '/data/' # at least 150GB storage needs to be attached
os.environ['TRANSFORMERS_CACHE'] = PATH
os.environ['HF_HOME'] = PATH
os.environ['HF_DATASETS_CACHE'] = PATH
os.environ['TORCH_HOME'] = PATH

HF_TOKEN = os.environ["hf_read"]

SENTIMENT_LABEL_NAMES = {0: "Negative", 1: "No sentiment or Neutral sentiment", 2: "Positive"}
LANGUAGES = ["Czech", "English", "French", "German", "Hungarian", "Polish", "Slovakian"]

def load_spacy_model(model_name="xx_sent_ud_sm"):
    try:
        model = spacy.load(model_name)
    except OSError:
        spacy.cli.download(model_name)
        model = spacy.load(model_name)
    return model

def split_sentences(text, model):
    # disable pipeline components not necessary for splitting
    model.disable_pipes(model.pipe_names)  # first disable all the pipes
    model.enable_pipe("senter") # then enable the sentence splitter only

    doc = model(text)
    sentences = [sent.text for sent in doc.sents]

    return sentences

def build_huggingface_path(language: str):
    if language == "Czech" or language == "Slovakian":
        return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
    return "poltextlab/xlm-roberta-large-pooled-MORES"

def predict(text, model_id, tokenizer_id):
    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

    inputs = tokenizer(text,
                       max_length=64,
                       truncation=True,
                       padding="do_not_pad",
                       return_tensors="pt")
    model.eval()

    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
    label_pred = model.config.id2label[probs.argmax()]
    probability_pred = f"{round(100*probs.max(), 2)}%"
    return label_pred, probability_pred


def predict_wrapper(text, language):
    model_id = build_huggingface_path(language)
    tokenizer_id = "xlm-roberta-large"

    spacy_model = load_spacy_model()
    sentences = split_sentences(text, spacy_model)

    results = []
    for sentence in sentences:
        label, probability = predict(sentence, model_id, tokenizer_id)
        results.append([sentence, label, probability])

    output_info = f'Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.'
    return results, output_info

with gr.Blocks() as demo:
    with gr.Row():
        input_text = gr.Textbox(lines=6, label="Input Text", placeholder="Enter your text here...")
        language_choice = gr.Dropdown(choices=LANGUAGES, label="Language", value="English")

    with gr.Row():
        predict_button = gr.Button("Submit")

    with gr.Row():
        result_table = gr.Dataframe(
            headers=["Sentence", "Prediction", "Confidence"],
            column_widths=["50%", "35%", "15%"]
        )
        model_info = gr.Markdown()

    predict_button.click(
        fn=predict_wrapper,
        inputs=[input_text, language_choice],
        outputs=[result_table, model_info]
    )

if __name__ == "__main__":
    demo.launch()