File size: 3,353 Bytes
158b5a1
 
fb05782
158b5a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb05782
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158b5a1
 
 
 
 
 
 
 
 
 
 
c1f3033
158b5a1
 
f61fe0b
158b5a1
 
 
 
 
 
fb05782
fb347fa
fb05782
 
158b5a1
 
 
 
 
fb05782
 
 
 
 
 
56b1f1f
fb05782
20f3f13
fb05782
158b5a1
 
fb05782
f7c3109
 
 
 
 
 
 
 
 
fb05782
 
158b5a1
fb05782
 
 
158b5a1
 
 
fb05782
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import torch
import spacy
import numpy as np
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import gradio as gr

PATH = '/data/' # at least 150GB storage needs to be attached
os.environ['TRANSFORMERS_CACHE'] = PATH
os.environ['HF_HOME'] = PATH
os.environ['HF_DATASETS_CACHE'] = PATH
os.environ['TORCH_HOME'] = PATH

HF_TOKEN = os.environ["hf_read"]

SENTIMENT_LABEL_NAMES = {0: "Negative", 1: "No sentiment or Neutral sentiment", 2: "Positive"}
LANGUAGES = ["Czech", "English", "French", "German", "Hungarian", "Polish", "Slovakian"]

def load_spacy_model(model_name="xx_sent_ud_sm"):
    try:
        model = spacy.load(model_name)
    except OSError:
        spacy.cli.download(model_name)
        model = spacy.load(model_name)
    return model

def split_sentences(text, model):
    # disable pipeline components not necessary for splitting
    model.disable_pipes(model.pipe_names)  # first disable all the pipes
    model.enable_pipe("senter") # then enable the sentence splitter only

    doc = model(text)
    sentences = [sent.text for sent in doc.sents]

    return sentences

def build_huggingface_path(language: str):
    if language == "Czech" or language == "Slovakian":
        return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
    return "poltextlab/xlm-roberta-large-pooled-MORES"

def predict(text, model_id, tokenizer_id):
    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

    inputs = tokenizer(text,
                       max_length=64,
                       truncation=True,
                       padding="do_not_pad",
                       return_tensors="pt")
    model.eval()

    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
    label_pred = model.config.id2label[probs.argmax()]
    probability_pred = f"{round(100*probs.max(), 2)}%"
    return label_pred, probability_pred


def predict_wrapper(text, language):
    model_id = build_huggingface_path(language)
    tokenizer_id = "xlm-roberta-large"

    spacy_model = load_spacy_model()
    sentences = split_sentences(text, spacy_model)

    results = []
    for sentence in sentences:
        label, probability = predict(sentence, model_id, tokenizer_id)
        results.append([sentence, label, probability])

    output_info = f'Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.'
    return results, output_info

with gr.Blocks() as demo:
    with gr.Row():
        input_text = gr.Textbox(lines=6, label="Input Text", placeholder="Enter your text here...")
        language_choice = gr.Dropdown(choices=LANGUAGES, label="Language", value="English")
        predict_button = gr.Button("Submit")

        result_table = gr.Dataframe(
            headers=["Sentence", "Prediction", "Confidence"],
            column_widths=["50%", "35%", "15%"]
        )
        model_info = gr.Markdown()

    predict_button.click(
        fn=predict_wrapper,
        inputs=[input_text, language_choice],
        outputs=[result_table, model_info]
    )

if __name__ == "__main__":
    demo.launch()