Spaces:
Running
Running
File size: 3,484 Bytes
158b5a1 fb05782 158b5a1 fb05782 158b5a1 c1f3033 158b5a1 f61fe0b 158b5a1 fb05782 fb347fa fb05782 158b5a1 fb05782 56b1f1f fb05782 20f3f13 fb05782 158b5a1 fb05782 f47d5b8 f7c3109 cc6aa04 f7c3109 1830273 4ee3727 f7c3109 f516e56 f7c3109 fb05782 158b5a1 fb05782 158b5a1 fb05782 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
import torch
import spacy
import numpy as np
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import gradio as gr
PATH = '/data/' # at least 150GB storage needs to be attached
os.environ['TRANSFORMERS_CACHE'] = PATH
os.environ['HF_HOME'] = PATH
os.environ['HF_DATASETS_CACHE'] = PATH
os.environ['TORCH_HOME'] = PATH
HF_TOKEN = os.environ["hf_read"]
SENTIMENT_LABEL_NAMES = {0: "Negative", 1: "No sentiment or Neutral sentiment", 2: "Positive"}
LANGUAGES = ["Czech", "English", "French", "German", "Hungarian", "Polish", "Slovakian"]
def load_spacy_model(model_name="xx_sent_ud_sm"):
try:
model = spacy.load(model_name)
except OSError:
spacy.cli.download(model_name)
model = spacy.load(model_name)
return model
def split_sentences(text, model):
# disable pipeline components not necessary for splitting
model.disable_pipes(model.pipe_names) # first disable all the pipes
model.enable_pipe("senter") # then enable the sentence splitter only
doc = model(text)
sentences = [sent.text for sent in doc.sents]
return sentences
def build_huggingface_path(language: str):
if language == "Czech" or language == "Slovakian":
return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
return "poltextlab/xlm-roberta-large-pooled-MORES"
def predict(text, model_id, tokenizer_id):
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
inputs = tokenizer(text,
max_length=64,
truncation=True,
padding="do_not_pad",
return_tensors="pt")
model.eval()
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
label_pred = model.config.id2label[probs.argmax()]
probability_pred = f"{round(100*probs.max(), 2)}%"
return label_pred, probability_pred
def predict_wrapper(text, language):
model_id = build_huggingface_path(language)
tokenizer_id = "xlm-roberta-large"
spacy_model = load_spacy_model()
sentences = split_sentences(text, spacy_model)
results = []
for sentence in sentences:
label, probability = predict(sentence, model_id, tokenizer_id)
results.append([sentence, label, probability])
output_info = f'Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.'
return results, output_info
with gr.Blocks() as demo:
with gr.Row():
with gr.Row():
input_text = gr.Textbox(lines=6, label="Input Text", placeholder="Enter your text here...")
with gr.Row():
language_choice = gr.Dropdown(choices=LANGUAGES, label="Language", value="English")
predict_button = gr.Button("Submit")
with gr.Row():
result_table = gr.Dataframe(
headers=["Sentence", "Prediction", "Confidence"],
column_widths=["65%", "25%", "10%"],
wrap=True # important
)
with gr.Row():
model_info = gr.Markdown()
predict_button.click(
fn=predict_wrapper,
inputs=[input_text, language_choice],
outputs=[result_table, model_info]
)
if __name__ == "__main__":
demo.launch()
|