Spaces:
Sleeping
Sleeping
File size: 3,353 Bytes
158b5a1 fb05782 158b5a1 fb05782 158b5a1 c1f3033 158b5a1 f61fe0b 158b5a1 fb05782 fb347fa fb05782 158b5a1 fb05782 56b1f1f fb05782 20f3f13 fb05782 158b5a1 fb05782 f7c3109 fb05782 158b5a1 fb05782 158b5a1 fb05782 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
import torch
import spacy
import numpy as np
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import gradio as gr
PATH = '/data/' # at least 150GB storage needs to be attached
os.environ['TRANSFORMERS_CACHE'] = PATH
os.environ['HF_HOME'] = PATH
os.environ['HF_DATASETS_CACHE'] = PATH
os.environ['TORCH_HOME'] = PATH
HF_TOKEN = os.environ["hf_read"]
SENTIMENT_LABEL_NAMES = {0: "Negative", 1: "No sentiment or Neutral sentiment", 2: "Positive"}
LANGUAGES = ["Czech", "English", "French", "German", "Hungarian", "Polish", "Slovakian"]
def load_spacy_model(model_name="xx_sent_ud_sm"):
try:
model = spacy.load(model_name)
except OSError:
spacy.cli.download(model_name)
model = spacy.load(model_name)
return model
def split_sentences(text, model):
# disable pipeline components not necessary for splitting
model.disable_pipes(model.pipe_names) # first disable all the pipes
model.enable_pipe("senter") # then enable the sentence splitter only
doc = model(text)
sentences = [sent.text for sent in doc.sents]
return sentences
def build_huggingface_path(language: str):
if language == "Czech" or language == "Slovakian":
return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
return "poltextlab/xlm-roberta-large-pooled-MORES"
def predict(text, model_id, tokenizer_id):
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
inputs = tokenizer(text,
max_length=64,
truncation=True,
padding="do_not_pad",
return_tensors="pt")
model.eval()
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
label_pred = model.config.id2label[probs.argmax()]
probability_pred = f"{round(100*probs.max(), 2)}%"
return label_pred, probability_pred
def predict_wrapper(text, language):
model_id = build_huggingface_path(language)
tokenizer_id = "xlm-roberta-large"
spacy_model = load_spacy_model()
sentences = split_sentences(text, spacy_model)
results = []
for sentence in sentences:
label, probability = predict(sentence, model_id, tokenizer_id)
results.append([sentence, label, probability])
output_info = f'Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.'
return results, output_info
with gr.Blocks() as demo:
with gr.Row():
input_text = gr.Textbox(lines=6, label="Input Text", placeholder="Enter your text here...")
language_choice = gr.Dropdown(choices=LANGUAGES, label="Language", value="English")
predict_button = gr.Button("Submit")
result_table = gr.Dataframe(
headers=["Sentence", "Prediction", "Confidence"],
column_widths=["50%", "35%", "15%"]
)
model_info = gr.Markdown()
predict_button.click(
fn=predict_wrapper,
inputs=[input_text, language_choice],
outputs=[result_table, model_info]
)
if __name__ == "__main__":
demo.launch()
|