import streamlit as st
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from annotated_text import annotated_text
# Page configuration
st.set_page_config(
layout="wide",
initial_sidebar_state="auto"
)
# CSS for styling
st.markdown("""
""", unsafe_allow_html=True)
@st.cache_resource
def init_spark():
return sparknlp.start()
@st.cache_resource
def create_pipeline():
document_assembler = DocumentAssembler() \
.setInputCol('text') \
.setOutputCol('document')
tokenizer = Tokenizer() \
.setInputCols(['document']) \
.setOutputCol('token')
sequence_classifier = XlmRoBertaForSequenceClassification.pretrained('xlm_roberta_base_sequence_classifier_imdb', 'en') \
.setInputCols(["document", "token"]) \
.setOutputCol("class")
pipeline = Pipeline(stages=[document_assembler, tokenizer, sequence_classifier])
return pipeline
def fit_data(pipeline, data):
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
model = LightPipeline(pipeline_model)
result = model.fullAnnotate(data)
return result
def annotate(data):
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
annotated_words = []
for chunk, label in zip(chunks, labels):
parts = document.split(chunk, 1)
if parts[0]:
annotated_words.append(parts[0])
annotated_words.append((chunk, label))
document = parts[1]
if document:
annotated_words.append(document)
annotated_text(*annotated_words)
tasks_models_descriptions = {
"Sequence Classification": {
"models": ["xlm_roberta_base_sequence_classifier_imdb"],
"description": "The 'xlm_roberta_base_sequence_classifier_imdb' model is specialized for sentiment analysis of movie reviews. It accurately classifies IMDb reviews as positive or negative, leveraging the multilingual capabilities of XLM-RoBERTa to analyze text content and sentiment across different languages."
}
}
# Sidebar content
task = st.sidebar.selectbox("Choose the task", list(tasks_models_descriptions.keys()))
model = st.sidebar.selectbox("Choose the pretrained model", tasks_models_descriptions[task]["models"], help="For more info about the models visit: https://sparknlp.org/models")
# Reference notebook link in sidebar
link = """
"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Page content
title, sub_title = (f'DeBERTa for {task}', tasks_models_descriptions[task]["description"])
st.markdown(f'