import streamlit as st
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from annotated_text import annotated_text
# Page configuration
st.set_page_config(
layout="wide",
initial_sidebar_state="auto"
)
# CSS for styling
st.markdown("""
""", unsafe_allow_html=True)
@st.cache_resource
def init_spark():
return sparknlp.start()
@st.cache_resource
def create_pipeline():
document_assembler = DocumentAssembler() \
.setInputCol('text') \
.setOutputCol('document')
tokenizer = Tokenizer() \
.setInputCols(['document']) \
.setOutputCol('token')
sequence_classifier = XlmRoBertaForSequenceClassification.pretrained("xlmroberta_classifier_base_mrpc","en") \
.setInputCols(["document", "token"]) \
.setOutputCol("class")
pipeline = Pipeline(stages=[document_assembler, tokenizer, sequence_classifier])
return pipeline
def fit_data(pipeline, data):
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
model = LightPipeline(pipeline_model)
result = model.fullAnnotate(data)
return result
def annotate(data):
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
annotated_words = []
for chunk, label in zip(chunks, labels):
parts = document.split(chunk, 1)
if parts[0]:
annotated_words.append(parts[0])
annotated_words.append((chunk, label))
document = parts[1]
if document:
annotated_words.append(document)
annotated_text(*annotated_words)
tasks_models_descriptions = {
"Sequence Classification": {
"models": ["xlmroberta_classifier_base_mrpc"],
"description": "The 'xlmroberta_classifier_base_mrpc' model is proficient in sequence classification tasks, such as sentiment analysis and document categorization. It effectively determines the sentiment of reviews, classifies text, and sorts documents based on their content and context."
}
}
# Sidebar content
task = st.sidebar.selectbox("Choose the task", list(tasks_models_descriptions.keys()))
model = st.sidebar.selectbox("Choose the pretrained model", tasks_models_descriptions[task]["models"], help="For more info about the models visit: https://sparknlp.org/models")
# Reference notebook link in sidebar
link = """
"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)
# Page content
title, sub_title = (f'DeBERTa for {task}', tasks_models_descriptions[task]["description"])
st.markdown(f'