abdullahmubeen10 commited on
Commit
cee413d
·
verified ·
1 Parent(s): 562e65e

Upload 5 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+
4
+ from sparknlp.base import *
5
+ from sparknlp.annotator import *
6
+ from pyspark.ml import Pipeline
7
+ from annotated_text import annotated_text
8
+
9
+ # Page configuration
10
+ st.set_page_config(
11
+ layout="wide",
12
+ initial_sidebar_state="auto"
13
+ )
14
+
15
+ # CSS for styling
16
+ st.markdown("""
17
+ <style>
18
+ .main-title {
19
+ font-size: 36px;
20
+ color: #4A90E2;
21
+ font-weight: bold;
22
+ text-align: center;
23
+ }
24
+ .section {
25
+ background-color: #f9f9f9;
26
+ padding: 10px;
27
+ border-radius: 10px;
28
+ margin-top: 10px;
29
+ }
30
+ .section p, .section ul {
31
+ color: #666666;
32
+ }
33
+ </style>
34
+ """, unsafe_allow_html=True)
35
+
36
+ @st.cache_resource
37
+ def init_spark():
38
+ return sparknlp.start()
39
+
40
+ @st.cache_resource
41
+ def create_pipeline():
42
+ document_assembler = DocumentAssembler() \
43
+ .setInputCol('text') \
44
+ .setOutputCol('document')
45
+
46
+ tokenizer = Tokenizer() \
47
+ .setInputCols(['document']) \
48
+ .setOutputCol('token')
49
+
50
+ sequence_classifier = XlmRoBertaForSequenceClassification.pretrained("xlmroberta_classifier_base_mrpc","en") \
51
+ .setInputCols(["document", "token"]) \
52
+ .setOutputCol("class")
53
+
54
+ pipeline = Pipeline(stages=[document_assembler, tokenizer, sequence_classifier])
55
+ return pipeline
56
+
57
+ def fit_data(pipeline, data):
58
+ empty_df = spark.createDataFrame([['']]).toDF('text')
59
+ pipeline_model = pipeline.fit(empty_df)
60
+ model = LightPipeline(pipeline_model)
61
+ result = model.fullAnnotate(data)
62
+ return result
63
+
64
+ def annotate(data):
65
+ document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
66
+ annotated_words = []
67
+ for chunk, label in zip(chunks, labels):
68
+ parts = document.split(chunk, 1)
69
+ if parts[0]:
70
+ annotated_words.append(parts[0])
71
+ annotated_words.append((chunk, label))
72
+ document = parts[1]
73
+ if document:
74
+ annotated_words.append(document)
75
+ annotated_text(*annotated_words)
76
+
77
+ tasks_models_descriptions = {
78
+ "Sequence Classification": {
79
+ "models": ["xlmroberta_classifier_base_mrpc"],
80
+ "description": "The 'xlmroberta_classifier_base_mrpc' model is proficient in sequence classification tasks, such as sentiment analysis and document categorization. It effectively determines the sentiment of reviews, classifies text, and sorts documents based on their content and context."
81
+ }
82
+ }
83
+
84
+ # Sidebar content
85
+ task = st.sidebar.selectbox("Choose the task", list(tasks_models_descriptions.keys()))
86
+ model = st.sidebar.selectbox("Choose the pretrained model", tasks_models_descriptions[task]["models"], help="For more info about the models visit: https://sparknlp.org/models")
87
+
88
+ # Reference notebook link in sidebar
89
+ link = """
90
+ <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/357691d18373d6e8f13b5b1015137a398fd0a45f/Spark_NLP_Udemy_MOOC/Open_Source/17.01.Transformers-based_Embeddings.ipynb#L103">
91
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
92
+ </a>
93
+ """
94
+ st.sidebar.markdown('Reference notebook:')
95
+ st.sidebar.markdown(link, unsafe_allow_html=True)
96
+
97
+ # Page content
98
+ title, sub_title = (f'DeBERTa for {task}', tasks_models_descriptions[task]["description"])
99
+ st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
100
+ container = st.container(border=True)
101
+ container.write(sub_title)
102
+
103
+ # Load examples
104
+ examples_mapping = {
105
+ "Sequence Classification": [
106
+ "This movie was absolutely fantastic! The storyline was gripping, the characters were well-developed, and the cinematography was stunning. I was on the edge of my seat the entire time.",
107
+ "A heartwarming and beautiful film. The performances were top-notch, and the direction was flawless. This is easily one of the best movies I've seen this year.",
108
+ "What a delightful surprise! The humor was spot on, and the plot was refreshingly original. The cast did an amazing job bringing the characters to life. Highly recommended!",
109
+ "This was one of the worst movies I’ve ever seen. The plot was predictable, the acting was wooden, and the pacing was painfully slow. I couldn’t wait for it to end.",
110
+ "A complete waste of time. The movie lacked any real substance or direction, and the dialogue was cringe-worthy. I wouldn’t recommend this to anyone.",
111
+ "I had high hopes for this film, but it turned out to be a huge disappointment. The story was disjointed, and the special effects were laughably bad. Don’t bother watching this one.",
112
+ "The movie was okay, but nothing special. It had a few good moments, but overall, it felt pretty average. Not something I would watch again, but it wasn’t terrible either.",
113
+ "An average film with a decent plot. The acting was passable, but it didn't leave much of an impression on me. It's a movie you might watch once and forget about.",
114
+ "This movie was neither good nor bad, just kind of there. It had some interesting ideas, but they weren’t executed very well. It’s a film you could take or leave."
115
+ ]
116
+ }
117
+
118
+ examples = examples_mapping[task]
119
+ selected_text = st.selectbox("Select an example", examples)
120
+ custom_input = st.text_input("Try it with your own Sentence!")
121
+
122
+ try:
123
+ text_to_analyze = custom_input if custom_input else selected_text
124
+ st.subheader('Full example text')
125
+ HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
126
+ st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
127
+ except:
128
+ text_to_analyze = selected_text
129
+
130
+ # Initialize Spark and create pipeline
131
+ spark = init_spark()
132
+ pipeline = create_pipeline()
133
+ output = fit_data(pipeline, text_to_analyze)
134
+
135
+ # Display matched sentence
136
+ st.subheader("Prediction:")
137
+ st.markdown(f"Classified as : **{output[0]['class'][0].result}**")
138
+
139
+
140
+
Dockerfile ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
9
+
10
+ # Install required packages
11
+ RUN apt-get update && apt-get install -y \
12
+ tar \
13
+ wget \
14
+ bash \
15
+ rsync \
16
+ gcc \
17
+ libfreetype6-dev \
18
+ libhdf5-serial-dev \
19
+ libpng-dev \
20
+ libzmq3-dev \
21
+ python3 \
22
+ python3-dev \
23
+ python3-pip \
24
+ unzip \
25
+ pkg-config \
26
+ software-properties-common \
27
+ graphviz \
28
+ openjdk-8-jdk \
29
+ ant \
30
+ ca-certificates-java \
31
+ && apt-get clean \
32
+ && update-ca-certificates -f
33
+
34
+ # Install Python 3.8 and pip
35
+ RUN add-apt-repository ppa:deadsnakes/ppa \
36
+ && apt-get update \
37
+ && apt-get install -y python3.8 python3-pip \
38
+ && apt-get clean
39
+
40
+ # Set up JAVA_HOME
41
+ RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> /etc/profile \
42
+ && echo "export PATH=\$JAVA_HOME/bin:\$PATH" >> /etc/profile
43
+ # Create a new user named "jovyan" with user ID 1000
44
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
45
+
46
+ # Switch to the "jovyan" user
47
+ USER ${NB_USER}
48
+
49
+ # Set home and path variables for the user
50
+ ENV HOME=/home/${NB_USER} \
51
+ PATH=/home/${NB_USER}/.local/bin:$PATH
52
+
53
+ # Set up PySpark to use Python 3.8 for both driver and workers
54
+ ENV PYSPARK_PYTHON=/usr/bin/python3.8
55
+ ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3.8
56
+
57
+ # Set the working directory to the user's home directory
58
+ WORKDIR ${HOME}
59
+
60
+ # Upgrade pip and install Python dependencies
61
+ RUN python3.8 -m pip install --upgrade pip
62
+ COPY requirements.txt /tmp/requirements.txt
63
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
64
+
65
+ # Copy the application code into the container at /home/jovyan
66
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
67
+
68
+ # Expose port for Streamlit
69
+ EXPOSE 7860
70
+
71
+ # Define the entry point for the container
72
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Page configuration
4
+ st.set_page_config(
5
+ layout="wide",
6
+ initial_sidebar_state="auto"
7
+ )
8
+
9
+ # Custom CSS for better styling
10
+ st.markdown("""
11
+ <style>
12
+ .main-title {
13
+ font-size: 36px;
14
+ color: #4A90E2;
15
+ font-weight: bold;
16
+ text-align: center;
17
+ }
18
+ .sub-title {
19
+ font-size: 24px;
20
+ color: #4A90E2;
21
+ margin-top: 20px;
22
+ }
23
+ .section {
24
+ background-color: #f9f9f9;
25
+ padding: 15px;
26
+ border-radius: 10px;
27
+ margin-top: 20px;
28
+ }
29
+ .section h2 {
30
+ font-size: 22px;
31
+ color: #4A90E2;
32
+ }
33
+ .section p, .section ul {
34
+ color: #666666;
35
+ }
36
+ .link {
37
+ color: #4A90E2;
38
+ text-decoration: none;
39
+ }
40
+ .benchmark-table {
41
+ width: 100%;
42
+ border-collapse: collapse;
43
+ margin-top: 20px;
44
+ }
45
+ .benchmark-table th, .benchmark-table td {
46
+ border: 1px solid #ddd;
47
+ padding: 8px;
48
+ text-align: left;
49
+ }
50
+ .benchmark-table th {
51
+ background-color: #4A90E2;
52
+ color: white;
53
+ }
54
+ .benchmark-table td {
55
+ background-color: #f2f2f2;
56
+ }
57
+ </style>
58
+ """, unsafe_allow_html=True)
59
+
60
+ # Title
61
+ st.markdown('<div class="main-title">Introduction to XLM-RoBERTa Annotators in Spark NLP</div>', unsafe_allow_html=True)
62
+
63
+ # Subtitle
64
+ st.markdown("""
65
+ <div class="section">
66
+ <p>XLM-RoBERTa (Cross-lingual Robustly Optimized BERT Approach) is an advanced multilingual model that extends the capabilities of RoBERTa to over 100 languages. Pre-trained on a massive, diverse corpus, XLM-RoBERTa is designed to handle various NLP tasks in a multilingual context, making it ideal for applications that require cross-lingual understanding. Below, we provide an overview of the XLM-RoBERTa annotators for these tasks:</p>
67
+ </div>
68
+ """, unsafe_allow_html=True)
69
+
70
+ st.markdown("""<div class="sub-title">Sequence Classification with XLM-RoBERTa</div>""", unsafe_allow_html=True)
71
+ st.markdown("""
72
+ <div class="section">
73
+ <p>Sequence classification is a common task in Natural Language Processing (NLP) where the goal is to assign a label to a sequence of text, such as sentiment analysis, spam detection, or paraphrase identification.</p>
74
+ <p><strong>XLM-RoBERTa</strong> excels at sequence classification across multiple languages, making it a powerful tool for global applications. Below is an example of how to implement sequence classification using XLM-RoBERTa in Spark NLP.</p>
75
+ <p>Using XLM-RoBERTa for Sequence Classification enables:</p>
76
+ <ul>
77
+ <li><strong>Multilingual Text Classification:</strong> Classify sequences of text in multiple languages with a single model.</li>
78
+ <li><strong>Broad Application:</strong> Apply to tasks such as sentiment analysis, spam detection, and paraphrase identification across languages.</li>
79
+ <li><strong>Transfer Learning:</strong> Utilize pretrained XLM-RoBERTa models to leverage knowledge from extensive cross-lingual datasets.</li>
80
+ </ul>
81
+ <p>Advantages of using XLM-RoBERTa for Sequence Classification in Spark NLP include:</p>
82
+ <ul>
83
+ <li><strong>Scalability:</strong> Spark NLP is built on Apache Spark, ensuring it scales efficiently for large datasets.</li>
84
+ <li><strong>Pretrained Excellence:</strong> Leverage state-of-the-art pretrained models to achieve high accuracy in text classification tasks.</li>
85
+ <li><strong>Multilingual Flexibility:</strong> XLM-RoBERTa’s multilingual capabilities make it suitable for global applications, reducing the need for language-specific models.</li>
86
+ <li><strong>Seamless Integration:</strong> Easily incorporate XLM-RoBERTa into your existing Spark pipelines for streamlined NLP workflows.</li>
87
+ </ul>
88
+ </div>
89
+ """, unsafe_allow_html=True)
90
+
91
+ st.markdown("""<div class="sub-title">How to Use XLM-RoBERTa for Sequence Classification in Spark NLP</div>""", unsafe_allow_html=True)
92
+ st.markdown("""
93
+ <div class="section">
94
+ <p>To leverage XLM-RoBERTa for sequence classification, Spark NLP provides an intuitive pipeline setup. The following example shows how to use XLM-RoBERTa for sequence classification tasks such as sentiment analysis, paraphrase detection, or categorizing text sequences into predefined classes. XLM-RoBERTa’s multilingual training enables it to perform sequence classification across various languages, making it a powerful tool for global NLP tasks.</p>
95
+ </div>
96
+ """, unsafe_allow_html=True)
97
+
98
+ # Code Example
99
+ st.code('''
100
+ from sparknlp.base import *
101
+ from sparknlp.annotator import *
102
+ from pyspark.ml import Pipeline
103
+
104
+ documentAssembler = DocumentAssembler() \\
105
+ .setInputCol("text") \\
106
+ .setOutputCol("document")
107
+
108
+ tokenizer = Tokenizer() \\
109
+ .setInputCols("document") \\
110
+ .setOutputCol("token")
111
+
112
+ seq_classifier = XlmRoBertaForSequenceClassification.pretrained("xlmroberta_classifier_base_mrpc","en") \\
113
+ .setInputCols(["document", "token"]) \\
114
+ .setOutputCol("class")
115
+
116
+ pipeline = Pipeline(stages=[documentAssembler, tokenizer, seq_classifier])
117
+
118
+ data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text")
119
+
120
+ result = pipeline.fit(data).transform(data)
121
+ result.select("class.result").show(truncate=False)
122
+ ''', language='python')
123
+
124
+ st.text("""
125
+ +-------+
126
+ |result |
127
+ +-------+
128
+ |[True] |
129
+ +-------+
130
+ """)
131
+
132
+ # Model Info Section
133
+ st.markdown('<div class="sub-title">Choosing the Right Model</div>', unsafe_allow_html=True)
134
+ st.markdown("""
135
+ <div class="section">
136
+ <p>The XLM-RoBERTa model used here is pretrained and fine-tuned for sequence classification tasks such as paraphrase detection. It is available in Spark NLP, providing high accuracy and multilingual support.</p>
137
+ <p>For more information about the model, visit the <a class="link" href="https://huggingface.co/xlm-roberta-base" target="_blank">XLM-RoBERTa Model Hub</a>.</p>
138
+ </div>
139
+ """, unsafe_allow_html=True)
140
+
141
+ # References Section
142
+ st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
143
+ st.markdown("""
144
+ <div class="section">
145
+ <ul>
146
+ <li><a class="link" href="https://arxiv.org/abs/1911.02116" target="_blank">XLM-R: Cross-lingual Pre-training</a></li>
147
+ <li><a class="link" href="https://huggingface.co/xlm-roberta-base" target="_blank">XLM-RoBERTa Model Overview</a></li>
148
+ </ul>
149
+ </div>
150
+ """, unsafe_allow_html=True)
151
+
152
+ # Footer
153
+ st.markdown("""
154
+ <div class="section">
155
+ <ul>
156
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
157
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
158
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
159
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
160
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
161
+ </ul>
162
+ </div>
163
+ """, unsafe_allow_html=True)
164
+
165
+ st.markdown('<div class="sub-title">Quick Links</div>', unsafe_allow_html=True)
166
+
167
+ st.markdown("""
168
+ <div class="section">
169
+ <ul>
170
+ <li><a class="link" href="https://sparknlp.org/docs/en/quickstart" target="_blank">Getting Started</a></li>
171
+ <li><a class="link" href="https://nlp.johnsnowlabs.com/models" target="_blank">Pretrained Models</a></li>
172
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english" target="_blank">Example Notebooks</a></li>
173
+ <li><a class="link" href="https://sparknlp.org/docs/en/install" target="_blank">Installation Guide</a></li>
174
+ </ul>
175
+ </div>
176
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ st-annotated-text
3
+ streamlit-tags
4
+ pandas
5
+ numpy
6
+ spark-nlp
7
+ pyspark