abdullahmubeen10 commited on
Commit
510d114
·
verified ·
1 Parent(s): 8b7b9fb

Upload 5 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ primaryColor="#29B4E8"
Demo.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+ from annotated_text import annotated_text
11
+ from streamlit_tags import st_tags
12
+
13
+ # Page configuration
14
+ st.set_page_config(
15
+ layout="wide",
16
+ initial_sidebar_state="auto"
17
+ )
18
+
19
+ # CSS for styling
20
+ st.markdown("""
21
+ <style>
22
+ .main-title {
23
+ font-size: 36px;
24
+ color: #4A90E2;
25
+ font-weight: bold;
26
+ text-align: center;
27
+ }
28
+ .section {
29
+ background-color: #f9f9f9;
30
+ padding: 10px;
31
+ border-radius: 10px;
32
+ margin-top: 10px;
33
+ }
34
+ .section p, .section ul {
35
+ color: #666666;
36
+ }
37
+ </style>
38
+ """, unsafe_allow_html=True)
39
+
40
+ @st.cache_resource
41
+ def init_spark():
42
+ return sparknlp.start()
43
+
44
+ @st.cache_resource
45
+ def create_pipeline(zeroShotLables=['']):
46
+ document_assembler = DocumentAssembler() \
47
+ .setInputCol('text') \
48
+ .setOutputCol('document')
49
+
50
+ tokenizer = Tokenizer() \
51
+ .setInputCols(['document']) \
52
+ .setOutputCol('token')
53
+
54
+ zeroShotClassifier = XlmRoBertaForZeroShotClassification \
55
+ .pretrained('xlm_roberta_large_zero_shot_classifier_xnli_anli', 'xx') \
56
+ .setInputCols(['token', 'document']) \
57
+ .setOutputCol('class') \
58
+ .setCaseSensitive(False) \
59
+ .setMaxSentenceLength(512) \
60
+ .setCandidateLabels(zeroShotLables)
61
+
62
+ pipeline = Pipeline(stages=[document_assembler, tokenizer, zeroShotClassifier])
63
+ return pipeline
64
+
65
+ def fit_data(pipeline, data):
66
+ empty_df = spark.createDataFrame([['']]).toDF('text')
67
+ pipeline_model = pipeline.fit(empty_df)
68
+ model = LightPipeline(pipeline_model)
69
+ result = model.fullAnnotate(data)
70
+ return result
71
+
72
+ def annotate(data):
73
+ document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
74
+ annotated_words = []
75
+ for chunk, label in zip(chunks, labels):
76
+ parts = document.split(chunk, 1)
77
+ if parts[0]:
78
+ annotated_words.append(parts[0])
79
+ annotated_words.append((chunk, label))
80
+ document = parts[1]
81
+ if document:
82
+ annotated_words.append(document)
83
+ annotated_text(*annotated_words)
84
+
85
+ tasks_models_descriptions = {
86
+ "Zero-Shot Classification": {
87
+ "models": ["xlm_roberta_large_zero_shot_classifier_xnli_anli"],
88
+ "description": "The 'xlm_roberta_large_zero_shot_classifier_xnli_anli' model provides flexible text classification without needing training data for specific categories. It is ideal for dynamic scenarios where text needs to be categorized into topics like urgent issues, technology, or sports without prior labeling."
89
+ }
90
+ }
91
+
92
+ # Sidebar content
93
+ task = st.sidebar.selectbox("Choose the task", list(tasks_models_descriptions.keys()))
94
+ model = st.sidebar.selectbox("Choose the pretrained model", tasks_models_descriptions[task]["models"], help="For more info about the models visit: https://sparknlp.org/models")
95
+
96
+ # Reference notebook link in sidebar
97
+ link = """
98
+ <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/357691d18373d6e8f13b5b1015137a398fd0a45f/Spark_NLP_Udemy_MOOC/Open_Source/17.01.Transformers-based_Embeddings.ipynb#L103">
99
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
100
+ </a>
101
+ """
102
+ st.sidebar.markdown('Reference notebook:')
103
+ st.sidebar.markdown(link, unsafe_allow_html=True)
104
+
105
+ # Page content
106
+ title, sub_title = (f'DeBERTa for {task}', tasks_models_descriptions[task]["description"])
107
+ st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
108
+ container = st.container(border=True)
109
+ container.write(sub_title)
110
+
111
+ # Load examples
112
+ examples_mapping = {
113
+ "Zero-Shot Classification" : [
114
+ "In today’s world, staying updated with urgent information is crucial as events can unfold rapidly and require immediate attention.", # Urgent
115
+ "Mobile technology has become indispensable, allowing us to access news, updates, and connect with others no matter where we are.", # Mobile
116
+ "For those who love to travel, the convenience of mobile apps has transformed how we plan and experience trips, providing real-time updates on flights, accommodations, and local attractions.", # Travel
117
+ "The entertainment industry continually offers new movies that captivate audiences with their storytelling and visuals, providing a wide range of genres to suit every taste.", # Movie
118
+ "Music is an integral part of modern life, with streaming platforms making it easy to discover new artists and enjoy favorite tunes anytime, anywhere.", # Music
119
+ "Sports enthusiasts follow games and matches closely, with live updates and detailed statistics available at their fingertips, enhancing the excitement of every game.", # Sport
120
+ "Weather forecasts play a vital role in daily planning, offering accurate and timely information to help us prepare for various weather conditions and adjust our plans accordingly.", # Weather
121
+ "Technology continues to evolve rapidly, driving innovation across all sectors and improving our everyday lives through smarter devices, advanced software, and enhanced connectivity." # Technology
122
+ ]
123
+ }
124
+
125
+ examples = examples_mapping[task]
126
+ selected_text = st.selectbox("Select an example", examples)
127
+ custom_input = st.text_input("Try it with your own Sentence!")
128
+
129
+ if task == 'Zero-Shot Classification':
130
+ zeroShotLables = ["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"]
131
+ lables = st_tags(
132
+ label='Select labels',
133
+ text='Press enter to add more',
134
+ value=zeroShotLables,
135
+ suggestions=[
136
+ "Positive", "Negative", "Neutral",
137
+ "Urgent", "Mobile", "Travel", "Movie", "Music", "Sport", "Weather", "Technology",
138
+ "Happiness", "Sadness", "Anger", "Fear", "Surprise", "Disgust",
139
+ "Informational", "Navigational", "Transactional", "Commercial Investigation",
140
+ "Politics", "Business", "Sports", "Entertainment", "Health", "Science",
141
+ "Product Quality", "Delivery Experience", "Customer Service", "Pricing", "Return Policy",
142
+ "Education", "Finance", "Lifestyle", "Fashion", "Food", "Art", "History",
143
+ "Culture", "Environment", "Real Estate", "Automotive", "Travel", "Fitness", "Career"],
144
+ maxtags = -1)
145
+
146
+ try:
147
+ text_to_analyze = custom_input if custom_input else selected_text
148
+ st.subheader('Full example text')
149
+ HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
150
+ st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
151
+ except:
152
+ text_to_analyze = selected_text
153
+
154
+ # Initialize Spark and create pipeline
155
+ spark = init_spark()
156
+ pipeline = create_pipeline(zeroShotLables)
157
+ output = fit_data(pipeline, text_to_analyze)
158
+
159
+ # Display matched sentence
160
+ st.subheader("Prediction:")
161
+ st.markdown(f"Document Classified as: **{output[0]['class'][0].result}**")
Dockerfile ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download base image ubuntu 18.04
2
+ FROM ubuntu:18.04
3
+
4
+ # Set environment variables
5
+ ENV NB_USER jovyan
6
+ ENV NB_UID 1000
7
+ ENV HOME /home/${NB_USER}
8
+ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
9
+
10
+ # Install required packages
11
+ RUN apt-get update && apt-get install -y \
12
+ tar \
13
+ wget \
14
+ bash \
15
+ rsync \
16
+ gcc \
17
+ libfreetype6-dev \
18
+ libhdf5-serial-dev \
19
+ libpng-dev \
20
+ libzmq3-dev \
21
+ python3 \
22
+ python3-dev \
23
+ python3-pip \
24
+ unzip \
25
+ pkg-config \
26
+ software-properties-common \
27
+ graphviz \
28
+ openjdk-8-jdk \
29
+ ant \
30
+ ca-certificates-java \
31
+ && apt-get clean \
32
+ && update-ca-certificates -f
33
+
34
+ # Install Python 3.8 and pip
35
+ RUN add-apt-repository ppa:deadsnakes/ppa \
36
+ && apt-get update \
37
+ && apt-get install -y python3.8 python3-pip \
38
+ && apt-get clean
39
+
40
+ # Set up JAVA_HOME
41
+ RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> /etc/profile \
42
+ && echo "export PATH=\$JAVA_HOME/bin:\$PATH" >> /etc/profile
43
+ # Create a new user named "jovyan" with user ID 1000
44
+ RUN useradd -m -u ${NB_UID} ${NB_USER}
45
+
46
+ # Switch to the "jovyan" user
47
+ USER ${NB_USER}
48
+
49
+ # Set home and path variables for the user
50
+ ENV HOME=/home/${NB_USER} \
51
+ PATH=/home/${NB_USER}/.local/bin:$PATH
52
+
53
+ # Set up PySpark to use Python 3.8 for both driver and workers
54
+ ENV PYSPARK_PYTHON=/usr/bin/python3.8
55
+ ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3.8
56
+
57
+ # Set the working directory to the user's home directory
58
+ WORKDIR ${HOME}
59
+
60
+ # Upgrade pip and install Python dependencies
61
+ RUN python3.8 -m pip install --upgrade pip
62
+ COPY requirements.txt /tmp/requirements.txt
63
+ RUN python3.8 -m pip install -r /tmp/requirements.txt
64
+
65
+ # Copy the application code into the container at /home/jovyan
66
+ COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
67
+
68
+ # Expose port for Streamlit
69
+ EXPOSE 7860
70
+
71
+ # Define the entry point for the container
72
+ ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
pages/Workflow & Model Overview.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Page configuration
4
+ st.set_page_config(
5
+ layout="wide",
6
+ initial_sidebar_state="auto"
7
+ )
8
+
9
+ # Custom CSS for better styling
10
+ st.markdown("""
11
+ <style>
12
+ .main-title {
13
+ font-size: 36px;
14
+ color: #4A90E2;
15
+ font-weight: bold;
16
+ text-align: center;
17
+ }
18
+ .sub-title {
19
+ font-size: 24px;
20
+ color: #4A90E2;
21
+ margin-top: 20px;
22
+ }
23
+ .section {
24
+ background-color: #f9f9f9;
25
+ padding: 15px;
26
+ border-radius: 10px;
27
+ margin-top: 20px;
28
+ }
29
+ .section h2 {
30
+ font-size: 22px;
31
+ color: #4A90E2;
32
+ }
33
+ .section p, .section ul {
34
+ color: #666666;
35
+ }
36
+ .link {
37
+ color: #4A90E2;
38
+ text-decoration: none;
39
+ }
40
+ .benchmark-table {
41
+ width: 100%;
42
+ border-collapse: collapse;
43
+ margin-top: 20px;
44
+ }
45
+ .benchmark-table th, .benchmark-table td {
46
+ border: 1px solid #ddd;
47
+ padding: 8px;
48
+ text-align: left;
49
+ }
50
+ .benchmark-table th {
51
+ background-color: #4A90E2;
52
+ color: white;
53
+ }
54
+ .benchmark-table td {
55
+ background-color: #f2f2f2;
56
+ }
57
+ </style>
58
+ """, unsafe_allow_html=True)
59
+
60
+ # Title
61
+ st.markdown('<div class="main-title">Introduction to XLM-RoBERTa Annotators in Spark NLP</div>', unsafe_allow_html=True)
62
+
63
+ # Subtitle
64
+ st.markdown("""
65
+ <div class="section">
66
+ <p>XLM-RoBERTa (Cross-lingual Robustly Optimized BERT Approach) is an advanced multilingual model that extends the capabilities of RoBERTa to over 100 languages. Pre-trained on a massive, diverse corpus, XLM-RoBERTa is designed to handle various NLP tasks in a multilingual context, making it ideal for applications that require cross-lingual understanding. Below, we provide an overview of the XLM-RoBERTa annotators for these tasks:</p>
67
+ </div>
68
+ """, unsafe_allow_html=True)
69
+
70
+ st.markdown("""<div class="sub-title">Zero-Shot Classification with XLM-RoBERTa</div>""", unsafe_allow_html=True)
71
+ st.markdown("""
72
+ <div class="section">
73
+ <p>Zero-shot classification is a powerful technique that allows models to classify text into categories that the model has never seen before during training. This is particularly useful in scenarios where labeled training data is scarce or when new categories emerge frequently.</p>
74
+ <p><strong>XLM-RoBERTa</strong> is a multilingual model, making it highly effective for zero-shot classification tasks across various languages. It leverages large-scale cross-lingual pretraining to understand and classify text in multiple languages without requiring language-specific annotated data.</p>
75
+ <p>Using XLM-RoBERTa for Zero-Shot Classification enables:</p>
76
+ <ul>
77
+ <li><strong>Multilingual Understanding:</strong> Classify text across multiple languages without needing language-specific training data.</li>
78
+ <li><strong>Dynamic Classification:</strong> Adapt to new or emerging categories without retraining the model.</li>
79
+ <li><strong>Resource Efficiency:</strong> Bypass the need for extensive labeled datasets for each language or category.</li>
80
+ </ul>
81
+ <p>Advantages of using XLM-RoBERTa for Zero-Shot Classification in Spark NLP include:</p>
82
+ <ul>
83
+ <li><strong>Scalability:</strong> Built on Apache Spark, the solution scales efficiently for processing large datasets.</li>
84
+ <li><strong>Flexibility:</strong> Easily adapt and integrate with existing Spark pipelines.</li>
85
+ <li><strong>Cross-Lingual Transfer:</strong> Benefit from XLM-RoBERTa’s cross-lingual transfer capabilities to classify text in various languages without additional fine-tuning.</li>
86
+ <li><strong>Pretrained Models:</strong> Leverage state-of-the-art pretrained models available in Spark NLP, reducing the need for custom training.</li>
87
+ </ul>
88
+ </div>
89
+ """, unsafe_allow_html=True)
90
+
91
+ st.markdown("""<div class="sub-title">How to Use XLM-RoBERTa for Token Classification in Spark NLP</div>""", unsafe_allow_html=True)
92
+ st.markdown("""
93
+ <div class="section">
94
+ <p>To leverage XLM-RoBERTa for zero-shot classification, Spark NLP offers a seamless pipeline configuration. The following example demonstrates how to utilize XLM-RoBERTa for zero-shot text classification, enabling the classification of text into categories that the model has never encountered during training. Thanks to its multilingual training, XLM-RoBERTa can perform zero-shot classification across various languages, making it a versatile tool for global NLP applications.</p>
95
+ </div>""", unsafe_allow_html=True)
96
+ st.code('''
97
+ from sparknlp.base import *
98
+ from sparknlp.annotator import *
99
+ from pyspark.ml import Pipeline
100
+
101
+ document_assembler = DocumentAssembler() \\
102
+ .setInputCol('text') \\
103
+ .setOutputCol('document')
104
+
105
+ tokenizer = Tokenizer() \\
106
+ .setInputCols(['document']) \\
107
+ .setOutputCol('token')
108
+
109
+ zeroShotClassifier = XlmRoBertaForZeroShotClassification \\
110
+ .pretrained('xlm_roberta_large_zero_shot_classifier_xnli_anli', 'xx') \\
111
+ .setInputCols(['token', 'document']) \\
112
+ .setOutputCol('class') \\
113
+ .setCaseSensitive(False) \\
114
+ .setMaxSentenceLength(512) \\
115
+ .setCandidateLabels(["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"])
116
+
117
+ pipeline = Pipeline(stages=[
118
+ document_assembler,
119
+ tokenizer,
120
+ zeroShotClassifier
121
+ ])
122
+
123
+ example = spark.createDataFrame([['I have a problem with my iphone that needs to be resolved asap!!']]).toDF("text")
124
+ result = pipeline.fit(example).transform(example)
125
+ result.select("class.result").show(truncate=False)
126
+ ''', language='python')
127
+
128
+ st.text("""
129
+ +------------------+
130
+ |result |
131
+ +------------------+
132
+ |["urgent"] |
133
+ +------------------+
134
+ """)
135
+
136
+ # Example Output
137
+ st.markdown("""
138
+ <div class="section">
139
+ <p>This pipeline processes the input text and classifies it into one of the candidate labels provided. In the example given, the text is classified into categories such as "urgent", "mobile", "travel", etc.</p>
140
+ </div>
141
+ """, unsafe_allow_html=True)
142
+
143
+ # Model Info Section
144
+ st.markdown('<div class="sub-title">Choosing the Right Model</div>', unsafe_allow_html=True)
145
+ st.markdown("""
146
+ <div class="section">
147
+ <p>The XLM-RoBERTa model used here is pretrained on large multilingual datasets and fine-tuned for zero-shot classification tasks. It is available in Spark NLP, providing robust performance across different languages without needing task-specific annotated data.</p>
148
+ <p>For more information about the model, visit the <a class="link" href="https://huggingface.co/xlm-roberta-large-zero-shot-classifier-xnli-anli" target="_blank">XLM-RoBERTa Model Hub</a>.</p>
149
+ </div>
150
+ """, unsafe_allow_html=True)
151
+
152
+ # References Section
153
+ st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
154
+ st.markdown("""
155
+ <div class="section">
156
+ <ul>
157
+ <li><a class="link" href="https://arxiv.org/abs/1911.02116" target="_blank">XLM-R: Cross-lingual Pre-training</a></li>
158
+ <li><a class="link" href="https://arxiv.org/abs/2008.03415" target="_blank">Zero-Shot Learning with XLM-RoBERTa</a></li>
159
+ <li><a class="link" href="https://huggingface.co/xlm-roberta-large-zero-shot-classifier-xnli-anli" target="_blank">XLM-RoBERTa Zero-Shot Classifier</a></li>
160
+ </ul>
161
+ </div>
162
+ """, unsafe_allow_html=True)
163
+
164
+
165
+ # Footer
166
+ st.markdown("""
167
+ <div class="section">
168
+ <ul>
169
+ <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
170
+ <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
171
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
172
+ <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
173
+ <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
174
+ </ul>
175
+ </div>
176
+ """, unsafe_allow_html=True)
177
+
178
+ st.markdown('<div class="sub-title">Quick Links</div>', unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="section">
182
+ <ul>
183
+ <li><a class="link" href="https://sparknlp.org/docs/en/quickstart" target="_blank">Getting Started</a></li>
184
+ <li><a class="link" href="https://nlp.johnsnowlabs.com/models" target="_blank">Pretrained Models</a></li>
185
+ <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english" target="_blank">Example Notebooks</a></li>
186
+ <li><a class="link" href="https://sparknlp.org/docs/en/install" target="_blank">Installation Guide</a></li>
187
+ </ul>
188
+ </div>
189
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ st-annotated-text
3
+ streamlit-tags
4
+ pandas
5
+ numpy
6
+ spark-nlp
7
+ pyspark