Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- .streamlit/config.toml +3 -0
- Demo.py +109 -0
- Dockerfile +72 -0
- pages/Workflow & Model Overview.py +168 -0
- requirements.txt +7 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base="light"
|
3 |
+
primaryColor="#29B4E8"
|
Demo.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import sparknlp
|
3 |
+
|
4 |
+
from sparknlp.base import *
|
5 |
+
from sparknlp.annotator import *
|
6 |
+
from pyspark.ml import Pipeline
|
7 |
+
|
8 |
+
# Page configuration
|
9 |
+
st.set_page_config(
|
10 |
+
layout="wide",
|
11 |
+
initial_sidebar_state="auto"
|
12 |
+
)
|
13 |
+
|
14 |
+
# CSS for styling
|
15 |
+
st.markdown("""
|
16 |
+
<style>
|
17 |
+
.main-title {
|
18 |
+
font-size: 36px;
|
19 |
+
color: #4A90E2;
|
20 |
+
font-weight: bold;
|
21 |
+
text-align: center;
|
22 |
+
}
|
23 |
+
.section {
|
24 |
+
background-color: #f9f9f9;
|
25 |
+
padding: 10px;
|
26 |
+
border-radius: 10px;
|
27 |
+
margin-top: 10px;
|
28 |
+
}
|
29 |
+
.section p, .section ul {
|
30 |
+
color: #666666;
|
31 |
+
}
|
32 |
+
</style>
|
33 |
+
""", unsafe_allow_html=True)
|
34 |
+
|
35 |
+
@st.cache_resource
|
36 |
+
def init_spark():
|
37 |
+
return sparknlp.start()
|
38 |
+
|
39 |
+
@st.cache_resource
|
40 |
+
def create_pipeline():
|
41 |
+
documentAssembler = DocumentAssembler() \
|
42 |
+
.setInputCol("text") \
|
43 |
+
.setOutputCol("documents")
|
44 |
+
|
45 |
+
t5 = T5Transformer.pretrained("t5_grammar_error_corrector") \
|
46 |
+
.setTask("gec:") \
|
47 |
+
.setInputCols(["documents"])\
|
48 |
+
.setMaxOutputLength(200)\
|
49 |
+
.setOutputCol("corrections")
|
50 |
+
|
51 |
+
pipeline = Pipeline().setStages([documentAssembler, t5])
|
52 |
+
return pipeline
|
53 |
+
|
54 |
+
def fit_data(pipeline, data):
|
55 |
+
df = spark.createDataFrame([[data]]).toDF("text")
|
56 |
+
result = pipeline.fit(df).transform(df)
|
57 |
+
return result.select('corrections.result').collect()
|
58 |
+
|
59 |
+
# Sidebar content
|
60 |
+
model = st.sidebar.selectbox(
|
61 |
+
"Choose the pretrained model",
|
62 |
+
['t5_grammar_error_corrector'],
|
63 |
+
help="For more info about the models visit: https://sparknlp.org/models"
|
64 |
+
)
|
65 |
+
|
66 |
+
# Set up the page layout
|
67 |
+
title = "Correct Sentences Grammar"
|
68 |
+
sub_title = "This demo uses a text-to-text model fine-tuned to correct grammatical errors when the task is set to “gec:”. It is based on Prithiviraj Damodaran’s Gramformer model."
|
69 |
+
|
70 |
+
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
|
71 |
+
st.markdown(f'<div style="text-align: center; color: #666666;">{sub_title}</div>', unsafe_allow_html=True)
|
72 |
+
|
73 |
+
# Reference notebook link in sidebar
|
74 |
+
link = """
|
75 |
+
<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/T5_LINGUISTIC.ipynb#scrollTo=QAZ3vOX_SW7B">
|
76 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
77 |
+
</a>
|
78 |
+
"""
|
79 |
+
st.sidebar.markdown('Reference notebook:')
|
80 |
+
st.sidebar.markdown(link, unsafe_allow_html=True)
|
81 |
+
|
82 |
+
# Define the exampless
|
83 |
+
examples = [
|
84 |
+
"She don't knows nothing about what's happening in the office.",
|
85 |
+
"They was playing soccer yesterday when it start raining heavily.",
|
86 |
+
"This car are more faster than that one, but it costed less money.",
|
87 |
+
"I seen him go to the store, but he don't buy nothing from there.",
|
88 |
+
"We was going to the park but it start raining before we could leave."
|
89 |
+
]
|
90 |
+
|
91 |
+
# Text selection and analysis
|
92 |
+
selected_text = st.selectbox("Select an example", examples)
|
93 |
+
custom_input = st.text_input("Try it with your own sentence!")
|
94 |
+
|
95 |
+
text_to_analyze = custom_input if custom_input else selected_text
|
96 |
+
|
97 |
+
st.write('Text to be converted to SQL query:')
|
98 |
+
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
|
99 |
+
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
|
100 |
+
|
101 |
+
# Initialize Spark and create pipeline
|
102 |
+
spark = init_spark()
|
103 |
+
pipeline = create_pipeline()
|
104 |
+
output = fit_data(pipeline, text_to_analyze)
|
105 |
+
|
106 |
+
# Display transformed sentence
|
107 |
+
st.write("Predicted Sentence:")
|
108 |
+
output_text = "".join(output[0][0])
|
109 |
+
st.markdown(f'<div class="scroll">{output_text}</div>', unsafe_allow_html=True)
|
Dockerfile
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download base image ubuntu 18.04
|
2 |
+
FROM ubuntu:18.04
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV NB_USER jovyan
|
6 |
+
ENV NB_UID 1000
|
7 |
+
ENV HOME /home/${NB_USER}
|
8 |
+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
9 |
+
|
10 |
+
# Install required packages
|
11 |
+
RUN apt-get update && apt-get install -y \
|
12 |
+
tar \
|
13 |
+
wget \
|
14 |
+
bash \
|
15 |
+
rsync \
|
16 |
+
gcc \
|
17 |
+
libfreetype6-dev \
|
18 |
+
libhdf5-serial-dev \
|
19 |
+
libpng-dev \
|
20 |
+
libzmq3-dev \
|
21 |
+
python3 \
|
22 |
+
python3-dev \
|
23 |
+
python3-pip \
|
24 |
+
unzip \
|
25 |
+
pkg-config \
|
26 |
+
software-properties-common \
|
27 |
+
graphviz \
|
28 |
+
openjdk-8-jdk \
|
29 |
+
ant \
|
30 |
+
ca-certificates-java \
|
31 |
+
&& apt-get clean \
|
32 |
+
&& update-ca-certificates -f
|
33 |
+
|
34 |
+
# Install Python 3.8 and pip
|
35 |
+
RUN add-apt-repository ppa:deadsnakes/ppa \
|
36 |
+
&& apt-get update \
|
37 |
+
&& apt-get install -y python3.8 python3-pip \
|
38 |
+
&& apt-get clean
|
39 |
+
|
40 |
+
# Set up JAVA_HOME
|
41 |
+
RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> /etc/profile \
|
42 |
+
&& echo "export PATH=\$JAVA_HOME/bin:\$PATH" >> /etc/profile
|
43 |
+
# Create a new user named "jovyan" with user ID 1000
|
44 |
+
RUN useradd -m -u ${NB_UID} ${NB_USER}
|
45 |
+
|
46 |
+
# Switch to the "jovyan" user
|
47 |
+
USER ${NB_USER}
|
48 |
+
|
49 |
+
# Set home and path variables for the user
|
50 |
+
ENV HOME=/home/${NB_USER} \
|
51 |
+
PATH=/home/${NB_USER}/.local/bin:$PATH
|
52 |
+
|
53 |
+
# Set up PySpark to use Python 3.8 for both driver and workers
|
54 |
+
ENV PYSPARK_PYTHON=/usr/bin/python3.8
|
55 |
+
ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3.8
|
56 |
+
|
57 |
+
# Set the working directory to the user's home directory
|
58 |
+
WORKDIR ${HOME}
|
59 |
+
|
60 |
+
# Upgrade pip and install Python dependencies
|
61 |
+
RUN python3.8 -m pip install --upgrade pip
|
62 |
+
COPY requirements.txt /tmp/requirements.txt
|
63 |
+
RUN python3.8 -m pip install -r /tmp/requirements.txt
|
64 |
+
|
65 |
+
# Copy the application code into the container at /home/jovyan
|
66 |
+
COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
|
67 |
+
|
68 |
+
# Expose port for Streamlit
|
69 |
+
EXPOSE 7860
|
70 |
+
|
71 |
+
# Define the entry point for the container
|
72 |
+
ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
pages/Workflow & Model Overview.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Custom CSS for better styling
|
4 |
+
st.markdown("""
|
5 |
+
<style>
|
6 |
+
.main-title {
|
7 |
+
font-size: 36px;
|
8 |
+
color: #4A90E2;
|
9 |
+
font-weight: bold;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
.sub-title {
|
13 |
+
font-size: 24px;
|
14 |
+
color: #4A90E2;
|
15 |
+
margin-top: 20px;
|
16 |
+
}
|
17 |
+
.section {
|
18 |
+
background-color: #f9f9f9;
|
19 |
+
padding: 15px;
|
20 |
+
border-radius: 10px;
|
21 |
+
margin-top: 20px;
|
22 |
+
}
|
23 |
+
.section h2 {
|
24 |
+
font-size: 22px;
|
25 |
+
color: #4A90E2;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
.link {
|
31 |
+
color: #4A90E2;
|
32 |
+
text-decoration: none;
|
33 |
+
}
|
34 |
+
</style>
|
35 |
+
""", unsafe_allow_html=True)
|
36 |
+
|
37 |
+
# Title
|
38 |
+
st.markdown('<div class="main-title">Correct Sentences Grammar</div>', unsafe_allow_html=True)
|
39 |
+
|
40 |
+
# Introduction Section
|
41 |
+
st.markdown("""
|
42 |
+
<div class="section">
|
43 |
+
<p>Ensuring correct grammar in sentences is essential for clear and effective communication. Whether writing an email, an academic paper, or a casual message, proper grammar ensures that your message is understood as intended.</p>
|
44 |
+
<p>This page demonstrates how to implement a grammar correction pipeline using advanced NLP models. We utilize the T5 Transformer model, fine-tuned for grammar error correction, to automatically correct sentences and enhance their grammatical accuracy.</p>
|
45 |
+
</div>
|
46 |
+
""", unsafe_allow_html=True)
|
47 |
+
|
48 |
+
# T5 Transformer Overview
|
49 |
+
st.markdown('<div class="sub-title">Understanding the T5 Transformer for Grammar Correction</div>', unsafe_allow_html=True)
|
50 |
+
|
51 |
+
st.markdown("""
|
52 |
+
<div class="section">
|
53 |
+
<p>The T5 (Text-To-Text Transfer Transformer) model, developed by Google, is a versatile tool for various NLP tasks, including grammar correction. By processing input sentences and applying the appropriate grammar corrections, T5 generates outputs that maintain the original meaning while correcting errors.</p>
|
54 |
+
<p>This is particularly useful for applications in writing assistance, automated editing, and educational tools, where grammatical accuracy is crucial.</p>
|
55 |
+
</div>
|
56 |
+
""", unsafe_allow_html=True)
|
57 |
+
|
58 |
+
# Performance Section
|
59 |
+
st.markdown('<div class="sub-title">Performance and Use Cases</div>', unsafe_allow_html=True)
|
60 |
+
|
61 |
+
st.markdown("""
|
62 |
+
<div class="section">
|
63 |
+
<p>The T5 model has shown strong performance in grammar correction tasks. It consistently produces accurate and contextually appropriate corrections, making it a valuable tool for improving written communication across various settings.</p>
|
64 |
+
<p>This capability is beneficial for students, professionals, and anyone who needs to ensure their writing is grammatically correct. The T5 model’s efficiency in correcting errors makes it a powerful asset for enhancing the quality of written content.</p>
|
65 |
+
</div>
|
66 |
+
""", unsafe_allow_html=True)
|
67 |
+
|
68 |
+
# Implementation Section
|
69 |
+
st.markdown('<div class="sub-title">Implementing Grammar Correction</div>', unsafe_allow_html=True)
|
70 |
+
|
71 |
+
st.markdown("""
|
72 |
+
<div class="section">
|
73 |
+
<p>The following example demonstrates how to implement a grammar correction pipeline using Spark NLP. The pipeline includes a document assembler and the T5 model for performing grammar corrections.</p>
|
74 |
+
</div>
|
75 |
+
""", unsafe_allow_html=True)
|
76 |
+
|
77 |
+
st.code('''
|
78 |
+
import sparknlp
|
79 |
+
from sparknlp.base import *
|
80 |
+
from sparknlp.annotator import *
|
81 |
+
from pyspark.ml import Pipeline
|
82 |
+
|
83 |
+
# Initialize Spark NLP
|
84 |
+
spark = sparknlp.start()
|
85 |
+
|
86 |
+
# Define the pipeline stages
|
87 |
+
documentAssembler = DocumentAssembler()\\
|
88 |
+
.setInputCol("text")\\
|
89 |
+
.setOutputCol("documents")
|
90 |
+
|
91 |
+
t5 = T5Transformer.pretrained("t5_grammar_error_corrector")\\
|
92 |
+
.setTask("gec:")\\
|
93 |
+
.setInputCols(["documents"])\\
|
94 |
+
.setMaxOutputLength(200)\\
|
95 |
+
.setOutputCol("corrections")
|
96 |
+
|
97 |
+
pipeline = Pipeline().setStages([documentAssembler, t5])
|
98 |
+
|
99 |
+
# Input data example
|
100 |
+
data = spark.createDataFrame([["She don't knows nothing about what's happening in the office."]]).toDF("text")
|
101 |
+
|
102 |
+
# Apply the pipeline for grammar correction
|
103 |
+
result = pipeline.fit(data).transform(data)
|
104 |
+
result.select("corrections.result").show(truncate=False)
|
105 |
+
''', language='python')
|
106 |
+
|
107 |
+
# Example Output
|
108 |
+
st.text("""
|
109 |
+
+---------------------------------------------------------------+
|
110 |
+
|corrections.result |
|
111 |
+
+---------------------------------------------------------------+
|
112 |
+
|[She doesn't know anything about what's happening in the office.]|
|
113 |
+
+---------------------------------------------------------------+
|
114 |
+
""")
|
115 |
+
|
116 |
+
# Model Info Section
|
117 |
+
st.markdown('<div class="sub-title">Choosing the Right T5 Model for Grammar Correction</div>', unsafe_allow_html=True)
|
118 |
+
|
119 |
+
st.markdown("""
|
120 |
+
<div class="section">
|
121 |
+
<p>For correcting grammar errors, we use the model: "t5_grammar_error_corrector". This model is fine-tuned to detect and correct various types of grammatical errors in English sentences.</p>
|
122 |
+
<p>Explore other T5 models tailored for different NLP tasks on the <a class="link" href="https://sparknlp.org/models?annotator=T5Transformer" target="_blank">Spark NLP Models Hub</a> to find the best fit for your specific needs.</p>
|
123 |
+
</div>
|
124 |
+
""", unsafe_allow_html=True)
|
125 |
+
|
126 |
+
# References Section
|
127 |
+
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
|
128 |
+
|
129 |
+
st.markdown("""
|
130 |
+
<div class="section">
|
131 |
+
<ul>
|
132 |
+
<li><a class="link" href="https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html" target="_blank">Google AI Blog</a>: Exploring Transfer Learning with T5</li>
|
133 |
+
<li><a class="link" href="https://sparknlp.org/models?annotator=T5Transformer" target="_blank">Spark NLP Model Hub</a>: Explore T5 models</li>
|
134 |
+
<li>Model used for Grammar Correction: <a class="link" href="https://sparknlp.org/2022/11/28/t5_grammar_error_corrector_en.html" target="_blank">t5_grammar_error_corrector</a></li>
|
135 |
+
<li><a class="link" href="https://github.com/google-research/text-to-text-transfer-transformer" target="_blank">GitHub</a>: T5 Transformer repository</li>
|
136 |
+
<li><a class="link" href="https://arxiv.org/abs/1910.10683" target="_blank">T5 Paper</a>: Detailed insights from the developers</li>
|
137 |
+
</ul>
|
138 |
+
</div>
|
139 |
+
""", unsafe_allow_html=True)
|
140 |
+
|
141 |
+
# Community & Support Section
|
142 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
143 |
+
|
144 |
+
st.markdown("""
|
145 |
+
<div class="section">
|
146 |
+
<ul>
|
147 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
148 |
+
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
149 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
150 |
+
<li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
|
151 |
+
<li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
|
152 |
+
</ul>
|
153 |
+
</div>
|
154 |
+
""", unsafe_allow_html=True)
|
155 |
+
|
156 |
+
# Quick Links Section
|
157 |
+
st.markdown('<div class="sub-title">Quick Links</div>', unsafe_allow_html=True)
|
158 |
+
|
159 |
+
st.markdown("""
|
160 |
+
<div class="section">
|
161 |
+
<ul>
|
162 |
+
<li><a class="link" href="https://sparknlp.org/docs/en/quickstart" target="_blank">Getting Started</a></li>
|
163 |
+
<li><a class="link" href="https://nlp.johnsnowlabs.com/models" target="_blank">Pretrained Models</a></li>
|
164 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english" target="_blank">Example Notebooks</a></li>
|
165 |
+
<li><a class="link" href="https://sparknlp.org/docs/en/install" target="_blank">Installation Guide</a></li>
|
166 |
+
</ul>
|
167 |
+
</div>
|
168 |
+
""", unsafe_allow_html=True)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
st-annotated-text
|
3 |
+
streamlit-tags
|
4 |
+
pandas
|
5 |
+
numpy
|
6 |
+
spark-nlp
|
7 |
+
pyspark
|