Nguyen Thi Dieu Hien
commited on
Add files via upload
Browse files
app.py
CHANGED
@@ -28,7 +28,7 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc
|
|
28 |
|
29 |
|
30 |
# Set up the Streamlit page
|
31 |
-
st.set_page_config(layout='wide')
|
32 |
|
33 |
|
34 |
# Define variables
|
@@ -45,7 +45,7 @@ class_names = ['Cong nghe', 'Doi song', 'Giai tri', 'Giao duc', 'Khoa hoc', 'Kin
|
|
45 |
class NewsClassifier(nn.Module):
|
46 |
def __init__(self, n_classes, model_name):
|
47 |
super(NewsClassifier, self).__init__()
|
48 |
-
# Load a pre-trained
|
49 |
self.bert = AutoModel.from_pretrained(model_name)
|
50 |
# Dropout layer to prevent overfitting
|
51 |
self.drop = nn.Dropout(p=0.3)
|
@@ -56,7 +56,7 @@ class NewsClassifier(nn.Module):
|
|
56 |
nn.init.normal_(self.fc.bias, 0)
|
57 |
|
58 |
def forward(self, input_ids, attention_mask):
|
59 |
-
# Get the output from the
|
60 |
last_hidden_state, output = self.bert(
|
61 |
input_ids=input_ids,
|
62 |
attention_mask=attention_mask,
|
@@ -138,6 +138,9 @@ def tokenize_text(text, tokenizer, max_len=256):
|
|
138 |
return_tensors='pt',
|
139 |
)
|
140 |
return tokenized['input_ids'], tokenized['attention_mask']
|
|
|
|
|
|
|
141 |
def get_vector_embedding(padded, attention_mask, phobert):
|
142 |
# Obtain features from BERT
|
143 |
with torch.no_grad():
|
@@ -237,27 +240,31 @@ def plot_data(train_html_path, test_html_path, val_html_path):
|
|
237 |
def main():
|
238 |
|
239 |
#st.title("News Classifier App")
|
240 |
-
activities = ["Introduction", "Text Preprocessing", "Feature Extraction", "Train and Evaluate Models", "Prediction"]
|
241 |
choice = st.sidebar.selectbox("Choose Activity", activities)
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
# Preprocessing data
|
244 |
if choice == "Text Preprocessing":
|
245 |
st.info("Text Preprocessing")
|
246 |
-
preprocessing_task = ["No Options", "
|
247 |
task_choice = st.selectbox("Choose Task", preprocessing_task)
|
248 |
-
|
249 |
-
|
250 |
-
st.markdown("From this large dataset, our team extracted approximately 162,000 articles categorized into 13 distinct categorie and split into training, test and validation sets after preprocessing the data with 70%, 15% and 15% respectively.")
|
251 |
-
st.markdown("Link to dataset: https://github.com/binhvq/news-corpus")
|
252 |
-
st.image("images/sample_data.png", caption="Sample original data", use_column_width=True)
|
253 |
-
summary_df = pd.read_csv("assets/summary_data.csv")
|
254 |
-
st.dataframe(summary_df)
|
255 |
-
train_images = "images/article_by_categories_train_data.html"
|
256 |
-
test_images = "images/article_by_categories_test_data.html"
|
257 |
-
val_images = "images/article_by_categories_val_data.html"
|
258 |
-
plot_data(train_images, test_images, val_images)
|
259 |
-
st.image("images/token_length_distribution.png",caption="Distribution of Token Count per Sentence", use_column_width=True)
|
260 |
-
elif task_choice == "Process Text Demo":
|
261 |
st.markdown("**Preprocessing Steps:**")
|
262 |
st.markdown("- Standardize Vietnamese words, convert to lower case")
|
263 |
st.markdown("- Utilize techniques such as regular expressions to remove unwanted elements: html, links, emails, numbers,...")
|
@@ -272,7 +279,7 @@ def main():
|
|
272 |
st.success(preprocessed_news)
|
273 |
elif task_choice == "Load Preprocessed Data":
|
274 |
df = pd.read_json(PREPROCESSED_DATA, encoding='utf-8', lines=True)
|
275 |
-
st.dataframe(df.head(20), use_container_width=True)
|
276 |
|
277 |
# Feature Extration
|
278 |
if choice == "Feature Extraction":
|
@@ -282,14 +289,22 @@ def main():
|
|
282 |
task_choice = st.selectbox("Choose Model",feature_extraction_task)
|
283 |
if task_choice == "PhoBert":
|
284 |
st.markdown("**Feature Extraction Steps:**")
|
285 |
-
st.markdown("- Tokenize using PhoBert's Tokenizer. Note that when tokenizing we will add two special tokens, [CLS] and [SEP] at the beginning and end of the sentence.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
st.markdown("- Insert the tokenized text sentence into the model with the attention mask. Attention mask helps the model only focus on words in the sentence and ignore words with additional padding. Added words are marked = 0")
|
287 |
st.markdown("- Take the output and take the first output vector (which is in the special token position [CLS]) as a feature for the sentence to train or predict (depending on the phase).")
|
288 |
phobert, tokenizer = load_bert()
|
289 |
text = st.text_area("Enter Text","Type Here")
|
290 |
if st.button("Execute"):
|
291 |
st.subheader("Sentence to ids")
|
292 |
-
padded, attention_mask = tokenize_text(
|
293 |
st.write("Padded Sequence:", padded)
|
294 |
st.write("Attention Mask:", attention_mask)
|
295 |
|
@@ -313,8 +328,8 @@ def main():
|
|
313 |
st.header("Predict")
|
314 |
processed_news = preprocess_text(news_text)
|
315 |
predicted_label, confidence_df = predict_label(processed_news, tokenizer, phobert, model, class_names, max_len)
|
316 |
-
st.subheader("Confidence
|
317 |
-
st.dataframe(confidence_df, height=
|
318 |
st.subheader("Predicted Label")
|
319 |
st.success(predicted_label)
|
320 |
|
@@ -326,8 +341,8 @@ def main():
|
|
326 |
st.info(news_text)
|
327 |
st.header("Predict")
|
328 |
df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
|
329 |
-
st.subheader("Confidence
|
330 |
-
st.dataframe(df_confidence, height=
|
331 |
st.subheader("Predicted Label")
|
332 |
st.success(predicted_label)
|
333 |
if model_choice == "phobertbase":
|
@@ -338,8 +353,8 @@ def main():
|
|
338 |
st.info(news_text)
|
339 |
st.header("Predict")
|
340 |
df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
|
341 |
-
st.subheader("Confidence
|
342 |
-
st.dataframe(df_confidence, height=
|
343 |
st.subheader("Predicted Label")
|
344 |
st.success(predicted_label)
|
345 |
if choice == "Train and Evaluate Models":
|
@@ -353,7 +368,7 @@ def main():
|
|
353 |
class NewsClassifier(nn.Module):
|
354 |
def __init__(self, n_classes, model_name):
|
355 |
super(NewsClassifier, self).__init__()
|
356 |
-
# Load a pre-trained
|
357 |
self.bert = AutoModel.from_pretrained(model_name)
|
358 |
# Dropout layer to prevent overfitting
|
359 |
self.drop = nn.Dropout(p=0.3)
|
@@ -364,7 +379,7 @@ def main():
|
|
364 |
nn.init.normal_(self.fc.bias, 0)
|
365 |
|
366 |
def forward(self, input_ids, attention_mask):
|
367 |
-
# Get the output from the
|
368 |
last_hidden_state, output = self.bert(
|
369 |
input_ids=input_ids,
|
370 |
attention_mask=attention_mask,
|
@@ -489,17 +504,17 @@ def main():
|
|
489 |
with col4:
|
490 |
st.markdown("**BiLSTM with PhoBert feature extraction**")
|
491 |
bilstm_report = pd.read_csv("assets/classification_report_bilstm_phobertbase.csv")
|
492 |
-
st.dataframe(bilstm_report, height=
|
493 |
|
494 |
with col5:
|
495 |
st.markdown("**phobertbase**")
|
496 |
phobertbase_report = pd.read_csv("assets/classification_report_phobertbase.csv")
|
497 |
-
st.dataframe(phobertbase_report, height=
|
498 |
|
499 |
with col6:
|
500 |
st.markdown("**longformer-phobertbase**")
|
501 |
longformer_report = pd.read_csv("assets/classification_report_longformer.csv")
|
502 |
-
st.dataframe(longformer_report, height=
|
503 |
if choice == "Introduction":
|
504 |
st.markdown(
|
505 |
"""
|
|
|
28 |
|
29 |
|
30 |
# Set up the Streamlit page
|
31 |
+
st.set_page_config(layout='wide', page_title="News Classifier App", page_icon="📑")
|
32 |
|
33 |
|
34 |
# Define variables
|
|
|
45 |
class NewsClassifier(nn.Module):
|
46 |
def __init__(self, n_classes, model_name):
|
47 |
super(NewsClassifier, self).__init__()
|
48 |
+
# Load a pre-trained model
|
49 |
self.bert = AutoModel.from_pretrained(model_name)
|
50 |
# Dropout layer to prevent overfitting
|
51 |
self.drop = nn.Dropout(p=0.3)
|
|
|
56 |
nn.init.normal_(self.fc.bias, 0)
|
57 |
|
58 |
def forward(self, input_ids, attention_mask):
|
59 |
+
# Get the output from the model
|
60 |
last_hidden_state, output = self.bert(
|
61 |
input_ids=input_ids,
|
62 |
attention_mask=attention_mask,
|
|
|
138 |
return_tensors='pt',
|
139 |
)
|
140 |
return tokenized['input_ids'], tokenized['attention_mask']
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
def get_vector_embedding(padded, attention_mask, phobert):
|
145 |
# Obtain features from BERT
|
146 |
with torch.no_grad():
|
|
|
240 |
def main():
|
241 |
|
242 |
#st.title("News Classifier App")
|
243 |
+
activities = ["Introduction", "About the Dataset","Text Preprocessing", "Feature Extraction", "Train and Evaluate Models", "Prediction"]
|
244 |
choice = st.sidebar.selectbox("Choose Activity", activities)
|
245 |
|
246 |
+
# Dataset
|
247 |
+
if choice == "About the Dataset":
|
248 |
+
st.info("About the Dataset")
|
249 |
+
st.markdown("This dataset consists of Vietnamese news articles collected from various Vietnamese online news portals such as Thanh Nien, VNExpress, BaoMoi, etc. The dataset was originally sourced from a MongoDB dump containing over 20 million articles.")
|
250 |
+
st.markdown("From this large dataset, our team extracted approximately 162,000 articles categorized into 13 distinct categorie and split into training, test and validation sets after preprocessing the data with 70%, 15% and 15% respectively.")
|
251 |
+
st.markdown("Link to dataset: https://github.com/binhvq/news-corpus")
|
252 |
+
st.image("images/sample_data.png", caption="Sample original data", use_column_width=True)
|
253 |
+
summary_df = pd.read_csv("assets/summary_data.csv")
|
254 |
+
st.dataframe(summary_df, hide_index=True, use_container_width=True)
|
255 |
+
train_images = "images/article_by_categories_train_data.html"
|
256 |
+
test_images = "images/article_by_categories_test_data.html"
|
257 |
+
val_images = "images/article_by_categories_val_data.html"
|
258 |
+
plot_data(train_images, test_images, val_images)
|
259 |
+
st.image("images/token_length_distribution.png",caption="Distribution of Token Count per Sentence", use_column_width=True)
|
260 |
+
|
261 |
# Preprocessing data
|
262 |
if choice == "Text Preprocessing":
|
263 |
st.info("Text Preprocessing")
|
264 |
+
preprocessing_task = ["No Options", "Process Text Demo", "Load Preprocessed Data"]
|
265 |
task_choice = st.selectbox("Choose Task", preprocessing_task)
|
266 |
+
|
267 |
+
if task_choice == "Process Text Demo":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
st.markdown("**Preprocessing Steps:**")
|
269 |
st.markdown("- Standardize Vietnamese words, convert to lower case")
|
270 |
st.markdown("- Utilize techniques such as regular expressions to remove unwanted elements: html, links, emails, numbers,...")
|
|
|
279 |
st.success(preprocessed_news)
|
280 |
elif task_choice == "Load Preprocessed Data":
|
281 |
df = pd.read_json(PREPROCESSED_DATA, encoding='utf-8', lines=True)
|
282 |
+
st.dataframe(df.head(20), use_container_width=True, hide_index=True)
|
283 |
|
284 |
# Feature Extration
|
285 |
if choice == "Feature Extraction":
|
|
|
289 |
task_choice = st.selectbox("Choose Model",feature_extraction_task)
|
290 |
if task_choice == "PhoBert":
|
291 |
st.markdown("**Feature Extraction Steps:**")
|
292 |
+
st.markdown("- Tokenize using PhoBert's Tokenizer. Note that when tokenizing we will add two special tokens, [CLS] and [SEP] at the beginning and end of the sentence. [CLS] (Classification Token): This token is added at the beginning of the sentence. It signals to PhoBERT that this is the start of a new sentence and helps the model understand the overall context of the sentence. [SEP] (Separator Token): This token is added at the end of the sentence. It acts as a separator, indicating the end of the input sentence.")
|
293 |
+
st.markdown("""
|
294 |
+
> Why use [CLS] and [SEP]?
|
295 |
+
>
|
296 |
+
> These special tokens help PhoBERT process sentences more effectively:
|
297 |
+
>
|
298 |
+
> - **Contextual Understanding:** [CLS] helps PhoBERT grasp the overall meaning of the sentence.
|
299 |
+
> - **Sentence Boundaries:** [SEP] clearly defines the start and end of each sentence, especially important when processing multiple sentences together.
|
300 |
+
""")
|
301 |
st.markdown("- Insert the tokenized text sentence into the model with the attention mask. Attention mask helps the model only focus on words in the sentence and ignore words with additional padding. Added words are marked = 0")
|
302 |
st.markdown("- Take the output and take the first output vector (which is in the special token position [CLS]) as a feature for the sentence to train or predict (depending on the phase).")
|
303 |
phobert, tokenizer = load_bert()
|
304 |
text = st.text_area("Enter Text","Type Here")
|
305 |
if st.button("Execute"):
|
306 |
st.subheader("Sentence to ids")
|
307 |
+
padded, attention_mask = tokenize_text(text.split(), tokenizer, max_len=256)
|
308 |
st.write("Padded Sequence:", padded)
|
309 |
st.write("Attention Mask:", attention_mask)
|
310 |
|
|
|
328 |
st.header("Predict")
|
329 |
processed_news = preprocess_text(news_text)
|
330 |
predicted_label, confidence_df = predict_label(processed_news, tokenizer, phobert, model, class_names, max_len)
|
331 |
+
st.subheader("Confidence per Label")
|
332 |
+
st.dataframe(confidence_df, height=500, hide_index=True, use_container_width=True)
|
333 |
st.subheader("Predicted Label")
|
334 |
st.success(predicted_label)
|
335 |
|
|
|
341 |
st.info(news_text)
|
342 |
st.header("Predict")
|
343 |
df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
|
344 |
+
st.subheader("Confidence per Label")
|
345 |
+
st.dataframe(df_confidence, height=500, hide_index=True, use_container_width=True)
|
346 |
st.subheader("Predicted Label")
|
347 |
st.success(predicted_label)
|
348 |
if model_choice == "phobertbase":
|
|
|
353 |
st.info(news_text)
|
354 |
st.header("Predict")
|
355 |
df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
|
356 |
+
st.subheader("Confidence per Label")
|
357 |
+
st.dataframe(df_confidence, height=500, hide_index=True, use_container_width=True)
|
358 |
st.subheader("Predicted Label")
|
359 |
st.success(predicted_label)
|
360 |
if choice == "Train and Evaluate Models":
|
|
|
368 |
class NewsClassifier(nn.Module):
|
369 |
def __init__(self, n_classes, model_name):
|
370 |
super(NewsClassifier, self).__init__()
|
371 |
+
# Load a pre-trained model
|
372 |
self.bert = AutoModel.from_pretrained(model_name)
|
373 |
# Dropout layer to prevent overfitting
|
374 |
self.drop = nn.Dropout(p=0.3)
|
|
|
379 |
nn.init.normal_(self.fc.bias, 0)
|
380 |
|
381 |
def forward(self, input_ids, attention_mask):
|
382 |
+
# Get the output from the model
|
383 |
last_hidden_state, output = self.bert(
|
384 |
input_ids=input_ids,
|
385 |
attention_mask=attention_mask,
|
|
|
504 |
with col4:
|
505 |
st.markdown("**BiLSTM with PhoBert feature extraction**")
|
506 |
bilstm_report = pd.read_csv("assets/classification_report_bilstm_phobertbase.csv")
|
507 |
+
st.dataframe(bilstm_report, height=500, hide_index=True, use_container_width=True)
|
508 |
|
509 |
with col5:
|
510 |
st.markdown("**phobertbase**")
|
511 |
phobertbase_report = pd.read_csv("assets/classification_report_phobertbase.csv")
|
512 |
+
st.dataframe(phobertbase_report, height=500, hide_index=True, use_container_width=True)
|
513 |
|
514 |
with col6:
|
515 |
st.markdown("**longformer-phobertbase**")
|
516 |
longformer_report = pd.read_csv("assets/classification_report_longformer.csv")
|
517 |
+
st.dataframe(longformer_report, height=500, hide_index=True, use_container_width=True)
|
518 |
if choice == "Introduction":
|
519 |
st.markdown(
|
520 |
"""
|