Nguyen Thi Dieu Hien commited on
Commit
f652bd3
·
unverified ·
1 Parent(s): 29937fc

Add files via upload

Browse files
Files changed (1) hide show
  1. app.py +47 -32
app.py CHANGED
@@ -28,7 +28,7 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc
28
 
29
 
30
  # Set up the Streamlit page
31
- st.set_page_config(layout='wide')
32
 
33
 
34
  # Define variables
@@ -45,7 +45,7 @@ class_names = ['Cong nghe', 'Doi song', 'Giai tri', 'Giao duc', 'Khoa hoc', 'Kin
45
  class NewsClassifier(nn.Module):
46
  def __init__(self, n_classes, model_name):
47
  super(NewsClassifier, self).__init__()
48
- # Load a pre-trained BERT model
49
  self.bert = AutoModel.from_pretrained(model_name)
50
  # Dropout layer to prevent overfitting
51
  self.drop = nn.Dropout(p=0.3)
@@ -56,7 +56,7 @@ class NewsClassifier(nn.Module):
56
  nn.init.normal_(self.fc.bias, 0)
57
 
58
  def forward(self, input_ids, attention_mask):
59
- # Get the output from the BERT model
60
  last_hidden_state, output = self.bert(
61
  input_ids=input_ids,
62
  attention_mask=attention_mask,
@@ -138,6 +138,9 @@ def tokenize_text(text, tokenizer, max_len=256):
138
  return_tensors='pt',
139
  )
140
  return tokenized['input_ids'], tokenized['attention_mask']
 
 
 
141
  def get_vector_embedding(padded, attention_mask, phobert):
142
  # Obtain features from BERT
143
  with torch.no_grad():
@@ -237,27 +240,31 @@ def plot_data(train_html_path, test_html_path, val_html_path):
237
  def main():
238
 
239
  #st.title("News Classifier App")
240
- activities = ["Introduction", "Text Preprocessing", "Feature Extraction", "Train and Evaluate Models", "Prediction"]
241
  choice = st.sidebar.selectbox("Choose Activity", activities)
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  # Preprocessing data
244
  if choice == "Text Preprocessing":
245
  st.info("Text Preprocessing")
246
- preprocessing_task = ["No Options", "Data Overview", "Process Text Demo", "Load Preprocessed Data"]
247
  task_choice = st.selectbox("Choose Task", preprocessing_task)
248
- if task_choice == "Data Overview":
249
- st.markdown("This dataset consists of Vietnamese news articles collected from various Vietnamese online news portals such as Thanh Nien, VNExpress, BaoMoi, etc. The dataset was originally sourced from a MongoDB dump containing over 20 million articles.")
250
- st.markdown("From this large dataset, our team extracted approximately 162,000 articles categorized into 13 distinct categorie and split into training, test and validation sets after preprocessing the data with 70%, 15% and 15% respectively.")
251
- st.markdown("Link to dataset: https://github.com/binhvq/news-corpus")
252
- st.image("images/sample_data.png", caption="Sample original data", use_column_width=True)
253
- summary_df = pd.read_csv("assets/summary_data.csv")
254
- st.dataframe(summary_df)
255
- train_images = "images/article_by_categories_train_data.html"
256
- test_images = "images/article_by_categories_test_data.html"
257
- val_images = "images/article_by_categories_val_data.html"
258
- plot_data(train_images, test_images, val_images)
259
- st.image("images/token_length_distribution.png",caption="Distribution of Token Count per Sentence", use_column_width=True)
260
- elif task_choice == "Process Text Demo":
261
  st.markdown("**Preprocessing Steps:**")
262
  st.markdown("- Standardize Vietnamese words, convert to lower case")
263
  st.markdown("- Utilize techniques such as regular expressions to remove unwanted elements: html, links, emails, numbers,...")
@@ -272,7 +279,7 @@ def main():
272
  st.success(preprocessed_news)
273
  elif task_choice == "Load Preprocessed Data":
274
  df = pd.read_json(PREPROCESSED_DATA, encoding='utf-8', lines=True)
275
- st.dataframe(df.head(20), use_container_width=True)
276
 
277
  # Feature Extration
278
  if choice == "Feature Extraction":
@@ -282,14 +289,22 @@ def main():
282
  task_choice = st.selectbox("Choose Model",feature_extraction_task)
283
  if task_choice == "PhoBert":
284
  st.markdown("**Feature Extraction Steps:**")
285
- st.markdown("- Tokenize using PhoBert's Tokenizer. Note that when tokenizing we will add two special tokens, [CLS] and [SEP] at the beginning and end of the sentence.")
 
 
 
 
 
 
 
 
286
  st.markdown("- Insert the tokenized text sentence into the model with the attention mask. Attention mask helps the model only focus on words in the sentence and ignore words with additional padding. Added words are marked = 0")
287
  st.markdown("- Take the output and take the first output vector (which is in the special token position [CLS]) as a feature for the sentence to train or predict (depending on the phase).")
288
  phobert, tokenizer = load_bert()
289
  text = st.text_area("Enter Text","Type Here")
290
  if st.button("Execute"):
291
  st.subheader("Sentence to ids")
292
- padded, attention_mask = tokenize_text([text], tokenizer, max_len=256)
293
  st.write("Padded Sequence:", padded)
294
  st.write("Attention Mask:", attention_mask)
295
 
@@ -313,8 +328,8 @@ def main():
313
  st.header("Predict")
314
  processed_news = preprocess_text(news_text)
315
  predicted_label, confidence_df = predict_label(processed_news, tokenizer, phobert, model, class_names, max_len)
316
- st.subheader("Confidence Per Label")
317
- st.dataframe(confidence_df, height=600, hide_index=True, use_container_width=True)
318
  st.subheader("Predicted Label")
319
  st.success(predicted_label)
320
 
@@ -326,8 +341,8 @@ def main():
326
  st.info(news_text)
327
  st.header("Predict")
328
  df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
329
- st.subheader("Confidence Per Label")
330
- st.dataframe(df_confidence, height=600, hide_index=True, use_container_width=True)
331
  st.subheader("Predicted Label")
332
  st.success(predicted_label)
333
  if model_choice == "phobertbase":
@@ -338,8 +353,8 @@ def main():
338
  st.info(news_text)
339
  st.header("Predict")
340
  df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
341
- st.subheader("Confidence Per Label")
342
- st.dataframe(df_confidence, height=600, hide_index=True, use_container_width=True)
343
  st.subheader("Predicted Label")
344
  st.success(predicted_label)
345
  if choice == "Train and Evaluate Models":
@@ -353,7 +368,7 @@ def main():
353
  class NewsClassifier(nn.Module):
354
  def __init__(self, n_classes, model_name):
355
  super(NewsClassifier, self).__init__()
356
- # Load a pre-trained BERT model
357
  self.bert = AutoModel.from_pretrained(model_name)
358
  # Dropout layer to prevent overfitting
359
  self.drop = nn.Dropout(p=0.3)
@@ -364,7 +379,7 @@ def main():
364
  nn.init.normal_(self.fc.bias, 0)
365
 
366
  def forward(self, input_ids, attention_mask):
367
- # Get the output from the BERT model
368
  last_hidden_state, output = self.bert(
369
  input_ids=input_ids,
370
  attention_mask=attention_mask,
@@ -489,17 +504,17 @@ def main():
489
  with col4:
490
  st.markdown("**BiLSTM with PhoBert feature extraction**")
491
  bilstm_report = pd.read_csv("assets/classification_report_bilstm_phobertbase.csv")
492
- st.dataframe(bilstm_report, height=600, hide_index=True, use_container_width=True)
493
 
494
  with col5:
495
  st.markdown("**phobertbase**")
496
  phobertbase_report = pd.read_csv("assets/classification_report_phobertbase.csv")
497
- st.dataframe(phobertbase_report, height=600, hide_index=True, use_container_width=True)
498
 
499
  with col6:
500
  st.markdown("**longformer-phobertbase**")
501
  longformer_report = pd.read_csv("assets/classification_report_longformer.csv")
502
- st.dataframe(longformer_report, height=600, hide_index=True, use_container_width=True)
503
  if choice == "Introduction":
504
  st.markdown(
505
  """
 
28
 
29
 
30
  # Set up the Streamlit page
31
+ st.set_page_config(layout='wide', page_title="News Classifier App", page_icon="📑")
32
 
33
 
34
  # Define variables
 
45
  class NewsClassifier(nn.Module):
46
  def __init__(self, n_classes, model_name):
47
  super(NewsClassifier, self).__init__()
48
+ # Load a pre-trained model
49
  self.bert = AutoModel.from_pretrained(model_name)
50
  # Dropout layer to prevent overfitting
51
  self.drop = nn.Dropout(p=0.3)
 
56
  nn.init.normal_(self.fc.bias, 0)
57
 
58
  def forward(self, input_ids, attention_mask):
59
+ # Get the output from the model
60
  last_hidden_state, output = self.bert(
61
  input_ids=input_ids,
62
  attention_mask=attention_mask,
 
138
  return_tensors='pt',
139
  )
140
  return tokenized['input_ids'], tokenized['attention_mask']
141
+
142
+
143
+
144
  def get_vector_embedding(padded, attention_mask, phobert):
145
  # Obtain features from BERT
146
  with torch.no_grad():
 
240
  def main():
241
 
242
  #st.title("News Classifier App")
243
+ activities = ["Introduction", "About the Dataset","Text Preprocessing", "Feature Extraction", "Train and Evaluate Models", "Prediction"]
244
  choice = st.sidebar.selectbox("Choose Activity", activities)
245
 
246
+ # Dataset
247
+ if choice == "About the Dataset":
248
+ st.info("About the Dataset")
249
+ st.markdown("This dataset consists of Vietnamese news articles collected from various Vietnamese online news portals such as Thanh Nien, VNExpress, BaoMoi, etc. The dataset was originally sourced from a MongoDB dump containing over 20 million articles.")
250
+ st.markdown("From this large dataset, our team extracted approximately 162,000 articles categorized into 13 distinct categorie and split into training, test and validation sets after preprocessing the data with 70%, 15% and 15% respectively.")
251
+ st.markdown("Link to dataset: https://github.com/binhvq/news-corpus")
252
+ st.image("images/sample_data.png", caption="Sample original data", use_column_width=True)
253
+ summary_df = pd.read_csv("assets/summary_data.csv")
254
+ st.dataframe(summary_df, hide_index=True, use_container_width=True)
255
+ train_images = "images/article_by_categories_train_data.html"
256
+ test_images = "images/article_by_categories_test_data.html"
257
+ val_images = "images/article_by_categories_val_data.html"
258
+ plot_data(train_images, test_images, val_images)
259
+ st.image("images/token_length_distribution.png",caption="Distribution of Token Count per Sentence", use_column_width=True)
260
+
261
  # Preprocessing data
262
  if choice == "Text Preprocessing":
263
  st.info("Text Preprocessing")
264
+ preprocessing_task = ["No Options", "Process Text Demo", "Load Preprocessed Data"]
265
  task_choice = st.selectbox("Choose Task", preprocessing_task)
266
+
267
+ if task_choice == "Process Text Demo":
 
 
 
 
 
 
 
 
 
 
 
268
  st.markdown("**Preprocessing Steps:**")
269
  st.markdown("- Standardize Vietnamese words, convert to lower case")
270
  st.markdown("- Utilize techniques such as regular expressions to remove unwanted elements: html, links, emails, numbers,...")
 
279
  st.success(preprocessed_news)
280
  elif task_choice == "Load Preprocessed Data":
281
  df = pd.read_json(PREPROCESSED_DATA, encoding='utf-8', lines=True)
282
+ st.dataframe(df.head(20), use_container_width=True, hide_index=True)
283
 
284
  # Feature Extration
285
  if choice == "Feature Extraction":
 
289
  task_choice = st.selectbox("Choose Model",feature_extraction_task)
290
  if task_choice == "PhoBert":
291
  st.markdown("**Feature Extraction Steps:**")
292
+ st.markdown("- Tokenize using PhoBert's Tokenizer. Note that when tokenizing we will add two special tokens, [CLS] and [SEP] at the beginning and end of the sentence. [CLS] (Classification Token): This token is added at the beginning of the sentence. It signals to PhoBERT that this is the start of a new sentence and helps the model understand the overall context of the sentence. [SEP] (Separator Token): This token is added at the end of the sentence. It acts as a separator, indicating the end of the input sentence.")
293
+ st.markdown("""
294
+ > Why use [CLS] and [SEP]?
295
+ >
296
+ > These special tokens help PhoBERT process sentences more effectively:
297
+ >
298
+ > - **Contextual Understanding:** [CLS] helps PhoBERT grasp the overall meaning of the sentence.
299
+ > - **Sentence Boundaries:** [SEP] clearly defines the start and end of each sentence, especially important when processing multiple sentences together.
300
+ """)
301
  st.markdown("- Insert the tokenized text sentence into the model with the attention mask. Attention mask helps the model only focus on words in the sentence and ignore words with additional padding. Added words are marked = 0")
302
  st.markdown("- Take the output and take the first output vector (which is in the special token position [CLS]) as a feature for the sentence to train or predict (depending on the phase).")
303
  phobert, tokenizer = load_bert()
304
  text = st.text_area("Enter Text","Type Here")
305
  if st.button("Execute"):
306
  st.subheader("Sentence to ids")
307
+ padded, attention_mask = tokenize_text(text.split(), tokenizer, max_len=256)
308
  st.write("Padded Sequence:", padded)
309
  st.write("Attention Mask:", attention_mask)
310
 
 
328
  st.header("Predict")
329
  processed_news = preprocess_text(news_text)
330
  predicted_label, confidence_df = predict_label(processed_news, tokenizer, phobert, model, class_names, max_len)
331
+ st.subheader("Confidence per Label")
332
+ st.dataframe(confidence_df, height=500, hide_index=True, use_container_width=True)
333
  st.subheader("Predicted Label")
334
  st.success(predicted_label)
335
 
 
341
  st.info(news_text)
342
  st.header("Predict")
343
  df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
344
+ st.subheader("Confidence per Label")
345
+ st.dataframe(df_confidence, height=500, hide_index=True, use_container_width=True)
346
  st.subheader("Predicted Label")
347
  st.success(predicted_label)
348
  if model_choice == "phobertbase":
 
353
  st.info(news_text)
354
  st.header("Predict")
355
  df_confidence, predicted_label = infer(news_text, tokenizer, models, class_names, max_len)
356
+ st.subheader("Confidence per Label")
357
+ st.dataframe(df_confidence, height=500, hide_index=True, use_container_width=True)
358
  st.subheader("Predicted Label")
359
  st.success(predicted_label)
360
  if choice == "Train and Evaluate Models":
 
368
  class NewsClassifier(nn.Module):
369
  def __init__(self, n_classes, model_name):
370
  super(NewsClassifier, self).__init__()
371
+ # Load a pre-trained model
372
  self.bert = AutoModel.from_pretrained(model_name)
373
  # Dropout layer to prevent overfitting
374
  self.drop = nn.Dropout(p=0.3)
 
379
  nn.init.normal_(self.fc.bias, 0)
380
 
381
  def forward(self, input_ids, attention_mask):
382
+ # Get the output from the model
383
  last_hidden_state, output = self.bert(
384
  input_ids=input_ids,
385
  attention_mask=attention_mask,
 
504
  with col4:
505
  st.markdown("**BiLSTM with PhoBert feature extraction**")
506
  bilstm_report = pd.read_csv("assets/classification_report_bilstm_phobertbase.csv")
507
+ st.dataframe(bilstm_report, height=500, hide_index=True, use_container_width=True)
508
 
509
  with col5:
510
  st.markdown("**phobertbase**")
511
  phobertbase_report = pd.read_csv("assets/classification_report_phobertbase.csv")
512
+ st.dataframe(phobertbase_report, height=500, hide_index=True, use_container_width=True)
513
 
514
  with col6:
515
  st.markdown("**longformer-phobertbase**")
516
  longformer_report = pd.read_csv("assets/classification_report_longformer.csv")
517
+ st.dataframe(longformer_report, height=500, hide_index=True, use_container_width=True)
518
  if choice == "Introduction":
519
  st.markdown(
520
  """