Adam-Al-Rahman commited on
Commit
fe86b7f
·
1 Parent(s): c1810f7

add: lstm model

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. .gitignore +1 -0
  3. app.py +37 -34
  4. models/lstm/x_g85_lstm.keras +3 -0
  5. requirements.txt +3 -0
  6. test.py +4 -0
  7. utils.py +60 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/lstm/x_g85_lstm.keras filter=lfs diff=lfs merge=lfs -text
37
+
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
app.py CHANGED
@@ -19,9 +19,8 @@ with c1:
19
 
20
  # The heading will be on the right.
21
  with c2:
22
-
23
- st.caption("")
24
- st.title("X_G85 Fake News")
25
 
26
  # We need to set up session state via st.session_state so that app interactions don't reset the app.
27
  if "valid_inputs_received" not in st.session_state:
@@ -34,38 +33,32 @@ st.sidebar.write("")
34
 
35
 
36
  # Model selection
37
- SELECTED_MODEL = st.sidebar.selectbox(
38
- "Choose a model",
39
- ("Bert", "Roberta", "Lstm")
40
- )
41
 
42
  if SELECTED_MODEL:
43
  st.session_state.valid_inputs_received = False
44
 
45
  MODEL_INFO = {
46
- "Bert": """
47
  #### [BERT base model (uncased)](https://huggingface.co/google-bert/bert-base-uncased)
48
  Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in this paper and first released in this repository. This model is uncased: it does not make a difference between english and English.
49
  """,
50
-
51
- "Roberta": """
52
  #### [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
53
  This model is trained by over 40,000 news from different medias based on the 'roberta-base'. It can give result by simply entering the text of the news less than 500 words(the excess will be truncated automatically).
54
  """,
55
-
56
- "Lstm": """
57
  #### [X_G85 Fake News LSTM MODEL](https://huggingface.co/x-g85)
58
  It is trained on the provided datasets\n
59
  Notebook: [Fake News using Lstm](https://www.kaggle.com/code/adamalrahman/fake-news-using-lstm)
60
  """,
61
- None: "NO MODEL SELECTED"
62
  }
63
 
64
 
65
-
66
  model_info_container = st.sidebar.container(border=True)
67
  model_info_container.markdown("### Model Information\n")
68
- model_info_container.markdown(MODEL_INFO[SELECTED_MODEL ])
69
 
70
 
71
  copyright_container = st.sidebar.container(border=True)
@@ -78,9 +71,10 @@ copyright_container.markdown("Copyright ©️ 2024 [X_G85](https://huggingface.c
78
  MainTab, InfoTab = st.tabs(["Main", "Info"])
79
 
80
  with InfoTab:
81
-
82
  st.subheader("X_G85 Fake News")
83
- st.markdown("It is fake news detection based on the following models trained on datasets")
 
 
84
 
85
  st.subheader("Datasets")
86
  st.markdown(
@@ -97,35 +91,39 @@ with InfoTab:
97
  """
98
  - Bert: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
99
  - Roberta: [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
100
- """)
 
101
  st.write("")
102
  copyright_container = st.container(border=True)
103
- copyright_container.markdown("Copyright ©️ 2024 [X_G85](https://huggingface.co/x-g85)")
104
-
 
105
 
106
 
107
  def MODEL_RESULT(model: str, news: str) -> str | None:
108
  if model == "Roberta":
109
  MODEL_jy46604790 = "jy46604790/Fake-News-Bert-Detect"
110
- classifier = pipeline("text-classification", model=MODEL_jy46604790, tokenizer=MODEL_jy46604790)
 
 
111
  result = classifier(news)
112
-
113
  if result[0]["label"] == "LABEL_1":
114
  return "REAL NEWS"
115
- else:
116
  return "FAKE NEWS"
117
-
118
  # TODO(Adam-Al-Rahman): Complete the statement
119
  if model == "Bert":
120
  pass
121
 
122
  if model == "Lstm":
123
- pass
124
 
 
125
 
126
 
127
  with MainTab:
128
-
129
  # Then, we create a intro text for the app, which we wrap in a st.markdown() widget.
130
 
131
  st.write("")
@@ -137,15 +135,21 @@ with MainTab:
137
  with st.form(key="form"):
138
  pre_defined_news = "Indonesian police have recaptured a U.S. citizen who escaped a week ago from an overcrowded prison on the holiday island of Bali, the jail's second breakout of foreign inmates this year. Cristian Beasley from California was rearrested on Sunday, Badung Police Chief Yudith Satria Hananta said, without providing further details. Beasley was a suspect in crimes related to narcotics but had not been sentenced when he escaped from Kerobokan prison in Bali last week. The 32-year-old is believed to have cut through bars in the ceiling of his cell before scaling a perimeter wall of the prison in an area being refurbished. The Kerobokan prison, about 10 km (six miles) from the main tourist beaches in the Kuta area, often holds foreigners facing drug-related charges. Representatives of Beasley could not immediately be reached for comment. In June, an Australian, a Bulgarian, an Indian, and a Malaysian tunneled to freedom about 12 meters (13 yards) under Kerobokan prison s walls. The Indian and the Bulgarian were caught soon after in neighboring East Timor, but Australian Shaun Edward Davidson and Malaysian Tee Kok King remain at large. Davidson has taunted authorities by saying he was enjoying life in various parts of the world, in purported posts on Facebook. Kerobokan has housed several well-known foreign drug convicts, including Australian Schappelle Corby, whose 12-1/2-year sentence for marijuana smuggling got huge media attention."
139
 
140
- news = st.text_area("Enter news to classify",
141
- pre_defined_news,
142
- height=200,
143
- help="Please provide the news that you need to verify for its truthfulness.",
144
- key="news")
 
 
145
 
146
  submit_button = st.form_submit_button(label="Submit")
147
 
148
- if not news and not submit_button and not st.session_state.valid_inputs_received:
 
 
 
 
149
  st.stop()
150
 
151
  elif submit_button and not news:
@@ -159,10 +163,9 @@ with MainTab:
159
 
160
  # Default Model: Bert
161
  MODEL = SELECTED_MODEL if SELECTED_MODEL else "Bert"
162
- result = MODEL_RESULT(model= MODEL, news=news)
163
 
164
  if result:
165
  st.success(f"Result: {result}")
166
  else:
167
  st.error(f"{MODEL} model error")
168
-
 
19
 
20
  # The heading will be on the right.
21
  with c2:
22
+ st.caption("")
23
+ st.title("X_G85 Fake News")
 
24
 
25
  # We need to set up session state via st.session_state so that app interactions don't reset the app.
26
  if "valid_inputs_received" not in st.session_state:
 
33
 
34
 
35
  # Model selection
36
+ SELECTED_MODEL = st.sidebar.selectbox("Choose a model", ("Bert", "Roberta", "Lstm"))
 
 
 
37
 
38
  if SELECTED_MODEL:
39
  st.session_state.valid_inputs_received = False
40
 
41
  MODEL_INFO = {
42
+ "Bert": """
43
  #### [BERT base model (uncased)](https://huggingface.co/google-bert/bert-base-uncased)
44
  Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in this paper and first released in this repository. This model is uncased: it does not make a difference between english and English.
45
  """,
46
+ "Roberta": """
 
47
  #### [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
48
  This model is trained by over 40,000 news from different medias based on the 'roberta-base'. It can give result by simply entering the text of the news less than 500 words(the excess will be truncated automatically).
49
  """,
50
+ "Lstm": """
 
51
  #### [X_G85 Fake News LSTM MODEL](https://huggingface.co/x-g85)
52
  It is trained on the provided datasets\n
53
  Notebook: [Fake News using Lstm](https://www.kaggle.com/code/adamalrahman/fake-news-using-lstm)
54
  """,
55
+ None: "NO MODEL SELECTED",
56
  }
57
 
58
 
 
59
  model_info_container = st.sidebar.container(border=True)
60
  model_info_container.markdown("### Model Information\n")
61
+ model_info_container.markdown(MODEL_INFO[SELECTED_MODEL])
62
 
63
 
64
  copyright_container = st.sidebar.container(border=True)
 
71
  MainTab, InfoTab = st.tabs(["Main", "Info"])
72
 
73
  with InfoTab:
 
74
  st.subheader("X_G85 Fake News")
75
+ st.markdown(
76
+ "It is fake news detection based on the following models trained on datasets"
77
+ )
78
 
79
  st.subheader("Datasets")
80
  st.markdown(
 
91
  """
92
  - Bert: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
93
  - Roberta: [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
94
+ """
95
+ )
96
  st.write("")
97
  copyright_container = st.container(border=True)
98
+ copyright_container.markdown(
99
+ "Copyright ©️ 2024 [X_G85](https://huggingface.co/x-g85)"
100
+ )
101
 
102
 
103
  def MODEL_RESULT(model: str, news: str) -> str | None:
104
  if model == "Roberta":
105
  MODEL_jy46604790 = "jy46604790/Fake-News-Bert-Detect"
106
+ classifier = pipeline(
107
+ "text-classification", model=MODEL_jy46604790, tokenizer=MODEL_jy46604790
108
+ )
109
  result = classifier(news)
110
+
111
  if result[0]["label"] == "LABEL_1":
112
  return "REAL NEWS"
113
+ else:
114
  return "FAKE NEWS"
115
+
116
  # TODO(Adam-Al-Rahman): Complete the statement
117
  if model == "Bert":
118
  pass
119
 
120
  if model == "Lstm":
121
+ from utils import modelx
122
 
123
+ return modelx(arch=model, model_path="models/lstm/x_g85_lstm.keras", text=news)
124
 
125
 
126
  with MainTab:
 
127
  # Then, we create a intro text for the app, which we wrap in a st.markdown() widget.
128
 
129
  st.write("")
 
135
  with st.form(key="form"):
136
  pre_defined_news = "Indonesian police have recaptured a U.S. citizen who escaped a week ago from an overcrowded prison on the holiday island of Bali, the jail's second breakout of foreign inmates this year. Cristian Beasley from California was rearrested on Sunday, Badung Police Chief Yudith Satria Hananta said, without providing further details. Beasley was a suspect in crimes related to narcotics but had not been sentenced when he escaped from Kerobokan prison in Bali last week. The 32-year-old is believed to have cut through bars in the ceiling of his cell before scaling a perimeter wall of the prison in an area being refurbished. The Kerobokan prison, about 10 km (six miles) from the main tourist beaches in the Kuta area, often holds foreigners facing drug-related charges. Representatives of Beasley could not immediately be reached for comment. In June, an Australian, a Bulgarian, an Indian, and a Malaysian tunneled to freedom about 12 meters (13 yards) under Kerobokan prison s walls. The Indian and the Bulgarian were caught soon after in neighboring East Timor, but Australian Shaun Edward Davidson and Malaysian Tee Kok King remain at large. Davidson has taunted authorities by saying he was enjoying life in various parts of the world, in purported posts on Facebook. Kerobokan has housed several well-known foreign drug convicts, including Australian Schappelle Corby, whose 12-1/2-year sentence for marijuana smuggling got huge media attention."
137
 
138
+ news = st.text_area(
139
+ "Enter news to classify",
140
+ pre_defined_news,
141
+ height=200,
142
+ help="Please provide the news that you need to verify for its truthfulness.",
143
+ key="news",
144
+ )
145
 
146
  submit_button = st.form_submit_button(label="Submit")
147
 
148
+ if (
149
+ not news
150
+ and not submit_button
151
+ and not st.session_state.valid_inputs_received
152
+ ):
153
  st.stop()
154
 
155
  elif submit_button and not news:
 
163
 
164
  # Default Model: Bert
165
  MODEL = SELECTED_MODEL if SELECTED_MODEL else "Bert"
166
+ result = MODEL_RESULT(model=MODEL, news=news)
167
 
168
  if result:
169
  st.success(f"Result: {result}")
170
  else:
171
  st.error(f"{MODEL} model error")
 
models/lstm/x_g85_lstm.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:848dd72be9c5f0f2ac18048eabbe3f50ae027515bd42c86482bbbc30b42500c9
3
+ size 17299713
requirements.txt CHANGED
@@ -3,3 +3,6 @@ transformers==4.42.3
3
  torch==2.3.1
4
  torchaudio==2.3.1
5
  torchvision==0.18.1
 
 
 
 
3
  torch==2.3.1
4
  torchaudio==2.3.1
5
  torchvision==0.18.1
6
+ numpy==1.26.4
7
+ tensorflow==2.17.0
8
+ pandas==2.2.2
test.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Use a pipeline as a high-level helper
2
+ from transformers import pipeline
3
+
4
+ pipe = pipeline("text-classification", model="x-g85/X_G85_LSTM")
utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2024 X_G85
3
+ Model Integration Utils
4
+ -------------------------
5
+ """
6
+
7
+ # Author: Adam-Al-Rahman <[email protected]>
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import tensorflow as tf
12
+ from tensorflow.keras.preprocessing.text import tokenizer_from_json
13
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
14
+
15
+
16
+ def tokenizer(arch: str, tokenizer_json: str, text: str, max_length=300):
17
+ """
18
+ ::param:: arch: type of model `Bstm` or `Bert`
19
+ """
20
+ tokenized_data = None
21
+ if arch == "Lstm":
22
+ # Load the tokenizer from the JSON file
23
+ with open(tokenizer_json) as file:
24
+ data = file.read()
25
+ tokenizer = tokenizer_from_json(data)
26
+
27
+ # Use the tokenizer to transform test data
28
+ tokenized_text = tokenizer.texts_to_sequences(text)
29
+ tokenized_data = pad_sequences(tokenized_text, maxlen=max_length)
30
+ tokenized_data = tokenized_data.astype(np.float32)
31
+
32
+ return tokenized_data
33
+
34
+
35
+ def modelx(
36
+ arch: str,
37
+ model_path: str,
38
+ text: str,
39
+ tokenizer_json: str = "",
40
+ batch_size=32,
41
+ max_length=300,
42
+ ):
43
+ model_result = None
44
+ if tokenizer_json:
45
+ text = tokenizer(arch, tokenizer_json, text, max_length)
46
+ else:
47
+ text = pd.Series(text)
48
+
49
+ if arch == "Lstm":
50
+ model = tf.keras.models.load_model(model_path)
51
+ model_result = model.predict(text, batch_size=batch_size)
52
+
53
+ model_result = tf.squeeze(tf.round(model_result))
54
+
55
+ if model_result == 1.0:
56
+ model_result = "REAL NEWS"
57
+ elif model_result == 0.0:
58
+ model_result = "FAKE NEWS"
59
+
60
+ return model_result