Commit
·
fe86b7f
1
Parent(s):
c1810f7
add: lstm model
Browse files- .gitattributes +2 -0
- .gitignore +1 -0
- app.py +37 -34
- models/lstm/x_g85_lstm.keras +3 -0
- requirements.txt +3 -0
- test.py +4 -0
- utils.py +60 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
models/lstm/x_g85_lstm.keras filter=lfs diff=lfs merge=lfs -text
|
37 |
+
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
app.py
CHANGED
@@ -19,9 +19,8 @@ with c1:
|
|
19 |
|
20 |
# The heading will be on the right.
|
21 |
with c2:
|
22 |
-
|
23 |
-
|
24 |
-
st.title("X_G85 Fake News")
|
25 |
|
26 |
# We need to set up session state via st.session_state so that app interactions don't reset the app.
|
27 |
if "valid_inputs_received" not in st.session_state:
|
@@ -34,38 +33,32 @@ st.sidebar.write("")
|
|
34 |
|
35 |
|
36 |
# Model selection
|
37 |
-
SELECTED_MODEL = st.sidebar.selectbox(
|
38 |
-
"Choose a model",
|
39 |
-
("Bert", "Roberta", "Lstm")
|
40 |
-
)
|
41 |
|
42 |
if SELECTED_MODEL:
|
43 |
st.session_state.valid_inputs_received = False
|
44 |
|
45 |
MODEL_INFO = {
|
46 |
-
"Bert":
|
47 |
#### [BERT base model (uncased)](https://huggingface.co/google-bert/bert-base-uncased)
|
48 |
Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in this paper and first released in this repository. This model is uncased: it does not make a difference between english and English.
|
49 |
""",
|
50 |
-
|
51 |
-
"Roberta": """
|
52 |
#### [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
|
53 |
This model is trained by over 40,000 news from different medias based on the 'roberta-base'. It can give result by simply entering the text of the news less than 500 words(the excess will be truncated automatically).
|
54 |
""",
|
55 |
-
|
56 |
-
"Lstm": """
|
57 |
#### [X_G85 Fake News LSTM MODEL](https://huggingface.co/x-g85)
|
58 |
It is trained on the provided datasets\n
|
59 |
Notebook: [Fake News using Lstm](https://www.kaggle.com/code/adamalrahman/fake-news-using-lstm)
|
60 |
""",
|
61 |
-
None: "NO MODEL SELECTED"
|
62 |
}
|
63 |
|
64 |
|
65 |
-
|
66 |
model_info_container = st.sidebar.container(border=True)
|
67 |
model_info_container.markdown("### Model Information\n")
|
68 |
-
model_info_container.markdown(MODEL_INFO[SELECTED_MODEL
|
69 |
|
70 |
|
71 |
copyright_container = st.sidebar.container(border=True)
|
@@ -78,9 +71,10 @@ copyright_container.markdown("Copyright ©️ 2024 [X_G85](https://huggingface.c
|
|
78 |
MainTab, InfoTab = st.tabs(["Main", "Info"])
|
79 |
|
80 |
with InfoTab:
|
81 |
-
|
82 |
st.subheader("X_G85 Fake News")
|
83 |
-
st.markdown(
|
|
|
|
|
84 |
|
85 |
st.subheader("Datasets")
|
86 |
st.markdown(
|
@@ -97,35 +91,39 @@ with InfoTab:
|
|
97 |
"""
|
98 |
- Bert: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
|
99 |
- Roberta: [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
|
100 |
-
"""
|
|
|
101 |
st.write("")
|
102 |
copyright_container = st.container(border=True)
|
103 |
-
copyright_container.markdown(
|
104 |
-
|
|
|
105 |
|
106 |
|
107 |
def MODEL_RESULT(model: str, news: str) -> str | None:
|
108 |
if model == "Roberta":
|
109 |
MODEL_jy46604790 = "jy46604790/Fake-News-Bert-Detect"
|
110 |
-
classifier = pipeline(
|
|
|
|
|
111 |
result = classifier(news)
|
112 |
-
|
113 |
if result[0]["label"] == "LABEL_1":
|
114 |
return "REAL NEWS"
|
115 |
-
else:
|
116 |
return "FAKE NEWS"
|
117 |
-
|
118 |
# TODO(Adam-Al-Rahman): Complete the statement
|
119 |
if model == "Bert":
|
120 |
pass
|
121 |
|
122 |
if model == "Lstm":
|
123 |
-
|
124 |
|
|
|
125 |
|
126 |
|
127 |
with MainTab:
|
128 |
-
|
129 |
# Then, we create a intro text for the app, which we wrap in a st.markdown() widget.
|
130 |
|
131 |
st.write("")
|
@@ -137,15 +135,21 @@ with MainTab:
|
|
137 |
with st.form(key="form"):
|
138 |
pre_defined_news = "Indonesian police have recaptured a U.S. citizen who escaped a week ago from an overcrowded prison on the holiday island of Bali, the jail's second breakout of foreign inmates this year. Cristian Beasley from California was rearrested on Sunday, Badung Police Chief Yudith Satria Hananta said, without providing further details. Beasley was a suspect in crimes related to narcotics but had not been sentenced when he escaped from Kerobokan prison in Bali last week. The 32-year-old is believed to have cut through bars in the ceiling of his cell before scaling a perimeter wall of the prison in an area being refurbished. The Kerobokan prison, about 10 km (six miles) from the main tourist beaches in the Kuta area, often holds foreigners facing drug-related charges. Representatives of Beasley could not immediately be reached for comment. In June, an Australian, a Bulgarian, an Indian, and a Malaysian tunneled to freedom about 12 meters (13 yards) under Kerobokan prison s walls. The Indian and the Bulgarian were caught soon after in neighboring East Timor, but Australian Shaun Edward Davidson and Malaysian Tee Kok King remain at large. Davidson has taunted authorities by saying he was enjoying life in various parts of the world, in purported posts on Facebook. Kerobokan has housed several well-known foreign drug convicts, including Australian Schappelle Corby, whose 12-1/2-year sentence for marijuana smuggling got huge media attention."
|
139 |
|
140 |
-
news = st.text_area(
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
145 |
|
146 |
submit_button = st.form_submit_button(label="Submit")
|
147 |
|
148 |
-
if
|
|
|
|
|
|
|
|
|
149 |
st.stop()
|
150 |
|
151 |
elif submit_button and not news:
|
@@ -159,10 +163,9 @@ with MainTab:
|
|
159 |
|
160 |
# Default Model: Bert
|
161 |
MODEL = SELECTED_MODEL if SELECTED_MODEL else "Bert"
|
162 |
-
result = MODEL_RESULT(model=
|
163 |
|
164 |
if result:
|
165 |
st.success(f"Result: {result}")
|
166 |
else:
|
167 |
st.error(f"{MODEL} model error")
|
168 |
-
|
|
|
19 |
|
20 |
# The heading will be on the right.
|
21 |
with c2:
|
22 |
+
st.caption("")
|
23 |
+
st.title("X_G85 Fake News")
|
|
|
24 |
|
25 |
# We need to set up session state via st.session_state so that app interactions don't reset the app.
|
26 |
if "valid_inputs_received" not in st.session_state:
|
|
|
33 |
|
34 |
|
35 |
# Model selection
|
36 |
+
SELECTED_MODEL = st.sidebar.selectbox("Choose a model", ("Bert", "Roberta", "Lstm"))
|
|
|
|
|
|
|
37 |
|
38 |
if SELECTED_MODEL:
|
39 |
st.session_state.valid_inputs_received = False
|
40 |
|
41 |
MODEL_INFO = {
|
42 |
+
"Bert": """
|
43 |
#### [BERT base model (uncased)](https://huggingface.co/google-bert/bert-base-uncased)
|
44 |
Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in this paper and first released in this repository. This model is uncased: it does not make a difference between english and English.
|
45 |
""",
|
46 |
+
"Roberta": """
|
|
|
47 |
#### [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
|
48 |
This model is trained by over 40,000 news from different medias based on the 'roberta-base'. It can give result by simply entering the text of the news less than 500 words(the excess will be truncated automatically).
|
49 |
""",
|
50 |
+
"Lstm": """
|
|
|
51 |
#### [X_G85 Fake News LSTM MODEL](https://huggingface.co/x-g85)
|
52 |
It is trained on the provided datasets\n
|
53 |
Notebook: [Fake News using Lstm](https://www.kaggle.com/code/adamalrahman/fake-news-using-lstm)
|
54 |
""",
|
55 |
+
None: "NO MODEL SELECTED",
|
56 |
}
|
57 |
|
58 |
|
|
|
59 |
model_info_container = st.sidebar.container(border=True)
|
60 |
model_info_container.markdown("### Model Information\n")
|
61 |
+
model_info_container.markdown(MODEL_INFO[SELECTED_MODEL])
|
62 |
|
63 |
|
64 |
copyright_container = st.sidebar.container(border=True)
|
|
|
71 |
MainTab, InfoTab = st.tabs(["Main", "Info"])
|
72 |
|
73 |
with InfoTab:
|
|
|
74 |
st.subheader("X_G85 Fake News")
|
75 |
+
st.markdown(
|
76 |
+
"It is fake news detection based on the following models trained on datasets"
|
77 |
+
)
|
78 |
|
79 |
st.subheader("Datasets")
|
80 |
st.markdown(
|
|
|
91 |
"""
|
92 |
- Bert: [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
|
93 |
- Roberta: [jy46604790/Fake-News-Bert-Detect](https://huggingface.co/jy46604790/Fake-News-Bert-Detect)
|
94 |
+
"""
|
95 |
+
)
|
96 |
st.write("")
|
97 |
copyright_container = st.container(border=True)
|
98 |
+
copyright_container.markdown(
|
99 |
+
"Copyright ©️ 2024 [X_G85](https://huggingface.co/x-g85)"
|
100 |
+
)
|
101 |
|
102 |
|
103 |
def MODEL_RESULT(model: str, news: str) -> str | None:
|
104 |
if model == "Roberta":
|
105 |
MODEL_jy46604790 = "jy46604790/Fake-News-Bert-Detect"
|
106 |
+
classifier = pipeline(
|
107 |
+
"text-classification", model=MODEL_jy46604790, tokenizer=MODEL_jy46604790
|
108 |
+
)
|
109 |
result = classifier(news)
|
110 |
+
|
111 |
if result[0]["label"] == "LABEL_1":
|
112 |
return "REAL NEWS"
|
113 |
+
else:
|
114 |
return "FAKE NEWS"
|
115 |
+
|
116 |
# TODO(Adam-Al-Rahman): Complete the statement
|
117 |
if model == "Bert":
|
118 |
pass
|
119 |
|
120 |
if model == "Lstm":
|
121 |
+
from utils import modelx
|
122 |
|
123 |
+
return modelx(arch=model, model_path="models/lstm/x_g85_lstm.keras", text=news)
|
124 |
|
125 |
|
126 |
with MainTab:
|
|
|
127 |
# Then, we create a intro text for the app, which we wrap in a st.markdown() widget.
|
128 |
|
129 |
st.write("")
|
|
|
135 |
with st.form(key="form"):
|
136 |
pre_defined_news = "Indonesian police have recaptured a U.S. citizen who escaped a week ago from an overcrowded prison on the holiday island of Bali, the jail's second breakout of foreign inmates this year. Cristian Beasley from California was rearrested on Sunday, Badung Police Chief Yudith Satria Hananta said, without providing further details. Beasley was a suspect in crimes related to narcotics but had not been sentenced when he escaped from Kerobokan prison in Bali last week. The 32-year-old is believed to have cut through bars in the ceiling of his cell before scaling a perimeter wall of the prison in an area being refurbished. The Kerobokan prison, about 10 km (six miles) from the main tourist beaches in the Kuta area, often holds foreigners facing drug-related charges. Representatives of Beasley could not immediately be reached for comment. In June, an Australian, a Bulgarian, an Indian, and a Malaysian tunneled to freedom about 12 meters (13 yards) under Kerobokan prison s walls. The Indian and the Bulgarian were caught soon after in neighboring East Timor, but Australian Shaun Edward Davidson and Malaysian Tee Kok King remain at large. Davidson has taunted authorities by saying he was enjoying life in various parts of the world, in purported posts on Facebook. Kerobokan has housed several well-known foreign drug convicts, including Australian Schappelle Corby, whose 12-1/2-year sentence for marijuana smuggling got huge media attention."
|
137 |
|
138 |
+
news = st.text_area(
|
139 |
+
"Enter news to classify",
|
140 |
+
pre_defined_news,
|
141 |
+
height=200,
|
142 |
+
help="Please provide the news that you need to verify for its truthfulness.",
|
143 |
+
key="news",
|
144 |
+
)
|
145 |
|
146 |
submit_button = st.form_submit_button(label="Submit")
|
147 |
|
148 |
+
if (
|
149 |
+
not news
|
150 |
+
and not submit_button
|
151 |
+
and not st.session_state.valid_inputs_received
|
152 |
+
):
|
153 |
st.stop()
|
154 |
|
155 |
elif submit_button and not news:
|
|
|
163 |
|
164 |
# Default Model: Bert
|
165 |
MODEL = SELECTED_MODEL if SELECTED_MODEL else "Bert"
|
166 |
+
result = MODEL_RESULT(model=MODEL, news=news)
|
167 |
|
168 |
if result:
|
169 |
st.success(f"Result: {result}")
|
170 |
else:
|
171 |
st.error(f"{MODEL} model error")
|
|
models/lstm/x_g85_lstm.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:848dd72be9c5f0f2ac18048eabbe3f50ae027515bd42c86482bbbc30b42500c9
|
3 |
+
size 17299713
|
requirements.txt
CHANGED
@@ -3,3 +3,6 @@ transformers==4.42.3
|
|
3 |
torch==2.3.1
|
4 |
torchaudio==2.3.1
|
5 |
torchvision==0.18.1
|
|
|
|
|
|
|
|
3 |
torch==2.3.1
|
4 |
torchaudio==2.3.1
|
5 |
torchvision==0.18.1
|
6 |
+
numpy==1.26.4
|
7 |
+
tensorflow==2.17.0
|
8 |
+
pandas==2.2.2
|
test.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use a pipeline as a high-level helper
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
pipe = pipeline("text-classification", model="x-g85/X_G85_LSTM")
|
utils.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Copyright 2024 X_G85
|
3 |
+
Model Integration Utils
|
4 |
+
-------------------------
|
5 |
+
"""
|
6 |
+
|
7 |
+
# Author: Adam-Al-Rahman <[email protected]>
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import pandas as pd
|
11 |
+
import tensorflow as tf
|
12 |
+
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
13 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
14 |
+
|
15 |
+
|
16 |
+
def tokenizer(arch: str, tokenizer_json: str, text: str, max_length=300):
|
17 |
+
"""
|
18 |
+
::param:: arch: type of model `Bstm` or `Bert`
|
19 |
+
"""
|
20 |
+
tokenized_data = None
|
21 |
+
if arch == "Lstm":
|
22 |
+
# Load the tokenizer from the JSON file
|
23 |
+
with open(tokenizer_json) as file:
|
24 |
+
data = file.read()
|
25 |
+
tokenizer = tokenizer_from_json(data)
|
26 |
+
|
27 |
+
# Use the tokenizer to transform test data
|
28 |
+
tokenized_text = tokenizer.texts_to_sequences(text)
|
29 |
+
tokenized_data = pad_sequences(tokenized_text, maxlen=max_length)
|
30 |
+
tokenized_data = tokenized_data.astype(np.float32)
|
31 |
+
|
32 |
+
return tokenized_data
|
33 |
+
|
34 |
+
|
35 |
+
def modelx(
|
36 |
+
arch: str,
|
37 |
+
model_path: str,
|
38 |
+
text: str,
|
39 |
+
tokenizer_json: str = "",
|
40 |
+
batch_size=32,
|
41 |
+
max_length=300,
|
42 |
+
):
|
43 |
+
model_result = None
|
44 |
+
if tokenizer_json:
|
45 |
+
text = tokenizer(arch, tokenizer_json, text, max_length)
|
46 |
+
else:
|
47 |
+
text = pd.Series(text)
|
48 |
+
|
49 |
+
if arch == "Lstm":
|
50 |
+
model = tf.keras.models.load_model(model_path)
|
51 |
+
model_result = model.predict(text, batch_size=batch_size)
|
52 |
+
|
53 |
+
model_result = tf.squeeze(tf.round(model_result))
|
54 |
+
|
55 |
+
if model_result == 1.0:
|
56 |
+
model_result = "REAL NEWS"
|
57 |
+
elif model_result == 0.0:
|
58 |
+
model_result = "FAKE NEWS"
|
59 |
+
|
60 |
+
return model_result
|