Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ from datasets import load_dataset
|
|
6 |
from transformers import AutoTokenizer
|
7 |
|
8 |
# Load tokenizer
|
9 |
-
tokenizer = AutoTokenizer.from_pretrained("
|
10 |
|
11 |
# Initialize variables to track stats
|
12 |
user_stats = {
|
@@ -14,19 +14,27 @@ user_stats = {
|
|
14 |
"ntp": {"correct": 0, "total": 0}
|
15 |
}
|
16 |
|
17 |
-
# Function to load and sample from
|
18 |
def load_sample_data(sample_size=100):
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Sample from the dataset
|
22 |
samples = []
|
23 |
for i, example in enumerate(dataset["train"]):
|
24 |
if i >= sample_size:
|
25 |
break
|
26 |
-
#
|
27 |
-
if
|
28 |
# Clean text by removing extra whitespaces
|
29 |
-
text = re.sub(r'\s+', ' ', example[
|
30 |
# Only include longer texts to make the task meaningful
|
31 |
if len(text.split()) > 20:
|
32 |
# Truncate to two sentences
|
@@ -142,12 +150,18 @@ def check_mlm_answer(user_answers):
|
|
142 |
"""Check user MLM answers against the masked tokens."""
|
143 |
global user_stats
|
144 |
|
145 |
-
#
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
# Ensure we have the same number of answers as masks
|
149 |
if len(user_tokens) != len(masked_tokens):
|
150 |
-
return f"Please provide {len(masked_tokens)} answers. You provided {len(user_tokens)}
|
151 |
|
152 |
# Compare each answer
|
153 |
correct = 0
|
@@ -338,10 +352,11 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
|
|
338 |
|
339 |
with gr.Group() as mlm_group:
|
340 |
mlm_answer = gr.Textbox(
|
341 |
-
label="Your MLM answers (separated by
|
342 |
-
placeholder="
|
343 |
lines=1
|
344 |
)
|
|
|
345 |
|
346 |
with gr.Group(visible=False) as ntp_group:
|
347 |
ntp_answer = gr.Textbox(
|
|
|
6 |
from transformers import AutoTokenizer
|
7 |
|
8 |
# Load tokenizer
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
10 |
|
11 |
# Initialize variables to track stats
|
12 |
user_stats = {
|
|
|
14 |
"ntp": {"correct": 0, "total": 0}
|
15 |
}
|
16 |
|
17 |
+
# Function to load and sample from the requested dataset
|
18 |
def load_sample_data(sample_size=100):
|
19 |
+
try:
|
20 |
+
# Try to load the requested dataset
|
21 |
+
dataset = load_dataset("mlfoundations/dclm-baseline-1.0-parquet", streaming=True)
|
22 |
+
dataset_field = "text" # Assuming the field name is "text"
|
23 |
+
except Exception as e:
|
24 |
+
print(f"Error loading requested dataset: {e}")
|
25 |
+
# Fallback to cc_news if there's an issue
|
26 |
+
dataset = load_dataset("vblagoje/cc_news", streaming=True)
|
27 |
+
dataset_field = "text"
|
28 |
|
29 |
# Sample from the dataset
|
30 |
samples = []
|
31 |
for i, example in enumerate(dataset["train"]):
|
32 |
if i >= sample_size:
|
33 |
break
|
34 |
+
# Get text from the appropriate field
|
35 |
+
if dataset_field in example and example[dataset_field]:
|
36 |
# Clean text by removing extra whitespaces
|
37 |
+
text = re.sub(r'\s+', ' ', example[dataset_field]).strip()
|
38 |
# Only include longer texts to make the task meaningful
|
39 |
if len(text.split()) > 20:
|
40 |
# Truncate to two sentences
|
|
|
150 |
"""Check user MLM answers against the masked tokens."""
|
151 |
global user_stats
|
152 |
|
153 |
+
# Improved parsing of user answers to better handle different formats
|
154 |
+
# First replace any whitespace around commas with just commas
|
155 |
+
cleaned_answers = re.sub(r'\s*,\s*', ',', user_answers.strip())
|
156 |
+
# Then split by comma or whitespace
|
157 |
+
user_tokens = []
|
158 |
+
for token in re.split(r',|\s+', cleaned_answers):
|
159 |
+
if token: # Only add non-empty tokens
|
160 |
+
user_tokens.append(token.strip().lower())
|
161 |
|
162 |
# Ensure we have the same number of answers as masks
|
163 |
if len(user_tokens) != len(masked_tokens):
|
164 |
+
return f"Please provide {len(masked_tokens)} answers. You provided {len(user_tokens)}.\nFormat: word1, word2, word3"
|
165 |
|
166 |
# Compare each answer
|
167 |
correct = 0
|
|
|
352 |
|
353 |
with gr.Group() as mlm_group:
|
354 |
mlm_answer = gr.Textbox(
|
355 |
+
label="Your MLM answers (separated by commas)",
|
356 |
+
placeholder="word1, word2, word3, etc.",
|
357 |
lines=1
|
358 |
)
|
359 |
+
gr.Markdown("**Example input format:** finding, its, phishing, in, links, 49, and, it")
|
360 |
|
361 |
with gr.Group(visible=False) as ntp_group:
|
362 |
ntp_answer = gr.Textbox(
|