Spaces:

orionweller
/

human-mlm-clm-predictor

Sleeping

App Files Files Community

orionweller commited on 22 days ago

Commit

cbbc299

verified ·

1 Parent(s): 59fd051

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -11

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from datasets import load_dataset
 from transformers import AutoTokenizer
 # Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained("answerdotai/modernbert-base")
 # Initialize variables to track stats
 user_stats = {
@@ -14,19 +14,27 @@ user_stats = {
     "ntp": {"correct": 0, "total": 0}
 }
-# Function to load and sample from cc_news dataset
 def load_sample_data(sample_size=100):
-    dataset = load_dataset("vblagoje/cc_news", streaming=True)
     # Sample from the dataset
     samples = []
     for i, example in enumerate(dataset["train"]):
         if i >= sample_size:
             break
-        # Only use text field
-        if "text" in example and example["text"]:
             # Clean text by removing extra whitespaces
-            text = re.sub(r'\s+', ' ', example["text"]).strip()
             # Only include longer texts to make the task meaningful
             if len(text.split()) > 20:
                 # Truncate to two sentences
@@ -142,12 +150,18 @@ def check_mlm_answer(user_answers):
     """Check user MLM answers against the masked tokens."""
     global user_stats
-    # Split user answers by spaces or commas
-    user_tokens = [token.strip().lower() for token in re.split(r'[,\s]+', user_answers)]
     # Ensure we have the same number of answers as masks
     if len(user_tokens) != len(masked_tokens):
-        return f"Please provide {len(masked_tokens)} answers. You provided {len(user_tokens)}."
     # Compare each answer
     correct = 0
@@ -338,10 +352,11 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
     with gr.Group() as mlm_group:
         mlm_answer = gr.Textbox(
-            label="Your MLM answers (separated by spaces or commas)",
-            placeholder="Type your guesses for the masked words",
             lines=1
         )
     with gr.Group(visible=False) as ntp_group:
         ntp_answer = gr.Textbox(

 from transformers import AutoTokenizer
 # Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 # Initialize variables to track stats
 user_stats = {
     "ntp": {"correct": 0, "total": 0}
 }
+# Function to load and sample from the requested dataset
 def load_sample_data(sample_size=100):
+    try:
+        # Try to load the requested dataset
+        dataset = load_dataset("mlfoundations/dclm-baseline-1.0-parquet", streaming=True)
+        dataset_field = "text"  # Assuming the field name is "text"
+    except Exception as e:
+        print(f"Error loading requested dataset: {e}")
+        # Fallback to cc_news if there's an issue
+        dataset = load_dataset("vblagoje/cc_news", streaming=True)
+        dataset_field = "text"
     # Sample from the dataset
     samples = []
     for i, example in enumerate(dataset["train"]):
         if i >= sample_size:
             break
+        # Get text from the appropriate field
+        if dataset_field in example and example[dataset_field]:
             # Clean text by removing extra whitespaces
+            text = re.sub(r'\s+', ' ', example[dataset_field]).strip()
             # Only include longer texts to make the task meaningful
             if len(text.split()) > 20:
                 # Truncate to two sentences
     """Check user MLM answers against the masked tokens."""
     global user_stats
+    # Improved parsing of user answers to better handle different formats
+    # First replace any whitespace around commas with just commas
+    cleaned_answers = re.sub(r'\s*,\s*', ',', user_answers.strip())
+    # Then split by comma or whitespace
+    user_tokens = []
+    for token in re.split(r',|\s+', cleaned_answers):
+        if token:  # Only add non-empty tokens
+            user_tokens.append(token.strip().lower())
     # Ensure we have the same number of answers as masks
     if len(user_tokens) != len(masked_tokens):
+        return f"Please provide {len(masked_tokens)} answers. You provided {len(user_tokens)}.\nFormat: word1, word2, word3"
     # Compare each answer
     correct = 0
     with gr.Group() as mlm_group:
         mlm_answer = gr.Textbox(
+            label="Your MLM answers (separated by commas)",
+            placeholder="word1, word2, word3, etc.",
             lines=1
         )
+        gr.Markdown("**Example input format:** finding, its, phishing, in, links, 49, and, it")
     with gr.Group(visible=False) as ntp_group:
         ntp_answer = gr.Textbox(