Spaces:

orionweller
/

human-mlm-clm-predictor

Sleeping

App Files Files Community

orionweller commited on 23 days ago

Commit

59fd051

verified ·

1 Parent(s): d1414a2

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -41

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from datasets import load_dataset
 from transformers import AutoTokenizer
 # Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 # Initialize variables to track stats
 user_stats = {
@@ -28,8 +28,13 @@ def load_sample_data(sample_size=100):
             # Clean text by removing extra whitespaces
             text = re.sub(r'\s+', ' ', example["text"]).strip()
             # Only include longer texts to make the task meaningful
-            if len(text.split()) > 50:
-                samples.append(text)
     return samples
@@ -81,8 +86,14 @@ def prepare_ntp_sample(text, cut_ratio=0.3):
     # Tokenize text to ensure reasonable cutting
     tokens = tokenizer.tokenize(text)
     # Calculate cutoff point (70% of tokens if cut_ratio is 0.3)
-    cutoff = int(len(tokens) * (1 - cut_ratio))
     # Get the visible part
     visible_tokens = tokens[:cutoff]
@@ -98,7 +109,7 @@ def prepare_ntp_sample(text, cut_ratio=0.3):
 def get_new_sample(task, mask_ratio=0.15):
     """Get a new text sample based on the task."""
-    global current_sample, masked_text, masked_indices, masked_tokens, original_text
     # Select a random sample
     current_sample = random.choice(data_samples)
@@ -113,6 +124,18 @@ def get_new_sample(task, mask_ratio=0.15):
         # Store original and visible for comparison
         original_text = current_sample
         masked_text = visible_text
         return visible_text
 def check_mlm_answer(user_answers):
@@ -159,52 +182,95 @@ def check_mlm_answer(user_answers):
     return "\n".join(feedback)
 def check_ntp_answer(user_continuation):
-    """Check user NTP answer against the original text."""
-    global user_stats, original_text, masked_text
-    # Get the hidden part of the original text
-    hidden_text = original_text[len(masked_text):].strip()
     user_text = user_continuation.strip()
-    # Tokenize for better comparison
-    hidden_tokens = tokenizer.tokenize(hidden_text)
     user_tokens = tokenizer.tokenize(user_text)
-    # Calculate overlap using first few tokens (more lenient)
-    max_compare = min(10, len(hidden_tokens), len(user_tokens))
-    if max_compare == 0:
-        return "Error: No hidden tokens to compare with."
-    correct = 0
-    for i in range(max_compare):
-        hidden_token = hidden_tokens[i].lower()
-        user_token = user_tokens[i].lower() if i < len(user_tokens) else ""
-        # Remove ## from subword tokens
-        if hidden_token.startswith("##"):
-            hidden_token = hidden_token[2:]
-        if user_token.startswith("##"):
-            user_token = user_token[2:]
-        if user_token == hidden_token:
-            correct += 1
     # Update stats
-    user_stats["ntp"]["correct"] += correct
-    user_stats["ntp"]["total"] += max_compare
-    # Calculate accuracy
-    accuracy = correct / max_compare
-    accuracy_percentage = accuracy * 100
-    feedback = [f"Your prediction accuracy: {correct}/{max_compare} ({accuracy_percentage:.1f}%)"]
-    # Show original continuation
-    feedback.append(f"\nActual continuation:\n{hidden_text}")
-    # Calculate overall stats
-    overall_accuracy = user_stats["ntp"]["correct"] / user_stats["ntp"]["total"] if user_stats["ntp"]["total"] > 0 else 0
     feedback.append(f"\nOverall NTP Accuracy: {user_stats['ntp']['correct']}/{user_stats['ntp']['total']} ({overall_accuracy*100:.1f}%)")
     return "\n".join(feedback)
@@ -279,9 +345,9 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
     with gr.Group(visible=False) as ntp_group:
         ntp_answer = gr.Textbox(
-            label="Your NTP continuation",
-            placeholder="Predict how the text continues...",
-            lines=3
         )
     with gr.Row():

 from transformers import AutoTokenizer
 # Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("answerdotai/modernbert-base")
 # Initialize variables to track stats
 user_stats = {
             # Clean text by removing extra whitespaces
             text = re.sub(r'\s+', ' ', example["text"]).strip()
             # Only include longer texts to make the task meaningful
+            if len(text.split()) > 20:
+                # Truncate to two sentences
+                sentences = re.split(r'(?<=[.!?])\s+', text)
+                if len(sentences) >= 2:
+                    # Take only the first two sentences
+                    two_sentence_text = ' '.join(sentences[:2])
+                    samples.append(two_sentence_text)
     return samples
     # Tokenize text to ensure reasonable cutting
     tokens = tokenizer.tokenize(text)
+    # Ensure we have enough tokens
+    if len(tokens) < 5:
+        return text, ""  # Return original if too short
     # Calculate cutoff point (70% of tokens if cut_ratio is 0.3)
+    # But make sure we have at least 3 tokens visible and 1 token hidden
+    cutoff = max(3, int(len(tokens) * (1 - cut_ratio)))
+    cutoff = min(cutoff, len(tokens) - 1)  # Ensure there's at least 1 token to predict
     # Get the visible part
     visible_tokens = tokens[:cutoff]
 def get_new_sample(task, mask_ratio=0.15):
     """Get a new text sample based on the task."""
+    global current_sample, masked_text, masked_indices, masked_tokens, original_text, ntp_state
     # Select a random sample
     current_sample = random.choice(data_samples)
         # Store original and visible for comparison
         original_text = current_sample
         masked_text = visible_text
+        # Reset NTP state for new iteration
+        ntp_state = {
+            "full_text": "",
+            "revealed_text": "",
+            "next_token_idx": 0,
+            "tokens": []
+        }
+        # Prepare for token-by-token prediction
+        prepare_next_token_prediction()
         return visible_text
 def check_mlm_answer(user_answers):
     return "\n".join(feedback)
+# Variable to store NTP state
+ntp_state = {
+    "full_text": "",
+    "revealed_text": "",
+    "next_token_idx": 0,
+    "tokens": []
+}
+def prepare_next_token_prediction():
+    """Prepare for the next token prediction."""
+    global ntp_state, masked_text, original_text
+    # Get the hidden part
+    full_hidden = original_text[len(masked_text):].strip()
+    # Tokenize the hidden part
+    ntp_state["tokens"] = tokenizer.tokenize(full_hidden)
+    ntp_state["full_text"] = full_hidden
+    ntp_state["revealed_text"] = ""
+    ntp_state["next_token_idx"] = 0
+    # Make sure we have tokens to predict
+    if not ntp_state["tokens"]:
+        # If we don't have tokens, get a new sample
+        new_text = get_new_sample("ntp", 0.3)
+        prepare_next_token_prediction()
 def check_ntp_answer(user_continuation):
+    """Check user NTP answer for the next token only."""
+    global user_stats, ntp_state, masked_text
+    # If we haven't set up NTP state yet, do it now
+    if not ntp_state["tokens"]:
+        prepare_next_token_prediction()
+    # No more tokens to predict
+    if ntp_state["next_token_idx"] >= len(ntp_state["tokens"]):
+        # Reset for next round
+        return "You've completed this prediction! Click 'New Sample' for another."
+    # Get the next token to predict
+    next_token = ntp_state["tokens"][ntp_state["next_token_idx"]]
+    # Get user's prediction
     user_text = user_continuation.strip()
+    # Tokenize user's prediction to get their first token
     user_tokens = tokenizer.tokenize(user_text)
+    user_token = user_tokens[0].lower() if user_tokens else ""
+    # Clean up tokens for comparison
+    next_token_clean = next_token.lower()
+    if next_token_clean.startswith("##"):
+        next_token_clean = next_token_clean[2:]
+    if user_token.startswith("##"):
+        user_token = user_token[2:]
+    # Check if correct
+    is_correct = (user_token == next_token_clean)
     # Update stats
+    if is_correct:
+        user_stats["ntp"]["correct"] += 1
+    user_stats["ntp"]["total"] += 1
+    # Reveal this token and prepare for next
+    ntp_state["revealed_text"] += " " + tokenizer.convert_tokens_to_string([next_token])
+    ntp_state["next_token_idx"] += 1
+    # Calculate overall accuracy
+    overall_accuracy = user_stats["ntp"]["correct"] / user_stats["ntp"]["total"] if user_stats["ntp"]["total"] > 0 else 0
+    feedback = []
+    if is_correct:
+        feedback.append(f"✓ Correct! The next token was indeed '{next_token_clean}'")
+    else:
+        feedback.append(f"✗ Not quite. The actual next token was '{next_token_clean}'")
+    # Show progress
+    feedback.append(f"\nRevealed so far: {masked_text}{ntp_state['revealed_text']}")
+    # If there are more tokens, prompt for next
+    if ntp_state["next_token_idx"] < len(ntp_state["tokens"]):
+        feedback.append(f"\nPredict the next token...")
+    else:
+        feedback.append(f"\nPrediction complete! Full text was:\n{original_text}")
+    # Show overall stats
     feedback.append(f"\nOverall NTP Accuracy: {user_stats['ntp']['correct']}/{user_stats['ntp']['total']} ({overall_accuracy*100:.1f}%)")
     return "\n".join(feedback)
     with gr.Group(visible=False) as ntp_group:
         ntp_answer = gr.Textbox(
+            label="Your Next Token Prediction",
+            placeholder="Predict the next token/word...",
+            lines=1
         )
     with gr.Row():