Spaces:

orionweller
/

human-mlm-clm-predictor

Running

App Files Files Community

orionweller commited on 24 days ago

Commit

565fb95

verified ·

1 Parent(s): ea15511

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -8

app.py CHANGED Viewed

@@ -70,9 +70,9 @@ def prepare_mlm_sample(text, mask_ratio=0.15):
     print(f"Maskable indices count: {len(maskable_indices)}")
     print(f"Mask ratio: {mask_ratio}")
-    # Calculate how many tokens to mask, but ensure at least 1 and at most 8
-    # Use the maskable_indices length with the ratio
-    num_to_mask = max(1, min(8, int(len(maskable_indices) * mask_ratio)))
     print(f"Number of tokens to mask: {num_to_mask}")
     # Randomly select indices to mask
@@ -256,15 +256,26 @@ def prepare_next_token_prediction():
     full_hidden = original_text[len(masked_text):].strip()
     # Tokenize the hidden part
-    ntp_state["tokens"] = tokenizer.tokenize(full_hidden)
     ntp_state["full_text"] = full_hidden
     ntp_state["revealed_text"] = ""
     ntp_state["next_token_idx"] = 0
     # Make sure we have tokens to predict
     if not ntp_state["tokens"]:
-        # If we don't have tokens, get a new sample
-        new_text = get_new_sample("ntp", 0.3)
         prepare_next_token_prediction()
 def check_ntp_answer(user_continuation):
@@ -275,6 +286,12 @@ def check_ntp_answer(user_continuation):
     if not ntp_state["tokens"]:
         prepare_next_token_prediction()
     # No more tokens to predict
     if ntp_state["next_token_idx"] >= len(ntp_state["tokens"]):
         # Reset for next round
@@ -282,6 +299,7 @@ def check_ntp_answer(user_continuation):
     # Get the next token to predict
     next_token = ntp_state["tokens"][ntp_state["next_token_idx"]]
     # Get user's prediction
     user_text = user_continuation.strip()
@@ -289,6 +307,7 @@ def check_ntp_answer(user_continuation):
     # Tokenize user's prediction to get their first token
     user_tokens = tokenizer.tokenize(user_text)
     user_token = user_tokens[0].lower() if user_tokens else ""
     # Clean up tokens for comparison
     next_token_clean = next_token.lower()
@@ -300,6 +319,7 @@ def check_ntp_answer(user_continuation):
     # Check if correct
     is_correct = (user_token == next_token_clean)
     # Update stats
     if is_correct:
@@ -307,7 +327,7 @@ def check_ntp_answer(user_continuation):
     user_stats["ntp"]["total"] += 1
     # Reveal this token and prepare for next
-    ntp_state["revealed_text"] += " " + tokenizer.convert_tokens_to_string([next_token])
     ntp_state["next_token_idx"] += 1
     # Calculate overall accuracy
@@ -320,7 +340,7 @@ def check_ntp_answer(user_continuation):
         feedback.append(f"✗ Not quite. The actual next token was '{next_token_clean}'")
     # Show progress
-    feedback.append(f"\nRevealed so far: {masked_text}{ntp_state['revealed_text']}")
     # If there are more tokens, prompt for next
     if ntp_state["next_token_idx"] < len(ntp_state["tokens"]):

     print(f"Maskable indices count: {len(maskable_indices)}")
     print(f"Mask ratio: {mask_ratio}")
+    # Calculate how many tokens to mask based on the mask ratio
+    # No arbitrary cap - use the actual percentage
+    num_to_mask = max(1, int(len(maskable_indices) * mask_ratio))
     print(f"Number of tokens to mask: {num_to_mask}")
     # Randomly select indices to mask
     full_hidden = original_text[len(masked_text):].strip()
     # Tokenize the hidden part
+    hidden_tokens = tokenizer.tokenize(full_hidden)
+    # Print debug info
+    print(f"NTP State setup:")
+    print(f"  Full text: '{original_text}'")
+    print(f"  Visible text: '{masked_text}'")
+    print(f"  Hidden text: '{full_hidden}'")
+    print(f"  Hidden tokens: {hidden_tokens}")
+    # Set up the NTP state
+    ntp_state["tokens"] = hidden_tokens
     ntp_state["full_text"] = full_hidden
     ntp_state["revealed_text"] = ""
     ntp_state["next_token_idx"] = 0
     # Make sure we have tokens to predict
     if not ntp_state["tokens"]:
+        print("Warning: No tokens to predict, will try another sample")
+        # If we don't have tokens, get a new sample with a higher cut ratio
+        new_text = get_new_sample("ntp", 0.4)  # Use higher cut ratio
         prepare_next_token_prediction()
 def check_ntp_answer(user_continuation):
     if not ntp_state["tokens"]:
         prepare_next_token_prediction()
+    # Print debug info
+    print(f"Current NTP state:")
+    print(f"  Next token index: {ntp_state['next_token_idx']}")
+    print(f"  Total tokens: {len(ntp_state['tokens'])}")
+    print(f"  User input: '{user_continuation}'")
     # No more tokens to predict
     if ntp_state["next_token_idx"] >= len(ntp_state["tokens"]):
         # Reset for next round
     # Get the next token to predict
     next_token = ntp_state["tokens"][ntp_state["next_token_idx"]]
+    print(f"  Expected next token: '{next_token}'")
     # Get user's prediction
     user_text = user_continuation.strip()
     # Tokenize user's prediction to get their first token
     user_tokens = tokenizer.tokenize(user_text)
     user_token = user_tokens[0].lower() if user_tokens else ""
+    print(f"  User's tokenized input: {user_tokens}")
     # Clean up tokens for comparison
     next_token_clean = next_token.lower()
     # Check if correct
     is_correct = (user_token == next_token_clean)
+    print(f"  Comparison: '{user_token}' vs '{next_token_clean}' -> {'Correct' if is_correct else 'Incorrect'}")
     # Update stats
     if is_correct:
     user_stats["ntp"]["total"] += 1
     # Reveal this token and prepare for next
+    ntp_state["revealed_text"] += tokenizer.convert_tokens_to_string([next_token])
     ntp_state["next_token_idx"] += 1
     # Calculate overall accuracy
         feedback.append(f"✗ Not quite. The actual next token was '{next_token_clean}'")
     # Show progress
+    feedback.append(f"\nText so far: {masked_text}{ntp_state['revealed_text']}")
     # If there are more tokens, prompt for next
     if ntp_state["next_token_idx"] < len(ntp_state["tokens"]):