Spaces:

orionweller
/

human-mlm-clm-predictor

Sleeping

App Files Files Community

orionweller commited on 17 days ago

Commit

bbed6df

verified ·

1 Parent(s): 8f1d1e1

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -15

app.py CHANGED Viewed

@@ -65,8 +65,8 @@ def prepare_mlm_sample(text, mask_ratio=0.15):
                         if not token.startswith("##") and not token.startswith("[") and not token.endswith("]")
                         and token not in [".", ",", "!", "?", ";", ":", "'", "\"", "-"]]
-    # Calculate how many tokens to mask
-    num_to_mask = max(1, int(len(maskable_indices) * mask_ratio))
     # Randomly select indices to mask
     indices_to_mask = random.sample(maskable_indices, min(num_to_mask, len(maskable_indices)))
@@ -87,6 +87,11 @@ def prepare_mlm_sample(text, mask_ratio=0.15):
     # Convert back to text with masks
     masked_text = tokenizer.convert_tokens_to_string(masked_tokens_list)
     return masked_text, indices_to_mask, original_tokens
 def prepare_ntp_sample(text, cut_ratio=0.3):
@@ -150,18 +155,33 @@ def check_mlm_answer(user_answers):
     """Check user MLM answers against the masked tokens."""
     global user_stats
-    # Improved parsing of user answers to better handle different formats
-    # First replace any whitespace around commas with just commas
-    cleaned_answers = re.sub(r'\s*,\s*', ',', user_answers.strip())
-    # Then split by comma or whitespace
-    user_tokens = []
-    for token in re.split(r',|\s+', cleaned_answers):
-        if token:  # Only add non-empty tokens
-            user_tokens.append(token.strip().lower())
     # Ensure we have the same number of answers as masks
     if len(user_tokens) != len(masked_tokens):
-        return f"Please provide {len(masked_tokens)} answers. You provided {len(user_tokens)}.\nFormat: word1, word2, word3"
     # Compare each answer
     correct = 0
@@ -338,6 +358,9 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
             info="Percentage of tokens to mask (MLM) or text to hide (NTP)"
         )
     sample_text = gr.Textbox(
         label="Text Sample",
         placeholder="Click 'New Sample' to get started",
@@ -351,12 +374,20 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
         reset_button = gr.Button("Reset Stats")
     with gr.Group() as mlm_group:
         mlm_answer = gr.Textbox(
-            label="Your MLM answers (separated by commas)",
-            placeholder="word1, word2, word3, etc.",
             lines=1
         )
-        gr.Markdown("**Example input format:** finding, its, phishing, in, links, 49, and, it")
     with gr.Group(visible=False) as ntp_group:
         ntp_answer = gr.Textbox(
@@ -372,7 +403,27 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
     # Set up event handlers
     task_radio.change(switch_task, inputs=[task_radio], outputs=[mlm_group, ntp_group])
-    new_button.click(generate_new_sample, inputs=[mask_ratio], outputs=[sample_text, result])
     reset_button.click(reset_stats, inputs=None, outputs=[result])
     check_button.click(

                         if not token.startswith("##") and not token.startswith("[") and not token.endswith("]")
                         and token not in [".", ",", "!", "?", ";", ":", "'", "\"", "-"]]
+    # Calculate how many tokens to mask, but ensure at least 1 and at most 8
+    num_to_mask = max(1, min(8, int(len(maskable_indices) * mask_ratio)))
     # Randomly select indices to mask
     indices_to_mask = random.sample(maskable_indices, min(num_to_mask, len(maskable_indices)))
     # Convert back to text with masks
     masked_text = tokenizer.convert_tokens_to_string(masked_tokens_list)
+    # Print debugging info
+    print(f"Original tokens: {original_tokens}")
+    print(f"Masked indices: {indices_to_mask}")
+    print(f"Number of masks: {len(original_tokens)}")
     return masked_text, indices_to_mask, original_tokens
 def prepare_ntp_sample(text, cut_ratio=0.3):
     """Check user MLM answers against the masked tokens."""
     global user_stats
+    # Print for debugging
+    print(f"Original user input: '{user_answers}'")
+    # Handle the case where input is empty
+    if not user_answers or user_answers.isspace():
+        return "Please provide your answers. No input was detected."
+    # Basic cleanup - trim and lowercase
+    user_answers = user_answers.strip().lower()
+    print(f"After basic cleanup: '{user_answers}'")
+    # Explicit comma-based splitting with protection for empty entries
+    if ',' in user_answers:
+        # Split by commas and strip each item
+        user_tokens = [token.strip() for token in user_answers.split(',')]
+        # Filter out empty tokens
+        user_tokens = [token for token in user_tokens if token]
+    else:
+        # If no commas, split by whitespace
+        user_tokens = [token for token in user_answers.split() if token]
+    print(f"Parsed tokens: {user_tokens}, count: {len(user_tokens)}")
+    print(f"Expected tokens: {masked_tokens}, count: {len(masked_tokens)}")
     # Ensure we have the same number of answers as masks
     if len(user_tokens) != len(masked_tokens):
+        return f"Please provide exactly {len(masked_tokens)} answers (one for each [MASK]). You provided {len(user_tokens)}.\n\nFormat example: word1, word2, word3"
     # Compare each answer
     correct = 0
             info="Percentage of tokens to mask (MLM) or text to hide (NTP)"
         )
+    # Count the visible [MASK] tokens for user reference
+    mask_count = gr.Markdown("**Number of [MASK] tokens to guess: 0**")
     sample_text = gr.Textbox(
         label="Text Sample",
         placeholder="Click 'New Sample' to get started",
         reset_button = gr.Button("Reset Stats")
     with gr.Group() as mlm_group:
+        mlm_instructions = gr.Markdown("""
+        ### MLM Instructions
+        1. For each [MASK] token, provide your guess for the original word.
+        2. Separate your answers with commas.
+        3. Make sure you provide exactly the same number of answers as [MASK] tokens.
+        **Example format:** `word1, word2, word3` or `word1,word2,word3`
+        """)
         mlm_answer = gr.Textbox(
+            label="Your answers (comma-separated)",
+            placeholder="word1, word2, word3",
             lines=1
         )
     with gr.Group(visible=False) as ntp_group:
         ntp_answer = gr.Textbox(
     # Set up event handlers
     task_radio.change(switch_task, inputs=[task_radio], outputs=[mlm_group, ntp_group])
+    # Update the sample text and also update the mask count
+    def new_sample_with_count(mask_ratio_pct, task):
+        ratio = float(mask_ratio_pct) / 100.0
+        sample = get_new_sample(task, ratio)
+        mask_count_text = ""
+        if task == "mlm":
+            count = len(masked_tokens)
+            mask_count_text = f"**Number of [MASK] tokens to guess: {count}**"
+        else:
+            mask_count_text = "**Next Token Prediction mode - guess one token at a time**"
+        return sample, mask_count_text, ""
+    new_button.click(
+        new_sample_with_count,
+        inputs=[mask_ratio, task_radio],
+        outputs=[sample_text, mask_count, result]
+    )
     reset_button.click(reset_stats, inputs=None, outputs=[result])
     check_button.click(