AbstractPhil commited on
Commit
42a1b9e
·
verified ·
1 Parent(s): 2ef5341

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -83
app.py CHANGED
@@ -205,6 +205,7 @@ def create_strategic_masks(text, tokenizer, strategy="content_words"):
205
  def symbolic_classification_analysis(text, selected_roles, masking_strategy="content_words", num_predictions=5):
206
  """
207
  Perform symbolic classification analysis using MLM prediction
 
208
  """
209
  if not selected_roles:
210
  selected_roles = list(symbolic_token_ids.keys())
@@ -213,86 +214,192 @@ def symbolic_classification_analysis(text, selected_roles, masking_strategy="con
213
  return "Please enter some text to analyze.", "", 0
214
 
215
  try:
216
- # Create strategically masked input
217
- masked_input_ids, attention_mask, original_tokens, mask_positions = create_strategic_masks(
218
- text, tokenizer, masking_strategy
219
- )
220
-
221
- if not mask_positions:
222
- return "No suitable positions found for masking. Try different text or strategy.", "", 0
223
 
224
- # Move to device
225
- masked_input_ids = masked_input_ids.to("cuda")
226
- attention_mask = attention_mask.to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- # Get symbolic predictions
229
- predictions = get_symbolic_predictions(
230
- masked_input_ids, attention_mask, mask_positions, selected_roles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  )
232
-
233
- # Build detailed analysis
234
- analysis = {
235
- "input_text": text,
236
- "masking_strategy": masking_strategy,
237
- "total_tokens": len(original_tokens),
238
- "masked_positions": len(mask_positions),
239
- "available_symbolic_roles": len(selected_roles),
240
- "analysis_results": []
241
- }
242
-
243
- for pred_data in predictions:
244
- pos = pred_data["position"]
245
- original_token = original_tokens[pos]
246
-
247
- # Show top N predictions
248
- top_preds = pred_data["predictions"][:num_predictions]
249
-
250
- position_analysis = {
251
- "position": pos,
252
- "original_token": original_token,
253
- "top_predictions": []
254
- }
 
 
 
 
 
 
255
 
256
- for pred in top_preds:
257
- position_analysis["top_predictions"].append({
258
- "symbolic_role": pred["token"],
259
- "probability": f"{pred['probability']:.4f}",
260
- "confidence": "High" if pred["probability"] > 0.3 else "Medium" if pred["probability"] > 0.1 else "Low"
261
- })
 
 
 
 
 
262
 
263
- analysis["analysis_results"].append(position_analysis)
264
-
265
- # Create readable summary
266
- summary_lines = []
267
- max_prob = 0
268
- best_prediction = None
269
 
270
- for result in analysis["analysis_results"]:
271
- pos = result["position"]
272
- orig = result["original_token"]
273
- top_pred = result["top_predictions"][0] if result["top_predictions"] else None
274
-
275
- if top_pred:
276
- prob = float(top_pred["probability"])
277
- role = top_pred["symbolic_role"]
278
- summary_lines.append(
279
- f"Position {pos:2d}: '{orig}' → {role} ({top_pred['probability']}, {top_pred['confidence']})"
280
- )
281
-
282
- if prob > max_prob:
283
- max_prob = prob
284
- best_prediction = f"{role} (confidence: {top_pred['confidence']})"
285
 
286
- summary = "\n".join(summary_lines)
287
- if best_prediction:
288
- summary = f"🎯 Best Match: {best_prediction}\n\n" + summary
289
 
290
- return json.dumps(analysis, indent=2), summary, len(mask_positions)
 
 
 
 
 
 
 
 
 
 
 
291
 
292
- except Exception as e:
293
- error_msg = f"Error during analysis: {str(e)}"
294
- print(error_msg)
295
- return error_msg, "", 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
 
298
  def create_manual_mask_analysis(text, mask_positions_str, selected_roles):
@@ -361,7 +468,7 @@ def build_interface():
361
  txt_input = gr.Textbox(
362
  label="Input Text",
363
  lines=4,
364
- placeholder="Enter text to analyze for symbolic role classification..."
365
  )
366
 
367
  with gr.Row():
@@ -450,23 +557,31 @@ def build_interface():
450
  )
451
 
452
  with gr.Tab("Caption Examples"):
453
- gr.Markdown("### 🖼️ Test with Realistic Image Caption Patterns")
 
 
 
 
 
 
 
 
454
 
455
  example_captions = [
456
- "a young woman wearing a blue dress",
457
- "the man has short brown hair",
458
- "she is wearing red high heels",
459
- "the lighting is soft and natural",
460
- "her expression shows happiness",
461
- "a person standing in confident pose",
462
- "wearing elegant silver jewelry",
463
- "the fabric has floral pattern"
464
  ]
465
 
466
  for caption in example_captions:
467
  with gr.Row():
468
- gr.Textbox(value=caption, label="Example Caption", interactive=False, scale=3)
469
- copy_btn = gr.Button("📋 Copy", scale=1)
470
 
471
  # Event handlers
472
  analyze_btn.click(
 
205
  def symbolic_classification_analysis(text, selected_roles, masking_strategy="content_words", num_predictions=5):
206
  """
207
  Perform symbolic classification analysis using MLM prediction
208
+ FIXED: Now tests what the model actually learned
209
  """
210
  if not selected_roles:
211
  selected_roles = list(symbolic_token_ids.keys())
 
214
  return "Please enter some text to analyze.", "", 0
215
 
216
  try:
217
+ # DETECT if input follows training pattern vs needs conversion
218
+ if any(role in text for role in symbolic_token_ids.keys()):
219
+ # Input already has symbolic tokens - test descriptive prediction
220
+ return test_descriptive_prediction(text, selected_roles, num_predictions)
221
+ else:
222
+ # Convert input to training-style format and test
223
+ return test_with_context_injection(text, selected_roles, num_predictions)
224
 
225
+ except Exception as e:
226
+ error_msg = f"Error during analysis: {str(e)}"
227
+ print(error_msg)
228
+ return error_msg, "", 0
229
+
230
+
231
+ def test_descriptive_prediction(text, selected_roles, num_predictions):
232
+ """
233
+ Test what descriptive words the model predicts after symbolic tokens
234
+ This matches the actual training objective
235
+ """
236
+ # Find positions after symbolic tokens
237
+ tokens = tokenizer.tokenize(text, add_special_tokens=True)
238
+ token_ids = tokenizer.convert_tokens_to_ids(tokens)
239
+
240
+ # Find symbolic token positions
241
+ symbolic_positions = []
242
+ for i, token in enumerate(tokens):
243
+ if token in symbolic_token_ids:
244
+ # Mask the next 1-3 positions after symbolic token
245
+ for offset in range(1, min(4, len(tokens) - i)):
246
+ if i + offset < len(tokens) and tokens[i + offset] not in ['[SEP]', '[PAD]']:
247
+ symbolic_positions.append({
248
+ 'mask_pos': i + offset,
249
+ 'symbolic_token': token,
250
+ 'original_token': tokens[i + offset]
251
+ })
252
+
253
+ if not symbolic_positions:
254
+ return "No symbolic tokens found in input. Try format like: '<subject> a young woman'", "", 0
255
+
256
+ # Create masked versions and get predictions
257
+ results = []
258
+ for pos_info in symbolic_positions[:5]: # Limit to 5 positions
259
+ masked_ids = token_ids.copy()
260
+ masked_ids[pos_info['mask_pos']] = MASK_ID
261
+
262
+ # Get MLM predictions
263
+ masked_input = torch.tensor([masked_ids]).to("cuda")
264
+ attention_mask = torch.ones_like(masked_input)
265
+
266
+ with torch.no_grad():
267
+ outputs = full_model(input_ids=masked_input, attention_mask=attention_mask)
268
+ logits = outputs.logits[0, pos_info['mask_pos']] # Logits for masked position
269
+
270
+ # Get top 10 predictions from full vocabulary
271
+ probs = F.softmax(logits, dim=-1)
272
+ top_indices = torch.argsort(probs, descending=True)[:num_predictions]
273
+
274
+ predictions = []
275
+ for idx in top_indices:
276
+ token_text = tokenizer.convert_ids_to_tokens([idx.item()])[0]
277
+ prob = probs[idx].item()
278
+ predictions.append({
279
+ "token": token_text,
280
+ "probability": prob
281
+ })
282
 
283
+ results.append({
284
+ "symbolic_context": pos_info['symbolic_token'],
285
+ "position": pos_info['mask_pos'],
286
+ "original_token": pos_info['original_token'],
287
+ "predictions": predictions
288
+ })
289
+
290
+ # Format results
291
+ analysis = {
292
+ "input_text": text,
293
+ "test_type": "descriptive_prediction",
294
+ "explanation": "Testing what descriptive words model predicts after symbolic tokens",
295
+ "results": results
296
+ }
297
+
298
+ summary_lines = [f"🎯 Testing Descriptive Prediction (what model actually learned)\n"]
299
+ for result in results:
300
+ ctx = result["symbolic_context"]
301
+ orig = result["original_token"]
302
+ top_pred = result["predictions"][0]
303
+
304
+ summary_lines.append(
305
+ f"After {ctx}: '{orig}' → '{top_pred['token']}' ({top_pred['probability']:.4f})"
306
  )
307
+
308
+ summary = "\n".join(summary_lines)
309
+ return json.dumps(analysis, indent=2), summary, len(results)
310
+
311
+
312
+ def test_with_context_injection(text, selected_roles, num_predictions):
313
+ """
314
+ Inject symbolic context and test what descriptive words are predicted
315
+ """
316
+ results = []
317
+
318
+ # Test each selected symbolic role as context
319
+ for role in selected_roles[:3]: # Limit to 3 roles for speed
320
+ # Create training-style context
321
+ context_text = f"{role} {text}"
322
+
323
+ # Tokenize and find good positions to mask
324
+ tokens = tokenizer.tokenize(context_text, add_special_tokens=True)
325
+ token_ids = tokenizer.convert_tokens_to_ids(tokens)
326
+
327
+ # Find role position and mask next content word
328
+ role_pos = None
329
+ for i, token in enumerate(tokens):
330
+ if token == role:
331
+ role_pos = i
332
+ break
333
+
334
+ if role_pos is None or role_pos + 2 >= len(tokens):
335
+ continue
336
 
337
+ # Mask position after role (skip articles like "a", "the")
338
+ mask_pos = role_pos + 1
339
+ skip_words = {'a', 'an', 'the', 'some', 'this', 'that'}
340
+ while mask_pos < len(tokens) - 1:
341
+ current_token = tokens[mask_pos].lower()
342
+ if current_token not in skip_words and len(current_token) > 2:
343
+ break
344
+ mask_pos += 1
345
+
346
+ if mask_pos >= len(tokens):
347
+ continue
348
 
349
+ # Create masked input
350
+ masked_ids = token_ids.copy()
351
+ original_token = tokens[mask_pos]
352
+ masked_ids[mask_pos] = MASK_ID
 
 
353
 
354
+ # Get predictions
355
+ masked_input = torch.tensor([masked_ids]).to("cuda")
356
+ attention_mask = torch.ones_like(masked_input)
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ with torch.no_grad():
359
+ outputs = full_model(input_ids=masked_input, attention_mask=attention_mask)
360
+ logits = outputs.logits[0, mask_pos]
361
 
362
+ # Get top predictions
363
+ probs = F.softmax(logits, dim=-1)
364
+ top_indices = torch.argsort(probs, descending=True)[:num_predictions]
365
+
366
+ predictions = []
367
+ for idx in top_indices:
368
+ token_text = tokenizer.convert_ids_to_tokens([idx.item()])[0]
369
+ prob = probs[idx].item()
370
+ predictions.append({
371
+ "token": token_text,
372
+ "probability": prob
373
+ })
374
 
375
+ results.append({
376
+ "symbolic_context": role,
377
+ "position": mask_pos,
378
+ "original_token": original_token,
379
+ "context_text": context_text,
380
+ "predictions": predictions
381
+ })
382
+
383
+ # Format results
384
+ analysis = {
385
+ "input_text": text,
386
+ "test_type": "context_injection",
387
+ "explanation": "Injected symbolic tokens and tested descriptive predictions",
388
+ "results": results
389
+ }
390
+
391
+ summary_lines = [f"🎯 Testing with Symbolic Context Injection\n"]
392
+ for result in results:
393
+ role = result["symbolic_context"]
394
+ orig = result["original_token"]
395
+ top_pred = result["predictions"][0]
396
+
397
+ summary_lines.append(
398
+ f"{role} context: '{orig}' → '{top_pred['token']}' ({top_pred['probability']:.4f})"
399
+ )
400
+
401
+ summary = "\n".join(summary_lines)
402
+ return json.dumps(analysis, indent=2), summary, len(results)
403
 
404
 
405
  def create_manual_mask_analysis(text, mask_positions_str, selected_roles):
 
468
  txt_input = gr.Textbox(
469
  label="Input Text",
470
  lines=4,
471
+ placeholder="Try: '<subject> a young woman wearing elegant dress' or just 'young woman wearing dress'"
472
  )
473
 
474
  with gr.Row():
 
557
  )
558
 
559
  with gr.Tab("Caption Examples"):
560
+ gr.Markdown("### 🖼️ Test with Training-Style Patterns")
561
+ gr.Markdown("""
562
+ **The model was trained to predict descriptive words AFTER symbolic tokens.**
563
+
564
+ Test with patterns like:
565
+ - `<subject> a young woman wearing elegant dress`
566
+ - `<lighting> soft natural illumination on the scene`
567
+ - `<emotion> happy expression while posing confidently`
568
+ """)
569
 
570
  example_captions = [
571
+ "<subject> a young woman wearing a blue dress",
572
+ "<lighting> soft natural illumination in the scene",
573
+ "<emotion> happy expression while posing confidently",
574
+ "<pose> standing gracefully near the window",
575
+ "<upper_body_clothing> elegant silk blouse with intricate patterns",
576
+ "<material> luxurious velvet fabric with rich texture",
577
+ "<accessory> delicate silver jewelry catching the light",
578
+ "<surface> polished marble floor reflecting ambient glow"
579
  ]
580
 
581
  for caption in example_captions:
582
  with gr.Row():
583
+ gr.Textbox(value=caption, label="Training-Style Example", interactive=False, scale=3)
584
+ copy_btn = gr.Button("📋 Test This", scale=1)
585
 
586
  # Event handlers
587
  analyze_btn.click(