anamargarida commited on
Commit
b43e4bd
·
verified ·
1 Parent(s): adc4b2b

Rename app_26.py to app_27.py

Browse files
Files changed (1) hide show
  1. app_26.py → app_27.py +67 -2
app_26.py → app_27.py RENAMED
@@ -331,14 +331,79 @@ def extract_arguments(text, tokenizer, model, beam_search=True):
331
 
332
  return text
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
 
336
 
337
  # Apply the tags to the sentence tokens
338
- tagged_sentence1 = add_tags_offset_2(input_text, start_cause1, end_cause1, start_effect1, end_effect1, start_signal, end_signal)
339
- tagged_sentence2 = add_tags_offset_2(input_text, start_cause2, end_cause2, start_effect2, end_effect2, start_signal, end_signal)
340
  return tagged_sentence1, tagged_sentence2
341
 
 
342
 
343
 
344
 
 
331
 
332
  return text
333
 
334
+ import re
335
+
336
+ def add_tags_offset_3(text, start_cause, end_cause, start_effect, end_effect, start_signal, end_signal):
337
+ """
338
+ Inserts tags into the original text based on token offsets, ensuring correct nesting,
339
+ avoiding empty tags, preventing duplication, and handling punctuation placement.
340
+
341
+ Args:
342
+ text (str): The original input text.
343
+ offset_mapping (list of tuples): Maps token indices to character spans.
344
+ start_cause (int): Start token index of the cause span.
345
+ end_cause (int): End token index of the cause span.
346
+ start_effect (int): Start token index of the effect span.
347
+ end_effect (int): End token index of the effect span.
348
+ start_signal (int, optional): Start token index of the signal span.
349
+ end_signal (int, optional): End token index of the signal span.
350
+
351
+ Returns:
352
+ str: The modified text with correctly positioned annotated spans.
353
+ """
354
+
355
+ # Convert token indices to character indices
356
+ spans = []
357
+
358
+ # Function to adjust start position to avoid punctuation issues
359
+ def adjust_start(text, start):
360
+ while start < len(text) and text[start] in {',', ' ', '.', ';', ':'}:
361
+ start += 1 # Move past punctuation
362
+ return start
363
+
364
+ # Ensure valid spans (avoid empty tags)
365
+ if start_cause is not None and end_cause is not None and start_cause < end_cause:
366
+ start_cause_char, end_cause_char = offset_mapping[start_cause][0], offset_mapping[end_cause][1]
367
+ spans.append((start_cause_char, end_cause_char, "<ARG0>", "</ARG0>"))
368
+
369
+ if start_effect is not None and end_effect is not None and start_effect < end_effect:
370
+ start_effect_char, end_effect_char = offset_mapping[start_effect][0], offset_mapping[end_effect][1]
371
+ start_effect_char = adjust_start(text, start_effect_char) # Skip punctuation
372
+ spans.append((start_effect_char, end_effect_char, "<ARG1>", "</ARG1>"))
373
+
374
+ if start_signal is not None and end_signal is not None and start_signal < end_signal:
375
+ start_signal_char, end_signal_char = offset_mapping[start_signal][0], offset_mapping[end_signal][1]
376
+ spans.append((start_signal_char, end_signal_char, "<SIG0>", "</SIG0>"))
377
+
378
+ # Sort spans in reverse order based on start index (to avoid shifting issues)
379
+ spans.sort(reverse=True, key=lambda x: x[0])
380
+
381
+ # Insert tags correctly
382
+ modified_text = text
383
+ inserted_positions = []
384
+
385
+ for start, end, open_tag, close_tag in spans:
386
+ # Adjust positions based on previous insertions
387
+ shift = sum(len(tag) for pos, tag in inserted_positions if pos <= start)
388
+ start += shift
389
+ end += shift
390
+
391
+ # Ensure valid start/end to prevent empty tags
392
+ if start < end:
393
+ modified_text = modified_text[:start] + open_tag + modified_text[start:end] + close_tag + modified_text[end:]
394
+ inserted_positions.append((start, open_tag))
395
+ inserted_positions.append((end + len(open_tag), close_tag))
396
+
397
+ return modified_text
398
 
399
 
400
 
401
  # Apply the tags to the sentence tokens
402
+ tagged_sentence1 = add_tags_offset_3(input_text, start_cause1, end_cause1, start_effect1, end_effect1, start_signal, end_signal)
403
+ tagged_sentence2 = add_tags_offset_3(input_text, start_cause2, end_cause2, start_effect2, end_effect2, start_signal, end_signal)
404
  return tagged_sentence1, tagged_sentence2
405
 
406
+
407
 
408
 
409