Spaces:
Build error
Build error
Rename app_23.py to app_24.py
Browse files- app_23.py → app_24.py +48 -3
app_23.py → app_24.py
RENAMED
|
@@ -60,7 +60,8 @@ def extract_arguments(text, tokenizer, model, beam_search=True):
|
|
| 60 |
self.pretrained_signal_detector = False
|
| 61 |
|
| 62 |
args = Args()
|
| 63 |
-
inputs = tokenizer(text, return_tensors="pt")
|
|
|
|
| 64 |
# Get tokenized words (for reconstruction later)
|
| 65 |
word_ids = inputs.word_ids()
|
| 66 |
|
|
@@ -133,9 +134,14 @@ def extract_arguments(text, tokenizer, model, beam_search=True):
|
|
| 133 |
|
| 134 |
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
| 135 |
token_ids = inputs["input_ids"][0]
|
|
|
|
| 136 |
|
| 137 |
for i, (token, word_id) in enumerate(zip(tokens, word_ids)):
|
| 138 |
st.write(f"Token {i}: {token}, Word ID: {word_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
|
| 141 |
st.write("Token Positions, IDs, and Corresponding Tokens:")
|
|
@@ -250,11 +256,50 @@ def extract_arguments(text, tokenizer, model, beam_search=True):
|
|
| 250 |
# Join tokens back into a string
|
| 251 |
return ' '.join(this_space_splitted_tokens)
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
|
| 255 |
# Apply the tags to the sentence tokens
|
| 256 |
-
tagged_sentence1 =
|
| 257 |
-
tagged_sentence2 =
|
| 258 |
return tagged_sentence1, tagged_sentence2
|
| 259 |
|
| 260 |
|
|
|
|
| 60 |
self.pretrained_signal_detector = False
|
| 61 |
|
| 62 |
args = Args()
|
| 63 |
+
inputs = tokenizer(text, return_offsets_mapping=True, return_tensors="pt")
|
| 64 |
+
|
| 65 |
# Get tokenized words (for reconstruction later)
|
| 66 |
word_ids = inputs.word_ids()
|
| 67 |
|
|
|
|
| 134 |
|
| 135 |
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
| 136 |
token_ids = inputs["input_ids"][0]
|
| 137 |
+
offset_mapping = inputs["offset_mapping"][0].tolist()
|
| 138 |
|
| 139 |
for i, (token, word_id) in enumerate(zip(tokens, word_ids)):
|
| 140 |
st.write(f"Token {i}: {token}, Word ID: {word_id}")
|
| 141 |
+
|
| 142 |
+
st.write("Token & offset:")
|
| 143 |
+
for i, (token, offset) in enumerate(zip(tokens, offset_mapping)):
|
| 144 |
+
st.write(f"Token {i}: {token}, Offset: {offset}")
|
| 145 |
|
| 146 |
|
| 147 |
st.write("Token Positions, IDs, and Corresponding Tokens:")
|
|
|
|
| 256 |
# Join tokens back into a string
|
| 257 |
return ' '.join(this_space_splitted_tokens)
|
| 258 |
|
| 259 |
+
def add_tags_offset(text, start_cause, end_cause, start_effect, end_effect, start_signal, end_signal):
|
| 260 |
+
"""
|
| 261 |
+
Inserts tags into the original text based on token offsets.
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
text (str): The original input text.
|
| 265 |
+
tokenizer: The tokenizer used for tokenization.
|
| 266 |
+
start_cause (int): Start token index of the cause span.
|
| 267 |
+
end_cause (int): End token index of the cause span.
|
| 268 |
+
start_effect (int): Start token index of the effect span.
|
| 269 |
+
end_effect (int): End token index of the effect span.
|
| 270 |
+
start_signal (int, optional): Start token index of the signal span.
|
| 271 |
+
end_signal (int, optional): End token index of the signal span.
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
str: The modified text with annotated spans.
|
| 275 |
+
"""
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# Convert token-based indices to character-based indices
|
| 280 |
+
start_cause_char, end_cause_char = offset_mapping[start_cause][0], offset_mapping[end_cause][1]
|
| 281 |
+
start_effect_char, end_effect_char = offset_mapping[start_effect][0], offset_mapping[end_effect][1]
|
| 282 |
+
|
| 283 |
+
# Insert tags into the original text
|
| 284 |
+
annotated_text = text[:start_cause_char] + "<ARG0>" + text[start_cause_char:end_cause_char] + "</ARG0>" + text[end_cause_char:start_effect_char] + "<ARG1>" + text[start_effect_char:end_effect_char] + "</ARG1>" + text[end_effect_char:]
|
| 285 |
+
|
| 286 |
+
# If signal span exists, insert signal tags
|
| 287 |
+
if start_signal is not None and end_signal is not None:
|
| 288 |
+
start_signal_char, end_signal_char = offset_mapping[start_signal][0], offset_mapping[end_signal][1]
|
| 289 |
+
annotated_text = (
|
| 290 |
+
annotated_text[:start_signal_char]
|
| 291 |
+
+ "<SIG0>" + annotated_text[start_signal_char:end_signal_char] + "</SIG0>"
|
| 292 |
+
+ annotated_text[end_signal_char:]
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
return annotated_text
|
| 296 |
+
|
| 297 |
+
|
| 298 |
|
| 299 |
|
| 300 |
# Apply the tags to the sentence tokens
|
| 301 |
+
tagged_sentence1 = add_tags_offset(input_text, start_cause1, end_cause1, start_effect1, end_effect1, start_signal, end_signal)
|
| 302 |
+
tagged_sentence2 = add_tags_offset(input_text, start_cause2, end_cause2, start_effect2, end_effect2, start_signal, end_signal)
|
| 303 |
return tagged_sentence1, tagged_sentence2
|
| 304 |
|
| 305 |
|