Update README.md
Browse files
README.md
CHANGED
@@ -99,7 +99,7 @@ formats = {
|
|
99 |
"sensitivity": """<|im_start|>system\nSensitivity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
100 |
"complexity": """<|im_start|>system\nComplexity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
101 |
"entity_detection": """<|im_start|>system\nEntity Detection<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
102 |
-
"entity_swapping": """<|im_start|>system\nEntity Swapping<|im_end|>\n<|im_start|>user\nentities
|
103 |
}
|
104 |
|
105 |
def model_inference(text, mode="anonymization", max_new_tokens=2028, config=None, entity_mapping=None, return_entities=False, reverse_mapping=False):
|
@@ -288,26 +288,26 @@ def select_entities_based_on_json(prediction_json, entity_json):
|
|
288 |
|
289 |
Example text
|
290 |
```
|
291 |
-
We have a community picnic
|
292 |
```
|
293 |
|
294 |
The sensitivity analysis feature evaluates the sensitivity of a given text and the complexitivity feature rates the complexity.
|
295 |
```python
|
296 |
-
text = "We have a community picnic
|
297 |
|
298 |
# Generate sensitivity score
|
299 |
sensitivity_score = model_inference(text, mode="sensitivity")
|
300 |
-
print(f"Sensitivity Score: {sensitivity_score}"
|
301 |
|
302 |
# Generate complexity score
|
303 |
complexity_score = model_inference(text, mode="complexity")
|
304 |
-
print(f"Complexity: {complexity_score}"
|
305 |
```
|
306 |
|
307 |
Output
|
308 |
```
|
309 |
Sensitivity Score: 0
|
310 |
-
Complexity
|
311 |
```
|
312 |
|
313 |
### 3. Anonymization and Re-Anonymization
|
@@ -325,58 +325,20 @@ print(f"Anonymized Text: {anonymized_text}\n")
|
|
325 |
# Restore the original text
|
326 |
anonymized_text, entity_mapping = model_inference(text, mode="anonymization", return_entities=True)
|
327 |
print(f"Entity Mapping:\n{entity_mapping}\n")
|
328 |
-
print(f"Anonymized Text: {anonymized_text}\n")
|
329 |
restored_text = model_inference(anonymized_text, mode="entity_swapping", entity_mapping=entity_mapping, reverse_mapping=True)
|
330 |
print(f"Restored Text: {restored_text}")
|
331 |
```
|
332 |
|
333 |
Output
|
334 |
```
|
335 |
-
Anonymized Text: We have a community picnic
|
336 |
|
337 |
-
|
338 |
-
|
|
|
|
|
339 |
|
340 |
-
|
341 |
-
```python
|
342 |
-
print(f"{json.dumps(entity_mapping, indent=4)}")
|
343 |
-
```
|
344 |
-
Output
|
345 |
-
```
|
346 |
-
{
|
347 |
-
"Greenfield Park": {
|
348 |
-
"TYPE": "LOC",
|
349 |
-
"RANDOM": "Maplewood Park",
|
350 |
-
"GENERAL": [
|
351 |
-
[
|
352 |
-
"Local Park",
|
353 |
-
"3"
|
354 |
-
],
|
355 |
-
[
|
356 |
-
"Public Park",
|
357 |
-
"5"
|
358 |
-
],
|
359 |
-
[
|
360 |
-
"Recreational Area",
|
361 |
-
"7"
|
362 |
-
]
|
363 |
-
]
|
364 |
-
},
|
365 |
-
"11 AM": {
|
366 |
-
"TYPE": "DATETIME",
|
367 |
-
"RANDOM": "1 PM",
|
368 |
-
"GENERAL": [
|
369 |
-
[
|
370 |
-
"Late Morning",
|
371 |
-
"2"
|
372 |
-
],
|
373 |
-
[
|
374 |
-
"A",
|
375 |
-
"4"
|
376 |
-
]
|
377 |
-
]
|
378 |
-
}
|
379 |
-
}
|
380 |
```
|
381 |
|
382 |
Normally you would process the annonymized version with a LLM and than reanonymize the result back.
|
|
|
99 |
"sensitivity": """<|im_start|>system\nSensitivity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
100 |
"complexity": """<|im_start|>system\nComplexity<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
101 |
"entity_detection": """<|im_start|>system\nEntity Detection<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""",
|
102 |
+
"entity_swapping": """<|im_start|>system\nEntity Swapping<|im_end|>\n<|im_start|>user\nentities:\n{entities}\ntext:\n{text}<|im_end|>\n<|im_start|>assistant\n"""
|
103 |
}
|
104 |
|
105 |
def model_inference(text, mode="anonymization", max_new_tokens=2028, config=None, entity_mapping=None, return_entities=False, reverse_mapping=False):
|
|
|
288 |
|
289 |
Example text
|
290 |
```
|
291 |
+
We have a community picnic at Greenfield Park, it is on thursday at 11 AM. Write me an e-mail annoucment!
|
292 |
```
|
293 |
|
294 |
The sensitivity analysis feature evaluates the sensitivity of a given text and the complexitivity feature rates the complexity.
|
295 |
```python
|
296 |
+
text = "We have a community picnic at Greenfield Park, it is on thursday at 11 AM. Write me an e-mail annoucment!"
|
297 |
|
298 |
# Generate sensitivity score
|
299 |
sensitivity_score = model_inference(text, mode="sensitivity")
|
300 |
+
print(f"Sensitivity Score: {sensitivity_score}")
|
301 |
|
302 |
# Generate complexity score
|
303 |
complexity_score = model_inference(text, mode="complexity")
|
304 |
+
print(f"Complexity: {complexity_score}")
|
305 |
```
|
306 |
|
307 |
Output
|
308 |
```
|
309 |
Sensitivity Score: 0
|
310 |
+
Complexity: 3
|
311 |
```
|
312 |
|
313 |
### 3. Anonymization and Re-Anonymization
|
|
|
325 |
# Restore the original text
|
326 |
anonymized_text, entity_mapping = model_inference(text, mode="anonymization", return_entities=True)
|
327 |
print(f"Entity Mapping:\n{entity_mapping}\n")
|
|
|
328 |
restored_text = model_inference(anonymized_text, mode="entity_swapping", entity_mapping=entity_mapping, reverse_mapping=True)
|
329 |
print(f"Restored Text: {restored_text}")
|
330 |
```
|
331 |
|
332 |
Output
|
333 |
```
|
334 |
+
Anonymized Text: We have a community picnic at Sunnyvale Park, it is on A Day of the Week at Morning. Write me an e-mail announcement!
|
335 |
|
336 |
+
Entity Mapping:
|
337 |
+
Greenfield Park : Sunnyvale Park
|
338 |
+
thursday : A Day of the Week
|
339 |
+
11 AM : Morning
|
340 |
|
341 |
+
Restored Text: We have a community picnic at Greenfield Park, it is on thursday at 11 AM. Write me an e-mail announcement!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
```
|
343 |
|
344 |
Normally you would process the annonymized version with a LLM and than reanonymize the result back.
|