metricspace
/

GDPR_Input_Detection_and_Anonymization_0.5B

Safetensors

qwen2

Model card Files Files and versions Community

marcel1997 commited on Jan 21

Commit

5ae56e7

verified ·

1 Parent(s): 69404f8

Update README.md

Browse files

Files changed (1) hide show

README.md +38 -20

README.md CHANGED Viewed

@@ -102,7 +102,7 @@ formats = {
     "entity_swapping": """<|im_start|>system\nEntity Swapping<|im_end|>\n<|im_start|>user\nentities:{entities}\ntext:\n{text}<|im_end|>\n<|im_start|>assistant\n"""
 }
-def model_inference(text, mode="anonymization", max_new_tokens=50, config=None, entity_mapping=None):
     if mode not in formats and mode != "anonymization":
         raise ValueError("Invalid mode. Choose from 'sensitivity', 'complexity', 'entity_detection', 'anonymization'.")
@@ -154,7 +154,6 @@ def model_inference(text, mode="anonymization", max_new_tokens=50, config=None,
         # Step 2: Select entities based on config
         selected_entities = select_entities_based_on_json(detected_entities, config)
         entities_str = "\n".join([f"{entity} : {label}" for entity, label in selected_entities])
         # Step 3: Entity swapping for anonymization
         swapping_prompt = formats["entity_swapping"].format(entities=entities_str, text=text)
         swapping_inputs = tokenizer(swapping_prompt, return_tensors="pt").to(device)
@@ -168,24 +167,25 @@ def model_inference(text, mode="anonymization", max_new_tokens=50, config=None,
         anonymized_text = tokenizer.decode(swapping_output[0], skip_special_tokens=True)
         anonymized_text = anonymized_text.split("assistant\n", 1)[-1].strip()  # Extract only the assistant's response
-        return anonymized_text, detected_entities
     # Entity Restoration Mode using entity_swapping
     elif mode == "entity_swapping" and entity_mapping:
-        # Aggregate RANDOM and GENERAL replacements for restoration
-        reversed_entities = []
-        for original, details in entity_mapping.items():
-            # Include RANDOM replacement
-            reversed_entities.append(f"{details['RANDOM']} : {original}")
-            # Include GENERAL replacements
-            for general_label, _ in details["GENERAL"]:
-                reversed_entities.append(f"{general_label} : {original}")
-        # Combine all replacement mappings for the prompt
-        reversed_entities_str = "\n".join(reversed_entities)
         # Create the swapping prompt with the aggregated reversed mappings
-        swapping_prompt = formats["entity_swapping"].format(entities=reversed_entities_str, text=text)
         swapping_inputs = tokenizer(swapping_prompt, return_tensors="pt").to(device)
         swapping_output = model.generate(
             **swapping_inputs,
@@ -206,7 +206,7 @@ def model_inference(text, mode="anonymization", max_new_tokens=50, config=None,
         model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
         generation_output = model.generate(
             **model_inputs,
-            max_new_tokens=max_new_tokens,
             use_cache=True,
             eos_token_id=151645
         )
@@ -224,7 +224,7 @@ def postprocess_entity_recognition(detection_output: str) -> dict:
     entity_pattern = re.compile(
         r'(?P<entity>[\w\s]+)--(?P<type>[\w]+)--(?P<random>[\w\s]+)--(?P<generalizations>.+)'
     )
-    generalization_pattern = re.compile(r'(\d+)::([\w\s]+)')
     lines = detection_output.strip().split("\n")
     for line in lines:
@@ -236,8 +236,21 @@ def postprocess_entity_recognition(detection_output: str) -> dict:
             generalizations = []
             for gen_match in generalization_pattern.findall(match.group("generalizations")):
-                score, label = gen_match
-                generalizations.append([label.strip(), score.strip()])
             output_json[entity_name] = {
                 "TYPE": entity_type,
@@ -304,11 +317,16 @@ To protect sensitive information, the model detects specific entities in the tex
 ```python
 # Anonymize the text
-anonymized_text, entity_mapping = model_inference(text, mode="anonymization")
 print(f"Anonymized Text: {anonymized_text}\n")
 # Restore the original text
-restored_text = model_inference(anonymized_text, mode="entity_swapping", entity_mapping=entity_mapping)
 print(f"Restored Text: {restored_text}")
 ```

     "entity_swapping": """<|im_start|>system\nEntity Swapping<|im_end|>\n<|im_start|>user\nentities:{entities}\ntext:\n{text}<|im_end|>\n<|im_start|>assistant\n"""
 }
+def model_inference(text, mode="anonymization", max_new_tokens=2028, config=None, entity_mapping=None, return_entities=False, reverse_mapping=False):
     if mode not in formats and mode != "anonymization":
         raise ValueError("Invalid mode. Choose from 'sensitivity', 'complexity', 'entity_detection', 'anonymization'.")
         # Step 2: Select entities based on config
         selected_entities = select_entities_based_on_json(detected_entities, config)
         entities_str = "\n".join([f"{entity} : {label}" for entity, label in selected_entities])
         # Step 3: Entity swapping for anonymization
         swapping_prompt = formats["entity_swapping"].format(entities=entities_str, text=text)
         swapping_inputs = tokenizer(swapping_prompt, return_tensors="pt").to(device)
         anonymized_text = tokenizer.decode(swapping_output[0], skip_special_tokens=True)
         anonymized_text = anonymized_text.split("assistant\n", 1)[-1].strip()  # Extract only the assistant's response
+        if return_entities:
+            return anonymized_text, entities_str
+        return anonymized_text
     # Entity Restoration Mode using entity_swapping
     elif mode == "entity_swapping" and entity_mapping:
+        # Reverse the entity mapping
+        if reverse_mapping:
+            reversed_mapping = []
+            for line in entity_mapping.splitlines():
+                if ':' in line:  # Ensure the line contains a colon
+                    left, right = map(str.strip, line.split(":", 1))  # Split and strip spaces
+                    reversed_mapping.append(f"{right} : {left}")  # Reverse and format
+            entity_mapping = "\n".join(reversed_mapping)
         # Create the swapping prompt with the aggregated reversed mappings
+        swapping_prompt = formats["entity_swapping"].format(entities=entity_mapping, text=text)
         swapping_inputs = tokenizer(swapping_prompt, return_tensors="pt").to(device)
         swapping_output = model.generate(
             **swapping_inputs,
         model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
         generation_output = model.generate(
             **model_inputs,
+            max_new_tokens=5,
             use_cache=True,
             eos_token_id=151645
         )
     entity_pattern = re.compile(
         r'(?P<entity>[\w\s]+)--(?P<type>[\w]+)--(?P<random>[\w\s]+)--(?P<generalizations>.+)'
     )
+    generalization_pattern = re.compile(r'([\w\s]+)::([\w\s]+)')
     lines = detection_output.strip().split("\n")
     for line in lines:
             generalizations = []
             for gen_match in generalization_pattern.findall(match.group("generalizations")):
+                first, second = gen_match
+                # Check if the first part is a digit (score) and swap if needed
+                if first.isdigit() and not second.isdigit():
+                    score = first
+                    label = second
+                    generalizations.append([label.strip(), score.strip()])
+                elif not first.isdigit() and second.isdigit():
+                    label = first
+                    score = second
+                    generalizations.append([label.strip(), score.strip()])
             output_json[entity_name] = {
                 "TYPE": entity_type,
 ```python
 # Anonymize the text
+anonymized_text = model_inference(text, mode="anonymization")
 print(f"Anonymized Text: {anonymized_text}\n")
+```
+```python
 # Restore the original text
+anonymized_text, entity_mapping = model_inference(text, mode="anonymization", return_entities=True)
+print(f"Entity Mapping:\n{entity_mapping}\n")
+print(f"Anonymized Text: {anonymized_text}\n")
+restored_text = model_inference(anonymized_text, mode="entity_swapping", entity_mapping=entity_mapping, reverse_mapping=True)
 print(f"Restored Text: {restored_text}")
 ```