File size: 2,259 Bytes
1587277 8d7a1e9 1587277 8d7a1e9 1587277 8d7a1e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
import instructor
from pydantic import BaseModel
"""
client = instructor.from_openai(
OpenAI(
base_url="http://localhost:11434/v1",
api_key="ollama",
),
mode=instructor.Mode.JSON,
)
"""
from groq import Groq
# Initialize with API key
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
# Enable instructor patches for Groq client
client = instructor.from_groq(client)
llm = 'llama-3.1-8b-instant' if os.getenv("GROQ_API_KEY") else "llama3.2"
class PIIData(BaseModel):
index: int
data_type: str
pii_value: str
class PIIExtraction(BaseModel):
"""
Extracted PII data from a document, all data_types should try to have consistent property names
"""
private_data: list[PIIData]
chain_of_thought: str
def sanitize(self, content):
"""
Iterates over the private data and replaces the value with a placeholder in the form of
<{data_type}_{i}>
"""
for i, data in enumerate(self.private_data):
content = content.replace(data.pii_value, f"<{data.data_type}_{i}>")
return content
def derisk(content) -> PIIExtraction:
return client.chat.completions.create(
model=llm,
response_model=PIIExtraction,
temperature=0.2,
messages=[
{
"role": "system",
"content": "You are a world class international PII scrubbing model, perform data preprocess include standardization, stop word removal, punctuation removal...to enhance signal to noise ratio for name, phone, address, email, id...etc. Extract the PII data from the following document",
}, {
"role": "user",
"content": {content},
}
]).model_dump_json(indent=2)
if __name__ == '__main__':
ESSAY = """
He Hua (Hua Hua) Director
[email protected]
+86-28-83505513
Alternative Address Format:
Xiongmao Ave West Section, Jinniu District (listed in some records as 610016 postcode)
Best Viewing: Before 9:00 AM during summer hours (7:30 AM-5:00 PM)
Caretaker: Tan Jintao ("Grandpa Tan")
Additional Contacts
Charitable Donations: +86-28-83505513
Dining Reservations: +86-17311072681
"""
print(derisk(ESSAY))
# print(pii_leak.model_dump_json(indent=2))
# print(pii_leak.sanitize(ESSAY))
|