kevinhug commited on
Commit
1587277
·
1 Parent(s): 0f2b5ae
Files changed (3) hide show
  1. app.py +64 -28
  2. knowledge.py +123 -63
  3. pii.py +87 -0
app.py CHANGED
@@ -3,39 +3,36 @@ from rag import rbc_product
3
  from tool import rival_product
4
  from graphrag import reasoning
5
  from knowledge import graph
6
-
7
-
8
-
9
-
10
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
11
  with gr.Tab("RAG"):
12
  gr.Markdown("""
13
- Marketing
14
- ------------
15
- GraphRAG: Models customer-product relationship networks for next-best-action predictions
16
-
17
- DSPy: Optimizes cross-sell/upsell prompt variations through A/B testing
18
-
19
- Risk & Audit
20
  ------------
21
- GraphRAG: Maps transactional relationships into dynamic knowledge graphs to detect multi-layered fraud patterns
22
-
23
- Tool Use: Integrates fraud detection APIs, anomaly scoring models, and regulatory compliance checkers
24
-
25
- DSPy: Optimizes fraud explanation prompts for regulatory reporting
26
-
27
- Other Use Case
28
- ------------
29
- https://huggingface.co/spaces/kevinhug/clientX
30
- https://kevinwkc.github.io/davinci/
31
  """)
32
 
33
 
34
  gr.Markdown("""
35
  Objective: Recommend RBC product based on persona.
36
  ================================================
37
- Retrieval: Public RBC Product Data
38
- Recommend: RBC Product
39
  """)
40
  in_verbatim = gr.Textbox(label="Verbatim")
41
  out_product = gr.Textbox(label="Product")
@@ -50,13 +47,25 @@ https://kevinwkc.github.io/davinci/
50
  btn_recommend=gr.Button("Recommend")
51
  btn_recommend.click(fn=rbc_product, inputs=in_verbatim, outputs=out_product)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  with gr.Tab("Tool Use"):
54
  gr.Markdown("""
55
  Objective: Recommend financial product based on persona for competitive analysis, product feature discovery
56
  ================================================
57
- Retrieval: Public Product Data using Tavily Search
58
-
59
- Recommend: Competition Product
60
  """)
61
  in_verbatim = gr.Textbox(label="Verbatim")
62
  out_product = gr.Textbox(label="Product")
@@ -74,7 +83,7 @@ https://kevinwkc.github.io/davinci/
74
  gr.Markdown("""
75
  Objective: Create a Marketing Plan based on persona.
76
  =======================
77
- Reasoning from context, answering the question
78
  """)
79
 
80
  marketing = """
@@ -145,4 +154,31 @@ Low APR and great customer service. I would highly recommend if you’re looking
145
  btn_recommend = gr.Button("Graph It!")
146
  btn_recommend.click(fn=graph, inputs=in_verbatim, outputs=out_product)
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  demo.launch(allowed_paths=["./"])
 
3
  from tool import rival_product
4
  from graphrag import reasoning
5
  from knowledge import graph
6
+ from pii import derisk
7
+
8
+ # Define the Google Analytics script
9
+ head = """
10
+ <!-- Google tag (gtag.js) -->
11
+ <script async src="https://www.googletagmanager.com/gtag/js?id=G-SRX9LDVBCW"></script>
12
+ <script>
13
+ window.dataLayer = window.dataLayer || [];
14
+ function gtag(){dataLayer.push(arguments);}
15
+ gtag('js', new Date());
16
+
17
+ gtag('config', 'G-SRX9LDVBCW');
18
+ </script>
19
+ """
20
+
21
+ with gr.Blocks(head=head) as demo:
22
  with gr.Tab("RAG"):
23
  gr.Markdown("""
24
+ Links:
 
 
 
 
 
 
25
  ------------
26
+ - https://huggingface.co/spaces/kevinhug/clientX
27
+ - https://kevinwkc.github.io/davinci/
 
 
 
 
 
 
 
 
28
  """)
29
 
30
 
31
  gr.Markdown("""
32
  Objective: Recommend RBC product based on persona.
33
  ================================================
34
+ - Retrieval: Public RBC Product Data
35
+ - Recommend: RBC Product
36
  """)
37
  in_verbatim = gr.Textbox(label="Verbatim")
38
  out_product = gr.Textbox(label="Product")
 
47
  btn_recommend=gr.Button("Recommend")
48
  btn_recommend.click(fn=rbc_product, inputs=in_verbatim, outputs=out_product)
49
 
50
+ gr.Markdown("""
51
+ Marketing
52
+ ------------
53
+ - GraphRAG: Models customer-product relationship networks for next-best-action predictions
54
+ - DSPy: Optimizes cross-sell/upsell prompt variations through A/B testing
55
+
56
+ Risk & Audit
57
+ ------------
58
+ - GraphRAG: Maps transactional relationships into dynamic knowledge graphs to detect multi-layered fraud patterns
59
+ - Tool Use: Integrates fraud detection APIs, anomaly scoring models, and regulatory compliance checkers
60
+ - DSPy: Optimizes fraud explanation prompts for regulatory reporting
61
+ """)
62
+
63
  with gr.Tab("Tool Use"):
64
  gr.Markdown("""
65
  Objective: Recommend financial product based on persona for competitive analysis, product feature discovery
66
  ================================================
67
+ - Retrieval: Public Product Data using Tavily Search
68
+ - Recommend: Competition Product
 
69
  """)
70
  in_verbatim = gr.Textbox(label="Verbatim")
71
  out_product = gr.Textbox(label="Product")
 
83
  gr.Markdown("""
84
  Objective: Create a Marketing Plan based on persona.
85
  =======================
86
+ - Reasoning from context, answering the question
87
  """)
88
 
89
  marketing = """
 
154
  btn_recommend = gr.Button("Graph It!")
155
  btn_recommend.click(fn=graph, inputs=in_verbatim, outputs=out_product)
156
 
157
+
158
+ with gr.Tab("pii masking"):
159
+ gr.Markdown("""
160
+ Objective: pii data removal
161
+ ================================================
162
+ """)
163
+ in_verbatim = gr.Textbox(label="Peronal Info")
164
+ out_product = gr.Textbox(label="PII")
165
+
166
+ gr.Examples(
167
+ [
168
+ [
169
+ """
170
+ He Hua (Hua Hua) Director
171
172
+ +86-28-83505513
173
+
174
+ Alternative Address Format:
175
+ Xiongmao Ave West Section, Jinniu District (listed in some records as 610016 postcode)
176
+ """
177
+ ]
178
+ ],
179
+ [in_verbatim]
180
+ )
181
+ btn_recommend = gr.Button("Mask PII")
182
+ btn_recommend.click(fn=derisk, inputs=in_verbatim, outputs=out_product)
183
+
184
  demo.launch(allowed_paths=["./"])
knowledge.py CHANGED
@@ -1,33 +1,8 @@
 
1
 
2
  import instructor
3
-
4
- from pydantic import BaseModel, Field
5
- from typing import List
6
  from graphviz import Digraph
7
-
8
- class Node(BaseModel, frozen=True):
9
- """
10
- Node representing concept in the subject domain
11
- """
12
- id: int
13
- label: str = Field(..., description = "description of the concept in the subject domain")
14
- color: str
15
-
16
- class Edge(BaseModel, frozen=True):
17
- """
18
- Edge representing relationship between concepts in the subject domain
19
- """
20
- source: int = Field(..., description = "source representing concept in the subject domain")
21
- target: int = Field(..., description = "target representing concept in the subject domain")
22
- label: str = Field(..., description = "description representing relationship between concepts in the subject domain")
23
- color: str = "black"
24
-
25
- class KnowledgeGraph(BaseModel):
26
- """
27
- graph representation of concepts in the subject domain
28
- """
29
- nodes: List[Node] = Field(..., default_factory=list)
30
- edges: List[Edge] = Field(..., default_factory=list)
31
 
32
  from groq import Groq
33
  import os
@@ -37,7 +12,6 @@ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
37
 
38
  # Enable instructor patches for Groq client
39
  client = instructor.from_groq(client)
40
- llm='llama-3.1-8b-instant' #"llama3.2", #
41
  """
42
  from openai import OpenAI
43
  client = instructor.from_openai(
@@ -48,46 +22,132 @@ client = instructor.from_openai(
48
  mode=instructor.Mode.JSON,
49
  )
50
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def generate_graph(q, input) -> KnowledgeGraph:
52
- return client.chat.completions.create(
53
  model=llm,
54
  max_retries=5,
55
  messages=[
56
- {
57
- "role": "user",
58
- "content": f"Help me understand the following by describing it as a detailed knowledge graph: ### Question: {q} ### Context: {input}",
59
- }
 
 
 
 
 
 
 
 
 
 
60
  ],
61
- response_model=KnowledgeGraph,
62
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
- class Issue(BaseModel):
66
- "Break down Issue as sub issues"
67
- question: str
68
-
69
- class IssuePlan(BaseModel):
70
- "List of Issue"
71
- issue_graph: List[Issue] = []
72
-
73
-
74
- def expandIssue(input) -> IssuePlan:
75
- return client.chat.completions.create(
76
- model=llm,
77
- max_retries=10,
78
- messages=[
79
- {
80
- "role": "system",
81
- "content": "As a Mckinsey Consultant create a framework that relevant to the topic, list all issues.",
82
- }, {
83
- "role": "user",
84
- "content": f"Question: {input}",
85
- },
86
- ],
87
- response_model=IssuePlan,
88
- )
89
  def graph(query):
90
- queryx = expandIssue(query)
91
- ctx = ", ".join([q.question for q in queryx.issue_graph])
92
- graph = generate_graph(query, ctx)
93
- return graph.json()
 
1
+ from typing import List
2
 
3
  import instructor
 
 
 
4
  from graphviz import Digraph
5
+ from pydantic import BaseModel, Field
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  from groq import Groq
8
  import os
 
12
 
13
  # Enable instructor patches for Groq client
14
  client = instructor.from_groq(client)
 
15
  """
16
  from openai import OpenAI
17
  client = instructor.from_openai(
 
22
  mode=instructor.Mode.JSON,
23
  )
24
  """
25
+ llm = 'llama-3.1-8b-instant' if os.getenv("GROQ_API_KEY") else "llama3.2"
26
+
27
+ class Node(BaseModel, frozen=True):
28
+ """
29
+ Node representing concept in the subject domain
30
+ """
31
+ id: int= Field(...,
32
+ description="unique id of the concept in the subject domain, used for deduplication, design a scheme allows multiple concept")
33
+ label: str = Field(..., description="description of the concept in the subject domain")
34
+ color: str = "orange"
35
+
36
+
37
+ class Edge(BaseModel, frozen=True):
38
+ """
39
+ Edge representing relationship between concepts in the subject domain, source depends on target
40
+ """
41
+ source: int = Field(..., description="source representing concept in the subject domain")
42
+ target: int = Field(..., description="target representing concept in the subject domain")
43
+ label: str = Field(..., description="description representing relationship between concepts in the subject domain")
44
+ color: str = "black"
45
+
46
+
47
+ from typing import Optional
48
+
49
+
50
+ class KnowledgeGraph(BaseModel):
51
+ """
52
+ KnowledgeGraph is graph representation of concepts in the subject domain
53
+ """
54
+ nodes: Optional[List[Node]] = Field(..., default_factory=list)
55
+ edges: Optional[List[Edge]] = Field(..., default_factory=list)
56
+
57
+ def update(self, other: "KnowledgeGraph") -> "KnowledgeGraph":
58
+ """Updates the current graph with the other graph, deduplicating nodes and edges."""
59
+ return KnowledgeGraph(
60
+ nodes=list(set(self.nodes + other.nodes)),
61
+ edges=list(set(self.edges + other.edges)),
62
+ )
63
+
64
+ def draw(self, prefix: str = "knowledge_graph"):
65
+ dot = Digraph(comment="Knowledge Graph")
66
+
67
+ for node in self.nodes:
68
+ dot.node(str(node.id), node.label, color=node.color)
69
+
70
+ for edge in self.edges:
71
+ dot.edge(
72
+ str(edge.source), str(edge.target), label=edge.label, color=edge.color
73
+ )
74
+ dot.render(prefix, format="png", view=True)
75
+
76
+
77
+ from typing import Iterable
78
+ from textwrap import dedent
79
+
80
+
81
  def generate_graph(q, input) -> KnowledgeGraph:
82
+ return client.chat.completions.create(
83
  model=llm,
84
  max_retries=5,
85
  messages=[
86
+ {
87
+ "role": "user",
88
+ "content": dedent(f"""Help me understand the following by describing it as a detailed knowledge graph:
89
+ ### Question: {q}
90
+ ### Context: {input}
91
+ ### Instruction:
92
+ Generate at least 5 concepts
93
+ Generate at least 3 relationship
94
+ ### Output Format:
95
+ Node with id, label for description of the concept
96
+ Edge with source's id, target's id, label for description of the relationship between source concept and target concept
97
+ """),
98
+
99
+ }
100
  ],
101
+ response_model=KnowledgeGraph)
102
+
103
+
104
+ class Subissue(BaseModel):
105
+ subissue_title: str
106
+ point: List[str] = Field(default_factory=list, description="Specific aspect or component of the subissue")
107
+
108
+
109
+ def expandIssue(input) -> Iterable[Subissue]:
110
+ response = client.chat.completions.create(
111
+ model=llm,
112
+ max_retries=3,
113
+ response_model=Iterable[Subissue],
114
+ temperature=0.1,
115
+ messages=[
116
+
117
+ {
118
+ "role": "user",
119
+ "content": dedent(f"""
120
+ As a McKinsey Consultant, perform MECE decomposition of the question.
121
+ ### Requirements
122
+ 1. Return 3 subissues minimum
123
+ 2. Each sub-issue has 3 bullet points, which each new point beginning with a *
124
+ 3. Use EXACT format:
125
+
126
+ - [Sub-issue 1.1 title]
127
+ * [point 1]
128
+ * [point 2]
129
+ * [point 3]
130
+ - [Sub-issue 1.2 title]
131
+ * [point 1]
132
+ * [point 2]
133
+ * [point 3]
134
+ - [Sub-issue 1.3 title]
135
+ * [point 1]
136
+ * [point 2]
137
+ * [point 3]
138
+
139
+ 4. return nothing else
140
+ ### Question: {input}
141
+ """),
142
+ },
143
+ ],
144
+ )
145
+
146
+ return response
147
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  def graph(query):
150
+ queryx = expandIssue(query)
151
+
152
+ graph = generate_graph(query, str(queryx))
153
+ return graph.json()
pii.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import instructor
4
+ from groq import Groq
5
+ from pydantic import BaseModel
6
+
7
+ # Initialize with API key
8
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
9
+
10
+ # Enable instructor patches for Groq client
11
+ client = instructor.from_groq(client)
12
+ """
13
+ client = instructor.from_openai(
14
+ OpenAI(
15
+ base_url="http://localhost:11434/v1",
16
+ api_key="ollama",
17
+ ),
18
+ mode=instructor.Mode.JSON,
19
+ )
20
+ """
21
+ llm = 'llama-3.1-8b-instant' if os.getenv("GROQ_API_KEY") else "llama3.2"
22
+
23
+
24
+ class PIIData(BaseModel):
25
+ index: int
26
+ data_type: str
27
+ pii_value: str
28
+
29
+
30
+ class PIIExtraction(BaseModel):
31
+ """
32
+ Extracted PII data from a document, all data_types should try to have consistent property names
33
+ """
34
+ private_data: list[PIIData]
35
+ chain_of_thought: str
36
+
37
+ def scrub_data(self, content):
38
+ """
39
+ Iterates over the private data and replaces the value with a placeholder in the form of
40
+ <{data_type}_{i}>
41
+ """
42
+
43
+ for i, data in enumerate(self.private_data):
44
+ content = content.replace(data.pii_value, f"<{data.data_type}_{i}>")
45
+
46
+ return content
47
+
48
+
49
+ def derisk(content) -> PIIExtraction:
50
+ return client.chat.completions.create(
51
+ model=llm,
52
+ response_model=PIIExtraction,
53
+ temperature=0.2,
54
+ messages=[
55
+ {
56
+ "role": "system",
57
+ "content": "You are a world class international PII scrubbing model, perform data preprocess include standardization, stop word removal, punctuation removal...to enhance signal to noise ratio for name, phone, address, email, id...etc. Extract the PII data from the following document",
58
+
59
+ }, {
60
+ "role": "user",
61
+ "content": {content},
62
+ }
63
+ ]).model_dump_json(indent=2)
64
+
65
+
66
+ if __name__ == '__main__':
67
+ ESSAY = """
68
+ He Hua (Hua Hua) Director
69
70
+ +86-28-83505513
71
+
72
+ Alternative Address Format:
73
+ Xiongmao Ave West Section, Jinniu District (listed in some records as 610016 postcode)
74
+
75
+
76
+ Best Viewing: Before 9:00 AM during summer hours (7:30 AM-5:00 PM)
77
+
78
+ Caretaker: Tan Jintao ("Grandpa Tan")
79
+
80
+ Additional Contacts
81
+ Charitable Donations: +86-28-83505513
82
+ Dining Reservations: +86-17311072681
83
+ """
84
+
85
+ print(derisk(ESSAY))
86
+ # print(pii_leak.model_dump_json(indent=2))
87
+ # print(pii_leak.scrub_data(ESSAY))