Spaces:

kevinhug
/

ai

Sleeping

App Files Files Community

kevinhug commited on Apr 9

Commit

1587277

1 Parent(s): 0f2b5ae

analytic

Browse files

Files changed (3) hide show

app.py +64 -28
knowledge.py +123 -63
pii.py +87 -0

app.py CHANGED Viewed

@@ -3,39 +3,36 @@ from rag import rbc_product
 from tool import rival_product
 from graphrag import reasoning
 from knowledge import graph
-with gr.Blocks() as demo:
   with gr.Tab("RAG"):
     gr.Markdown("""
-Marketing
-------------
-GraphRAG: Models customer-product relationship networks for next-best-action predictions
-DSPy: Optimizes cross-sell/upsell prompt variations through A/B testing
-Risk & Audit
 ------------
-GraphRAG: Maps transactional relationships into dynamic knowledge graphs to detect multi-layered fraud patterns
-Tool Use: Integrates fraud detection APIs, anomaly scoring models, and regulatory compliance checkers
-DSPy: Optimizes fraud explanation prompts for regulatory reporting
-Other Use Case
-------------
-https://huggingface.co/spaces/kevinhug/clientX
-https://kevinwkc.github.io/davinci/
     """)
     gr.Markdown("""
     Objective: Recommend RBC product based on persona.
     ================================================
-    Retrieval: Public RBC Product Data
-    Recommend: RBC Product
     """)
     in_verbatim = gr.Textbox(label="Verbatim")
     out_product = gr.Textbox(label="Product")
@@ -50,13 +47,25 @@ https://kevinwkc.github.io/davinci/
     btn_recommend=gr.Button("Recommend")
     btn_recommend.click(fn=rbc_product, inputs=in_verbatim, outputs=out_product)
   with gr.Tab("Tool Use"):
     gr.Markdown("""
     Objective: Recommend financial product based on persona for competitive analysis, product feature discovery
     ================================================
-    Retrieval: Public Product Data using Tavily Search
-    Recommend: Competition Product
     """)
     in_verbatim = gr.Textbox(label="Verbatim")
     out_product = gr.Textbox(label="Product")
@@ -74,7 +83,7 @@ https://kevinwkc.github.io/davinci/
     gr.Markdown("""
     Objective: Create a Marketing Plan based on persona.
     =======================
-    Reasoning from context, answering the question
     """)
     marketing = """
@@ -145,4 +154,31 @@ Low APR and great customer service. I would highly recommend if you’re looking
     btn_recommend = gr.Button("Graph It!")
     btn_recommend.click(fn=graph, inputs=in_verbatim, outputs=out_product)
 demo.launch(allowed_paths=["./"])

 from tool import rival_product
 from graphrag import reasoning
 from knowledge import graph
+from pii import derisk
+# Define the Google Analytics script
+head = """
+<!-- Google tag (gtag.js) -->
+<script async src="https://www.googletagmanager.com/gtag/js?id=G-SRX9LDVBCW"></script>
+<script>
+  window.dataLayer = window.dataLayer || [];
+  function gtag(){dataLayer.push(arguments);}
+  gtag('js', new Date());
+  gtag('config', 'G-SRX9LDVBCW');
+</script>
+"""
+with gr.Blocks(head=head) as demo:
   with gr.Tab("RAG"):
     gr.Markdown("""
+Links:
 ------------
+- https://huggingface.co/spaces/kevinhug/clientX
+- https://kevinwkc.github.io/davinci/
     """)
     gr.Markdown("""
     Objective: Recommend RBC product based on persona.
     ================================================
+    - Retrieval: Public RBC Product Data
+    - Recommend: RBC Product
     """)
     in_verbatim = gr.Textbox(label="Verbatim")
     out_product = gr.Textbox(label="Product")
     btn_recommend=gr.Button("Recommend")
     btn_recommend.click(fn=rbc_product, inputs=in_verbatim, outputs=out_product)
+    gr.Markdown("""
+Marketing
+------------
+- GraphRAG: Models customer-product relationship networks for next-best-action predictions
+- DSPy: Optimizes cross-sell/upsell prompt variations through A/B testing
+Risk & Audit
+------------
+- GraphRAG: Maps transactional relationships into dynamic knowledge graphs to detect multi-layered fraud patterns
+- Tool Use: Integrates fraud detection APIs, anomaly scoring models, and regulatory compliance checkers
+- DSPy: Optimizes fraud explanation prompts for regulatory reporting
+    """)
   with gr.Tab("Tool Use"):
     gr.Markdown("""
     Objective: Recommend financial product based on persona for competitive analysis, product feature discovery
     ================================================
+    - Retrieval: Public Product Data using Tavily Search
+    - Recommend: Competition Product
     """)
     in_verbatim = gr.Textbox(label="Verbatim")
     out_product = gr.Textbox(label="Product")
     gr.Markdown("""
     Objective: Create a Marketing Plan based on persona.
     =======================
+    - Reasoning from context, answering the question
     """)
     marketing = """
     btn_recommend = gr.Button("Graph It!")
     btn_recommend.click(fn=graph, inputs=in_verbatim, outputs=out_product)
+  with gr.Tab("pii masking"):
+    gr.Markdown("""
+    Objective: pii data removal
+    ================================================
+    """)
+    in_verbatim = gr.Textbox(label="Peronal Info")
+    out_product = gr.Textbox(label="PII")
+    gr.Examples(
+      [
+        [
+        """
+        He Hua (Hua Hua) Director
+        [email protected]
+        +86-28-83505513
+        Alternative Address Format:
+        Xiongmao Ave West Section, Jinniu District (listed in some records as 610016 postcode)
+        """
+          ]
+      ],
+      [in_verbatim]
+    )
+    btn_recommend = gr.Button("Mask PII")
+    btn_recommend.click(fn=derisk, inputs=in_verbatim, outputs=out_product)
 demo.launch(allowed_paths=["./"])

knowledge.py CHANGED Viewed

@@ -1,33 +1,8 @@
 import instructor
-from pydantic import BaseModel, Field
-from typing import List
 from graphviz import Digraph
-class Node(BaseModel, frozen=True):
-    """
-    Node representing concept in the subject domain
-    """
-    id: int
-    label: str = Field(..., description = "description of the concept in the subject domain")
-    color: str
-class Edge(BaseModel, frozen=True):
-    """
-    Edge representing relationship between concepts in the subject domain
-    """
-    source: int = Field(..., description = "source representing concept in the subject domain")
-    target: int = Field(..., description = "target representing concept in the subject domain")
-    label: str = Field(..., description = "description representing relationship between concepts in the subject domain")
-    color: str = "black"
-class KnowledgeGraph(BaseModel):
-    """
-    graph representation of concepts in the subject domain
-    """
-    nodes: List[Node] = Field(..., default_factory=list)
-    edges: List[Edge] = Field(..., default_factory=list)
 from groq import Groq
 import os
@@ -37,7 +12,6 @@ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 # Enable instructor patches for Groq client
 client = instructor.from_groq(client)
-llm='llama-3.1-8b-instant' #"llama3.2", #
 """
 from openai import OpenAI
 client = instructor.from_openai(
@@ -48,46 +22,132 @@ client = instructor.from_openai(
     mode=instructor.Mode.JSON,
 )
 """
 def generate_graph(q, input) -> KnowledgeGraph:
-    return client.chat.completions.create(
     model=llm,
     max_retries=5,
     messages=[
-        {
-            "role": "user",
-            "content": f"Help me understand the following by describing it as a detailed knowledge graph: ### Question: {q} ### Context: {input}",
-        }
     ],
-    response_model=KnowledgeGraph,
-)
-class Issue(BaseModel):
-    "Break down Issue as sub issues"
-    question: str
-class IssuePlan(BaseModel):
-    "List of Issue"
-    issue_graph: List[Issue] = []
-def expandIssue(input) -> IssuePlan:
-    return client.chat.completions.create(
-        model=llm,
-        max_retries=10,
-        messages=[
-            {
-                "role": "system",
-                "content": "As a Mckinsey Consultant create a framework that relevant to the topic, list all issues.",
-            },             {
-                "role": "user",
-                "content": f"Question: {input}",
-            },
-        ],
-        response_model=IssuePlan,
-    )
 def graph(query):
-    queryx = expandIssue(query)
-    ctx = ", ".join([q.question for q in queryx.issue_graph])
-    graph = generate_graph(query, ctx)
-    return graph.json()

+from typing import List
 import instructor
 from graphviz import Digraph
+from pydantic import BaseModel, Field
 from groq import Groq
 import os
 # Enable instructor patches for Groq client
 client = instructor.from_groq(client)
 """
 from openai import OpenAI
 client = instructor.from_openai(
     mode=instructor.Mode.JSON,
 )
 """
+llm = 'llama-3.1-8b-instant' if os.getenv("GROQ_API_KEY") else "llama3.2"
+class Node(BaseModel, frozen=True):
+  """
+  Node representing concept in the subject domain
+  """
+  id: int= Field(...,
+                  description="unique id of the concept in the subject domain, used for deduplication, design a scheme allows multiple concept")
+  label: str = Field(..., description="description of the concept in the subject domain")
+  color: str = "orange"
+class Edge(BaseModel, frozen=True):
+  """
+  Edge representing relationship between concepts in the subject domain, source depends on target
+  """
+  source: int = Field(..., description="source representing concept in the subject domain")
+  target: int = Field(..., description="target representing concept in the subject domain")
+  label: str = Field(..., description="description representing relationship between concepts in the subject domain")
+  color: str = "black"
+from typing import Optional
+class KnowledgeGraph(BaseModel):
+  """
+    KnowledgeGraph is graph representation of concepts in the subject domain
+    """
+  nodes: Optional[List[Node]] = Field(..., default_factory=list)
+  edges: Optional[List[Edge]] = Field(..., default_factory=list)
+  def update(self, other: "KnowledgeGraph") -> "KnowledgeGraph":
+    """Updates the current graph with the other graph, deduplicating nodes and edges."""
+    return KnowledgeGraph(
+      nodes=list(set(self.nodes + other.nodes)),
+      edges=list(set(self.edges + other.edges)),
+    )
+  def draw(self, prefix: str = "knowledge_graph"):
+    dot = Digraph(comment="Knowledge Graph")
+    for node in self.nodes:
+      dot.node(str(node.id), node.label, color=node.color)
+    for edge in self.edges:
+      dot.edge(
+        str(edge.source), str(edge.target), label=edge.label, color=edge.color
+      )
+    dot.render(prefix, format="png", view=True)
+from typing import Iterable
+from textwrap import dedent
 def generate_graph(q, input) -> KnowledgeGraph:
+  return client.chat.completions.create(
     model=llm,
     max_retries=5,
     messages=[
+      {
+        "role": "user",
+        "content": dedent(f"""Help me understand the following by describing it as a detailed knowledge graph:
+            ### Question: {q}
+            ### Context: {input}
+            ### Instruction:
+            Generate at least 5 concepts
+            Generate at least 3 relationship
+            ### Output Format:
+            Node with id, label for description of the concept
+            Edge with source's id, target's id, label for description of the relationship between source concept and target concept
+            """),
+      }
     ],
+    response_model=KnowledgeGraph)
+class Subissue(BaseModel):
+  subissue_title: str
+  point: List[str] = Field(default_factory=list, description="Specific aspect or component of the subissue")
+def expandIssue(input) -> Iterable[Subissue]:
+  response = client.chat.completions.create(
+    model=llm,
+    max_retries=3,
+    response_model=Iterable[Subissue],
+    temperature=0.1,
+    messages=[
+      {
+        "role": "user",
+        "content": dedent(f"""
+                As a McKinsey Consultant, perform MECE decomposition of the question.
+                ### Requirements
+                1. Return 3 subissues minimum
+                2. Each sub-issue has 3 bullet points, which each new point beginning with a *
+                3. Use EXACT format:
+                - [Sub-issue 1.1 title]
+                  * [point 1]
+                  * [point 2]
+                  * [point 3]
+                - [Sub-issue 1.2 title]
+                  * [point 1]
+                  * [point 2]
+                  * [point 3]
+                - [Sub-issue 1.3 title]
+                  * [point 1]
+                  * [point 2]
+                  * [point 3]
+                4. return nothing else
+                ### Question: {input}
+                """),
+      },
+    ],
+  )
+  return response
 def graph(query):
+  queryx = expandIssue(query)
+  graph = generate_graph(query, str(queryx))
+  return graph.json()

pii.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import instructor
+from groq import Groq
+from pydantic import BaseModel
+# Initialize with API key
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Enable instructor patches for Groq client
+client = instructor.from_groq(client)
+"""
+client = instructor.from_openai(
+    OpenAI(
+        base_url="http://localhost:11434/v1",
+        api_key="ollama",
+    ),
+    mode=instructor.Mode.JSON,
+)
+"""
+llm = 'llama-3.1-8b-instant' if os.getenv("GROQ_API_KEY") else "llama3.2"
+class PIIData(BaseModel):
+  index: int
+  data_type: str
+  pii_value: str
+class PIIExtraction(BaseModel):
+  """
+  Extracted PII data from a document, all data_types should try to have consistent property names
+  """
+  private_data: list[PIIData]
+  chain_of_thought: str
+  def scrub_data(self, content):
+    """
+    Iterates over the private data and replaces the value with a placeholder in the form of
+    <{data_type}_{i}>
+    """
+    for i, data in enumerate(self.private_data):
+      content = content.replace(data.pii_value, f"<{data.data_type}_{i}>")
+    return content
+def derisk(content) -> PIIExtraction:
+  return client.chat.completions.create(
+    model=llm,
+    response_model=PIIExtraction,
+    temperature=0.2,
+    messages=[
+      {
+        "role": "system",
+        "content": "You are a world class international PII scrubbing model, perform data preprocess include standardization, stop word removal, punctuation removal...to enhance signal to noise ratio for name, phone, address, email, id...etc. Extract the PII data from the following document",
+      }, {
+        "role": "user",
+        "content": {content},
+      }
+    ]).model_dump_json(indent=2)
+if __name__ == '__main__':
+  ESSAY = """
+    He Hua (Hua Hua) Director
+    [email protected]
+    +86-28-83505513
+    Alternative Address Format:
+    Xiongmao Ave West Section, Jinniu District (listed in some records as 610016 postcode)
+    Best Viewing: Before 9:00 AM during summer hours (7:30 AM-5:00 PM)
+    Caretaker: Tan Jintao ("Grandpa Tan")
+    Additional Contacts
+    Charitable Donations: +86-28-83505513
+    Dining Reservations: +86-17311072681
+    """
+  print(derisk(ESSAY))
+  # print(pii_leak.model_dump_json(indent=2))
+  # print(pii_leak.scrub_data(ESSAY))