File size: 18,003 Bytes
f1d6020
 
 
 
9a79fc6
0dcf7a4
c169ce1
9a79fc6
f1d6020
13ebfcb
 
 
a4a4bd1
669119c
13ebfcb
669119c
 
 
 
13ebfcb
 
 
 
 
a4a4bd1
24b7694
 
 
204fe88
0fc5ade
204fe88
da9cbbc
24b7694
 
515fc3a
204fe88
 
 
24b7694
 
 
 
 
 
 
 
 
 
 
 
515fc3a
c1afbf3
da9cbbc
204fe88
da9cbbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246fbe8
 
24b7694
a1d18f3
 
 
 
 
 
 
 
839da70
f1d6020
daa55b3
f1d6020
 
a1d18f3
 
 
b176344
64fc514
bb4a4eb
281e32d
 
bb4a4eb
 
 
 
 
64fc514
2671dc3
 
 
 
64fc514
281e32d
a1d18f3
 
 
b5dc76c
 
 
 
8dc0d1e
b5dc76c
9a79fc6
 
 
a1d18f3
f7250d5
a1d18f3
f7250d5
8a8cea1
f7250d5
 
 
 
 
8a8cea1
 
f7250d5
a1d18f3
93315e4
a1d18f3
64aff5e
a1d18f3
 
281e32d
 
 
87e5105
616d40e
f03da05
a1d18f3
f7250d5
a1d18f3
2e1e611
 
839da70
5b876de
8a8cea1
a1d18f3
ae1178b
a1d18f3
bb4a4eb
 
 
 
7af2ca1
839da70
a1d18f3
 
 
ae1178b
 
 
 
598b013
ae1178b
 
 
 
 
a1d18f3
839da70
 
 
 
33ed4ca
 
6446f6c
ec63e17
9291158
0ad6e99
9291158
33ed4ca
 
 
 
 
2af935d
78932d3
349b99e
 
 
 
78932d3
 
 
349b99e
78932d3
 
839da70
 
 
 
736fdc2
14e9611
 
 
 
 
33ed4ca
839da70
33ed4ca
 
 
 
 
839da70
14e9611
a1d18f3
 
2fd5b7f
 
 
 
 
669119c
a684eb2
669119c
 
 
a684eb2
669119c
 
 
 
b5dc76c
ca8df9e
c559868
f808f68
1616812
 
 
553ab50
 
c1afbf3
553ab50
319b0bc
 
 
 
 
 
389df15
d4a8b37
 
319b0bc
c1afbf3
319b0bc
 
 
 
 
 
553ab50
 
 
 
 
 
 
ae5a310
553ab50
319b0bc
 
ae5a310
4542e54
ae5a310
 
 
 
b5dc76c
c1afbf3
37220b6
ae5a310
553ab50
ae5a310
 
 
 
 
 
37220b6
ae5a310
553ab50
ae5a310
 
 
 
 
 
 
553ab50
ae5a310
 
 
 
 
 
498e3e2
2e09fa8
 
ae5a310
 
 
 
 
 
076eeb7
ae5a310
 
 
13b84a9
 
 
ae5a310
 
 
076eeb7
ae5a310
 
 
076eeb7
ae5a310
 
 
 
 
 
 
 
bec1297
ae5a310
bec1297
ae5a310
 
 
 
 
 
c1afbf3
ae5a310
 
 
 
bec1297
ae5a310
 
a6b6a76
6427a71
 
ca8df9e
b5dc76c
2fd5b7f
bb4a4eb
 
 
7af2ca1
bb4a4eb
a1d18f3
7af2ca1
a1d18f3
ae1178b
 
 
 
 
598b013
ae1178b
 
 
 
 
2d72659
a8ef33b
2d72659
a8ef33b
2d72659
a8ef33b
 
 
2d72659
a8ef33b
 
2d72659
 
 
204fe88
2d72659
87e5105
2d72659
246fbe8
2d72659
 
267d883
2d72659
bb4a4eb
a8ef33b
ccaec75
da9cbbc
bb4a4eb
da9cbbc
8b3f79b
 
 
 
2af935d
8b3f79b
da9cbbc
967792a
8b3f79b
 
967792a
 
da9cbbc
13ebfcb
2af935d
8b3f79b
13ebfcb
 
 
8b3f79b
349b99e
 
 
 
 
 
8b3f79b
 
 
5ffc728
 
8b3f79b
bb4a4eb
ccaec75
13ebfcb
 
 
18fddd0
13ebfcb
 
 
18fddd0
13ebfcb
 
 
18fddd0
13ebfcb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d00de8
13ebfcb
0d00de8
13ebfcb
 
 
 
 
 
 
 
 
 
 
 
 
 
031daa2
13ebfcb
c1afbf3
29221b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
import gradio as gr
# from langchain.vectorstores import Chroma


'''
https://huggingface.co/spaces/kevinhug/clientX
https://hits.seeyoufarm.com/
'''

'''
PORTFOLIO OPTIMIZATION
'''
from aiOpt import Asset
from aiSum import Sum
import numpy as np

def summarize(text):
  return Sum().summarize(text)

def optimize(cost, prob, its):
  s = Asset(np.asfarray(cost.split()),
            np.asfarray(prob.split()))

  return s.random_restart(int(its))

'''
TIME SERIES ANALYTICS
'''

import pandas as pd

import plotly.express as px

def trend(t):
  '''
  import yfinance as yf
  from sklearn.preprocessing import StandardScaler
  data = yf.download(t, period="3mo")

  for c in t.split(' '):
      q=data.loc[:,('Close',c)]
      data.loc[:,('Close_MA',c)]=q.rolling(9).mean() -q.rolling(42).mean()
      q=data.loc[:,('Volume',c)]
      data.loc[:,('Volume_MA',c)]=q.rolling(9).mean() -q.rolling(42).mean()

  ma=data.loc[:,["Volume_MA","Close_MA"]].tail(15)
  std=StandardScaler()
  result=std.fit_transform(ma)
  df=pd.DataFrame(result,columns=ma.columns)
  d=df.tail(1).stack(level=-1).droplevel(0, axis=0)
  '''
  d=pd.read_pickle("./ts/data.pkl")
  '''
  https://www.gradio.app/docs/plot
        fig = px.line(df, x="day", y=countries)
        fig.update_layout(
            title="Outbreak in " + month,
            xaxis_title="Cases",
            yaxis_title="Days Since Day 0",
        )
        return fig
  '''
  fig=px.scatter(d, x="Close_MA", y="Volume_MA",color='ticker')
  fig.update_layout(
    title="Top Right is the Growth Industry",
    xaxis_title="Trend in Price",
    yaxis_title="Trend in Volume",
  )
  return fig

  #return gr.ScatterPlot(d, x="Close_MA", y="Volume_MA",color='ticker')

'''
SIMILAR VECTOR DB SEARCH
'''
import chromadb
client = chromadb.PersistentClient(path="chroma.db")

db = client.get_collection(name="banks")

def similar(issue):
  global db
  docs = db.query(query_texts=issue, n_results=5)
  return docs

'''
FINE TUNE LLM LIKE SCORE
'''
from fastai.text.all import *


import pathlib
p=pathlib.Path('./banks_txt_like.pkl').resolve()

'''
NotImplementedError: cannot instantiate ‘WindowsPath’ on your system
'''
import platform
plt = platform.system()
if plt == 'Windows':
  pathlib.PosixPath = pathlib.WindowsPath
else:
  pathlib.WindowsPath = pathlib.PosixPath

learn = load_learner(p)
def like(issue):
  pred,idx,probs = learn.predict(issue)
  return pred

'''
EXPLAINABLE AI
'''


'''
https://www.gradio.app/docs/interface
'''

with gr.Blocks() as demo:

  '''
  https://hits.seeyoufarm.com/
  https://dash.elfsight.com
  '''
  counter="""
  <script src="https://static.elfsight.com/platform/platform.js" data-use-service-core defer></script>
  <div class="elfsight-app-5f3e8eb9-9103-490e-9999-e20aa4157dc7" data-elfsight-app-lazy></div>
  
  ![Visitor Count](https://profile-counter.glitch.me/{YOUR USER}/count.svg)
  """

  # gr.HTML(counter)


  gr.Markdown("""Enhancing Customer Engagement and Operational Efficiency with NLP
  =========
  
  1) Semantic Similarity Document Search (SSDS)
  2) Fine Tune LLM
  3) Trading Analytic: Using Time Series Data to Identify Growth
  4) Portfolio Optimization with cost, probabilities
  5) Explainable AI
  
  #### Data Scientist: Kevin Wong, [email protected], 416-903-7937

  ##### Open source ml bank dataset, 
  __I'm just using a small sample of this data set for demo__
https://www.kaggle.com/datasets/trainingdatapro/20000-customers-reviews-on-banks/?select=Banks.csv

[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2Fkevinhug%2FclientX&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com)
  """)

  with gr.Tab("Semantic Similarity Document Search (SSDS)"):
    in_similar = gr.Textbox(placeholder="having credit card problem",
                            label="Issue",
                            info="issue you want to explore about"
                            )
    out_similar = gr.JSON(label="Similar Verbatim")

    btn_similar = gr.Button("Find Similar Verbatim")
    btn_similar.click(fn=similar, inputs=in_similar, outputs=out_similar)

    gr.Examples(
      [
        ["having credit card problem"],
        ["low interest credit card"],
        ["loan"],
        ["upset customer"],
        ["what is the password"],
      ],
      [in_similar]
    )
    gr.Markdown("""
Description:
=======
In today's dynamic financial landscape, the Semantic Similarity Document Search (SSDS) capability is a practical innovation to improve client experience, marketing leads, and sentiment analysis. As a Data Scientist with a decades in the financial industry, I see the value of SSDS in action.

Investment Portfolio Construction/Marketing Leads:
------
To enhance marketing strategies, SSDS identifies market trends and consumer preferences, such as the demand for low-interest credit cards, and GIC. It's a treasure trove for refining our product offerings to the targeted customer according to their credit score, risk appetite, demographic, collateral, capital, and economic conditions, enhancing the lift and efficiency of the recommendation process.

Combining **SingleStore MEMSQL/Kafka structured streaming** for a real-time recommendation for closing sales at the right time in the right channel.

Optimize your Asset Allocation with your objective function and cost function.

### issue:
  - low interest credit card
  - GIC
  
AML/KYC/Compliance/Audit/Cyber Security/Fraud Analytics/Observability:
------
### vite vue chart.js UI demo
https://kevinwkc.github.io/davinci/  

### Proactive Detection: Identify potential fraud threats and vulnerabilities in real-time.

Customer-Centric Approach: Gain insights into customer concerns, allowing us to address them promptly.

#### issue:
  - what is the password

Client Experience:
------
When a client faces a bad experience, SSDS helps us swiftly locate relevant documents to understand and address their concerns, be it credit card issues, late payment fees, or credit score drops.

### issue:
  - having bad client experience
  - having credit card problem
  - late payment fee
  - credit score dropping
  
Sentiments:
------
SSDS tracks customer sentiment, empowering us to swiftly respond to upset customers. It ensures we address their issues promptly, enhancing trust and loyalty.
With no need for jargon, SSDS delivers tangible value to our fintech operations. It's about staying agile, informed, and customer-centric in a rapidly changing financial world.

### issue:   
  - upset customer

  
    """)

    with gr.Accordion("Future Improvement"):
      gr.Markdown("""
    tuning the distance for use case
      """)

  with gr.Tab("Generative AI Summarization"):
    in_sum = gr.Textbox(placeholder="Customer service was terrible. Called the number for accounts and forced to listen to advertisements from their partners with no escape. When it was finally over it just went to a loop with a number to call for more promotional offers. Called a different number and got transferred from a human back to their answering service-- which hung up on me.", lines=7,
                            label="Long Text",
                            info="Summarization"
                            )
    out_sum = gr.Textbox(label="Summarized Verbatim")

    btn_sum = gr.Button("Find Similar Verbatim")
    btn_sum.click(fn=summarize, inputs=in_sum, outputs=out_sum)

  with gr.Tab("Explainable AI"):
    df=pd.read_csv("./xgb/re.csv")

    gr.Markdown("""
    


CREDIT DEFAULT RISK INTERPRETATION
=======================
Explain by Context
----------
- Sometimes, understanding why an individual defaults requires shifting to a credit-healthy background, altering the baseline E[f(x) | credit healthy] using interventional feature perturbation ([source](https://arxiv.org/pdf/2006.16234.pdf)).

[UCI Machine Learning Repository - Credit Default Dataset](https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset)

![Credit Record Summary](file=./xgb/credit_record.png)
**Observations from a healthy credit background:**
f(x) in probability for logistic regression objective using XGBoost
- base line at 0.2, indicate explain from healthy credit perspective, why this guy is default
- This individual defaults due to high **PAY_0**, despite PAY_AMT5 .
- PAY_0 represents repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above).

![Credit Data Summary](file=./xgb/credit_data.png)
**Insights from a healthy credit background:**
- Default patterns relate to high **PAY_0/PAY_2** (payment delay) and low **LIMIT_BAL** (lack of liquidity).
- LIMIT_BAL signifies the amount of given credit in NT dollars (includes individual and family/supplementary credit).
- BILL_AMT1 indicates the bill statement amount in September, 2005 (NT dollar).


HOME PRICE INTERPRETATION
=======================
This analysis is derived from an XGBoost regression model designed to predict house prices. The model utilizes features such as **dist_subway, age, lat, long,** and **dist_stores**.

Full dataset at the bottom of this tab

Explain by Dataset
----------
- Below are explanation in typical background E[f(x)]

![Summary](file=./xgb/data.png)

**Key insights:**
- **dist_subway** has a significant impact on pricing when at low values (green).
- **dist_store** demonstrates minimal impact on price.
- Higher age correlates with lower prices while lower age raises prices.



Explain by Feature
----------
![Partial Dependence](file=./xgb/feature.png)

**Observations:**
- Prices spike for **distances lower than 900** based on the function f(x).
- Noteworthy **SHAP value at record[20] around 6500**.

    
Explain by Record
----------
![Force](file=./xgb/record.png)

**Contribution to Price:**
- **dist_subway** holds the largest positive contribution to price.
- **Age** follows as the second significant contributor.

Explain by Instance
----------
![Dependence](file=./xgb/instance.png)

**Insights:**
- Around **500 dist_subway**, there's a potential for both positive and negative impacts on price.
- Overall trend: closer proximity to the subway correlates with higher prices.
- An outlier at **6500 distance** from subway negatively impacts price, despite proximity to stores (dist_stores).
    
![1st Decision Tree](file=./xgb/dtree.png)
*Note:  first decision tree within XGBoost.*

Explain by Top 5 Error Example
===============
![Top 5 Error Data](file=./xgb/error_data.png)

**Top Features for Errors:**
- **dist_subway, age** stands out as the top feature impacting the top 5 errors negatively (for young ages).

![Error Record](file=./xgb/error_record.png)
**Top 1 Error:**
- Notably, lat has positive impact
- old age has a negative impact on pricing (top 1 error).


![Error Feature](file=./xgb/error_feature.png)
**Insight from Errors:**
- Further distance from the subway might positively impact pricing for the top 5 errors at around 700

![Error Instance](file=./xgb/error_instance.png)
**Error Instances:**
- Younger age negatively impacts price, while older age positively impacts price for the top 5 errors.

ML Observability
===============
**Visualization with Context:**
[Tableau Visualization](https://public.tableau.com/app/profile/kevin1619/vizzes)


**Data Validation:**
- Led data validation for a new data source using covariate shift and recall methodology for legacy models with circuit breaker pattern with notification.
- Ensured consistency in feature transformation between dev and prod environments.
- Monitor prediction distribution with precision, recall metric.

**Unit Testing/Acceptance Testing:**
- Led unit testing for models, identified logical errors, and improved campaign lift by 50% for small businesses.

**A/B Testing for Lift:**
- Utilized statistical approaches in A/B testing for small business models, ensuring lift met criteria.
- Setting up baseline model, retain evidence of input, output

**File/Log Mining:**
- Led server observability, leveraging event journey maps to understand server downtimes.

**
**Root Cause Analysis:**
- Proficient in employing Six Sigma methodology to trace root causes with established metrics.
    """)


    gr.DataFrame(df)

  with gr.Tab("Fine Tune LLM"):
    in_like = gr.Textbox(placeholder="having credit card problem"   ,                                             label="Issue",
                            info="issue you want to explore about")
    out_like = gr.Textbox(placeholder="like score in range [2 to 248] from fine tuning data",
                          label="like score",
                          info="like score")

    btn_like = gr.Button("Classify Like Score")
    btn_like.click(fn=like, inputs=in_like, outputs=out_like)

    gr.Examples(
      [
        ["having credit card problem"],
        ["low interest credit card"],
        ["loan"],
        ["upset customer"],
        ["what is the password"],
      ],
      [in_like]
    )
    gr.Markdown("""
Smart Insights: Elevating Customer Engagement Through Sentiment Analysis
=========
As a Data Scientist with a decades of financial industry experience, I recognize the paramount importance of staying closely tuned to our customer's needs and opinions. In this app, Fine Tune LLM, we have shown how fine-tuning a Language Model (LLM) on a custom dataset can provide valuable insights into customer sentiment across crucial areas such as service, sales, point of failure, product, and emerging trends.

Objective:
---------
Our aim is to extract meaningful insights from customer interactions to improve our services, products, and overall customer experience. This analysis will help us understand what our customers are discussing and how they feel about different aspects of our business.

Use Case:
- intervene attrition through incentive

    """)
  with gr.Tab("Trading Analyics"):
    in_ts = gr.Textbox(placeholder="XLE XLV XLY XLK XLF XLP XLI XLRE XLU",
                            label="Ticker",
                            info="Technical Difficult: Currently it only works with these tickers due to data pulling constraints"
                            )
    plot = gr.Plot()
    #plot = gr.Plot(label="Identify Trend/Decline Industry")
    btn_ts = gr.Button("Find Trending Industry")
    btn_ts.click(fn=trend, inputs=in_ts, outputs=plot)

    gr.Markdown("""
Maximizing Trading Efficiency: Personalize Your Asset Allocation for Optimal Growth
=========
The industry life cycle is a useful tool for traders to identify growth and decline industries. It describes the evolution of an industry based on its stages of growth and decline 

#### There are four phases of the industry life cycle: introduction, growth, maturity, and decline 
By identifying growth and decline industries, traders can make informed investment decisions and speed up trading by investing in companies that are likely to experience growth in the future and avoiding companies that are likely to experience a decline in the future.

- Long Trader: buy growth industry
- Short Trader: sell decline industry

#### Personalize objective function and cost function for each trader
- cost function can prevent selecting decline industry

  we can use this to filter out blacklisted firms for compliance
  
  we can increase the cost for highly correlated stock for diversity
  
- objective function can identify potential industry
  

#### Personalize UI to fit each trader
  customize UI for secret sauce formula for stock picking:
  - metric: moving average for the price, moving average for volume, ...etc
  - timeframe chain: monthly, weekly, daily, 4h, 15 min

##### tableau portfolio
https://public.tableau.com/app/profile/kevin1619
  
##### vite vue chart.js UI demo
https://kevinwkc.github.io/davinci/  

#### Personalize Alert with Twilio
The trader can set their price to buy at the right price at the right time, without missing the right entry in a high stress environment

#### Keeping record for compliance 
The System can save data for reporting or compliance purpose

    """)



  with gr.Tab("Portfolio Optimization"):
    in_p_cost = gr.Textbox(value="4 30 2 3 5",
                            label="Cost",
                            info="cost for the asset"
                            )
    in_p_prob = gr.Textbox(value="0.3 0.4 0.5 0.6 0.7",
                            label="Probabilities",
                            info="P(success) for the asset"
                            )
    in_p_its = gr.Textbox(value="10",
                            label="Number of Iteration",
                            info="number of trial for optimal"
                            )
    out_p = gr.Textbox(label="Asset Allocation Approx Optimization using AI")

    btn_p = gr.Button("Optimize Asset Allocation")
    btn_p.click(fn=optimize, inputs=[in_p_cost, in_p_prob, in_p_its], outputs=out_p)

    gr.Markdown("""
Objective: To allocate assets in a way that maximizes expected profit while minimizing costs and risks.

Inputs:
- List of available assets and their associated probabilities and costs.

Outputs:
- Allocation of assets that maximizes expected profit while minimizing costs and risks.

Constraints:

Assume volume is bound by 0 to 100

Objective: max SUM [ vol_s * prob_s - cost_s ]

- The total cost of the allocation must not exceed the available budget.
- The risk level of the allocation must not exceed the maximum allowable risk level.
- The profit margin of the allocation must not fall below the minimum allowable profit margin.

Method:
Using search algorithm to find the approx. optimial allocation

Assumptions:
- The probabilities and costs of the assets are known with certainty.
- The expected return and risk level of the allocation are calculated using historical data and statistical models.


    """)

demo.launch(allowed_paths=["./xgb","./ts"])