Mustehson commited on
Commit
98f0179
·
1 Parent(s): 6dda383

Data Summary

Browse files
Files changed (1) hide show
  1. app.py +25 -9
app.py CHANGED
@@ -15,11 +15,15 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
15
 
16
  # Height of the Tabs Text Area
17
  TAB_LINES = 8
18
- # Load Token
 
 
19
  md_token = os.getenv('MD_TOKEN')
20
- # Connect to DB
21
  conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
 
 
22
 
 
23
  models = ["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3-70B-Instruct",
24
  "meta-llama/Llama-3.1-70B-Instruct"]
25
 
@@ -35,13 +39,13 @@ for model in models:
35
  continue
36
 
37
  llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
 
38
 
39
-
40
-
41
  prompt_autogenerate = hub.pull("autogenerate-rules-testworkflow")
42
  prompt_user_input = hub.pull("usergenerate-rules-testworkflow")
43
 
44
-
45
  # Get Databases
46
  def get_schemas():
47
  schemas = conn.execute("""
@@ -67,10 +71,18 @@ def get_data_df(schema):
67
 
68
 
69
  def format_prompt(df):
70
- return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'))
71
- def format_user_prompt(df, user_description):
72
- return prompt_user_input.format_prompt(data=df.head(2).to_json(orient='records'), user_description=user_description)
73
-
 
 
 
 
 
 
 
 
74
 
75
  def process_inputs(inputs) :
76
  print(inputs)
@@ -167,6 +179,10 @@ def statistics(df):
167
  df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
168
 
169
  return df_statistics, df_alerts
 
 
 
 
170
  # Main Function
171
  def main(table):
172
  schema = get_table_schema(table)
 
15
 
16
  # Height of the Tabs Text Area
17
  TAB_LINES = 8
18
+
19
+
20
+ #----------CONNECT TO DATABASE----------
21
  md_token = os.getenv('MD_TOKEN')
 
22
  conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
23
+ #---------------------------------------
24
+
25
 
26
+ #-------LOAD HUGGINGFACE-------
27
  models = ["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3-70B-Instruct",
28
  "meta-llama/Llama-3.1-70B-Instruct"]
29
 
 
39
  continue
40
 
41
  llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
42
+ #---------------------------------------
43
 
44
+ #-----LOAD PROMPT FROM LANCHAIN HUB-----
 
45
  prompt_autogenerate = hub.pull("autogenerate-rules-testworkflow")
46
  prompt_user_input = hub.pull("usergenerate-rules-testworkflow")
47
 
48
+ #--------------ALL UTILS----------------
49
  # Get Databases
50
  def get_schemas():
51
  schemas = conn.execute("""
 
71
 
72
 
73
  def format_prompt(df):
74
+ summary_df = pd.DataFrame({
75
+ "max": df.max(),
76
+ "min": df.min(),
77
+ "top": df.mode().iloc[0],
78
+ "nunique": df.nunique(),
79
+ "count": df.count(),
80
+ "dtype": df.dtypes.astype(str)
81
+ }).reset_index().rename(columns={"index": "column"})
82
+ return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'),
83
+ summary=summary_df.to_json(orient='records'))
84
+ def format_user_prompt(df):
85
+ return prompt_user_input.format_prompt(data=df.head().to_json(orient='records'))
86
 
87
  def process_inputs(inputs) :
88
  print(inputs)
 
179
  df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
180
 
181
  return df_statistics, df_alerts
182
+ #---------------------------------------
183
+
184
+
185
+
186
  # Main Function
187
  def main(table):
188
  schema = get_table_schema(table)