Spaces:
Sleeping
Sleeping
Mustehson
commited on
Commit
·
98f0179
1
Parent(s):
6dda383
Data Summary
Browse files
app.py
CHANGED
@@ -15,11 +15,15 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
15 |
|
16 |
# Height of the Tabs Text Area
|
17 |
TAB_LINES = 8
|
18 |
-
|
|
|
|
|
19 |
md_token = os.getenv('MD_TOKEN')
|
20 |
-
# Connect to DB
|
21 |
conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
|
|
|
|
|
22 |
|
|
|
23 |
models = ["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3-70B-Instruct",
|
24 |
"meta-llama/Llama-3.1-70B-Instruct"]
|
25 |
|
@@ -35,13 +39,13 @@ for model in models:
|
|
35 |
continue
|
36 |
|
37 |
llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
|
|
|
38 |
|
39 |
-
|
40 |
-
|
41 |
prompt_autogenerate = hub.pull("autogenerate-rules-testworkflow")
|
42 |
prompt_user_input = hub.pull("usergenerate-rules-testworkflow")
|
43 |
|
44 |
-
|
45 |
# Get Databases
|
46 |
def get_schemas():
|
47 |
schemas = conn.execute("""
|
@@ -67,10 +71,18 @@ def get_data_df(schema):
|
|
67 |
|
68 |
|
69 |
def format_prompt(df):
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
def process_inputs(inputs) :
|
76 |
print(inputs)
|
@@ -167,6 +179,10 @@ def statistics(df):
|
|
167 |
df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
|
168 |
|
169 |
return df_statistics, df_alerts
|
|
|
|
|
|
|
|
|
170 |
# Main Function
|
171 |
def main(table):
|
172 |
schema = get_table_schema(table)
|
|
|
15 |
|
16 |
# Height of the Tabs Text Area
|
17 |
TAB_LINES = 8
|
18 |
+
|
19 |
+
|
20 |
+
#----------CONNECT TO DATABASE----------
|
21 |
md_token = os.getenv('MD_TOKEN')
|
|
|
22 |
conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
|
23 |
+
#---------------------------------------
|
24 |
+
|
25 |
|
26 |
+
#-------LOAD HUGGINGFACE-------
|
27 |
models = ["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3-70B-Instruct",
|
28 |
"meta-llama/Llama-3.1-70B-Instruct"]
|
29 |
|
|
|
39 |
continue
|
40 |
|
41 |
llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
|
42 |
+
#---------------------------------------
|
43 |
|
44 |
+
#-----LOAD PROMPT FROM LANCHAIN HUB-----
|
|
|
45 |
prompt_autogenerate = hub.pull("autogenerate-rules-testworkflow")
|
46 |
prompt_user_input = hub.pull("usergenerate-rules-testworkflow")
|
47 |
|
48 |
+
#--------------ALL UTILS----------------
|
49 |
# Get Databases
|
50 |
def get_schemas():
|
51 |
schemas = conn.execute("""
|
|
|
71 |
|
72 |
|
73 |
def format_prompt(df):
|
74 |
+
summary_df = pd.DataFrame({
|
75 |
+
"max": df.max(),
|
76 |
+
"min": df.min(),
|
77 |
+
"top": df.mode().iloc[0],
|
78 |
+
"nunique": df.nunique(),
|
79 |
+
"count": df.count(),
|
80 |
+
"dtype": df.dtypes.astype(str)
|
81 |
+
}).reset_index().rename(columns={"index": "column"})
|
82 |
+
return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'),
|
83 |
+
summary=summary_df.to_json(orient='records'))
|
84 |
+
def format_user_prompt(df):
|
85 |
+
return prompt_user_input.format_prompt(data=df.head().to_json(orient='records'))
|
86 |
|
87 |
def process_inputs(inputs) :
|
88 |
print(inputs)
|
|
|
179 |
df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
|
180 |
|
181 |
return df_statistics, df_alerts
|
182 |
+
#---------------------------------------
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
# Main Function
|
187 |
def main(table):
|
188 |
schema = get_table_schema(table)
|