File size: 5,928 Bytes
246d201 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
common_hypothesis_features = [
'1-2 sentences',
'surprising finding',
'includes numeric concepts',
'includes categorical concepts',
'includes binary concepts',
]
hypothesis_features = [
['requires within-cluster analysis'],
['requires across-cluster analysis'],
['corresponds to a polynomial relationship of some columns'],
['corresponds to a ratio between some columns'],
['requires temporal analysis'],
['relationship is based on descriptive statistics of some columns'],
['requires concepts based on percentage or percentiles'],
['relationship is only applicable to one cluster in the data and not the others'],
]
column_features = [
[
'must have one target column',
'must have quantifiable columns',
'must have a few categorical columns',
'make sure the categorical column values do not contain special characters',
'include a few distractor columns',
]
]
common_pandas_features = [
'must be executable using python `eval` to create the target column in variable `df` (pandas dataframe)',
"for e.g., df['A']**2 + 3*df['B'] + 9, np.where(df['A'] > 3, 'Yes', 'No'), etc.",
'variables in pandas_expression must be from the existing columns listed above',
'variables in pandas_expression must NOT contain the target column itself',
]
pandas_features = [
['expression is a quadratic polynomial'],
['expression is a cubic polynomial'],
['expression is a ratio of existing columns'],
['expression is derived through logical combination of existing columns'],
# workflow
]
pandas_features = [common_pandas_features + p for p in pandas_features]
common_derived_features = [
'1-2 sentences',
'includes numeric concepts',
'includes categorical concepts',
'includes binary concepts',
]
derived_features = [common_derived_features + h for h in hypothesis_features]
hypothesis_features = [common_hypothesis_features + h for h in hypothesis_features]
PROMPT_HYP = """\
Given a dataset topic and description, generate an interesting hypothesis based on \
the provided instructions. Be creative and come up with an unusual finding.
```json
{
"topic": "%s",
"description": "%s",
"hypothesis_features": %s,
"hypothesis": "..."
}```
Give your answer as a new JSON with the following format:
```json
{
"hypothesis": "..."
}
```"""
PROMPT_COL = """\
Given a dataset topic, its description, and a true hypothesis that can be determined from it, \
generate a list of valid columns based on the provided instructions.
```json
{
"topic": "%s",
"description": "%s",
"hypothesis": "%s",
"column_instructions": %s,
"columns": [
{
"col_name": "...", # should be an "_"-separated string
"description": "...",
"data_type": "...", # should be executable using python's `eval` function. E.g., str, float, int, bool
"data_range": {...}, # should be either {"min": ..., "max": ...} or {"values": [...]}
"is_distractor": true/false, # boolean indicating whether this is a distractor that could cause confusion during data analysis
"is_target": true/false # boolean indicating whether this is the target variable for the hypothesis; at least one column should be the target
},
...
],
"pandas_instructions": %s,
"pandas_equation_for_hypothesis": {
"target_col": "...",
"target_col_type": "...",
"target_col_range": {...},
"independent_cols_in_pandas_expression": [], # list of column names that will be used to derive the target column
"pandas_expression": "..." # expression to derive df[target_col] using df[ind_col1], df[ind_col2], etc.
}
}```
Give your answer as a new JSON with the "columns" and "pandas_equation_for_hypothesis" keys filled using the following format:
```json
{
"columns": [...],
"pandas_equation_for_hypothesis": {...}
}
```"""
PROMPT_DER = """\
Given a dataset topic, description, a true hypothesis that can be determined from the data, \
and a target column from the dataset, generate a hypothesis for the target column using new independent columns not present in the existing columns.
```json
{
"topic": "%s",
"description": "%s",
"hypothesis": "%s",
"existing_columns": %s,
"target_column": "%s",
"new_to_target_instructions": %s,
"new_to_target_hypothesis": "...", # describe a relationship between new columns that explains the target column
"new_columns_for_target": [ # do not repeat any of the existing columns in the dataset
{
"col_name": "...", # should be an "_"-separated string
"description": "...",
"data_type": "...", # should be executable using python's `eval` function. E.g., str, float, int, bool
"data_range": {...}, # should be either {"min": ..., "max": ...} or {"values": [...]}
},
...
],
"pandas_instructions": %s,
"pandas_equation_for_new_to_target_hypothesis": {
"target_col": "...",
"target_col_type": "...",
"target_col_range": {...},
"independent_cols_in_pandas_expression": [], # list of column names from new_columns_for_target that will be used to derive target_col
"pandas_expression": "..." # expression to derive df[target_col] using df[ind_col1], df[ind_col2], etc.
}
}```
Give your answer as a new JSON with the "new_to_target_hypothesis", "new_columns_for_target", and \
"pandas_equation_for_new_to_target_hypothesis" keys filled using the following format:
```json
{
"new_to_target_hypothesis": "...",
"new_columns_for_target": [...],
"pandas_equation_for_new_to_target_hypothesis": {...}
}
```"""
|