Spaces:
Sleeping
Sleeping
TQA task bugs fix
Browse files
app.py
CHANGED
@@ -782,7 +782,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
|
|
782 |
<div style='font-size: 3rem'>➡️</div>
|
783 |
</div>
|
784 |
"""
|
785 |
-
yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model]for model in model_list]
|
786 |
#samples = us.generate_some_samples(input_data["data_path"], row["tbl_name"])
|
787 |
model_to_send = None if not flag_TQA else model
|
788 |
|
@@ -805,12 +805,14 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
|
|
805 |
else: task="SP"
|
806 |
start_time = time.time()
|
807 |
response = predictor.make_prediction(
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
)
|
|
|
|
|
814 |
end_time = time.time()
|
815 |
prediction = response['response_parsed']
|
816 |
price = response['cost']
|
@@ -853,7 +855,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
|
|
853 |
predictions_dict[model] = pd.concat([predictions_dict[model], new_row], ignore_index=True)
|
854 |
|
855 |
# yield gr.Textbox(), gr.Textbox(prediction), *[predictions_dict[model] for model in input_data["models"]], None
|
856 |
-
yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model]for model in model_list]
|
857 |
yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
|
858 |
# END
|
859 |
eval_text = generate_eval_text("Evaluation")
|
@@ -874,16 +876,16 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
|
|
874 |
metrics_df_model['model'] = model
|
875 |
metrics_conc = pd.concat([metrics_conc, metrics_df_model], ignore_index=True)
|
876 |
|
|
|
|
|
|
|
|
|
877 |
if 'valid_efficency_score' not in metrics_conc.columns:
|
878 |
metrics_conc['valid_efficency_score'] = metrics_conc['VES']
|
879 |
|
880 |
if 'VES' not in metrics_conc.columns:
|
881 |
metrics_conc['VES'] = metrics_conc['valid_efficency_score']
|
882 |
|
883 |
-
if 'VES' not in metrics_conc.columns and 'valid_efficency_score' not in metrics_conc.columns:
|
884 |
-
metrics_conc['VES'] = 0
|
885 |
-
metrics_conc['valid_efficency_score'] = 0
|
886 |
-
|
887 |
eval_text = generate_eval_text("End evaluation")
|
888 |
yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
|
889 |
|
@@ -1004,7 +1006,6 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
|
|
1004 |
]
|
1005 |
)
|
1006 |
|
1007 |
-
|
1008 |
##########################################
|
1009 |
# METRICS VISUALIZATION SECTION #
|
1010 |
##########################################
|
@@ -1796,12 +1797,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
|
|
1796 |
df = calculate_average_metrics(df, selected_metrics)
|
1797 |
|
1798 |
if flag_TQA:
|
1799 |
-
df["target_answer"] = df["target_answer"].apply(
|
1800 |
-
lambda x: " - ".join([",".join(map(str, item)) for item in x]) if isinstance(x, list) else str(x)
|
1801 |
-
)
|
1802 |
-
df["predicted_answer"] = df["predicted_answer"].apply(
|
1803 |
-
lambda x: " - ".join([",".join(map(str, item)) for item in x]) if isinstance(x, list) else str(x)
|
1804 |
-
)
|
1805 |
|
1806 |
worst_cases_df = df.groupby(['model', 'tbl_name', 'test_category', 'question', 'target_answer', 'predicted_answer', 'answer', 'sql_tag'])['avg_metric'].mean().reset_index()
|
1807 |
else:
|
@@ -1824,7 +1820,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
|
|
1824 |
f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span> \n"
|
1825 |
f"<span style='font-size:16px;'>- <b>Question:</b> {row['question']}</span> \n"
|
1826 |
f"<span style='font-size:16px;'>- <b>Original Answer:</b> `{row['target_answer']}`</span> \n"
|
1827 |
-
f"<span style='font-size:16px;'>- <b>Predicted Answer:</b> `{row['predicted_answer']}`</span> \n\n"
|
1828 |
)
|
1829 |
|
1830 |
worst_str.append(entry)
|
|
|
782 |
<div style='font-size: 3rem'>➡️</div>
|
783 |
</div>
|
784 |
"""
|
785 |
+
yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
|
786 |
#samples = us.generate_some_samples(input_data["data_path"], row["tbl_name"])
|
787 |
model_to_send = None if not flag_TQA else model
|
788 |
|
|
|
805 |
else: task="SP"
|
806 |
start_time = time.time()
|
807 |
response = predictor.make_prediction(
|
808 |
+
question=question,
|
809 |
+
db_schema=db_schema_text,
|
810 |
+
model_name=model,
|
811 |
+
prompt=f"{prompt_to_send}",
|
812 |
+
task=task
|
813 |
)
|
814 |
+
#if flag_TQA: response = {'response_parsed': "[['Alice'],['Bob'],['Charlie']]", 'cost': 0, 'response': "[['Alice'],['Bob'],['Charlie']]"} # TODO remove this line
|
815 |
+
#else : response = {'response_parsed': "SELECT * FROM 'MyTable'", 'cost': 0, 'response': "SQL_QUERY"}
|
816 |
end_time = time.time()
|
817 |
prediction = response['response_parsed']
|
818 |
price = response['cost']
|
|
|
855 |
predictions_dict[model] = pd.concat([predictions_dict[model], new_row], ignore_index=True)
|
856 |
|
857 |
# yield gr.Textbox(), gr.Textbox(prediction), *[predictions_dict[model] for model in input_data["models"]], None
|
858 |
+
yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
|
859 |
yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
|
860 |
# END
|
861 |
eval_text = generate_eval_text("Evaluation")
|
|
|
876 |
metrics_df_model['model'] = model
|
877 |
metrics_conc = pd.concat([metrics_conc, metrics_df_model], ignore_index=True)
|
878 |
|
879 |
+
if 'VES' not in metrics_conc.columns and 'valid_efficency_score' not in metrics_conc.columns:
|
880 |
+
metrics_conc['VES'] = 0
|
881 |
+
metrics_conc['valid_efficency_score'] = 0
|
882 |
+
|
883 |
if 'valid_efficency_score' not in metrics_conc.columns:
|
884 |
metrics_conc['valid_efficency_score'] = metrics_conc['VES']
|
885 |
|
886 |
if 'VES' not in metrics_conc.columns:
|
887 |
metrics_conc['VES'] = metrics_conc['valid_efficency_score']
|
888 |
|
|
|
|
|
|
|
|
|
889 |
eval_text = generate_eval_text("End evaluation")
|
890 |
yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
|
891 |
|
|
|
1006 |
]
|
1007 |
)
|
1008 |
|
|
|
1009 |
##########################################
|
1010 |
# METRICS VISUALIZATION SECTION #
|
1011 |
##########################################
|
|
|
1797 |
df = calculate_average_metrics(df, selected_metrics)
|
1798 |
|
1799 |
if flag_TQA:
|
1800 |
+
df["target_answer"] = df["target_answer"] = df["target_answer"].apply(lambda x: "[" + ", ".join(map(str, x)) + "]")
|
|
|
|
|
|
|
|
|
|
|
1801 |
|
1802 |
worst_cases_df = df.groupby(['model', 'tbl_name', 'test_category', 'question', 'target_answer', 'predicted_answer', 'answer', 'sql_tag'])['avg_metric'].mean().reset_index()
|
1803 |
else:
|
|
|
1820 |
f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span> \n"
|
1821 |
f"<span style='font-size:16px;'>- <b>Question:</b> {row['question']}</span> \n"
|
1822 |
f"<span style='font-size:16px;'>- <b>Original Answer:</b> `{row['target_answer']}`</span> \n"
|
1823 |
+
f"<span style='font-size:16px;'>- <b>Predicted Answer:</b> `{eval(row['predicted_answer'])}`</span> \n\n"
|
1824 |
)
|
1825 |
|
1826 |
worst_str.append(entry)
|