franceth commited on
Commit
746cc2a
·
verified ·
1 Parent(s): 327ecdf

TQA task bugs fix

Browse files
Files changed (1) hide show
  1. app.py +15 -19
app.py CHANGED
@@ -782,7 +782,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
782
  <div style='font-size: 3rem'>➡️</div>
783
  </div>
784
  """
785
- yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model]for model in model_list]
786
  #samples = us.generate_some_samples(input_data["data_path"], row["tbl_name"])
787
  model_to_send = None if not flag_TQA else model
788
 
@@ -805,12 +805,14 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
805
  else: task="SP"
806
  start_time = time.time()
807
  response = predictor.make_prediction(
808
- question=question,
809
- db_schema=db_schema_text,
810
- model_name=model,
811
- prompt=f"{prompt_to_send}",
812
- task=task
813
  )
 
 
814
  end_time = time.time()
815
  prediction = response['response_parsed']
816
  price = response['cost']
@@ -853,7 +855,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
853
  predictions_dict[model] = pd.concat([predictions_dict[model], new_row], ignore_index=True)
854
 
855
  # yield gr.Textbox(), gr.Textbox(prediction), *[predictions_dict[model] for model in input_data["models"]], None
856
- yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model]for model in model_list]
857
  yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
858
  # END
859
  eval_text = generate_eval_text("Evaluation")
@@ -874,16 +876,16 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
874
  metrics_df_model['model'] = model
875
  metrics_conc = pd.concat([metrics_conc, metrics_df_model], ignore_index=True)
876
 
 
 
 
 
877
  if 'valid_efficency_score' not in metrics_conc.columns:
878
  metrics_conc['valid_efficency_score'] = metrics_conc['VES']
879
 
880
  if 'VES' not in metrics_conc.columns:
881
  metrics_conc['VES'] = metrics_conc['valid_efficency_score']
882
 
883
- if 'VES' not in metrics_conc.columns and 'valid_efficency_score' not in metrics_conc.columns:
884
- metrics_conc['VES'] = 0
885
- metrics_conc['valid_efficency_score'] = 0
886
-
887
  eval_text = generate_eval_text("End evaluation")
888
  yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
889
 
@@ -1004,7 +1006,6 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
1004
  ]
1005
  )
1006
 
1007
-
1008
  ##########################################
1009
  # METRICS VISUALIZATION SECTION #
1010
  ##########################################
@@ -1796,12 +1797,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
1796
  df = calculate_average_metrics(df, selected_metrics)
1797
 
1798
  if flag_TQA:
1799
- df["target_answer"] = df["target_answer"].apply(
1800
- lambda x: " - ".join([",".join(map(str, item)) for item in x]) if isinstance(x, list) else str(x)
1801
- )
1802
- df["predicted_answer"] = df["predicted_answer"].apply(
1803
- lambda x: " - ".join([",".join(map(str, item)) for item in x]) if isinstance(x, list) else str(x)
1804
- )
1805
 
1806
  worst_cases_df = df.groupby(['model', 'tbl_name', 'test_category', 'question', 'target_answer', 'predicted_answer', 'answer', 'sql_tag'])['avg_metric'].mean().reset_index()
1807
  else:
@@ -1824,7 +1820,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
1824
  f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span> \n"
1825
  f"<span style='font-size:16px;'>- <b>Question:</b> {row['question']}</span> \n"
1826
  f"<span style='font-size:16px;'>- <b>Original Answer:</b> `{row['target_answer']}`</span> \n"
1827
- f"<span style='font-size:16px;'>- <b>Predicted Answer:</b> `{row['predicted_answer']}`</span> \n\n"
1828
  )
1829
 
1830
  worst_str.append(entry)
 
782
  <div style='font-size: 3rem'>➡️</div>
783
  </div>
784
  """
785
+ yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
786
  #samples = us.generate_some_samples(input_data["data_path"], row["tbl_name"])
787
  model_to_send = None if not flag_TQA else model
788
 
 
805
  else: task="SP"
806
  start_time = time.time()
807
  response = predictor.make_prediction(
808
+ question=question,
809
+ db_schema=db_schema_text,
810
+ model_name=model,
811
+ prompt=f"{prompt_to_send}",
812
+ task=task
813
  )
814
+ #if flag_TQA: response = {'response_parsed': "[['Alice'],['Bob'],['Charlie']]", 'cost': 0, 'response': "[['Alice'],['Bob'],['Charlie']]"} # TODO remove this line
815
+ #else : response = {'response_parsed': "SELECT * FROM 'MyTable'", 'cost': 0, 'response': "SQL_QUERY"}
816
  end_time = time.time()
817
  prediction = response['response_parsed']
818
  price = response['cost']
 
855
  predictions_dict[model] = pd.concat([predictions_dict[model], new_row], ignore_index=True)
856
 
857
  # yield gr.Textbox(), gr.Textbox(prediction), *[predictions_dict[model] for model in input_data["models"]], None
858
+ yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
859
  yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
860
  # END
861
  eval_text = generate_eval_text("Evaluation")
 
876
  metrics_df_model['model'] = model
877
  metrics_conc = pd.concat([metrics_conc, metrics_df_model], ignore_index=True)
878
 
879
+ if 'VES' not in metrics_conc.columns and 'valid_efficency_score' not in metrics_conc.columns:
880
+ metrics_conc['VES'] = 0
881
+ metrics_conc['valid_efficency_score'] = 0
882
+
883
  if 'valid_efficency_score' not in metrics_conc.columns:
884
  metrics_conc['valid_efficency_score'] = metrics_conc['VES']
885
 
886
  if 'VES' not in metrics_conc.columns:
887
  metrics_conc['VES'] = metrics_conc['valid_efficency_score']
888
 
 
 
 
 
889
  eval_text = generate_eval_text("End evaluation")
890
  yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
891
 
 
1006
  ]
1007
  )
1008
 
 
1009
  ##########################################
1010
  # METRICS VISUALIZATION SECTION #
1011
  ##########################################
 
1797
  df = calculate_average_metrics(df, selected_metrics)
1798
 
1799
  if flag_TQA:
1800
+ df["target_answer"] = df["target_answer"] = df["target_answer"].apply(lambda x: "[" + ", ".join(map(str, x)) + "]")
 
 
 
 
 
1801
 
1802
  worst_cases_df = df.groupby(['model', 'tbl_name', 'test_category', 'question', 'target_answer', 'predicted_answer', 'answer', 'sql_tag'])['avg_metric'].mean().reset_index()
1803
  else:
 
1820
  f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span> \n"
1821
  f"<span style='font-size:16px;'>- <b>Question:</b> {row['question']}</span> \n"
1822
  f"<span style='font-size:16px;'>- <b>Original Answer:</b> `{row['target_answer']}`</span> \n"
1823
+ f"<span style='font-size:16px;'>- <b>Predicted Answer:</b> `{eval(row['predicted_answer'])}`</span> \n\n"
1824
  )
1825
 
1826
  worst_str.append(entry)