fdl_extract / utils_assessment.py
Yannael's picture
Upload 3 files
46f57ce verified
import re
from rouge_score import rouge_scorer
import Levenshtein
import pandas as pd
import numpy as np
feature_assessment_entries = {
f'brand': {
'name': f'brand',
'output_column': 'Brand',
'scoring_function_name': 'grade_exact_match',
'post_processing_function_name': 'post_processing_none',
# 'post_processing_function_name' : 'post_processing_brand',
'k_folds': 3,
},
f'product_name': {
'name': f'product_name',
'output_column': 'Product name',
'scoring_function_name': 'grade_levenshtein_match',
# 'scoring_function_name' : 'grade_exact_match',
'post_processing_function_name': 'post_processing_none',
'k_folds': 3,
},
f'ingredients': {
'name': f'ingredients',
'output_column': 'Ingredients',
'scoring_function_name': 'grade_rouge_score',
# 'scoring_function_name' : 'grade_levenshtein_match',
'post_processing_function_name': 'post_processing_none',
# 'post_processing_function_name' : 'post_processing_ingredients',
'k_folds': 3,
},
f'energy_kj': {
'name': f'energy_kj',
'output_column': 'Energy kJ',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_none',
'k_folds': 3,
},
f'energy_kcal': {
'name': f'energy_kcal',
'output_column': 'Energy kcal',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_none',
'k_folds': 3,
},
f'fat': {
'name': f'fat',
'output_column': 'Fat',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_nutritionals',
'k_folds': 3,
},
f'saturated_fat': {
'name': f'saturated_fat',
'output_column': 'Saturated fat',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_nutritionals',
'k_folds': 3,
},
f'carbohydrates': {
'name': f'carbohydrates',
'output_column': 'Carbohydrates',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_nutritionals',
'k_folds': 3,
},
f'sugars': {
'name': f'sugars',
'output_column': 'Sugars',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_nutritionals',
'k_folds': 3,
},
f'fibers': {
'name': f'fibers',
'output_column': 'Fibers',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_nutritionals',
'k_folds': 3,
},
f'proteins': {
'name': f'proteins',
'output_column': 'Proteins',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_nutritionals',
'k_folds': 3,
},
f'salt': {
'name': f'salt',
'output_column': 'Salt',
'scoring_function_name': 'grade_numerical',
'post_processing_function_name': 'post_processing_nutritionals',
'k_folds': 3,
},
}
def post_processing_none(string):
return string
def post_processing_ingredients(string):
pattern = r"<ingredients>(.*?)</ingredients>"
# Find all matches
matches = re.findall(pattern, string, re.DOTALL)
if len(matches) == 0:
output = string
else:
output = matches[0].strip()
if output.lower().startswith("ingrediënten: ") or output.lower().startswith("ingredienten: "):
output = output[len("ingrediënten: "):]
if output.lower().startswith("ingredients: "):
output = output[len("ingredients: "):]
return output
def post_processing_brand(brand):
if brand.lower() == "boni":
brand = "Boni Selection"
elif brand.lower() == "rana":
brand = "Giovanni Rana"
elif brand.lower() == "the market":
brand = "Carrefour The Market"
elif brand.lower() == "extra":
brand = "Carrefour Extra"
return brand
def post_processing_nutritionals(predicted_value):
try:
predicted_value = re.findall(r"[-+]?\d*\.\d+|\d+", str(predicted_value))[0]
except:
predicted_value = np.nan
return predicted_value
def grade_levenshtein_match(predicted_value, reference_value):
score = Levenshtein.ratio(predicted_value.lower().strip(), reference_value.lower().strip())
return score
def grade_exact_match(predicted_value, reference_value):
reference_value = reference_value.lower().strip()
reference_value = re.sub(r'\s+', ' ', reference_value)
predicted_value = predicted_value.lower().strip()
score = int(predicted_value.lower().strip() == reference_value.lower().strip())
return score
def grade_rouge_score(predicted_value, reference_value):
scorer = rouge_scorer.RougeScorer(['rouge2'])
score = scorer.score(predicted_value, reference_value)['rouge2'].fmeasure
return score
def grade_numerical(predicted_value, reference_value):
try:
if np.isnan(float(predicted_value)) and np.isnan(float(reference_value)):
score = 1
else:
score = int(float(predicted_value) == float(reference_value))
except:
score = -1
return score
def create_eval_data(OUTPUT_DIR, feature_assessment_entry):
df_product_id = pd.read_csv(f"{OUTPUT_DIR}/reference_data.csv")
df_features = pd.read_csv(f"{OUTPUT_DIR}/{feature_assessment_entry['name']}.csv")
df_features = df_features.merge(df_product_id, on='ID', how='left')
df_eval_data = df_features[
['ID', feature_assessment_entry['output_column'], 'Extracted_Text', 'Price', 'Processing time']].copy()
df_eval_data.rename(columns={feature_assessment_entry['output_column']: 'Reference'}, inplace=True)
df_eval_data.rename(columns={'Extracted_Text': 'Predicted'}, inplace=True)
df_eval_data['Predicted'] = df_eval_data.apply(
lambda row: eval(feature_assessment_entry['post_processing_function_name'])(row['Predicted']), axis=1)
df_eval_data['accuracy_score'] = df_eval_data.apply(
lambda row: eval(feature_assessment_entry['scoring_function_name'])(row['Predicted'], row['Reference']), axis=1)
df_eval_data['accuracy_score'] = round(df_eval_data['accuracy_score'], 2)
N = len(df_eval_data)
k = feature_assessment_entry['k_folds']
np.random.seed(42)
df_eval_data['fold'] = np.random.randint(0, k, size=N)
return df_eval_data
def merge_and_save_data(OUTPUT_DIR):
df_ref_data = pd.read_csv(f"{OUTPUT_DIR}/data_extraction/reference_data.csv")
data_merged = [df_ref_data[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo']]]
for feature_name in feature_assessment_entries.keys():
df_eval_data = create_eval_data(f'{OUTPUT_DIR}/data_extraction', feature_assessment_entries[feature_name])
df_eval_data = df_eval_data[['Reference', 'Predicted', 'accuracy_score']]
df_eval_data.rename(columns={'Reference': 'Reference_' + feature_name}, inplace=True)
df_eval_data.rename(columns={'Predicted': 'Predicted_' + feature_name}, inplace=True)
df_eval_data.rename(columns={'accuracy_score': 'accuracy_score_' + feature_name}, inplace=True)
data_merged.append(df_eval_data)
data_merged = pd.concat(data_merged, axis=1)
data_merged.to_csv(f"{OUTPUT_DIR}/data_extraction/merged.csv")
return data_merged