Spaces:
Running
Running
import re | |
from rouge_score import rouge_scorer | |
import Levenshtein | |
import pandas as pd | |
import numpy as np | |
feature_assessment_entries = { | |
f'brand': { | |
'name': f'brand', | |
'output_column': 'Brand', | |
'scoring_function_name': 'grade_exact_match', | |
'post_processing_function_name': 'post_processing_none', | |
# 'post_processing_function_name' : 'post_processing_brand', | |
'k_folds': 3, | |
}, | |
f'product_name': { | |
'name': f'product_name', | |
'output_column': 'Product name', | |
'scoring_function_name': 'grade_levenshtein_match', | |
# 'scoring_function_name' : 'grade_exact_match', | |
'post_processing_function_name': 'post_processing_none', | |
'k_folds': 3, | |
}, | |
f'ingredients': { | |
'name': f'ingredients', | |
'output_column': 'Ingredients', | |
'scoring_function_name': 'grade_rouge_score', | |
# 'scoring_function_name' : 'grade_levenshtein_match', | |
'post_processing_function_name': 'post_processing_none', | |
# 'post_processing_function_name' : 'post_processing_ingredients', | |
'k_folds': 3, | |
}, | |
f'energy_kj': { | |
'name': f'energy_kj', | |
'output_column': 'Energy kJ', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_none', | |
'k_folds': 3, | |
}, | |
f'energy_kcal': { | |
'name': f'energy_kcal', | |
'output_column': 'Energy kcal', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_none', | |
'k_folds': 3, | |
}, | |
f'fat': { | |
'name': f'fat', | |
'output_column': 'Fat', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_nutritionals', | |
'k_folds': 3, | |
}, | |
f'saturated_fat': { | |
'name': f'saturated_fat', | |
'output_column': 'Saturated fat', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_nutritionals', | |
'k_folds': 3, | |
}, | |
f'carbohydrates': { | |
'name': f'carbohydrates', | |
'output_column': 'Carbohydrates', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_nutritionals', | |
'k_folds': 3, | |
}, | |
f'sugars': { | |
'name': f'sugars', | |
'output_column': 'Sugars', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_nutritionals', | |
'k_folds': 3, | |
}, | |
f'fibers': { | |
'name': f'fibers', | |
'output_column': 'Fibers', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_nutritionals', | |
'k_folds': 3, | |
}, | |
f'proteins': { | |
'name': f'proteins', | |
'output_column': 'Proteins', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_nutritionals', | |
'k_folds': 3, | |
}, | |
f'salt': { | |
'name': f'salt', | |
'output_column': 'Salt', | |
'scoring_function_name': 'grade_numerical', | |
'post_processing_function_name': 'post_processing_nutritionals', | |
'k_folds': 3, | |
}, | |
} | |
def post_processing_none(string): | |
return string | |
def post_processing_ingredients(string): | |
pattern = r"<ingredients>(.*?)</ingredients>" | |
# Find all matches | |
matches = re.findall(pattern, string, re.DOTALL) | |
if len(matches) == 0: | |
output = string | |
else: | |
output = matches[0].strip() | |
if output.lower().startswith("ingrediënten: ") or output.lower().startswith("ingredienten: "): | |
output = output[len("ingrediënten: "):] | |
if output.lower().startswith("ingredients: "): | |
output = output[len("ingredients: "):] | |
return output | |
def post_processing_brand(brand): | |
if brand.lower() == "boni": | |
brand = "Boni Selection" | |
elif brand.lower() == "rana": | |
brand = "Giovanni Rana" | |
elif brand.lower() == "the market": | |
brand = "Carrefour The Market" | |
elif brand.lower() == "extra": | |
brand = "Carrefour Extra" | |
return brand | |
def post_processing_nutritionals(predicted_value): | |
try: | |
predicted_value = re.findall(r"[-+]?\d*\.\d+|\d+", str(predicted_value))[0] | |
except: | |
predicted_value = np.nan | |
return predicted_value | |
def grade_levenshtein_match(predicted_value, reference_value): | |
score = Levenshtein.ratio(predicted_value.lower().strip(), reference_value.lower().strip()) | |
return score | |
def grade_exact_match(predicted_value, reference_value): | |
reference_value = reference_value.lower().strip() | |
reference_value = re.sub(r'\s+', ' ', reference_value) | |
predicted_value = predicted_value.lower().strip() | |
score = int(predicted_value.lower().strip() == reference_value.lower().strip()) | |
return score | |
def grade_rouge_score(predicted_value, reference_value): | |
scorer = rouge_scorer.RougeScorer(['rouge2']) | |
score = scorer.score(predicted_value, reference_value)['rouge2'].fmeasure | |
return score | |
def grade_numerical(predicted_value, reference_value): | |
try: | |
if np.isnan(float(predicted_value)) and np.isnan(float(reference_value)): | |
score = 1 | |
else: | |
score = int(float(predicted_value) == float(reference_value)) | |
except: | |
score = -1 | |
return score | |
def create_eval_data(OUTPUT_DIR, feature_assessment_entry): | |
df_product_id = pd.read_csv(f"{OUTPUT_DIR}/reference_data.csv") | |
df_features = pd.read_csv(f"{OUTPUT_DIR}/{feature_assessment_entry['name']}.csv") | |
df_features = df_features.merge(df_product_id, on='ID', how='left') | |
df_eval_data = df_features[ | |
['ID', feature_assessment_entry['output_column'], 'Extracted_Text', 'Price', 'Processing time']].copy() | |
df_eval_data.rename(columns={feature_assessment_entry['output_column']: 'Reference'}, inplace=True) | |
df_eval_data.rename(columns={'Extracted_Text': 'Predicted'}, inplace=True) | |
df_eval_data['Predicted'] = df_eval_data.apply( | |
lambda row: eval(feature_assessment_entry['post_processing_function_name'])(row['Predicted']), axis=1) | |
df_eval_data['accuracy_score'] = df_eval_data.apply( | |
lambda row: eval(feature_assessment_entry['scoring_function_name'])(row['Predicted'], row['Reference']), axis=1) | |
df_eval_data['accuracy_score'] = round(df_eval_data['accuracy_score'], 2) | |
N = len(df_eval_data) | |
k = feature_assessment_entry['k_folds'] | |
np.random.seed(42) | |
df_eval_data['fold'] = np.random.randint(0, k, size=N) | |
return df_eval_data | |
def merge_and_save_data(OUTPUT_DIR): | |
df_ref_data = pd.read_csv(f"{OUTPUT_DIR}/data_extraction/reference_data.csv") | |
data_merged = [df_ref_data[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo']]] | |
for feature_name in feature_assessment_entries.keys(): | |
df_eval_data = create_eval_data(f'{OUTPUT_DIR}/data_extraction', feature_assessment_entries[feature_name]) | |
df_eval_data = df_eval_data[['Reference', 'Predicted', 'accuracy_score']] | |
df_eval_data.rename(columns={'Reference': 'Reference_' + feature_name}, inplace=True) | |
df_eval_data.rename(columns={'Predicted': 'Predicted_' + feature_name}, inplace=True) | |
df_eval_data.rename(columns={'accuracy_score': 'accuracy_score_' + feature_name}, inplace=True) | |
data_merged.append(df_eval_data) | |
data_merged = pd.concat(data_merged, axis=1) | |
data_merged.to_csv(f"{OUTPUT_DIR}/data_extraction/merged.csv") | |
return data_merged | |