Yannael commited on
Commit
46f57ce
·
verified ·
1 Parent(s): 19f8e20

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +224 -0
  2. utils_assessment.py +241 -0
  3. utils_data_extraction.py +323 -0
app.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import os
4
+ import shutil
5
+
6
+ import gradio as gr
7
+
8
+ import utils_data_extraction
9
+ import utils_assessment
10
+
11
+ import importlib
12
+ importlib.reload(utils_data_extraction)
13
+ importlib.reload(utils_assessment)
14
+
15
+ """### Function to load data
16
+
17
+ Data is loaded from a Roamler Excel file, from a sheet called "output".
18
+
19
+ - A subset of the Excel file is taken as reference data, and saved in the `outputs` directory as reference_data.csv
20
+ - A folder for storing photos is created
21
+
22
+ A n_rows parameter can be passed to load a subset of the data.
23
+ """
24
+
25
+ def load_roamler_excel_file(filepath, n_rows=3):
26
+
27
+ OUTPUT_DIR = 'outputs/'+os.path.basename(filepath)
28
+ if not os.path.exists(OUTPUT_DIR):
29
+ os.makedirs(OUTPUT_DIR)
30
+
31
+ DATA_EXTRACTION_DIR=OUTPUT_DIR+'/data_extraction'
32
+ if not os.path.exists(DATA_EXTRACTION_DIR):
33
+ os.makedirs(DATA_EXTRACTION_DIR)
34
+
35
+ df_review = pd.read_excel(filepath, sheet_name='Output')
36
+ if n_rows is not None:
37
+ df_review = df_review.sample(n=n_rows, random_state=42)
38
+
39
+ df_products = df_review[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo',
40
+ 'Brand', 'Product name', 'Legal name', 'Barcode',
41
+ 'Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins', 'Salt', 'Ingredients',
42
+ 'Nutriscore','Allergens',
43
+ 'Quantity per unit']].copy()
44
+
45
+ df_products.to_csv(f'{OUTPUT_DIR}/data_extraction/reference_data.csv', index=False)
46
+
47
+ PHOTO_DIR=OUTPUT_DIR+'/photos'
48
+ if not os.path.exists(PHOTO_DIR):
49
+ os.makedirs(PHOTO_DIR)
50
+
51
+ df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR)
52
+
53
+ return df_products, OUTPUT_DIR, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
54
+
55
+ def load_df_from_folder(OUTPUT_DIR):
56
+
57
+ df_brand_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
58
+ if os.path.exists(f'{OUTPUT_DIR}/data_extraction/brand.csv'):
59
+ df_brand_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv')
60
+
61
+ df_product_name_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
62
+ if os.path.exists(f'{OUTPUT_DIR}/data_extraction/product_name.csv'):
63
+ df_product_name_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv')
64
+
65
+ df_ingredients_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
66
+ if os.path.exists(f'{OUTPUT_DIR}/data_extraction/ingredients.csv'):
67
+ df_ingredients_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv')
68
+
69
+ df_nutritional_values_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
70
+ if os.path.exists(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv'):
71
+ df_nutritional_values_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv')
72
+
73
+ return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
74
+
75
+ def load_csv_files(archive, OUTPUT_DIR):
76
+
77
+ accepted_files = ['brand.csv', 'product_name.csv', 'ingredients.csv', 'nutritional_values.csv']
78
+
79
+ for file in archive:
80
+ print(os.path.basename(file))
81
+ if os.path.basename(file) in accepted_files:
82
+ shutil.copy(file, f'{OUTPUT_DIR}/data_extraction')
83
+
84
+ df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR)
85
+ return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
86
+
87
+ """### Function to save data
88
+
89
+ This function is called when the user clicks on the "Generate data archive" button.
90
+
91
+ It creates a zip of all CSV files of the f'{OUTPUT_DIR}/data_extraction' folder, and return a download button to the archive.
92
+ """
93
+
94
+ def generate_archive(OUTPUT_DIR):
95
+ # Download all data
96
+
97
+ archive_name = f'{OUTPUT_DIR}'
98
+ shutil.make_archive(archive_name, 'zip', f'{OUTPUT_DIR}/data_extraction')
99
+
100
+ return gr.DownloadButton(label=f"Download {archive_name}.zip", value=f'{archive_name}.zip', visible=True)
101
+
102
+ """### Gradio UI"""
103
+
104
+ def toggle_row_visibility(show):
105
+ if show:
106
+ return gr.update(visible=True)
107
+ else:
108
+ return gr.update(visible=False)
109
+
110
+ language = 'French'
111
+
112
+ # Custom CSS to set max height for the rows
113
+ custom_css = """
114
+ .dataframe-wrap {
115
+ max-height: 300px; /* Set the desired height */
116
+ overflow-y: scroll;
117
+ }
118
+ """
119
+
120
+ OUTPUT_DIR_value = ""
121
+ dummy_data = df_brand_data = df_product_name_data = df_ingredients_data = df_nutritional_values_data = pd.DataFrame()
122
+ #dummy_data, OUTPUT_DIR_value, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_roamler_excel_file("FDL-Datasets3/FR - Review.xlsm", n_rows=3)
123
+
124
+ with gr.Blocks(css=custom_css) as fdl_data_extraction_ui:
125
+
126
+ gr.HTML("<div align='center'><h1>Euroconsumers Food Data Lake</h1>")
127
+ gr.HTML("<div align='center'><h2>Data extraction</h2>")
128
+
129
+ OUTPUT_DIR = gr.State(value=OUTPUT_DIR_value)
130
+
131
+ with gr.Row():
132
+ with gr.Column():
133
+ gr.HTML("<h2>Upload Roamler Excel file</h2>")
134
+ load_roamler_excel_file_input = gr.File(label="Upload Roamler Excel file", type="filepath")
135
+
136
+ with gr.Row(visible=False) as dataset_block:
137
+ with gr.Column():
138
+ gr.HTML("<h2>Dataset summary</h2>")
139
+
140
+ # Display summary of the dataset - ID, Reference_brand, Reference_product_name, mean_accuracy_score
141
+ with gr.Row(elem_classes="dataframe-wrap"):
142
+ dataframe_component = gr.DataFrame(value=dummy_data, interactive=False)
143
+
144
+ with gr.Row(visible=False) as product_detail_block:
145
+ with gr.Column():
146
+ # Section for product details
147
+ gr.HTML("<h1>Data extraction</h1>")
148
+
149
+ load_csv_files_input = gr.Files(label="Upload extracted data from CSV files")
150
+
151
+ language = gr.Dropdown(label="Select language", choices=["French", "Dutch", "Spanish", "Italian", "Portuguese"], value="French")
152
+
153
+ gr.HTML("<h3>Brand</h3>")
154
+ extract_brand_button = gr.Button("Extract brand")
155
+ df_brand = gr.Dataframe(label="Brand data", scale=2,
156
+ column_widths=["10%", "60%", "15%", "15%"],
157
+ wrap=True, value=df_brand_data)
158
+
159
+ gr.HTML("<h3>Product name</h3>")
160
+ extract_product_name_button = gr.Button("Extract product_name")
161
+ df_product_name = gr.Dataframe(label="Product name data", scale=2,
162
+ column_widths=["10%", "60%", "15%", "15%"],
163
+ wrap=True, value=df_product_name_data)
164
+
165
+ gr.HTML("<h3>Ingredients</h3>")
166
+ extract_ingredients_button = gr.Button("Extract ingredients")
167
+ df_ingredients = gr.Dataframe(label="Ingredients data", scale=2,
168
+ column_widths=["10%", "60%", "15%", "15%"],
169
+ wrap=True, value=df_ingredients_data)
170
+
171
+ gr.HTML("<h3>Nutritional values</h3>")
172
+ extract_nutritional_values_button = gr.Button("Extract nutritional values")
173
+ df_nutritional_values = gr.Dataframe(label="Nutritional data", scale=2,
174
+ column_widths=["10%", "60%", "15%", "15%"],
175
+ wrap=True, value=df_nutritional_values_data)
176
+
177
+ # Download
178
+ gr.HTML("<h1>Data download</h1>")
179
+
180
+ generate_merged_file_button = gr.Button("Generate merged file")
181
+ generate_archive_button = gr.Button("Generate data archive")
182
+ download_button = gr.DownloadButton("Download archive", visible=False)
183
+
184
+ ### Control functions
185
+
186
+ # Linking the select_dataset change event to update both the gradio DataFrame and product_ids dropdown
187
+ load_roamler_excel_file_input.change(load_roamler_excel_file,
188
+ inputs=load_roamler_excel_file_input,
189
+ outputs=[dataframe_component, OUTPUT_DIR,
190
+ df_brand, df_product_name, df_ingredients, df_nutritional_values])
191
+
192
+ # Toggle visibility of the dataset block
193
+ load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=dataset_block)
194
+ load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=product_detail_block)
195
+
196
+ load_csv_files_input.change(load_csv_files,
197
+ inputs=[load_csv_files_input, OUTPUT_DIR],
198
+ outputs=[df_brand, df_product_name, df_ingredients, df_nutritional_values])
199
+
200
+ # Data extraction
201
+ extract_brand_button.click(utils_data_extraction.extract_brand,
202
+ inputs=[OUTPUT_DIR, dataframe_component, language],
203
+ outputs=df_brand)
204
+
205
+ extract_product_name_button.click(utils_data_extraction.extract_product_name,
206
+ inputs=[OUTPUT_DIR, dataframe_component, language],
207
+ outputs=df_product_name)
208
+
209
+ extract_ingredients_button.click(utils_data_extraction.extract_ingredients,
210
+ inputs=[OUTPUT_DIR, dataframe_component, language],
211
+ outputs=df_ingredients)
212
+
213
+ extract_nutritional_values_button.click(utils_data_extraction.extract_nutritional_values,
214
+ inputs=[OUTPUT_DIR, dataframe_component, language],
215
+ outputs=df_nutritional_values)
216
+
217
+ generate_merged_file_button.click(utils_assessment.merge_and_save_data, inputs=OUTPUT_DIR)
218
+
219
+ generate_archive_button.click(generate_archive, inputs=OUTPUT_DIR, outputs=download_button)
220
+
221
+ fdl_data_extraction_ui.launch(debug=True)
222
+
223
+
224
+
utils_assessment.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from rouge_score import rouge_scorer
4
+
5
+ import Levenshtein
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ feature_assessment_entries = {
11
+
12
+ f'brand': {
13
+ 'name': f'brand',
14
+ 'output_column': 'Brand',
15
+ 'scoring_function_name': 'grade_exact_match',
16
+ 'post_processing_function_name': 'post_processing_none',
17
+ # 'post_processing_function_name' : 'post_processing_brand',
18
+ 'k_folds': 3,
19
+ },
20
+
21
+ f'product_name': {
22
+ 'name': f'product_name',
23
+ 'output_column': 'Product name',
24
+ 'scoring_function_name': 'grade_levenshtein_match',
25
+ # 'scoring_function_name' : 'grade_exact_match',
26
+ 'post_processing_function_name': 'post_processing_none',
27
+ 'k_folds': 3,
28
+ },
29
+
30
+ f'ingredients': {
31
+ 'name': f'ingredients',
32
+ 'output_column': 'Ingredients',
33
+ 'scoring_function_name': 'grade_rouge_score',
34
+ # 'scoring_function_name' : 'grade_levenshtein_match',
35
+ 'post_processing_function_name': 'post_processing_none',
36
+ # 'post_processing_function_name' : 'post_processing_ingredients',
37
+ 'k_folds': 3,
38
+ },
39
+
40
+ f'energy_kj': {
41
+ 'name': f'energy_kj',
42
+ 'output_column': 'Energy kJ',
43
+ 'scoring_function_name': 'grade_numerical',
44
+ 'post_processing_function_name': 'post_processing_none',
45
+ 'k_folds': 3,
46
+ },
47
+
48
+ f'energy_kcal': {
49
+ 'name': f'energy_kcal',
50
+ 'output_column': 'Energy kcal',
51
+ 'scoring_function_name': 'grade_numerical',
52
+ 'post_processing_function_name': 'post_processing_none',
53
+ 'k_folds': 3,
54
+ },
55
+
56
+ f'fat': {
57
+ 'name': f'fat',
58
+ 'output_column': 'Fat',
59
+ 'scoring_function_name': 'grade_numerical',
60
+ 'post_processing_function_name': 'post_processing_nutritionals',
61
+ 'k_folds': 3,
62
+ },
63
+
64
+ f'saturated_fat': {
65
+ 'name': f'saturated_fat',
66
+ 'output_column': 'Saturated fat',
67
+ 'scoring_function_name': 'grade_numerical',
68
+ 'post_processing_function_name': 'post_processing_nutritionals',
69
+ 'k_folds': 3,
70
+ },
71
+
72
+ f'carbohydrates': {
73
+ 'name': f'carbohydrates',
74
+ 'output_column': 'Carbohydrates',
75
+ 'scoring_function_name': 'grade_numerical',
76
+ 'post_processing_function_name': 'post_processing_nutritionals',
77
+ 'k_folds': 3,
78
+ },
79
+
80
+ f'sugars': {
81
+ 'name': f'sugars',
82
+ 'output_column': 'Sugars',
83
+ 'scoring_function_name': 'grade_numerical',
84
+ 'post_processing_function_name': 'post_processing_nutritionals',
85
+ 'k_folds': 3,
86
+ },
87
+
88
+ f'fibers': {
89
+ 'name': f'fibers',
90
+ 'output_column': 'Fibers',
91
+ 'scoring_function_name': 'grade_numerical',
92
+ 'post_processing_function_name': 'post_processing_nutritionals',
93
+ 'k_folds': 3,
94
+ },
95
+
96
+ f'proteins': {
97
+ 'name': f'proteins',
98
+ 'output_column': 'Proteins',
99
+ 'scoring_function_name': 'grade_numerical',
100
+ 'post_processing_function_name': 'post_processing_nutritionals',
101
+ 'k_folds': 3,
102
+ },
103
+
104
+ f'salt': {
105
+ 'name': f'salt',
106
+ 'output_column': 'Salt',
107
+ 'scoring_function_name': 'grade_numerical',
108
+ 'post_processing_function_name': 'post_processing_nutritionals',
109
+ 'k_folds': 3,
110
+ },
111
+
112
+ }
113
+
114
+
115
+ def post_processing_none(string):
116
+ return string
117
+
118
+
119
+ def post_processing_ingredients(string):
120
+ pattern = r"<ingredients>(.*?)</ingredients>"
121
+ # Find all matches
122
+ matches = re.findall(pattern, string, re.DOTALL)
123
+
124
+ if len(matches) == 0:
125
+ output = string
126
+ else:
127
+ output = matches[0].strip()
128
+ if output.lower().startswith("ingrediënten: ") or output.lower().startswith("ingredienten: "):
129
+ output = output[len("ingrediënten: "):]
130
+ if output.lower().startswith("ingredients: "):
131
+ output = output[len("ingredients: "):]
132
+
133
+ return output
134
+
135
+
136
+ def post_processing_brand(brand):
137
+ if brand.lower() == "boni":
138
+ brand = "Boni Selection"
139
+ elif brand.lower() == "rana":
140
+ brand = "Giovanni Rana"
141
+ elif brand.lower() == "the market":
142
+ brand = "Carrefour The Market"
143
+ elif brand.lower() == "extra":
144
+ brand = "Carrefour Extra"
145
+
146
+ return brand
147
+
148
+
149
+ def post_processing_nutritionals(predicted_value):
150
+ try:
151
+ predicted_value = re.findall(r"[-+]?\d*\.\d+|\d+", str(predicted_value))[0]
152
+ except:
153
+ predicted_value = np.nan
154
+
155
+ return predicted_value
156
+
157
+ def grade_levenshtein_match(predicted_value, reference_value):
158
+ score = Levenshtein.ratio(predicted_value.lower().strip(), reference_value.lower().strip())
159
+ return score
160
+
161
+
162
+ def grade_exact_match(predicted_value, reference_value):
163
+ reference_value = reference_value.lower().strip()
164
+ reference_value = re.sub(r'\s+', ' ', reference_value)
165
+ predicted_value = predicted_value.lower().strip()
166
+
167
+ score = int(predicted_value.lower().strip() == reference_value.lower().strip())
168
+
169
+ return score
170
+
171
+
172
+ def grade_rouge_score(predicted_value, reference_value):
173
+ scorer = rouge_scorer.RougeScorer(['rouge2'])
174
+ score = scorer.score(predicted_value, reference_value)['rouge2'].fmeasure
175
+
176
+ return score
177
+
178
+
179
+ def grade_numerical(predicted_value, reference_value):
180
+ try:
181
+ if np.isnan(float(predicted_value)) and np.isnan(float(reference_value)):
182
+ score = 1
183
+ else:
184
+ score = int(float(predicted_value) == float(reference_value))
185
+ except:
186
+ score = -1
187
+
188
+ return score
189
+
190
+
191
+ def create_eval_data(OUTPUT_DIR, feature_assessment_entry):
192
+
193
+ df_product_id = pd.read_csv(f"{OUTPUT_DIR}/reference_data.csv")
194
+
195
+ df_features = pd.read_csv(f"{OUTPUT_DIR}/{feature_assessment_entry['name']}.csv")
196
+
197
+ df_features = df_features.merge(df_product_id, on='ID', how='left')
198
+ df_eval_data = df_features[
199
+ ['ID', feature_assessment_entry['output_column'], 'Extracted_Text', 'Price', 'Processing time']].copy()
200
+ df_eval_data.rename(columns={feature_assessment_entry['output_column']: 'Reference'}, inplace=True)
201
+ df_eval_data.rename(columns={'Extracted_Text': 'Predicted'}, inplace=True)
202
+
203
+ df_eval_data['Predicted'] = df_eval_data.apply(
204
+ lambda row: eval(feature_assessment_entry['post_processing_function_name'])(row['Predicted']), axis=1)
205
+
206
+ df_eval_data['accuracy_score'] = df_eval_data.apply(
207
+ lambda row: eval(feature_assessment_entry['scoring_function_name'])(row['Predicted'], row['Reference']), axis=1)
208
+
209
+ df_eval_data['accuracy_score'] = round(df_eval_data['accuracy_score'], 2)
210
+
211
+ N = len(df_eval_data)
212
+ k = feature_assessment_entry['k_folds']
213
+ np.random.seed(42)
214
+ df_eval_data['fold'] = np.random.randint(0, k, size=N)
215
+
216
+ return df_eval_data
217
+
218
+
219
+ def merge_and_save_data(OUTPUT_DIR):
220
+
221
+ df_ref_data = pd.read_csv(f"{OUTPUT_DIR}/data_extraction/reference_data.csv")
222
+
223
+ data_merged = [df_ref_data[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo']]]
224
+
225
+ for feature_name in feature_assessment_entries.keys():
226
+
227
+ df_eval_data = create_eval_data(f'{OUTPUT_DIR}/data_extraction', feature_assessment_entries[feature_name])
228
+
229
+ df_eval_data = df_eval_data[['Reference', 'Predicted', 'accuracy_score']]
230
+ df_eval_data.rename(columns={'Reference': 'Reference_' + feature_name}, inplace=True)
231
+ df_eval_data.rename(columns={'Predicted': 'Predicted_' + feature_name}, inplace=True)
232
+ df_eval_data.rename(columns={'accuracy_score': 'accuracy_score_' + feature_name}, inplace=True)
233
+
234
+ data_merged.append(df_eval_data)
235
+
236
+ data_merged = pd.concat(data_merged, axis=1)
237
+
238
+ data_merged.to_csv(f"{OUTPUT_DIR}/data_extraction/merged.csv")
239
+
240
+ return data_merged
241
+
utils_data_extraction.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ import os
4
+ import time
5
+ import gradio as gr
6
+ import json
7
+
8
+ import google.generativeai as genai
9
+
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+ GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
13
+ genai.configure(api_key=GOOGLE_API_KEY)
14
+
15
+
16
+
17
+ ############## Photos ##############
18
+
19
+
20
+ def download_file(url, save_path):
21
+ try:
22
+ # Send a GET request to the URL
23
+ response = requests.get(url)
24
+
25
+ # Check if the request was successful (status code 200)
26
+ if response.status_code == 200:
27
+ # Open the specified path in binary-write mode and save the content
28
+ with open(save_path, 'wb') as file:
29
+ file.write(response.content)
30
+ else:
31
+ print(f"Failed to download image. Status code: {response.status_code}")
32
+ except Exception as e:
33
+ print(f"An error occurred: {e}")
34
+
35
+ def upload_file(photo_path):
36
+
37
+ photo = genai.upload_file(photo_path)
38
+
39
+ return photo
40
+
41
+ ###### Data extraction
42
+
43
+ ## Helper function to initialize model
44
+
45
+ price_token={'gemini-1.5-pro-002': {'input': 1.25 / 1000000, 'output': 5 / 1000000}
46
+ }
47
+
48
+ gemini_safety_settings = [
49
+ {
50
+ "category": "HARM_CATEGORY_DANGEROUS",
51
+ "threshold": "BLOCK_NONE",
52
+ },
53
+ {
54
+ "category": "HARM_CATEGORY_HARASSMENT",
55
+ "threshold": "BLOCK_NONE",
56
+ },
57
+ {
58
+ "category": "HARM_CATEGORY_HATE_SPEECH",
59
+ "threshold": "BLOCK_NONE",
60
+ },
61
+ {
62
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
63
+ "threshold": "BLOCK_NONE",
64
+ },
65
+ {
66
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
67
+ "threshold": "BLOCK_NONE",
68
+ },
69
+ ]
70
+
71
+
72
+ def load_gemini_model(model_name):
73
+
74
+ generation_config = genai.types.GenerationConfig(
75
+ # Only one candidate for now.
76
+ candidate_count=1,
77
+ max_output_tokens=4000,
78
+ temperature=0,
79
+ response_mime_type="text/plain"
80
+ )
81
+
82
+ generation_config_json = genai.types.GenerationConfig(
83
+ # Only one candidate for now.
84
+ candidate_count=1,
85
+ max_output_tokens=4000,
86
+ temperature=0,
87
+ response_mime_type= "application/json"
88
+ )
89
+
90
+ system_prompt = ["You are a helpful assistant."]
91
+ gemini_model = genai.GenerativeModel(model_name, system_instruction=system_prompt,
92
+ safety_settings=gemini_safety_settings)
93
+
94
+ return gemini_model, generation_config, generation_config_json
95
+
96
+
97
+
98
+ ##### Call LLM
99
+
100
+
101
+ def call_llm_gemini(model_instance, model, messages, generation_config):
102
+
103
+ response = model_instance.generate_content(messages,
104
+ generation_config=generation_config)
105
+
106
+ try:
107
+ response_content = response.text.strip()
108
+ except:
109
+ response_content = 'Failed'
110
+
111
+ nb_input_tokens = model_instance.count_tokens(messages).total_tokens
112
+ nb_output_tokens = model_instance.count_tokens(response_content).total_tokens
113
+ price = nb_input_tokens * price_token[model]['input'] + nb_output_tokens * price_token[model]['output']
114
+ print(f"input tokens: {nb_input_tokens}; output tokens: {nb_output_tokens}, price: {price}")
115
+
116
+ return response_content, nb_input_tokens, nb_output_tokens, price
117
+
118
+
119
+
120
+ ##### Prompts
121
+
122
+ def get_prompt_brand(language):
123
+
124
+ prompt = "What is the brand of this product? Answer with the brand name and nothing else."
125
+
126
+ return prompt
127
+
128
+ def get_prompt_product_name(language):
129
+
130
+ prompt = f"What is the {language} product name of this product? Answer in {language} with the product name and nothing else."
131
+
132
+ return prompt
133
+
134
+ def get_prompt_ingredients(language):
135
+ prompt=f"""
136
+ You will be given an image of a product label or packaging. Your task is to extract the ingredients list from this image, focusing specifically on the {language} language version. Here's how to approach this task:
137
+
138
+ 1. Analyze the provided image
139
+
140
+ 2. Locate the ingredients list on the product label or packaging.
141
+
142
+ 3. Identify the {language} language section of the ingredients list.
143
+
144
+ 4. Extract only the {language} ingredients list. Ignore any ingredients lists in other languages, even if they are present in the image.
145
+
146
+ 5. If there are multiple {language} ingredient lists (e.g., for different flavors or varieties), extract all of them and clearly separate them.
147
+
148
+ 6. Do not include any additional information such as allergen warnings, nutritional information, or preparation instructions, even if they are in {language}.
149
+
150
+ 7. If you cannot find a {language} ingredients list in the image, state that no {language} ingredients list was found.
151
+
152
+ 8. If the image is unclear, state that the image quality is insufficient to extract the ingredients list accurately.
153
+
154
+ Provide your output in the following format:
155
+
156
+ <ingredients>
157
+ [Insert the extracted {language} ingredients list here, exactly as it appears in the image]
158
+ </ingredients>
159
+
160
+ Remember, include only the text of the {language} ingredients list, nothing else. Do not translate or interpret the ingredients; simply transcribe them as they appear in {language}.
161
+ """
162
+
163
+ return prompt
164
+
165
+
166
+ def get_prompt_nutritional_info():
167
+
168
+ prompt = """Extract the following nutritional information from the product image and present it **only** in JSON format, providing only the values per 100g: Energy kJ, Energy kcal, Fat, Saturated fat, Carbohydrates, Sugars, Fibers, Proteins, Salt.
169
+
170
+ If you can't extract the nutritional information from the image, you need to say why it's the case.
171
+
172
+ The response should contain **only** the following JSON:
173
+
174
+ {
175
+ "Energy kJ": 1500,
176
+ "Energy kcal": 360,
177
+ "Fat": 18,
178
+ "Saturated fat": 7,
179
+ "Carbohydrates": 40,
180
+ "Sugars": 25,
181
+ "Fibers": 3,
182
+ "Proteins": 8,
183
+ "Salt": 0.5
184
+ }
185
+
186
+ No additional text or explanation should be included.
187
+ """
188
+
189
+ return prompt
190
+
191
+
192
+ ##### Extract data functions
193
+
194
+ def extract_text_from_picture_baseline(OUTPUT_DIR,
195
+ df_product_id,
196
+ prompt,
197
+ type_photo,
198
+ generation_config,
199
+ max_entry=None,
200
+ progress=None
201
+ ):
202
+ outputs = []
203
+
204
+ if max_entry is None:
205
+ max_entry = len(df_product_id)
206
+
207
+ for i in progress.tqdm(range(max_entry)) if progress is not None else range(max_entry):
208
+
209
+ start_time = time.time()
210
+ product = df_product_id.loc[i]
211
+ product_id = product['ID']
212
+
213
+ photo_path = f'{OUTPUT_DIR}/photos/{product_id}_{type_photo}.jpg'
214
+
215
+ download_file(url=product[type_photo], save_path=photo_path)
216
+
217
+ photo = upload_file(photo_path)
218
+
219
+ messages = [photo, prompt]
220
+
221
+ try:
222
+ response_content, _, _, price = call_llm_gemini(gemini_model, model, messages, generation_config)
223
+
224
+ print(response_content)
225
+ processing_time = time.time() - start_time
226
+
227
+ output = [product_id, response_content, round(price, 4), round(processing_time, 2)]
228
+ outputs.append(output)
229
+
230
+ except:
231
+ print(f"Error for ID: {product_id}")
232
+
233
+ df_output = pd.DataFrame(outputs, columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
234
+
235
+ return df_output
236
+
237
+
238
+ def extract_brand(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
239
+ df_output = extract_text_from_picture_baseline(OUTPUT_DIR,
240
+ df_product_id,
241
+ get_prompt_brand(language),
242
+ type_photo="Front photo",
243
+ generation_config=generation_config,
244
+ max_entry=None,
245
+ progress=progress)
246
+
247
+ df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv', index=False)
248
+
249
+ return df_output
250
+
251
+
252
+ def extract_product_name(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
253
+ df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
254
+ get_prompt_product_name(language),
255
+ type_photo="Front photo",
256
+ generation_config=generation_config,
257
+ max_entry=None,
258
+ progress=progress)
259
+
260
+ df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv', index=False)
261
+
262
+ return df_output
263
+
264
+
265
+ def extract_ingredients(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
266
+ df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
267
+ get_prompt_ingredients(language),
268
+ type_photo="Ingredients photo",
269
+ generation_config=generation_config,
270
+ max_entry=None,
271
+ progress=progress)
272
+
273
+ df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv', index=False)
274
+
275
+ return df_output
276
+
277
+
278
+ def convert_json_string_to_dict(json_string, record_id):
279
+ default_keys = ['Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins',
280
+ 'Salt']
281
+
282
+ clean_string = json_string
283
+
284
+ if not clean_string:
285
+ print(f"ID: {record_id} - La chaîne est vide ou invalide : '{json_string}'")
286
+ return {key: -1 for key in default_keys}
287
+
288
+ try:
289
+ return json.loads(clean_string)
290
+ except json.JSONDecodeError:
291
+ print(f"ID: {record_id} - Erreur lors du décodage du JSON : '{json_string}'")
292
+ return {key: -1 for key in default_keys}
293
+
294
+ def extract_nutritional_values(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
295
+
296
+ df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
297
+ get_prompt_nutritional_info(),
298
+ type_photo="Nutritionals photo",
299
+ generation_config=generation_config_json,
300
+ max_entry=None,
301
+ progress=progress)
302
+
303
+ df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv', index=False)
304
+
305
+ df_output['Extracted_Text_Json'] = df_output.apply(
306
+ lambda row: convert_json_string_to_dict(row['Extracted_Text'], row['ID']), axis=1)
307
+
308
+ keys = list(df_output['Extracted_Text_Json'].iloc[
309
+ 0].keys()) # On prend les clés du premier dictionnaire comme référence
310
+
311
+ for key in keys:
312
+ df_key = df_output[['ID', 'Price', 'Processing time']].copy()
313
+ df_key['Extracted_Text'] = df_output['Extracted_Text_Json'].apply(lambda x: x.get(key, None))
314
+ df_key.to_csv(f"{OUTPUT_DIR}/data_extraction/{key.replace(' ', '_').lower()}.csv", index=False)
315
+
316
+ df_output = df_output[['ID', 'Extracted_Text', 'Price', 'Processing time']]
317
+
318
+ return df_output
319
+
320
+
321
+ model = 'gemini-1.5-pro-002'
322
+
323
+ gemini_model, generation_config, generation_config_json = load_gemini_model(model)