Spaces:

Yannael
/

fdl_extract

Sleeping

App Files Files Community

Yannael commited on Nov 28, 2024

Commit

46f57ce

verified ·

1 Parent(s): 19f8e20

Upload 3 files

Browse files

Files changed (3) hide show

app.py +224 -0
utils_assessment.py +241 -0
utils_data_extraction.py +323 -0

app.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import pandas as pd
+import os
+import shutil
+import gradio as gr
+import utils_data_extraction
+import utils_assessment
+import importlib
+importlib.reload(utils_data_extraction)
+importlib.reload(utils_assessment)
+"""### Function to load data
+Data is loaded from a Roamler Excel file, from a sheet called "output".
+- A subset of the Excel file is taken as reference data, and saved in the `outputs` directory as reference_data.csv
+- A folder for storing photos is created
+A n_rows parameter can be passed to load a subset of the data.
+"""
+def load_roamler_excel_file(filepath, n_rows=3):
+    OUTPUT_DIR = 'outputs/'+os.path.basename(filepath)
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
+    DATA_EXTRACTION_DIR=OUTPUT_DIR+'/data_extraction'
+    if not os.path.exists(DATA_EXTRACTION_DIR):
+        os.makedirs(DATA_EXTRACTION_DIR)
+    df_review = pd.read_excel(filepath, sheet_name='Output')
+    if n_rows is not None:
+        df_review = df_review.sample(n=n_rows, random_state=42)
+    df_products = df_review[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo',
+                           'Brand', 'Product name', 'Legal name', 'Barcode',
+                           'Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins', 'Salt', 'Ingredients',
+                           'Nutriscore','Allergens',
+                           'Quantity per unit']].copy()
+    df_products.to_csv(f'{OUTPUT_DIR}/data_extraction/reference_data.csv', index=False)
+    PHOTO_DIR=OUTPUT_DIR+'/photos'
+    if not os.path.exists(PHOTO_DIR):
+        os.makedirs(PHOTO_DIR)
+    df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR)
+    return df_products, OUTPUT_DIR, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
+def load_df_from_folder(OUTPUT_DIR):
+    df_brand_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
+    if os.path.exists(f'{OUTPUT_DIR}/data_extraction/brand.csv'):
+        df_brand_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv')
+    df_product_name_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
+    if os.path.exists(f'{OUTPUT_DIR}/data_extraction/product_name.csv'):
+        df_product_name_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv')
+    df_ingredients_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
+    if os.path.exists(f'{OUTPUT_DIR}/data_extraction/ingredients.csv'):
+        df_ingredients_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv')
+    df_nutritional_values_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
+    if os.path.exists(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv'):
+        df_nutritional_values_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv')
+    return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
+def load_csv_files(archive, OUTPUT_DIR):
+    accepted_files = ['brand.csv', 'product_name.csv', 'ingredients.csv', 'nutritional_values.csv']
+    for file in archive:
+        print(os.path.basename(file))
+        if os.path.basename(file) in accepted_files:
+            shutil.copy(file, f'{OUTPUT_DIR}/data_extraction')
+    df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR)
+    return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
+"""### Function to save data
+This function is called when the user clicks on the "Generate data archive" button.
+It creates a zip of all CSV files of the f'{OUTPUT_DIR}/data_extraction' folder, and return a download button to the archive.
+"""
+def generate_archive(OUTPUT_DIR):
+    # Download all data
+    archive_name = f'{OUTPUT_DIR}'
+    shutil.make_archive(archive_name, 'zip', f'{OUTPUT_DIR}/data_extraction')
+    return gr.DownloadButton(label=f"Download {archive_name}.zip", value=f'{archive_name}.zip', visible=True)
+"""### Gradio UI"""
+def toggle_row_visibility(show):
+    if show:
+        return gr.update(visible=True)
+    else:
+        return gr.update(visible=False)
+language = 'French'
+# Custom CSS to set max height for the rows
+custom_css = """
+.dataframe-wrap {
+    max-height: 300px;  /* Set the desired height */
+    overflow-y: scroll;
+}
+"""
+OUTPUT_DIR_value = ""
+dummy_data = df_brand_data = df_product_name_data = df_ingredients_data = df_nutritional_values_data = pd.DataFrame()
+#dummy_data, OUTPUT_DIR_value, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_roamler_excel_file("FDL-Datasets3/FR - Review.xlsm", n_rows=3)
+with gr.Blocks(css=custom_css) as fdl_data_extraction_ui:
+    gr.HTML("<div align='center'><h1>Euroconsumers Food Data Lake</h1>")
+    gr.HTML("<div align='center'><h2>Data extraction</h2>")
+    OUTPUT_DIR = gr.State(value=OUTPUT_DIR_value)
+    with gr.Row():
+        with gr.Column():
+            gr.HTML("<h2>Upload Roamler Excel file</h2>")
+            load_roamler_excel_file_input = gr.File(label="Upload Roamler Excel file", type="filepath")
+    with gr.Row(visible=False) as dataset_block:
+        with gr.Column():
+            gr.HTML("<h2>Dataset summary</h2>")
+            # Display summary of the dataset - ID, Reference_brand, Reference_product_name, mean_accuracy_score
+            with gr.Row(elem_classes="dataframe-wrap"):
+                dataframe_component = gr.DataFrame(value=dummy_data, interactive=False)
+    with gr.Row(visible=False) as product_detail_block:
+        with gr.Column():
+            # Section for product details
+            gr.HTML("<h1>Data extraction</h1>")
+            load_csv_files_input = gr.Files(label="Upload extracted data from CSV files")
+            language = gr.Dropdown(label="Select language", choices=["French", "Dutch", "Spanish", "Italian", "Portuguese"], value="French")
+            gr.HTML("<h3>Brand</h3>")
+            extract_brand_button = gr.Button("Extract brand")
+            df_brand = gr.Dataframe(label="Brand data", scale=2,
+                               column_widths=["10%", "60%", "15%", "15%"],
+                                wrap=True, value=df_brand_data)
+            gr.HTML("<h3>Product name</h3>")
+            extract_product_name_button = gr.Button("Extract product_name")
+            df_product_name = gr.Dataframe(label="Product name data", scale=2,
+                               column_widths=["10%", "60%", "15%", "15%"],
+                                wrap=True, value=df_product_name_data)
+            gr.HTML("<h3>Ingredients</h3>")
+            extract_ingredients_button = gr.Button("Extract ingredients")
+            df_ingredients = gr.Dataframe(label="Ingredients data", scale=2,
+                               column_widths=["10%", "60%", "15%", "15%"],
+                                wrap=True, value=df_ingredients_data)
+            gr.HTML("<h3>Nutritional values</h3>")
+            extract_nutritional_values_button = gr.Button("Extract nutritional values")
+            df_nutritional_values = gr.Dataframe(label="Nutritional data", scale=2,
+                               column_widths=["10%", "60%", "15%", "15%"],
+                                wrap=True, value=df_nutritional_values_data)
+            # Download
+            gr.HTML("<h1>Data download</h1>")
+            generate_merged_file_button = gr.Button("Generate merged file")
+            generate_archive_button = gr.Button("Generate data archive")
+            download_button = gr.DownloadButton("Download archive", visible=False)
+    ### Control functions
+    # Linking the select_dataset change event to update both the gradio DataFrame and product_ids dropdown
+    load_roamler_excel_file_input.change(load_roamler_excel_file,
+                                         inputs=load_roamler_excel_file_input,
+                                         outputs=[dataframe_component, OUTPUT_DIR,
+                                                  df_brand, df_product_name, df_ingredients, df_nutritional_values])
+    # Toggle visibility of the dataset block
+    load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=dataset_block)
+    load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=product_detail_block)
+    load_csv_files_input.change(load_csv_files,
+                                inputs=[load_csv_files_input, OUTPUT_DIR],
+                                outputs=[df_brand, df_product_name, df_ingredients, df_nutritional_values])
+    # Data extraction
+    extract_brand_button.click(utils_data_extraction.extract_brand,
+                               inputs=[OUTPUT_DIR, dataframe_component, language],
+                               outputs=df_brand)
+    extract_product_name_button.click(utils_data_extraction.extract_product_name,
+                                      inputs=[OUTPUT_DIR, dataframe_component, language],
+                                      outputs=df_product_name)
+    extract_ingredients_button.click(utils_data_extraction.extract_ingredients,
+                                     inputs=[OUTPUT_DIR, dataframe_component, language],
+                                     outputs=df_ingredients)
+    extract_nutritional_values_button.click(utils_data_extraction.extract_nutritional_values,
+                                            inputs=[OUTPUT_DIR, dataframe_component, language],
+                                            outputs=df_nutritional_values)
+    generate_merged_file_button.click(utils_assessment.merge_and_save_data, inputs=OUTPUT_DIR)
+    generate_archive_button.click(generate_archive, inputs=OUTPUT_DIR, outputs=download_button)
+fdl_data_extraction_ui.launch(debug=True)

utils_assessment.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import re
+from rouge_score import rouge_scorer
+import Levenshtein
+import pandas as pd
+import numpy as np
+feature_assessment_entries = {
+    f'brand': {
+        'name': f'brand',
+        'output_column': 'Brand',
+        'scoring_function_name': 'grade_exact_match',
+        'post_processing_function_name': 'post_processing_none',
+        # 'post_processing_function_name' : 'post_processing_brand',
+        'k_folds': 3,
+    },
+    f'product_name': {
+        'name': f'product_name',
+        'output_column': 'Product name',
+        'scoring_function_name': 'grade_levenshtein_match',
+        # 'scoring_function_name' : 'grade_exact_match',
+        'post_processing_function_name': 'post_processing_none',
+        'k_folds': 3,
+    },
+    f'ingredients': {
+        'name': f'ingredients',
+        'output_column': 'Ingredients',
+        'scoring_function_name': 'grade_rouge_score',
+        # 'scoring_function_name' : 'grade_levenshtein_match',
+        'post_processing_function_name': 'post_processing_none',
+        # 'post_processing_function_name' : 'post_processing_ingredients',
+        'k_folds': 3,
+    },
+    f'energy_kj': {
+        'name': f'energy_kj',
+        'output_column': 'Energy kJ',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_none',
+        'k_folds': 3,
+    },
+    f'energy_kcal': {
+        'name': f'energy_kcal',
+        'output_column': 'Energy kcal',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_none',
+        'k_folds': 3,
+    },
+    f'fat': {
+        'name': f'fat',
+        'output_column': 'Fat',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_nutritionals',
+        'k_folds': 3,
+    },
+    f'saturated_fat': {
+        'name': f'saturated_fat',
+        'output_column': 'Saturated fat',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_nutritionals',
+        'k_folds': 3,
+    },
+    f'carbohydrates': {
+        'name': f'carbohydrates',
+        'output_column': 'Carbohydrates',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_nutritionals',
+        'k_folds': 3,
+    },
+    f'sugars': {
+        'name': f'sugars',
+        'output_column': 'Sugars',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_nutritionals',
+        'k_folds': 3,
+    },
+    f'fibers': {
+        'name': f'fibers',
+        'output_column': 'Fibers',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_nutritionals',
+        'k_folds': 3,
+    },
+    f'proteins': {
+        'name': f'proteins',
+        'output_column': 'Proteins',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_nutritionals',
+        'k_folds': 3,
+    },
+    f'salt': {
+        'name': f'salt',
+        'output_column': 'Salt',
+        'scoring_function_name': 'grade_numerical',
+        'post_processing_function_name': 'post_processing_nutritionals',
+        'k_folds': 3,
+    },
+}
+def post_processing_none(string):
+    return string
+def post_processing_ingredients(string):
+    pattern = r"<ingredients>(.*?)</ingredients>"
+    # Find all matches
+    matches = re.findall(pattern, string, re.DOTALL)
+    if len(matches) == 0:
+        output = string
+    else:
+        output = matches[0].strip()
+        if output.lower().startswith("ingrediënten: ") or output.lower().startswith("ingredienten: "):
+            output = output[len("ingrediënten: "):]
+        if output.lower().startswith("ingredients: "):
+            output = output[len("ingredients: "):]
+    return output
+def post_processing_brand(brand):
+    if brand.lower() == "boni":
+        brand = "Boni Selection"
+    elif brand.lower() == "rana":
+        brand = "Giovanni Rana"
+    elif brand.lower() == "the market":
+        brand = "Carrefour The Market"
+    elif brand.lower() == "extra":
+        brand = "Carrefour Extra"
+    return brand
+def post_processing_nutritionals(predicted_value):
+    try:
+        predicted_value = re.findall(r"[-+]?\d*\.\d+|\d+", str(predicted_value))[0]
+    except:
+        predicted_value = np.nan
+    return predicted_value
+def grade_levenshtein_match(predicted_value, reference_value):
+    score = Levenshtein.ratio(predicted_value.lower().strip(), reference_value.lower().strip())
+    return score
+def grade_exact_match(predicted_value, reference_value):
+    reference_value = reference_value.lower().strip()
+    reference_value = re.sub(r'\s+', ' ', reference_value)
+    predicted_value = predicted_value.lower().strip()
+    score = int(predicted_value.lower().strip() == reference_value.lower().strip())
+    return score
+def grade_rouge_score(predicted_value, reference_value):
+    scorer = rouge_scorer.RougeScorer(['rouge2'])
+    score = scorer.score(predicted_value, reference_value)['rouge2'].fmeasure
+    return score
+def grade_numerical(predicted_value, reference_value):
+    try:
+        if np.isnan(float(predicted_value)) and np.isnan(float(reference_value)):
+            score = 1
+        else:
+            score = int(float(predicted_value) == float(reference_value))
+    except:
+        score = -1
+    return score
+def create_eval_data(OUTPUT_DIR, feature_assessment_entry):
+    df_product_id = pd.read_csv(f"{OUTPUT_DIR}/reference_data.csv")
+    df_features = pd.read_csv(f"{OUTPUT_DIR}/{feature_assessment_entry['name']}.csv")
+    df_features = df_features.merge(df_product_id, on='ID', how='left')
+    df_eval_data = df_features[
+        ['ID', feature_assessment_entry['output_column'], 'Extracted_Text', 'Price', 'Processing time']].copy()
+    df_eval_data.rename(columns={feature_assessment_entry['output_column']: 'Reference'}, inplace=True)
+    df_eval_data.rename(columns={'Extracted_Text': 'Predicted'}, inplace=True)
+    df_eval_data['Predicted'] = df_eval_data.apply(
+        lambda row: eval(feature_assessment_entry['post_processing_function_name'])(row['Predicted']), axis=1)
+    df_eval_data['accuracy_score'] = df_eval_data.apply(
+        lambda row: eval(feature_assessment_entry['scoring_function_name'])(row['Predicted'], row['Reference']), axis=1)
+    df_eval_data['accuracy_score'] = round(df_eval_data['accuracy_score'], 2)
+    N = len(df_eval_data)
+    k = feature_assessment_entry['k_folds']
+    np.random.seed(42)
+    df_eval_data['fold'] = np.random.randint(0, k, size=N)
+    return df_eval_data
+def merge_and_save_data(OUTPUT_DIR):
+    df_ref_data = pd.read_csv(f"{OUTPUT_DIR}/data_extraction/reference_data.csv")
+    data_merged = [df_ref_data[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo']]]
+    for feature_name in feature_assessment_entries.keys():
+        df_eval_data = create_eval_data(f'{OUTPUT_DIR}/data_extraction', feature_assessment_entries[feature_name])
+        df_eval_data = df_eval_data[['Reference', 'Predicted', 'accuracy_score']]
+        df_eval_data.rename(columns={'Reference': 'Reference_' + feature_name}, inplace=True)
+        df_eval_data.rename(columns={'Predicted': 'Predicted_' + feature_name}, inplace=True)
+        df_eval_data.rename(columns={'accuracy_score': 'accuracy_score_' + feature_name}, inplace=True)
+        data_merged.append(df_eval_data)
+    data_merged = pd.concat(data_merged, axis=1)
+    data_merged.to_csv(f"{OUTPUT_DIR}/data_extraction/merged.csv")
+    return data_merged

utils_data_extraction.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import requests
+import pandas as pd
+import os
+import time
+import gradio as gr
+import json
+import google.generativeai as genai
+from dotenv import load_dotenv
+load_dotenv()
+GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
+genai.configure(api_key=GOOGLE_API_KEY)
+############## Photos ##############
+def download_file(url, save_path):
+    try:
+        # Send a GET request to the URL
+        response = requests.get(url)
+        # Check if the request was successful (status code 200)
+        if response.status_code == 200:
+            # Open the specified path in binary-write mode and save the content
+            with open(save_path, 'wb') as file:
+                file.write(response.content)
+        else:
+            print(f"Failed to download image. Status code: {response.status_code}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+def upload_file(photo_path):
+    photo = genai.upload_file(photo_path)
+    return photo
+###### Data extraction
+## Helper function to initialize model
+price_token={'gemini-1.5-pro-002': {'input': 1.25 / 1000000, 'output': 5 / 1000000}
+             }
+gemini_safety_settings = [
+    {
+        "category": "HARM_CATEGORY_DANGEROUS",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_HARASSMENT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_HATE_SPEECH",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+        "threshold": "BLOCK_NONE",
+    },
+]
+def load_gemini_model(model_name):
+    generation_config = genai.types.GenerationConfig(
+            # Only one candidate for now.
+            candidate_count=1,
+            max_output_tokens=4000,
+            temperature=0,
+            response_mime_type="text/plain"
+        )
+    generation_config_json = genai.types.GenerationConfig(
+            # Only one candidate for now.
+            candidate_count=1,
+            max_output_tokens=4000,
+            temperature=0,
+            response_mime_type= "application/json"
+        )
+    system_prompt = ["You are a helpful assistant."]
+    gemini_model = genai.GenerativeModel(model_name, system_instruction=system_prompt,
+                                       safety_settings=gemini_safety_settings)
+    return gemini_model, generation_config, generation_config_json
+##### Call LLM
+def call_llm_gemini(model_instance, model, messages, generation_config):
+    response = model_instance.generate_content(messages,
+                                               generation_config=generation_config)
+    try:
+        response_content = response.text.strip()
+    except:
+        response_content = 'Failed'
+    nb_input_tokens = model_instance.count_tokens(messages).total_tokens
+    nb_output_tokens = model_instance.count_tokens(response_content).total_tokens
+    price = nb_input_tokens * price_token[model]['input'] + nb_output_tokens * price_token[model]['output']
+    print(f"input tokens: {nb_input_tokens}; output tokens: {nb_output_tokens}, price: {price}")
+    return response_content, nb_input_tokens, nb_output_tokens, price
+##### Prompts
+def get_prompt_brand(language):
+    prompt = "What is the brand of this product? Answer with the brand name and nothing else."
+    return prompt
+def get_prompt_product_name(language):
+    prompt = f"What is the {language} product name of this product? Answer in {language} with the product name and nothing else."
+    return prompt
+def get_prompt_ingredients(language):
+    prompt=f"""
+You will be given an image of a product label or packaging. Your task is to extract the ingredients list from this image, focusing specifically on the {language} language version. Here's how to approach this task:
+1. Analyze the provided image
+2. Locate the ingredients list on the product label or packaging.
+3. Identify the {language} language section of the ingredients list.
+4. Extract only the {language} ingredients list. Ignore any ingredients lists in other languages, even if they are present in the image.
+5. If there are multiple {language} ingredient lists (e.g., for different flavors or varieties), extract all of them and clearly separate them.
+6. Do not include any additional information such as allergen warnings, nutritional information, or preparation instructions, even if they are in {language}.
+7. If you cannot find a {language} ingredients list in the image, state that no {language} ingredients list was found.
+8. If the image is unclear, state that the image quality is insufficient to extract the ingredients list accurately.
+Provide your output in the following format:
+<ingredients>
+[Insert the extracted {language} ingredients list here, exactly as it appears in the image]
+</ingredients>
+Remember, include only the text of the {language} ingredients list, nothing else. Do not translate or interpret the ingredients; simply transcribe them as they appear in {language}.
+		"""
+    return prompt
+def get_prompt_nutritional_info():
+    prompt = """Extract the following nutritional information from the product image and present it **only** in JSON format, providing only the values per 100g: Energy kJ, Energy kcal, Fat, Saturated fat, Carbohydrates, Sugars, Fibers, Proteins, Salt.
+    If you can't extract the nutritional information from the image, you need to say why it's the case.
+    The response should contain **only** the following JSON:
+    {
+      "Energy kJ": 1500,
+      "Energy kcal": 360,
+      "Fat": 18,
+      "Saturated fat": 7,
+      "Carbohydrates": 40,
+      "Sugars": 25,
+      "Fibers": 3,
+      "Proteins": 8,
+      "Salt": 0.5
+    }
+    No additional text or explanation should be included.
+    """
+    return prompt
+##### Extract data functions
+def extract_text_from_picture_baseline(OUTPUT_DIR,
+                                       df_product_id,
+                                       prompt,
+                                       type_photo,
+                                       generation_config,
+                                       max_entry=None,
+                                       progress=None
+                                       ):
+    outputs = []
+    if max_entry is None:
+        max_entry = len(df_product_id)
+    for i in progress.tqdm(range(max_entry)) if progress is not None else range(max_entry):
+        start_time = time.time()
+        product = df_product_id.loc[i]
+        product_id = product['ID']
+        photo_path = f'{OUTPUT_DIR}/photos/{product_id}_{type_photo}.jpg'
+        download_file(url=product[type_photo], save_path=photo_path)
+        photo = upload_file(photo_path)
+        messages = [photo, prompt]
+        try:
+            response_content, _, _, price = call_llm_gemini(gemini_model, model, messages, generation_config)
+            print(response_content)
+            processing_time = time.time() - start_time
+            output = [product_id, response_content, round(price, 4), round(processing_time, 2)]
+            outputs.append(output)
+        except:
+            print(f"Error for ID: {product_id}")
+    df_output = pd.DataFrame(outputs, columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
+    return df_output
+def extract_brand(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
+    df_output = extract_text_from_picture_baseline(OUTPUT_DIR,
+                                                   df_product_id,
+                                                   get_prompt_brand(language),
+                                                   type_photo="Front photo",
+                                                   generation_config=generation_config,
+                                                   max_entry=None,
+                                                   progress=progress)
+    df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv', index=False)
+    return df_output
+def extract_product_name(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
+    df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
+                                                   get_prompt_product_name(language),
+                                                   type_photo="Front photo",
+                                                   generation_config=generation_config,
+                                                   max_entry=None,
+                                                   progress=progress)
+    df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv', index=False)
+    return df_output
+def extract_ingredients(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
+    df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
+                                                   get_prompt_ingredients(language),
+                                                   type_photo="Ingredients photo",
+                                                   generation_config=generation_config,
+                                                   max_entry=None,
+                                                   progress=progress)
+    df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv', index=False)
+    return df_output
+def convert_json_string_to_dict(json_string, record_id):
+    default_keys = ['Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins',
+                    'Salt']
+    clean_string = json_string
+    if not clean_string:
+        print(f"ID: {record_id} - La chaîne est vide ou invalide : '{json_string}'")
+        return {key: -1 for key in default_keys}
+    try:
+        return json.loads(clean_string)
+    except json.JSONDecodeError:
+        print(f"ID: {record_id} - Erreur lors du décodage du JSON : '{json_string}'")
+        return {key: -1 for key in default_keys}
+def extract_nutritional_values(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
+        df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
+                                                       get_prompt_nutritional_info(),
+                                                       type_photo="Nutritionals photo",
+                                                       generation_config=generation_config_json,
+                                                       max_entry=None,
+                                                       progress=progress)
+        df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv', index=False)
+        df_output['Extracted_Text_Json'] = df_output.apply(
+            lambda row: convert_json_string_to_dict(row['Extracted_Text'], row['ID']), axis=1)
+        keys = list(df_output['Extracted_Text_Json'].iloc[
+                        0].keys())  # On prend les clés du premier dictionnaire comme référence
+        for key in keys:
+            df_key = df_output[['ID', 'Price', 'Processing time']].copy()
+            df_key['Extracted_Text'] = df_output['Extracted_Text_Json'].apply(lambda x: x.get(key, None))
+            df_key.to_csv(f"{OUTPUT_DIR}/data_extraction/{key.replace(' ', '_').lower()}.csv", index=False)
+        df_output = df_output[['ID', 'Extracted_Text', 'Price', 'Processing time']]
+        return df_output
+model = 'gemini-1.5-pro-002'
+gemini_model, generation_config, generation_config_json = load_gemini_model(model)