Spaces:

Saving-Willy
/

saving-willy-dev

Sleeping

App Files Files Community

vancauwe commited on Feb 4

Commit

8ccb11f

unverified ·

2 Parent(s): 41bbd4a 8e4ef44

Merge pull request #29 from sdsc-ordes/feat/stateful-workflow

Browse files

Files changed (17) hide show

.github/workflows/python-pycov-onPR.yml +39 -0
requirements.txt +2 -1
src/classifier/classifier_image.py +202 -8
src/classifier_image.py +0 -70
src/hf_push_observations.py +74 -8
src/input/input_handling.py +370 -72
src/input/input_observation.py +190 -43
src/input/input_validator.py +28 -9
src/main.py +149 -76
src/maps/obs_map.py +2 -2
src/utils/metadata_handler.py +15 -5
src/utils/st_logs.py +11 -0
src/utils/workflow_state.py +108 -0
src/utils/workflow_ui.py +48 -0
src/whale_viewer.py +3 -0
tests/test_input_handling.py +2 -5
tests/test_whale_viewer.py +1 -3

.github/workflows/python-pycov-onPR.yml ADDED Viewed

	@@ -0,0 +1,39 @@

+# This workflow will install dependencies, create coverage tests and run Pytest Coverage Commentator
+# For more information see: https://github.com/coroo/pytest-coverage-commentator
+name: pytest-coverage-in-PR
+on:
+  pull_request:
+    branches:
+      - '*'
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+        contents: write
+        pull-requests: write
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        if [ -f tests/requirements.txt ]; then pip install -r tests/requirements.txt; fi
+    - name: Build coverage files for mishakav commenter action
+      run: |
+        pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=src tests/ | tee pytest-coverage.txt
+        echo "working dir:" && pwd
+        echo "files in cwd:" && ls -ltr
+    - name: Pytest coverage comment
+      uses: MishaKav/pytest-coverage-comment@main
+      with:
+        pytest-coverage-path: ./pytest-coverage.txt
+        junitxml-path: ./pytest.xml
+    #- name: Comment coverage
+    #  uses: coroo/[email protected]

requirements.txt CHANGED Viewed

@@ -10,7 +10,8 @@ streamlit_folium==0.23.1
 # backend
 datasets==3.0.2
 # running ML models

 # backend
 datasets==3.0.2
+## FSM
+transitions==0.9.2
 # running ML models

src/classifier/classifier_image.py CHANGED Viewed

@@ -10,13 +10,207 @@ import whale_viewer as viewer
 from hf_push_observations import push_observations
 from utils.grid_maker import gridder
 from utils.metadata_handler import metadata2md
-def cetacean_classify(cetacean_classifier):
     """Cetacean classifier using the saving-willy model from Saving Willy Hugging Face space.
     For each image in the session state, classify the image and display the top 3 predictions.
     Args:
         cetacean_classifier ([type]):  saving-willy model from Saving Willy Hugging Face space
     """
     images = st.session_state.images
     observations = st.session_state.observations
     hashes = st.session_state.image_hashes
@@ -33,25 +227,25 @@ def cetacean_classify(cetacean_classifier):
             observation = observations[hash].to_dict()
             # run classifier model on `image`, and persistently store the output
             out = cetacean_classifier(image) # get top 3 matches
-            st.session_state.whale_prediction1 = out['predictions'][0]
-            st.session_state.classify_whale_done = True
-            msg = f"[D]2 classify_whale_done: {st.session_state.classify_whale_done}, whale_prediction1: {st.session_state.whale_prediction1}"
             g_logger.info(msg)
             # dropdown for selecting/overriding the species prediction
-            if not st.session_state.classify_whale_done:
                 selected_class = st.sidebar.selectbox("Species", viewer.WHALE_CLASSES,
                                                                 index=None, placeholder="Species not yet identified...",
                                                                 disabled=True)
             else:
-                pred1 = st.session_state.whale_prediction1
                 # get index of pred1 from WHALE_CLASSES, none if not present
                 print(f"[D] pred1: {pred1}")
                 ix = viewer.WHALE_CLASSES.index(pred1) if pred1 in viewer.WHALE_CLASSES else None
                 selected_class = st.selectbox(f"Species for observation {str(o)}", viewer.WHALE_CLASSES, index=ix)
             observation['predicted_class'] = selected_class
-            if selected_class != st.session_state.whale_prediction1:
                 observation['class_overriden'] = selected_class
             st.session_state.public_observation = observation
@@ -70,4 +264,4 @@ def cetacean_classify(cetacean_classifier):
             for i in range(len(whale_classes)):
                 viewer.display_whale(whale_classes, i)
         o += 1
-        col = (col + 1) % row_size

 from hf_push_observations import push_observations
 from utils.grid_maker import gridder
 from utils.metadata_handler import metadata2md
+from input.input_observation import InputObservation
+def init_classifier_session_states() -> None:
+    '''
+    Initialise the session state variables used in classification
+    '''
+    if "classify_whale_done" not in st.session_state:
+        st.session_state.classify_whale_done = {}
+    if "whale_prediction1" not in st.session_state:
+        st.session_state.whale_prediction1 = {}
+def add_classifier_header() -> None:
+    """
+    Add brief explainer text about cetacean classification to the tab
+    """
+    st.markdown("""
+                *Run classifer to identify the species of cetean on the uploaded image.
+                Once inference is complete, the top three predictions are shown.
+                You can override the prediction by selecting a species from the dropdown.*""")
+# func to just run classification, store results.
+def cetacean_just_classify(cetacean_classifier):
+    """
+    Infer cetacean species for all observations in the session state.
+    - this function runs the classifier, and stores results in the session state.
+    - the top 3 predictions are stored in the observation object, which is retained
+      in st.session_state.observations
+    - to display results use cetacean_show_results() or cetacean_show_results_and_review()
+    Args:
+        cetacean_classifier ([type]):  saving-willy model from Saving Willy Hugging Face space
+    """
+    images = st.session_state.images
+    #observations = st.session_state.observations
+    hashes = st.session_state.image_hashes
+    for hash in hashes:
+        image = images[hash]
+        # run classifier model on `image`, and persistently store the output
+        out = cetacean_classifier(image) # get top 3 matches
+        st.session_state.whale_prediction1[hash] = out['predictions'][0]
+        st.session_state.classify_whale_done[hash] = True
+        st.session_state.observations[hash].set_top_predictions(out['predictions'][:])
+        msg = f"[D]2 classify_whale_done for {hash}: {st.session_state.classify_whale_done[hash]}, whale_prediction1: {st.session_state.whale_prediction1[hash]}"
+        g_logger.info(msg)
+        if st.session_state.MODE_DEV_STATEFUL:
+            st.write(f"*[D] Observation {hash} classified as {st.session_state.whale_prediction1[hash]}*")
+# func to show results and allow review
+def cetacean_show_results_and_review() -> None:
+    """
+    Present classification results and allow user to review and override the prediction.
+    - for each observation in the session state, displays the image, summarised
+      metadata, and the top 3 predictions.
+    - allows user to override the prediction by selecting a species from the dropdown.
+    - the selected species is stored in the observation object, which is retained in
+      st.session_state.observations
+    """
+    images = st.session_state.images
+    observations = st.session_state.observations
+    hashes = st.session_state.image_hashes
+    batch_size, row_size, page = gridder(hashes)
+    grid = st.columns(row_size)
+    col = 0
+    o = 1
+    for hash in hashes:
+        image = images[hash]
+        #observation = observations[hash].to_dict()
+        _observation:InputObservation = observations[hash]
+        with grid[col]:
+            st.image(image, use_column_width=True)
+            # dropdown for selecting/overriding the species prediction
+            if not st.session_state.classify_whale_done[hash]:
+                selected_class = st.sidebar.selectbox("Species", viewer.WHALE_CLASSES,
+                                                                index=None, placeholder="Species not yet identified...",
+                                                                disabled=True)
+            else:
+                pred1 = st.session_state.whale_prediction1[hash]
+                # get index of pred1 from WHALE_CLASSES, none if not present
+                print(f"[D] {o:3} pred1: {pred1:30} | {hash}")
+                ix = viewer.WHALE_CLASSES.index(pred1) if pred1 in viewer.WHALE_CLASSES else None
+                selected_class = st.selectbox(f"Species for observation {str(o)}", viewer.WHALE_CLASSES, index=ix)
+            _observation.set_selected_class(selected_class)
+            #observation['predicted_class'] = selected_class
+            # this logic is now in the InputObservation class automatially
+            #if selected_class != st.session_state.whale_prediction1[hash]:
+            #    observation['class_overriden'] = selected_class # TODO: this should be boolean!
+            # store the elements of the observation that will be transmitted (not image)
+            observation = _observation.to_dict()
+            st.session_state.public_observations[hash] = observation
+            #st.button(f"Upload observation {str(o)} to THE INTERNET!", on_click=push_observations)
+            # TODO: the metadata only fills properly if `validate` was clicked.
+            st.markdown(metadata2md(hash, debug=True))
+            msg = f"[D] full observation after inference: {observation}"
+            g_logger.debug(msg)
+            print(msg)
+            # TODO: add a link to more info on the model, next to the button.
+            whale_classes = observations[hash].top_predictions
+            # render images for the top 3 (that is what the model api returns)
+            n = len(whale_classes)
+            st.markdown(f"**Top {n} Predictions for observation {str(o)}**")
+            for i in range(n):
+                viewer.display_whale(whale_classes, i)
+        o += 1
+        col = (col + 1) % row_size
+# func to just present results
+def cetacean_show_results():
+    """
+    Present classification results that may be pushed to the online dataset.
+    - for each observation in the session state, displays the image, summarised
+      metadata, the top 3 predictions, and the selected species (which may have
+      been manually selected, or the top prediction accepted).
+    """
+    images = st.session_state.images
+    observations = st.session_state.observations
+    hashes = st.session_state.image_hashes
+    batch_size, row_size, page = gridder(hashes)
+    grid = st.columns(row_size)
+    col = 0
+    o = 1
+    for hash in hashes:
+        image = images[hash]
+        observation = observations[hash].to_dict()
+        with grid[col]:
+            st.image(image, use_column_width=True)
+            # # dropdown for selecting/overriding the species prediction
+            # if not st.session_state.classify_whale_done[hash]:
+            #     selected_class = st.sidebar.selectbox("Species", viewer.WHALE_CLASSES,
+            #                                                     index=None, placeholder="Species not yet identified...",
+            #                                                     disabled=True)
+            # else:
+            #     pred1 = st.session_state.whale_prediction1[hash]
+            #     # get index of pred1 from WHALE_CLASSES, none if not present
+            #     print(f"[D] pred1: {pred1}")
+            #     ix = viewer.WHALE_CLASSES.index(pred1) if pred1 in viewer.WHALE_CLASSES else None
+            #     selected_class = st.selectbox(f"Species for observation {str(o)}", viewer.WHALE_CLASSES, index=ix)
+            # observation['predicted_class'] = selected_class
+            # if selected_class != st.session_state.whale_prediction1[hash]:
+            #     observation['class_overriden'] = selected_class # TODO: this should be boolean!
+            # st.session_state.public_observation = observation
+            #st.button(f"Upload observation {str(o)} to THE INTERNET!", on_click=push_observations)
+            #
+            st.markdown(metadata2md(hash, debug=True))
+            msg = f"[D] full observation after inference: {observation}"
+            g_logger.debug(msg)
+            print(msg)
+            # TODO: add a link to more info on the model, next to the button.
+            whale_classes = observations[hash].top_predictions
+            # render images for the top 3 (that is what the model api returns)
+            n = len(whale_classes)
+            st.markdown(f"**Top {n} Predictions for observation {str(o)}**")
+            for i in range(n):
+                viewer.display_whale(whale_classes, i)
+        o += 1
+        col = (col + 1) % row_size
+# func to do all in one
+def cetacean_classify_show_and_review(cetacean_classifier):
     """Cetacean classifier using the saving-willy model from Saving Willy Hugging Face space.
     For each image in the session state, classify the image and display the top 3 predictions.
     Args:
         cetacean_classifier ([type]):  saving-willy model from Saving Willy Hugging Face space
     """
+    raise DeprecationWarning("This function is deprecated. Use individual steps instead")
     images = st.session_state.images
     observations = st.session_state.observations
     hashes = st.session_state.image_hashes
             observation = observations[hash].to_dict()
             # run classifier model on `image`, and persistently store the output
             out = cetacean_classifier(image) # get top 3 matches
+            st.session_state.whale_prediction1[hash] = out['predictions'][0]
+            st.session_state.classify_whale_done[hash] = True
+            msg = f"[D]2 classify_whale_done for {hash}: {st.session_state.classify_whale_done[hash]}, whale_prediction1: {st.session_state.whale_prediction1[hash]}"
             g_logger.info(msg)
             # dropdown for selecting/overriding the species prediction
+            if not st.session_state.classify_whale_done[hash]:
                 selected_class = st.sidebar.selectbox("Species", viewer.WHALE_CLASSES,
                                                                 index=None, placeholder="Species not yet identified...",
                                                                 disabled=True)
             else:
+                pred1 = st.session_state.whale_prediction1[hash]
                 # get index of pred1 from WHALE_CLASSES, none if not present
                 print(f"[D] pred1: {pred1}")
                 ix = viewer.WHALE_CLASSES.index(pred1) if pred1 in viewer.WHALE_CLASSES else None
                 selected_class = st.selectbox(f"Species for observation {str(o)}", viewer.WHALE_CLASSES, index=ix)
             observation['predicted_class'] = selected_class
+            if selected_class != st.session_state.whale_prediction1[hash]:
                 observation['class_overriden'] = selected_class
             st.session_state.public_observation = observation
             for i in range(len(whale_classes)):
                 viewer.display_whale(whale_classes, i)
         o += 1
+        col = (col + 1) % row_size

src/classifier_image.py DELETED Viewed

@@ -1,70 +0,0 @@
-import streamlit as st
-import logging
-import os
-# get a global var for logger accessor in this module
-LOG_LEVEL = logging.DEBUG
-g_logger = logging.getLogger(__name__)
-g_logger.setLevel(LOG_LEVEL)
-from grid_maker import gridder
-import hf_push_observations as sw_push_obs
-import utils.metadata_handler as meta_handler
-import whale_viewer as sw_wv
-def cetacean_classify(cetacean_classifier, tab_inference):
-    files = st.session_state.files
-    images = st.session_state.images
-    observations = st.session_state.observations
-    batch_size, row_size, page = gridder(files)
-    grid = st.columns(row_size)
-    col = 0
-    for file in files:
-        image = images[file.name]
-        with grid[col]:
-            st.image(image, use_column_width=True)
-            observation = observations[file.name].to_dict()
-            # run classifier model on `image`, and persistently store the output
-            out = cetacean_classifier(image) # get top 3 matches
-            st.session_state.whale_prediction1 = out['predictions'][0]
-            st.session_state.classify_whale_done = True
-            msg = f"[D]2 classify_whale_done: {st.session_state.classify_whale_done}, whale_prediction1: {st.session_state.whale_prediction1}"
-            g_logger.info(msg)
-            # dropdown for selecting/overriding the species prediction
-            if not st.session_state.classify_whale_done:
-                selected_class = st.sidebar.selectbox("Species", sw_wv.WHALE_CLASSES,
-                                                                index=None, placeholder="Species not yet identified...",
-                                                                disabled=True)
-            else:
-                pred1 = st.session_state.whale_prediction1
-                # get index of pred1 from WHALE_CLASSES, none if not present
-                print(f"[D] pred1: {pred1}")
-                ix = sw_wv.WHALE_CLASSES.index(pred1) if pred1 in sw_wv.WHALE_CLASSES else None
-                selected_class = tab_inference.selectbox("Species", sw_wv.WHALE_CLASSES, index=ix)
-            observation['predicted_class'] = selected_class
-            if selected_class != st.session_state.whale_prediction1:
-                observation['class_overriden'] = selected_class
-            st.session_state.public_observation = observation
-            st.button(f"Upload observation for {file.name} to THE INTERNET!", on_click=sw_push_obs.push_observations)
-            # TODO: the metadata only fills properly if `validate` was clicked.
-            st.markdown(meta_handler.metadata2md())
-            msg = f"[D] full observation after inference: {observation}"
-            g_logger.debug(msg)
-            print(msg)
-            # TODO: add a link to more info on the model, next to the button.
-            whale_classes = out['predictions'][:]
-            # render images for the top 3 (that is what the model api returns)
-            #with tab_inference:
-            st.title(f"Species detected for {file.name}")
-            for i in range(len(whale_classes)):
-                sw_wv.display_whale(whale_classes, i)
-        col = (col + 1) % row_size

src/hf_push_observations.py CHANGED Viewed

@@ -1,15 +1,82 @@
-from streamlit.delta_generator import DeltaGenerator
-import streamlit as st
-from huggingface_hub import HfApi
 import json
 import tempfile
 import logging
 # get a global var for logger accessor in this module
 LOG_LEVEL = logging.DEBUG
 g_logger = logging.getLogger(__name__)
 g_logger.setLevel(LOG_LEVEL)
 def push_observations(tab_log:DeltaGenerator=None):
     """
     Push the observations to the Hugging Face dataset
@@ -20,17 +87,16 @@ def push_observations(tab_log:DeltaGenerator=None):
             push any observation since generating the logger)
     """
     # we get the observation from session state: 1 is the dict 2 is the image.
     # first, lets do an info display (popup)
     metadata_str = json.dumps(st.session_state.public_observation)
     st.toast(f"Uploading observations: {metadata_str}", icon="🦭")
-    tab_log = st.session_state.tab_log
-    if tab_log is not None:
-        tab_log.info(f"Uploading observations: {metadata_str}")
     # get huggingface api
-    import os
     token = os.environ.get("HF_TOKEN", None)
     api = HfApi(token=token)
@@ -53,4 +119,4 @@ def push_observations(tab_log:DeltaGenerator=None):
     # msg = f"observation attempted tx to repo happy walrus: {rv}"
     g_logger.info(msg)
     st.info(msg)

+import os
 import json
 import tempfile
 import logging
+from streamlit.delta_generator import DeltaGenerator
+import streamlit as st
+from huggingface_hub import HfApi, CommitInfo
 # get a global var for logger accessor in this module
 LOG_LEVEL = logging.DEBUG
 g_logger = logging.getLogger(__name__)
 g_logger.setLevel(LOG_LEVEL)
+def push_observation(image_hash:str, api:HfApi, enable_push:False) -> CommitInfo:
+    '''
+    push one observation to the Hugging Face dataset
+    '''
+    # get the observation
+    observation = st.session_state.public_observations.get(image_hash)
+    if observation is None:
+        msg = f"Could not find observation with hash {image_hash}"
+        g_logger.error(msg)
+        st.error(msg)
+        return None
+    # convert to json
+    metadata_str = json.dumps(observation) # doesn't work yet, TODO
+    st.toast(f"Uploading observation: {metadata_str}", icon="🦭")
+    g_logger.info(f"Uploading observation: {metadata_str}")
+    # write to temp file so we can send it (why is this not using context mgr?)
+    f = tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False)
+    f.write(metadata_str)
+    f.close()
+    #st.info(f"temp file: {f.name} with metadata written...")
+    path_in_repo = f"metadata/{observation['author_email']}/{observation['image_md5']}.json"
+    msg = f"fname: {f.name} | path: {path_in_repo}"
+    print(msg)
+    st.warning(msg)
+    if enable_push:
+        rv = api.upload_file(
+            path_or_fileobj=f.name,
+            path_in_repo=path_in_repo,
+            repo_id="Saving-Willy/temp_dataset",
+            repo_type="dataset",
+        )
+        print(rv)
+        msg = f"observation attempted tx to repo happy walrus: {rv}"
+        g_logger.info(msg)
+        st.info(msg)
+    else:
+        rv = None # temp don't send anything
+    return rv
+def push_all_observations(enable_push:bool=False):
+    '''
+    open an API connection to Hugging Face, and push all observation one by one
+    '''
+    # get huggingface api
+    token = os.environ.get("HF_TOKEN", None)
+    api = HfApi(token=token)
+    # iterate over the list of observations
+    for hash in st.session_state.public_observations.keys():
+        rv = push_observation(hash, api, enable_push=enable_push)
 def push_observations(tab_log:DeltaGenerator=None):
     """
     Push the observations to the Hugging Face dataset
             push any observation since generating the logger)
     """
+    raise DeprecationWarning("This function is deprecated. Use push_all_observations instead.")
     # we get the observation from session state: 1 is the dict 2 is the image.
     # first, lets do an info display (popup)
     metadata_str = json.dumps(st.session_state.public_observation)
     st.toast(f"Uploading observations: {metadata_str}", icon="🦭")
+    g_logger.info(f"Uploading observations: {metadata_str}")
     # get huggingface api
     token = os.environ.get("HF_TOKEN", None)
     api = HfApi(token=token)
     # msg = f"observation attempted tx to repo happy walrus: {rv}"
     g_logger.info(msg)
     st.info(msg)

src/input/input_handling.py CHANGED Viewed

@@ -1,14 +1,17 @@
 import datetime
 import logging
 import streamlit as st
 from streamlit.delta_generator import DeltaGenerator
 import cv2
 import numpy as np
 from input.input_observation import InputObservation
-from input.input_validator import get_image_datetime, is_valid_email, is_valid_number
 m_logger = logging.getLogger(__name__)
 m_logger.setLevel(logging.INFO)
@@ -23,99 +26,394 @@ allowed_image_types = ['jpg', 'jpeg', 'png', 'webp']
 # an arbitrary set of defaults so testing is less painful...
 # ideally we add in some randomization to the defaults
 spoof_metadata = {
-    "latitude": 23.5,
     "longitude": 44,
     "author_email": "[email protected]",
     "date": None,
     "time": None,
 }
-def setup_input(
-    viewcontainer: DeltaGenerator=None,
-    _allowed_image_types: list=None, ) -> InputObservation:
     """
-    Sets up the input interface for uploading an image and entering metadata.
-    It provides input fields for an image upload, lat/lon, author email, and date-time.
-    In the ideal case, the image metadata will be used to populate location and datetime.
-    Parameters:
-        viewcontainer (DeltaGenerator, optional): The Streamlit container to use for the input interface. Defaults to st.sidebar.
-        _allowed_image_types (list, optional): List of allowed image file types for upload. Defaults to allowed_image_types.
     Returns:
-        InputObservation: An object containing the uploaded image and entered metadata.
     """
-    if viewcontainer is None:
-        viewcontainer = st.sidebar
-    if _allowed_image_types is None:
-        _allowed_image_types = allowed_image_types
-    viewcontainer.title("Input image and data")
-    # 1. Input the author email
-    author_email = viewcontainer.text_input("Author Email", spoof_metadata.get('author_email', ""))
-    if author_email and not is_valid_email(author_email):
-        viewcontainer.error("Please enter a valid email address.")
-    # 2. Image Selector
-    uploaded_files = viewcontainer.file_uploader("Upload an image", type=allowed_image_types, accept_multiple_files=True)
     observations = {}
-    images = {}
-    image_hashes =[]
-    if uploaded_files is not None:
-        for file in uploaded_files:
-            viewcontainer.title(f"Metadata for {file.name}")
-            # Display the uploaded image
-            # load image using cv2 format, so it is compatible with the ML models
-            file_bytes = np.asarray(bytearray(file.read()), dtype=np.uint8)
-            filename = file.name
-            image = cv2.imdecode(file_bytes, 1)
-            # Extract and display image date-time
-            image_datetime = None  # For storing date-time from image
-            image_datetime = get_image_datetime(file)
-            m_logger.debug(f"image date extracted as {image_datetime} (from {uploaded_files})")
-            # 3. Latitude Entry Box
-            latitude = viewcontainer.text_input("Latitude for "+filename, spoof_metadata.get('latitude', ""))
-            if latitude and not is_valid_number(latitude):
-                viewcontainer.error("Please enter a valid latitude (numerical only).")
-                m_logger.error(f"Invalid latitude entered: {latitude}.")
-            # 4. Longitude Entry Box
-            longitude = viewcontainer.text_input("Longitude for "+filename, spoof_metadata.get('longitude', ""))
-            if longitude and not is_valid_number(longitude):
-                viewcontainer.error("Please enter a valid longitude (numerical only).")
-                m_logger.error(f"Invalid latitude entered: {latitude}.")
-            # 5. Date/time
-            ## first from image metadata
-            if image_datetime is not None:
-                time_value = datetime.datetime.strptime(image_datetime, '%Y:%m:%d %H:%M:%S').time()
-                date_value = datetime.datetime.strptime(image_datetime, '%Y:%m:%d %H:%M:%S').date()
-            else:
-                time_value = datetime.datetime.now().time()  # Default to current time
-                date_value = datetime.datetime.now().date()
-            ## if not, give user the option to enter manually
-            date_option = st.sidebar.date_input("Date for "+filename, value=date_value)
-            time_option = st.sidebar.time_input("Time for "+filename, time_value)
-            observation = InputObservation(image=file, latitude=latitude, longitude=longitude,
-                                        author_email=author_email, date=image_datetime, time=None,
-                                        date_option=date_option, time_option=time_option)
-            image_hash = observation.to_dict()["image_md5"]
-            observations[image_hash] = observation
-            images[image_hash] = image
-            image_hashes.append(image_hash)
-    st.session_state.images = images
-    st.session_state.files = uploaded_files
-    st.session_state.observations = observations
-    st.session_state.image_hashes = image_hashes

+from typing import List, Tuple
 import datetime
 import logging
+import hashlib
 import streamlit as st
 from streamlit.delta_generator import DeltaGenerator
+from streamlit.runtime.uploaded_file_manager import UploadedFile
 import cv2
 import numpy as np
 from input.input_observation import InputObservation
+from input.input_validator import get_image_datetime, is_valid_email, is_valid_number, get_image_latlon
 m_logger = logging.getLogger(__name__)
 m_logger.setLevel(logging.INFO)
 # an arbitrary set of defaults so testing is less painful...
 # ideally we add in some randomization to the defaults
 spoof_metadata = {
+    "latitude": 0.5,
     "longitude": 44,
     "author_email": "[email protected]",
     "date": None,
     "time": None,
 }
+def check_inputs_are_set(empty_ok:bool=False, debug:bool=False) -> bool:
     """
+    Checks if all expected inputs have been entered
+    Implementation: via the Streamlit session state.
+    Args:
+        empty_ok (bool): If True, returns True if no inputs are set. Default is False.
+        debug (bool): If True, prints and logs the status of each expected input key. Default is False.
+    Returns:
+        bool: True if all expected input keys are set, False otherwise.
+    """
+    image_hashes = st.session_state.image_hashes
+    if len(image_hashes) == 0:
+        return empty_ok
+    exp_input_key_stubs = ["input_latitude", "input_longitude", "input_date", "input_time"]
+    #exp_input_key_stubs = ["input_latitude", "input_longitude", "input_author_email", "input_date", "input_time",
+    vals = []
+    # the author_email is global/one-off - no hash extension.
+    if "input_author_email" in st.session_state:
+        val = st.session_state["input_author_email"]
+        vals.append(val)
+        if debug:
+            msg = f"{'input_author_email':15}, {(val is not None):8}, {val}"
+            m_logger.debug(msg)
+            print(msg)
+    for image_hash in image_hashes:
+        for stub in exp_input_key_stubs:
+            key = f"{stub}_{image_hash}"
+            val = None
+            if key in st.session_state:
+                val = st.session_state[key]
+            # handle cases where it is defined but empty
+            # if val is a string and empty, set to None
+            if isinstance(val, str) and not val:
+                val = None
+            # if val is a list and empty, set to None (not sure what UI elements would return a list?)
+            if isinstance(val, list) and not val:
+                val = None
+            # number 0 is ok - possibly. could be on the equator, e.g.
+            vals.append(val)
+            if debug:
+                msg = f"{key:15}, {(val is not None):8}, {val}"
+                m_logger.debug(msg)
+                print(msg)
+    return all([v is not None for v in vals])
+def buffer_uploaded_files():
+    """
+    Buffers uploaded files to session_state (images, image_hashes, filenames).
+    Buffers uploaded files by extracting and storing filenames, images, and
+    image hashes in the session state.
+    Adds the following keys to `st.session_state`:
+    - `images`: dict mapping image hashes to image data (numpy arrays)
+    - `files`: list of uploaded files
+    - `image_hashes`: list of image hashes
+    - `image_filenames`: list of filenames
+    """
+    # buffer info from the file_uploader that doesn't require further user input
+    # - the image, the hash, the filename
+    # a separate function takes care of per-file user inputs for metadata
+    # - this is necessary because dynamically producing more widgets should be
+    #   avoided inside callbacks (tl;dr: they dissapear)
+    # - note that the UploadedFile objects have file_ids, which are unique to each file
+    #   - these file_ids are not persistent between sessions, seem to just be random identifiers.
+    # get files from state
+    uploaded_files = st.session_state.file_uploader_data
+    filenames = []
+    images = {}
+    image_hashes = []
+    for ix, file in enumerate(uploaded_files):
+        filename:str = file.name
+        print(f"[D] processing {ix}th file {filename}. {file.file_id} {file.type} {file.size}")
+        # image to np and hash both require reading the file so do together
+        image, image_hash = load_file_and_hash(file)
+        filenames.append(filename)
+        image_hashes.append(image_hash)
+        images[image_hash] = image
+    st.session_state.images = images
+    st.session_state.files = uploaded_files
+    st.session_state.image_hashes = image_hashes
+    st.session_state.image_filenames = filenames
+def load_file_and_hash(file:UploadedFile) -> Tuple[np.ndarray, str]:
+    """
+    Loads an image file and computes its MD5 hash.
+    Since both operations require reading the full file contentsV, they are done
+    together for efficiency.
+    Args:
+        file (UploadedFile): The uploaded file to be processed.
     Returns:
+        Tuple[np.ndarray, str]: A tuple containing the decoded image as a NumPy array and the MD5 hash of the file's contents.
+    """
+    # two operations that require reading the file done together for efficiency
+    # load the file, compute the hash, return the image and hash
+    _bytes = file.read()
+    image_hash = hashlib.md5(_bytes).hexdigest()
+    image: np.ndarray = cv2.imdecode(np.asarray(bytearray(_bytes), dtype=np.uint8), 1)
+    return (image, image_hash)
+def metadata_inputs_one_file(file:UploadedFile, image_hash:str, dbg_ix:int=0) -> InputObservation:
     """
+    Creates and parses metadata inputs for a single file
+    Args:
+        file (UploadedFile): The uploaded file for which metadata is being handled.
+        image_hash (str): The hash of the image.
+        dbg_ix (int, optional): Debug index to differentiate data in each input group. Defaults to 0.
+    Returns:
+        InputObservation: An object containing the metadata and other information for the input file.
+    """
+    # dbg_ix is a hack to have different data in each input group, checking persistence
+    if st.session_state.container_metadata_inputs is not None:
+        _viewcontainer = st.session_state.container_metadata_inputs
+    else:
+        _viewcontainer = st.sidebar
+        m_logger.warning(f"[W] `container_metadata_inputs` is None, using sidebar")
+    author_email = st.session_state["input_author_email"]
+    filename = file.name
+    image_datetime_raw = get_image_datetime(file)
+    latitude0, longitude0 = get_image_latlon(file)
+    msg = f"[D] {filename}: lat, lon from image metadata: {latitude0}, {longitude0}"
+    m_logger.debug(msg)
+    if latitude0 is None: # get some default values if not found in exifdata
+        latitude0:float = spoof_metadata.get('latitude', 0) + dbg_ix
+    if longitude0 is None:
+        longitude0:float = spoof_metadata.get('longitude', 0) - dbg_ix
+    image = st.session_state.images.get(image_hash, None)
+    # add the UI elements
+    #viewcontainer.title(f"Metadata for {filename}")
+    viewcontainer = _viewcontainer.expander(f"Metadata for {file.name}", expanded=True)
+    # TODO: use session state so any changes are persisted within session -- currently I think
+    # we are going to take the defaults over and over again -- if the user adjusts coords, or date, it will get lost
+    # - it is a bit complicated, if no values change, they persist (the widget definition: params, name, key, etc)
+    #   even if the code is re-run. but if the value changes, it is lost.
+    # 3. Latitude Entry Box
+    latitude = viewcontainer.text_input(
+        "Latitude for " + filename,
+        latitude0,
+        key=f"input_latitude_{image_hash}")
+    if latitude and not is_valid_number(latitude):
+        viewcontainer.error("Please enter a valid latitude (numerical only).")
+        m_logger.error(f"Invalid latitude entered: {latitude}.")
+    # 4. Longitude Entry Box
+    longitude = viewcontainer.text_input(
+        "Longitude for " + filename,
+        longitude0,
+        key=f"input_longitude_{image_hash}")
+    if longitude and not is_valid_number(longitude):
+        viewcontainer.error("Please enter a valid longitude (numerical only).")
+        m_logger.error(f"Invalid latitude entered: {latitude}.")
+    # 5. Date/time
+    ## first from image metadata
+    if image_datetime_raw is not None:
+        time_value = datetime.datetime.strptime(image_datetime_raw, '%Y:%m:%d %H:%M:%S').time()
+        date_value = datetime.datetime.strptime(image_datetime_raw, '%Y:%m:%d %H:%M:%S').date()
+    else:
+        time_value = datetime.datetime.now().time()  # Default to current time
+        date_value = datetime.datetime.now().date()
+    ## either way, give user the option to enter manually (or correct, e.g. if camera has no rtc clock)
+    date = viewcontainer.date_input("Date for "+filename, value=date_value, key=f"input_date_{image_hash}")
+    time = viewcontainer.time_input("Time for "+filename, time_value, key=f"input_time_{image_hash}")
+    observation = InputObservation(image=image, latitude=latitude, longitude=longitude,
+                                author_email=author_email, image_datetime_raw=image_datetime_raw,
+                                date=date, time=time,
+                                uploaded_file=file, image_md5=image_hash
+                                )
+    return observation
+def _setup_dynamic_inputs() -> None:
+    """
+    Setup metadata inputs dynamically for each uploaded file, and process.
+    This operates on the data buffered in the session state, and writes
+    the observation objects back to the session state.
+    """
+    # for each file uploaded,
+    # - add the UI elements for the metadata
+    # - validate the data
+    # end of cycle should have observation objects set for each file.
+    # - and these go into session state
+    # load the files from the session state
+    uploaded_files = st.session_state.files
+    hashes = st.session_state.image_hashes
+    #images = st.session_state.images
     observations = {}
+    for ix, file in enumerate(uploaded_files):
+        hash = hashes[ix]
+        observation = metadata_inputs_one_file(file, hash, ix)
+        old_obs = st.session_state.observations.get(hash, None)
+        if old_obs is not None:
+            if old_obs == observation:
+                m_logger.debug(f"[D] {ix}th observation is the same as before. retaining")
+                observations[hash] = old_obs
+            else:
+                m_logger.debug(f"[D] {ix}th observation is different from before. updating")
+                observations[hash] = observation
+                observation.show_diff(old_obs)
+        else:
+            m_logger.debug(f"[D] {ix}th observation is new (image_hash not seen before). Storing")
+            observations[hash] = observation
+    st.session_state.observations = observations
+def _setup_oneoff_inputs() -> None:
+    '''
+    Add the UI input elements for which we have one covering all files
+    - author email
+    - file uploader (accepts multiple files)
+    '''
+    # fetch the container for the file uploader input elements
+    container_file_uploader = st.session_state.container_file_uploader
+    with container_file_uploader:
+        # 1. Input the author email
+        author_email = st.text_input("Author Email", spoof_metadata.get('author_email', ""),
+                                                key="input_author_email")
+        if author_email and not is_valid_email(author_email):
+            st.error("Please enter a valid email address.")
+        # 2. Image Selector
+        st.file_uploader(
+            "Upload one or more images", type=["png", 'jpg', 'jpeg', 'webp'],
+            accept_multiple_files=True,
+            key="file_uploader_data", on_change=buffer_uploaded_files)
+def setup_input() -> None:
+    '''
+    Set up the user input handling (files and metadata)
+    It provides input fields for an image upload, and author email.
+    Then for each uploaded image,
+    - it provides input fields for lat/lon, date-time.
+    - In the ideal case, the image metadata will be used to populate location and datetime.
+    Data is stored in the Streamlit session state for downstream processing,
+    nothing is returned
+    '''
+    # configure the author email and file_uploader (with callback to buffer files)
+    _setup_oneoff_inputs()
+    # setup dynamic UI input elements, based on the data that is buffered in session_state
+    _setup_dynamic_inputs()
+def init_input_container_states() -> None:
+    '''
+    Initialise the layout containers used in the input handling
+    '''
+    #if "container_per_file_input_elems" not in st.session_state:
+    #    st.session_state.container_per_file_input_elems = None
+    if "container_file_uploader" not in st.session_state:
+        st.session_state.container_file_uploader = None
+    if "container_metadata_inputs" not in st.session_state:
+        st.session_state.container_metadata_inputs = None
+def init_input_data_session_states() -> None:
+    '''
+    Initialise the session state variables used in the input handling
+    '''
+    if "image_hashes" not in st.session_state:
+        st.session_state.image_hashes = []
+    # TODO: ideally just use image_hashes, but need a unique key for the ui elements
+    # to track the user input phase; and these are created before the hash is generated.
+    if "image_filenames" not in st.session_state:
+        st.session_state.image_filenames = []
+    if "observations" not in st.session_state:
+        st.session_state.observations = {}
+    if "images" not in st.session_state:
+        st.session_state.images = {}
+    if "files" not in st.session_state:
+        st.session_state.files = {}
+    if "public_observations" not in st.session_state:
+        st.session_state.public_observations = {}
+def add_input_UI_elements() -> None:
+    '''
+    Create the containers within which user input elements will be placed
+    '''
+    # we make containers ahead of time, allowing consistent order of elements
+    # which are not created in the same order.
+    st.divider()
+    st.title("Input image and data")
+    # create and style a container for the file uploader/other one-off inputs
+    st.markdown('<style>.st-key-container_file_uploader_id { border: 1px solid skyblue; border-radius: 5px; }</style>', unsafe_allow_html=True)
+    container_file_uploader = st.container(border=True, key="container_file_uploader_id")
+    st.session_state.container_file_uploader = container_file_uploader
+    # create and style a container for the dynamic metadata inputs
+    st.markdown('<style>.st-key-container_metadata_inputs_id { border: 1px solid lightgreen; border-radius: 5px; }</style>', unsafe_allow_html=True)
+    container_metadata_inputs = st.container(border=True, key="container_metadata_inputs_id")
+    container_metadata_inputs.write("Metadata Inputs... wait for file upload ")
+    st.session_state.container_metadata_inputs = container_metadata_inputs
+def dbg_show_observation_hashes() -> None:
+    """
+    Displays information about each observation including the hash
+    - debug usage, keeping track of the hashes and persistence of the InputObservations.
+    - it renders text to the current container, not intended for final app.
+    """
+    # a debug: we seem to be losing the whale classes?
+    st.write(f"[D] num observations: {len(st.session_state.observations)}")
+    s = ""
+    for hash in st.session_state.observations.keys():
+        obs = st.session_state.observations[hash]
+        s += f"- [D] observation {hash} ({obs._inst_id}) has {len(obs.top_predictions)} predictions\n"
+        #s += f"   - {repr(obs)}\n" # check the str / repr method
+        #print(obs)
+    st.markdown(s)

src/input/input_observation.py CHANGED Viewed

@@ -1,13 +1,18 @@
 import hashlib
 from input.input_validator import generate_random_md5
 # autogenerated class to hold the input data
 class InputObservation:
     """
     A class to hold an input observation and associated metadata
     Attributes:
-        image (Any):
             The image associated with the observation.
         latitude (float):
             The latitude where the observation was made.
@@ -15,16 +20,16 @@ class InputObservation:
             The longitude where the observation was made.
         author_email (str):
             The email of the author of the observation.
-        date (str):
-            The date when the observation was made.
-        time (str):
-            The time when the observation was made.
-        date_option (str):
-            Additional date option for the observation.
-        time_option (str):
-            Additional time option for the observation.
-        uploaded_filename (Any):
-            The uploaded filename associated with the observation.
     Methods:
         __str__():
@@ -35,8 +40,8 @@ class InputObservation:
             Checks if two observations are equal.
         __ne__(other):
             Checks if two observations are not equal.
-        __hash__():
-            Returns the hash of the observation.
         to_dict():
             Converts the observation to a dictionary.
         from_dict(data):
@@ -44,66 +49,208 @@ class InputObservation:
         from_input(input):
             Creates an observation from another input observation.
     """
-    def __init__(self, image=None, latitude=None, longitude=None,
-                 author_email=None, date=None, time=None, date_option=None, time_option=None,
-                 uploaded_filename=None):
         self.image = image
         self.latitude = latitude
         self.longitude = longitude
         self.author_email = author_email
         self.date = date
         self.time = time
-        self.date_option = date_option
-        self.time_option = time_option
-        self.uploaded_filename = uploaded_filename
     def __str__(self):
-        return f"Observation: {self.image}, {self.latitude}, {self.longitude}, {self.author_email}, {self.date}, {self.time}, {self.date_option}, {self.time_option}, {self.uploaded_filename}"
     def __repr__(self):
-        return f"Observation: {self.image}, {self.latitude}, {self.longitude}, {self.author_email}, {self.date}, {self.time}, {self.date_option}, {self.time_option}, {self.uploaded_filename}"
     def __eq__(self, other):
-        return (self.image == other.image and self.latitude == other.latitude and self.longitude == other.longitude and
-                self.author_email == other.author_email and self.date == other.date and self.time == other.time and
-                self.date_option == other.date_option and self.time_option == other.time_option and self.uploaded_filename == other.uploaded_filename)
     def __ne__(self, other):
         return not self.__eq__(other)
-    def __hash__(self):
-        return hash((self.image, self.latitude, self.longitude, self.author_email, self.date, self.time, self.date_option, self.time_option, self.uploaded_filename))
     def to_dict(self):
         return {
             #"image": self.image,
-            "image_filename": self.uploaded_filename.name if self.uploaded_filename else None,
-            "image_md5": hashlib.md5(self.uploaded_filename.read()).hexdigest() if self.uploaded_filename else generate_random_md5(),
             "latitude": self.latitude,
             "longitude": self.longitude,
             "author_email": self.author_email,
-            "date": self.date,
-            "time": self.time,
-            "date_option": str(self.date_option),
-            "time_option": str(self.time_option),
-            "uploaded_filename": self.uploaded_filename
         }
     @classmethod
     def from_dict(cls, data):
-        return cls(data["image"], data["latitude"], data["longitude"], data["author_email"], data["date"], data["time"], data["date_option"], data["time_option"], data["uploaded_filename"])
     @classmethod
     def from_input(cls, input):
-        return cls(input.image, input.latitude, input.longitude, input.author_email, input.date, input.time, input.date_option, input.time_option, input.uploaded_filename)
-    @staticmethod
-    def from_input(input):
-        return InputObservation(input.image, input.latitude, input.longitude, input.author_email, input.date, input.time, input.date_option, input.time_option, input.uploaded_filename)
-    @staticmethod
-    def from_dict(data):
-        return InputObservation(data["image"], data["latitude"], data["longitude"], data["author_email"], data["date"], data["time"], data["date_option"], data["time_option"], data["uploaded_filename"])

 import hashlib
 from input.input_validator import generate_random_md5
+from numpy import ndarray
+from streamlit.runtime.uploaded_file_manager import UploadedFile
+import datetime
 # autogenerated class to hold the input data
 class InputObservation:
     """
     A class to hold an input observation and associated metadata
     Attributes:
+        image (ndarray):
             The image associated with the observation.
         latitude (float):
             The latitude where the observation was made.
             The longitude where the observation was made.
         author_email (str):
             The email of the author of the observation.
+        image_datetime_raw (str):
+            The datetime extracted from the observation file
+        date (datetime.date):
+            Date of the observation
+        time (datetime.time):
+            Time of the observation
+        uploaded_file (UploadedFile):
+            The uploaded file associated with the observation.
+        image_md5 (str):
+            The MD5 hash of the image associated with the observation.
     Methods:
         __str__():
             Checks if two observations are equal.
         __ne__(other):
             Checks if two observations are not equal.
+        show_diff(other):
+            Shows the differences between two observations.
         to_dict():
             Converts the observation to a dictionary.
         from_dict(data):
         from_input(input):
             Creates an observation from another input observation.
     """
+    _inst_count = 0
+    def __init__(
+        self, image:ndarray=None, latitude:float=None, longitude:float=None,
+        author_email:str=None, image_datetime_raw:str=None,
+        date:datetime.date=None,
+        time:datetime.time=None,
+        uploaded_file:UploadedFile=None, image_md5:str=None):
         self.image = image
         self.latitude = latitude
         self.longitude = longitude
         self.author_email = author_email
+        self.image_datetime_raw = image_datetime_raw
         self.date = date
         self.time = time
+        self.uploaded_file = uploaded_file
+        self.image_md5 = image_md5
+        # attributes that get set after predictions/processing
+        self._top_predictions = []
+        self._selected_class = None
+        self._class_overriden = False
+        InputObservation._inst_count += 1
+        self._inst_id = InputObservation._inst_count
+        #dbg - temporarily give up if hash is not provided
+        if self.image_md5 is None:
+            raise ValueError(f"Image MD5 hash is required - {self._inst_id:3}.")
+    def set_top_predictions(self, top_predictions:list):
+        self._top_predictions = top_predictions
+        if len(top_predictions) > 0:
+            self.set_selected_class(top_predictions[0])
+    def set_selected_class(self, selected_class:str):
+        self._selected_class = selected_class
+        if selected_class != self._top_predictions[0]:
+            self.set_class_overriden(True)
+    def set_class_overriden(self, class_overriden:bool):
+        self._class_overriden = class_overriden
+    # add getters for the top_predictions, selected_class and class_overriden
+    @property
+    def top_predictions(self):
+        return self._top_predictions
+    @property
+    def selected_class(self):
+        return self._selected_class
+    @property
+    def class_overriden(self):
+        return self._class_overriden
+    # add a method to assign the image_md5 only once
+    def assign_image_md5(self):
+        raise DeprecationWarning("This method is deprecated. hash is a required constructor argument.")
+        if not self.image_md5:
+            self.image_md5 = hashlib.md5(self.uploaded_file.read()).hexdigest() if self.uploaded_file else generate_random_md5()
+            m_logger.debug(f"[D] Assigned image md5: {self.image_md5} for {self.uploaded_file}")
     def __str__(self):
+        _im_str = "None" if self.image is None else f"image dims: {self.image.shape}"
+        return (
+            f"Observation: {_im_str}, {self.latitude}, {self.longitude}, "
+            f"{self.author_email}, {self.image_datetime_raw}, {self.date}, "
+            f"{self.time}, {self.uploaded_file}, {self.image_md5}"
+        )
     def __repr__(self):
+        _im_str = "None" if self.image is None else f"image dims: {self.image.shape}"
+        return (
+            f"Observation: "
+            f"Image: {_im_str}, "
+            f"Latitude: {self.latitude}, "
+            f"Longitude: {self.longitude}, "
+            f"Author Email: {self.author_email}, "
+            f"raw timestamp: {self.image_datetime_raw}, "
+            f"Date: {self.date}, "
+            f"Time: {self.time}, "
+            f"Uploaded Filename: {self.uploaded_file}"
+            f"Image MD5 hash: {self.image_md5}"
+        )
     def __eq__(self, other):
+        # TODO: ensure this covers all the attributes (some have been added?)
+        # - except inst_id which is unique
+        _image_equality = False
+        if self.image is None or other.image is None:
+            _image_equality = other.image == self.image
+        else: # maybe strong assumption: both are correctly ndarray.. should I test types intead?
+            _image_equality = (self.image == other.image).all()
+        equality = (
+            #self.image == other.image and
+            _image_equality and
+            self.latitude == other.latitude and
+            self.longitude == other.longitude and
+            self.author_email == other.author_email and
+            self.image_datetime_raw == other.image_datetime_raw and
+            self.date == other.date and
+            # temporarily skip time, it is followed by the clock and that is always differnt
+            #self.time == other.time and
+            self.uploaded_file == other.uploaded_file and
+            self.image_md5 == other.image_md5
+            )
+        return equality
+    # define a function show_diff(other) that shows the differences between two observations
+    # only highlight the differences, if element is the same don't show it
+    # have a summary at the top that shows if the observations are the same or not
+    def show_diff(self, other):
+        """Show the differences between two observations"""
+        differences = []
+        if self.image is None or other.image is None:
+            if other.image != self.image:
+                differences.append(f"   Image is different. (types mismatch: {type(self.image)} vs {type(other.image)})")
+        else:
+            if (self.image != other.image).any():
+                cnt = (self.image != other.image).sum()
+                differences.append(f"   Image is different: {cnt} different pixels.")
+        if self.latitude != other.latitude:
+            differences.append(f"   Latitude is different. (self: {self.latitude}, other: {other.latitude})")
+        if self.longitude != other.longitude:
+            differences.append(f"   Longitude is different. (self: {self.longitude}, other: {other.longitude})")
+        if self.author_email != other.author_email:
+            differences.append(f"   Author email is different. (self: {self.author_email}, other: {other.author_email})")
+        if self.image_datetime_raw != other.image_datetime_raw:
+            differences.append(f"   Date is different. (self: {self.image_datetime_raw}, other: {other.image_datetime_raw})")
+        if self.date != other.date:
+            differences.append(f"   Date is different. (self: {self.date}, other: {other.date})")
+        if self.time != other.time:
+            differences.append(f"   Time is different. (self: {self.time}, other: {other.time})")
+        if self.uploaded_file != other.uploaded_file:
+            differences.append("   Uploaded filename is different.")
+        if self.image_md5 != other.image_md5:
+            differences.append("   Image MD5 hash is different.")
+        if differences:
+            print(f"Observations have {len(differences)} differences:")
+            for diff in differences:
+                print(diff)
+        else:
+            print("Observations are the same.")
     def __ne__(self, other):
         return not self.__eq__(other)
     def to_dict(self):
         return {
             #"image": self.image,
+            "image_filename": self.uploaded_file.name if self.uploaded_file else None,
+            "image_md5": self.image_md5,
+            #"image_md5": hashlib.md5(self.uploaded_file.read()).hexdigest() if self.uploaded_file else generate_random_md5(),
             "latitude": self.latitude,
             "longitude": self.longitude,
             "author_email": self.author_email,
+            "image_datetime_raw": self.image_datetime_raw,
+            "date": str(self.date),
+            "time": str(self.time),
+            "selected_class": self._selected_class,
+            "top_prediction": self._top_predictions[0] if len(self._top_predictions) else None,
+            "class_overriden": self._class_overriden,
+            #"uploaded_file": self.uploaded_file # can't serialize this in json, not sent to dataset anyway.
         }
     @classmethod
     def from_dict(cls, data):
+        return cls(
+            image=data.get("image"),
+            latitude=data.get("latitude"),
+            longitude=data.get("longitude"),
+            author_email=data.get("author_email"),
+            image_datetime_raw=data.get("image_datetime_raw"),
+            date=data.get("date"),
+            time=data.get("time"),
+            uploaded_file=data.get("uploaded_file"),
+            image_hash=data.get("image_md5")
+        )
     @classmethod
     def from_input(cls, input):
+        return cls(
+            image=input.image,
+            latitude=input.latitude,
+            longitude=input.longitude,
+            author_email=input.author_email,
+            image_datetime_raw=input.image_datetime_raw,
+            date=input.date,
+            time=input.time,
+            uploaded_file=input.uploaded_file,
+            image_hash=input.image_hash
+        )

src/input/input_validator.py CHANGED Viewed

@@ -1,22 +1,33 @@
 import random
 import string
 import hashlib
 import re
-import streamlit as st
 from fractions import Fraction
 from PIL import Image
 from PIL import ExifTags
 from streamlit.runtime.uploaded_file_manager import UploadedFile
-def generate_random_md5():
     # Generate a random string
-    random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=16))
     # Encode the string and compute its MD5 hash
     md5_hash = hashlib.md5(random_string.encode()).hexdigest()
     return md5_hash
 def is_valid_number(number:str) -> bool:
     """
     Check if the given string is a valid number (int or float, sign ok)
@@ -30,6 +41,7 @@ def is_valid_number(number:str) -> bool:
     pattern = r'^[-+]?[0-9]*\.?[0-9]+$'
     return re.match(pattern, number) is not None
 # Function to validate email address
 def is_valid_email(email:str) -> bool:
     """
@@ -41,11 +53,14 @@ def is_valid_email(email:str) -> bool:
     Returns:
         bool: True if the email address is valid, False otherwise.
     """
-    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
     return re.match(pattern, email) is not None
 # Function to extract date and time from image metadata
-def get_image_datetime(image_file):
     """
     Extracts the original date and time from the EXIF metadata of an uploaded image file.
@@ -69,6 +84,7 @@ def get_image_datetime(image_file):
          # TODO: add to logger
     return None
 def decimal_coords(coords:tuple, ref:str) -> Fraction:
     """
     Converts coordinates from degrees, minutes, and seconds to decimal degrees.
@@ -96,8 +112,9 @@ def decimal_coords(coords:tuple, ref:str) -> Fraction:
     return decimal_degrees
-#def get_image_latlon(image_file: UploadedFile) -> tuple[float, float] | None:
-def get_image_latlon(image_file: UploadedFile) :
     """
     Extracts the latitude and longitude from the EXIF metadata of an uploaded image file.
@@ -123,4 +140,6 @@ def get_image_latlon(image_file: UploadedFile) :
                 return lat, lon
     except Exception as e: # FIXME: what types of exception?
-         st.warning(f"Could not extract latitude and longitude from image metadata. (file: {str(image_file)}")

+from typing import Tuple, Union
 import random
 import string
 import hashlib
 import re
 from fractions import Fraction
 from PIL import Image
 from PIL import ExifTags
+import streamlit as st
 from streamlit.runtime.uploaded_file_manager import UploadedFile
+def generate_random_md5(length:int=16) -> str:
+    """
+    Generate a random MD5 hash.
+    Args:
+        length (int): The length of the random string to generate. Default is 16.
+    Returns:
+        str: The MD5 hash of the generated random string.
+    """
     # Generate a random string
+    random_string = ''.join(random.choices(string.ascii_letters + string.digits, length=16))
     # Encode the string and compute its MD5 hash
     md5_hash = hashlib.md5(random_string.encode()).hexdigest()
     return md5_hash
 def is_valid_number(number:str) -> bool:
     """
     Check if the given string is a valid number (int or float, sign ok)
     pattern = r'^[-+]?[0-9]*\.?[0-9]+$'
     return re.match(pattern, number) is not None
 # Function to validate email address
 def is_valid_email(email:str) -> bool:
     """
     Returns:
         bool: True if the email address is valid, False otherwise.
     """
+    #pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
+    # do not allow starting with a +
+    pattern = r'^[a-zA-Z0-9_]+[a-zA-Z0-9._%+-]*@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
     return re.match(pattern, email) is not None
 # Function to extract date and time from image metadata
+def get_image_datetime(image_file:UploadedFile) -> Union[str, None]:
     """
     Extracts the original date and time from the EXIF metadata of an uploaded image file.
          # TODO: add to logger
     return None
 def decimal_coords(coords:tuple, ref:str) -> Fraction:
     """
     Converts coordinates from degrees, minutes, and seconds to decimal degrees.
     return decimal_degrees
+#def get_image_latlon(image_file: UploadedFile) : # if it is still not working
+#def get_image_latlon(image_file: UploadedFile) -> Tuple[float, float] | None: # Python >=3.10
+def get_image_latlon(image_file: UploadedFile) -> Union[Tuple[float, float], None]: # 3.6 <= Python < 3.10
     """
     Extracts the latitude and longitude from the EXIF metadata of an uploaded image file.
                 return lat, lon
     except Exception as e: # FIXME: what types of exception?
+         st.warning(f"Could not extract latitude and longitude from image metadata. (file: {str(image_file)}")
+    return None, None

src/main.py CHANGED Viewed

@@ -9,17 +9,24 @@ from streamlit_folium import st_folium
 from transformers import pipeline
 from transformers import AutoModelForImageClassification
-from maps.obs_map import add_header_text
 from datasets import disable_caching
 disable_caching()
 import whale_gallery as gallery
 import whale_viewer as viewer
-from input.input_handling import setup_input
 from maps.alps_map import present_alps_map
 from maps.obs_map import present_obs_map
-from utils.st_logs import setup_logging, parse_log_buffer
-from classifier.classifier_image import cetacean_classify
 from classifier.classifier_hotdog import hotdog_classify
@@ -34,6 +41,11 @@ data_files = "data/train-00000-of-00001.parquet"
 USE_BASIC_MAP = False
 DEV_SIDEBAR_LIB = True
 # get a global var for logger accessor in this module
 LOG_LEVEL = logging.DEBUG
 g_logger = logging.getLogger(__name__)
@@ -42,33 +54,13 @@ g_logger.setLevel(LOG_LEVEL)
 st.set_page_config(layout="wide")
 # initialise various session state variables
-if "handler" not in st.session_state:
-    st.session_state['handler'] = setup_logging()
-if "image_hashes" not in st.session_state:
-    st.session_state.image_hashes = []
-if "observations" not in st.session_state:
-    st.session_state.observations = {}
-if "images" not in st.session_state:
-    st.session_state.images = {}
-if "files" not in st.session_state:
-    st.session_state.files = {}
-if "public_observation" not in st.session_state:
-    st.session_state.public_observation = {}
-if "classify_whale_done" not in st.session_state:
-    st.session_state.classify_whale_done = False
-if "whale_prediction1" not in st.session_state:
-    st.session_state.whale_prediction1 = None
-if "tab_log" not in st.session_state:
-    st.session_state.tab_log = None
 def main() -> None:
     """
@@ -100,29 +92,22 @@ def main() -> None:
     # Streamlit app
     tab_inference, tab_hotdogs, tab_map, tab_coords, tab_log, tab_gallery = \
         st.tabs(["Cetecean classifier", "Hotdog classifier", "Map", "*:gray[Dev:coordinates]*", "Log", "Beautiful cetaceans"])
-    st.session_state.tab_log = tab_log
     # create a sidebar, and parse all the input (returned as `observations` object)
-    setup_input(viewcontainer=st.sidebar)
-    if 0:## WIP
-        # goal of this code is to allow the user to override the ML prediction, before transmitting an observations
-        predicted_class = st.sidebar.selectbox("Predicted Class", viewer.WHALE_CLASSES)
-        override_prediction = st.sidebar.checkbox("Override Prediction")
-        if override_prediction:
-            overridden_class = st.sidebar.selectbox("Override Class", viewer.WHALE_CLASSES)
-            st.session_state.observations['class_overriden'] = overridden_class
-        else:
-            st.session_state.observations['class_overriden'] = None
     with tab_map:
         # visual structure: a couple of toggles at the top, then the map inlcuding a
         # dropdown for tileset selection.
-        add_header_text()
         tab_map_ui_cols = st.columns(2)
         with tab_map_ui_cols[0]:
             show_db_points = st.toggle("Show Points from DB", True)
@@ -180,43 +165,128 @@ def main() -> None:
             gallery.render_whale_gallery(n_cols=4)
-    # Display submitted observation
-    if st.sidebar.button("Validate"):
-        # create a dictionary with the submitted observation
-        tab_log.info(f"{st.session_state.observations}")
-        df = pd.DataFrame(st.session_state.observations, index=[0])
-        with tab_coords:
-            st.table(df)
-    # inside the inference tab, on button press we call the model (on huggingface hub)
-    # which will be run locally.
-    # - the model predicts the top 3 most likely species from the input image
-    # - these species are shown
-    # - the user can override the species prediction using the dropdown
-    # - an observation is uploaded if the user chooses.
-    tab_inference.markdown("""
-                *Run classifer to identify the species of cetean on the uploaded image.
-                Once inference is complete, the top three predictions are shown.
-                You can override the prediction by selecting a species from the dropdown.*""")
-    if tab_inference.button("Identify with cetacean classifier"):
-        #pipe = pipeline("image-classification", model="Saving-Willy/cetacean-classifier", trust_remote_code=True)
-        cetacean_classifier = AutoModelForImageClassification.from_pretrained("Saving-Willy/cetacean-classifier",
-                                                                            revision=classifier_revision,
-                                                                            trust_remote_code=True)
-        if st.session_state.images is None:
-            # TODO: cleaner design to disable the button until data input done?
-            st.info("Please upload an image first.")
-        else:
-            cetacean_classify(cetacean_classifier)
     # inside the hotdog tab, on button press we call a 2nd model (totally unrelated at present, just for demo
     # purposes, an hotdog image classifier) which will be run locally.
@@ -240,6 +310,9 @@ def main() -> None:
             hotdog_classify(pipeline_hot_dog, tab_hotdogs)
 if __name__ == "__main__":
     main()

 from transformers import pipeline
 from transformers import AutoModelForImageClassification
+from maps.obs_map import add_obs_map_header
+from classifier.classifier_image import add_classifier_header
 from datasets import disable_caching
 disable_caching()
 import whale_gallery as gallery
 import whale_viewer as viewer
+from input.input_handling import setup_input, check_inputs_are_set
+from input.input_handling import init_input_container_states, add_input_UI_elements, init_input_data_session_states
+from input.input_handling import dbg_show_observation_hashes
 from maps.alps_map import present_alps_map
 from maps.obs_map import present_obs_map
+from utils.st_logs import parse_log_buffer, init_logging_session_states
+from utils.workflow_ui import refresh_progress_display, init_workflow_viz, init_workflow_session_states
+from hf_push_observations import push_all_observations
+from classifier.classifier_image import cetacean_just_classify, cetacean_show_results_and_review, cetacean_show_results, init_classifier_session_states
 from classifier.classifier_hotdog import hotdog_classify
 USE_BASIC_MAP = False
 DEV_SIDEBAR_LIB = True
+# one toggle for all the extra debug text
+if "MODE_DEV_STATEFUL" not in st.session_state:
+    st.session_state.MODE_DEV_STATEFUL = False
 # get a global var for logger accessor in this module
 LOG_LEVEL = logging.DEBUG
 g_logger = logging.getLogger(__name__)
 st.set_page_config(layout="wide")
 # initialise various session state variables
+init_logging_session_states() # logging init should be early
+init_workflow_session_states()
+init_input_data_session_states()
+init_input_container_states()
+init_workflow_viz()
+init_classifier_session_states()
 def main() -> None:
     """
     # Streamlit app
     tab_inference, tab_hotdogs, tab_map, tab_coords, tab_log, tab_gallery = \
         st.tabs(["Cetecean classifier", "Hotdog classifier", "Map", "*:gray[Dev:coordinates]*", "Log", "Beautiful cetaceans"])
+    # put this early so the progress indicator is at the top (also refreshed at end)
+    refresh_progress_display()
     # create a sidebar, and parse all the input (returned as `observations` object)
+    with st.sidebar:
+        # layout handling
+        add_input_UI_elements()
+        # input elements (file upload, text input, etc)
+        setup_input()
     with tab_map:
         # visual structure: a couple of toggles at the top, then the map inlcuding a
         # dropdown for tileset selection.
+        add_obs_map_header()
         tab_map_ui_cols = st.columns(2)
         with tab_map_ui_cols[0]:
             show_db_points = st.toggle("Show Points from DB", True)
             gallery.render_whale_gallery(n_cols=4)
+    # state handling re data_entry phases
+    # 0. no data entered yet -> display the file uploader thing
+    # 1. we have some images, but not all the metadata fields are done -> validate button shown, disabled
+    # 2. all data entered -> validate button enabled
+    # 3. validation button pressed, validation done -> enable the inference button.
+    #    - at this point do we also want to disable changes to the metadata selectors?
+    #    anyway, simple first.
+    if st.session_state.workflow_fsm.is_in_state('doing_data_entry'):
+        # can we advance state? - only when all inputs are set for all uploaded files
+        all_inputs_set = check_inputs_are_set(debug=True, empty_ok=False)
+        if all_inputs_set:
+            st.session_state.workflow_fsm.complete_current_state()
+            # -> data_entry_complete
+        else:
+            # button, disabled; no state change yet.
+            st.sidebar.button(":gray[*Validate*]", disabled=True, help="Please fill in all fields.")
+    if st.session_state.workflow_fsm.is_in_state('data_entry_complete'):
+        # can we advance state? - only when the validate button is pressed
+        if st.sidebar.button(":white_check_mark:[**Validate**]"):
+            # create a dictionary with the submitted observation
+            tab_log.info(f"{st.session_state.observations}")
+            df = pd.DataFrame([obs.to_dict() for obs in st.session_state.observations.values()])
+            #df = pd.DataFrame(st.session_state.observations, index=[0])
+            with tab_coords:
+                st.table(df)
+            # there doesn't seem to be any actual validation here?? TODO: find validator function (each element is validated by the input box, but is there something at the whole image level?)
+            # hmm, maybe it should actually just be "I'm done with data entry"
+            st.session_state.workflow_fsm.complete_current_state()
+            # -> data_entry_validated
+    # state handling re inference phases (tab_inference)
+    # 3. validation button pressed, validation done -> enable the inference button.
+    # 4. inference button pressed -> ML started. | let's cut this one out, since it would only
+    #      make sense if we did it as an async action
+    # 5. ML done -> show results, and manual validation options
+    # 6. manual validation done -> enable the upload buttons
+    #
+    with tab_inference:
+        # inside the inference tab, on button press we call the model (on huggingface hub)
+        # which will be run locally.
+        # - the model predicts the top 3 most likely species from the input image
+        # - these species are shown
+        # - the user can override the species prediction using the dropdown
+        # - an observation is uploaded if the user chooses.
+        if st.session_state.MODE_DEV_STATEFUL:
+            dbg_show_observation_hashes()
+        add_classifier_header()
+        # if we are before data_entry_validated, show the button, disabled.
+        if not st.session_state.workflow_fsm.is_in_state_or_beyond('data_entry_validated'):
+            tab_inference.button(":gray[*Identify with cetacean classifier*]", disabled=True,
+                                help="Please validate inputs before proceeding",
+                                key="button_infer_ceteans")
+        if st.session_state.workflow_fsm.is_in_state('data_entry_validated'):
+            # show the button, enabled. If pressed, we start the ML model (And advance state)
+            if tab_inference.button("Identify with cetacean classifier"):
+                cetacean_classifier = AutoModelForImageClassification.from_pretrained(
+                    "Saving-Willy/cetacean-classifier",
+                    revision=classifier_revision,
+                    trust_remote_code=True)
+                cetacean_just_classify(cetacean_classifier)
+                st.session_state.workflow_fsm.complete_current_state()
+                # trigger a refresh too (refreshhing the prog indicator means the script reruns and
+                # we can enter the next state - visualising the results / review)
+                # ok it doesn't if done programmatically. maybe interacting with teh button? check docs.
+                refresh_progress_display()
+                #TODO: validate this doesn't harm performance adversely.
+                st.rerun()
+        elif st.session_state.workflow_fsm.is_in_state('ml_classification_completed'):
+            # show the results, and allow manual validation
+            st.markdown("""### Inference results and manual validation/adjustment """)
+            if st.session_state.MODE_DEV_STATEFUL:
+                s = ""
+                for k, v in st.session_state.whale_prediction1.items():
+                    s += f"* Image {k}: {v}\n"
+                st.markdown(s)
+            # add a button to advance the state
+            if st.button("Confirm species predictions", help="Confirm that all species are selected correctly"):
+                st.session_state.workflow_fsm.complete_current_state()
+                # -> manual_inspection_completed
+                st.rerun()
+            cetacean_show_results_and_review()
+        elif st.session_state.workflow_fsm.is_in_state('manual_inspection_completed'):
+            # show the ML results, and allow the user to upload the observation
+            st.markdown("""### Inference Results (after manual validation) """)
+            if st.button("Upload all observations to THE INTERNET!"):
+                # let this go through to the push_all func, since it just reports to log for now.
+                push_all_observations(enable_push=False)
+                st.session_state.workflow_fsm.complete_current_state()
+                # -> data_uploaded
+                st.rerun()
+            cetacean_show_results()
+        elif st.session_state.workflow_fsm.is_in_state('data_uploaded'):
+            # the data has been sent. Lets show the observations again
+            # but no buttons to upload (or greyed out ok)
+            st.markdown("""### Observation(s) uploaded - thank you!""")
+            cetacean_show_results()
+            st.divider()
+            #df = pd.DataFrame(st.session_state.observations, index=[0])
+            df = pd.DataFrame([obs.to_dict() for obs in st.session_state.observations.values()])
+            st.table(df)
+            # didn't decide what the next state is here - I think we are in the terminal state.
+            #st.session_state.workflow_fsm.complete_current_state()
     # inside the hotdog tab, on button press we call a 2nd model (totally unrelated at present, just for demo
     # purposes, an hotdog image classifier) which will be run locally.
             hotdog_classify(pipeline_hot_dog, tab_hotdogs)
+    # after all other processing, we can show the stage/state
+    refresh_progress_display()
 if __name__ == "__main__":
     main()

src/maps/obs_map.py CHANGED Viewed

@@ -192,8 +192,8 @@ def present_obs_map(dataset_id:str = "Saving-Willy/Happywhale-kaggle",
     return st_data
-def add_header_text() -> None:
     """
     Add brief explainer text to the tab
     """
-    st.write("A map showing the observations in the dataset, with markers colored by species.")

     return st_data
+def add_obs_map_header() -> None:
     """
     Add brief explainer text to the tab
     """
+    st.write("A map showing the observations in the dataset, with markers colored by species.")

src/utils/metadata_handler.py CHANGED Viewed

@@ -1,16 +1,26 @@
 import streamlit as st
-def metadata2md() -> str:
     """Get metadata from cache and return as markdown-formatted key-value list
     Returns:
         str: Markdown-formatted key-value list of metadata
     """
     markdown_str = "\n"
-    keys_to_print = ["latitude","longitude","author_email","date","time"]
-    for key, value in st.session_state.public_observation.items():
-            if key in keys_to_print:
-                markdown_str += f"- **{key}**: {value}\n"
     return markdown_str

 import streamlit as st
+def metadata2md(image_hash:str, debug:bool=False) -> str:
     """Get metadata from cache and return as markdown-formatted key-value list
+    Args:
+        image_hash (str): The hash of the image to get metadata for
+        debug (bool, optional): Whether to print additional fields.
     Returns:
         str: Markdown-formatted key-value list of metadata
     """
     markdown_str = "\n"
+    keys_to_print = ["author_email", "latitude", "longitude", "date", "time"]
+    if debug:
+        keys_to_print += ["iamge_md5", "selected_class", "top_prediction", "class_overriden"]
+    observation = st.session_state.public_observations.get(image_hash, {})
+    for key, value in observation.items():
+        if key in keys_to_print:
+            markdown_str += f"- **{key}**: {value}\n"
     return markdown_str

src/utils/st_logs.py CHANGED Viewed

@@ -100,6 +100,16 @@ class StreamlitLogHandler(logging.Handler):
         self.log_area.empty()  # Clear previous logs
         self.buffer.clear()
 # Set up logging to capture all info level logs from the root logger
 @st.cache_resource
 def setup_logging(level:int=logging.INFO, buffer_len:int=15) -> StreamlitLogHandler:
@@ -126,6 +136,7 @@ def setup_logging(level:int=logging.INFO, buffer_len:int=15) -> StreamlitLogHand
     #    st.session_state['handler'] = handler
     return handler
 def parse_log_buffer(log_contents: deque) -> List[dict]:
     """
     Convert log buffer to a list of dictionaries for use with a streamlit datatable.

         self.log_area.empty()  # Clear previous logs
         self.buffer.clear()
+def init_logging_session_states():
+    """
+    Initialise the session state variables for logging.
+    """
+    if "handler" not in st.session_state:
+        st.session_state['handler'] = setup_logging()
 # Set up logging to capture all info level logs from the root logger
 @st.cache_resource
 def setup_logging(level:int=logging.INFO, buffer_len:int=15) -> StreamlitLogHandler:
     #    st.session_state['handler'] = handler
     return handler
 def parse_log_buffer(log_contents: deque) -> List[dict]:
     """
     Convert log buffer to a list of dictionaries for use with a streamlit datatable.

src/utils/workflow_state.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from transitions import Machine
+from typing import List
+OKBLUE = '\033[94m'
+OKGREEN = '\033[92m'
+OKCYAN = '\033[96m'
+FAIL = '\033[91m'
+ENDC = '\033[0m'
+FSM_STATES = ['doing_data_entry', 'data_entry_complete', 'data_entry_validated',
+              #'ml_classification_started',
+              'ml_classification_completed',
+              'manual_inspection_completed', 'data_uploaded']
+class WorkflowFSM:
+    def __init__(self, state_sequence: List[str]):
+        self.state_sequence = state_sequence
+        self.state_dict = {state: i for i, state in enumerate(state_sequence)}
+        # Create state machine
+        self.machine = Machine(
+            model=self,
+            states=state_sequence,
+            initial=state_sequence[0],
+        )
+        # For each state (except the last), add a completion transition to the next state
+        for i in range(len(state_sequence) - 1):
+            current_state = state_sequence[i]
+            next_state = state_sequence[i + 1]
+            self.machine.add_transition(
+                trigger=f'complete_{current_state}',
+                source=current_state,
+                dest=next_state,
+                conditions=[f'is_in_{current_state}']
+            )
+            # Dynamically add a condition method for each state
+            setattr(self, f'is_in_{current_state}',
+                   lambda s=current_state: self.is_in_state(s))
+        # Add callbacks for logging
+        self.machine.before_state_change = self._log_transition
+        self.machine.after_state_change = self._post_transition
+    def is_in_state(self, state_name: str) -> bool:
+        """Check if we're currently in the specified state"""
+        return self.state == state_name
+    def complete_current_state(self) -> bool:
+        """
+        Signal that the current state is complete.
+        Returns True if state transition occurred, False otherwise.
+        """
+        current_state = self.state
+        trigger_name = f'complete_{current_state}'
+        if hasattr(self, trigger_name):
+            try:
+                trigger_func = getattr(self, trigger_name)
+                trigger_func()
+                return True
+            except:
+                return False
+        return False
+    # add a helper method, to find out if a given state has been reached/passed
+    # we first need to get the index of the current state
+    # then the index of the argument state
+    # compare, and return boolean
+    def is_in_state_or_beyond(self, state_name: str) -> bool:
+        """Check if we have reached or passed the specified state"""
+        if state_name not in self.state_dict:
+            raise ValueError(f"Invalid state: {state_name}")
+        return self.state_dict[state_name] <= self.state_dict[self.state]
+    @property
+    def current_state(self) -> str:
+        """Get the current state name"""
+        return self.state
+    @property
+    def current_state_index(self) -> int:
+        """Get the current state index"""
+        return self.state_dict[self.state]
+    @property
+    def num_states(self) -> int:
+        return len(self.state_sequence)
+    def _log_transition(self):
+        # TODO: use logger, not printing.
+        self._cprint(f"[FSM] -> Transitioning from {self.current_state}")
+    def _post_transition(self):
+        # TODO: use logger, not printing.
+        self._cprint(f"[FSM] -| Transitioned to {self.current_state}")
+    def _cprint(self, msg:str, color:str=OKCYAN):
+        """Print colored message"""
+        print(f"{color}{msg}{ENDC}")

src/utils/workflow_ui.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+from utils.workflow_state import WorkflowFSM, FSM_STATES
+def init_workflow_session_states():
+    """
+    Initialise the session state variables for the workflow state machine
+    """
+    if "workflow_fsm" not in st.session_state:
+        # create and init the state machine
+        st.session_state.workflow_fsm = WorkflowFSM(FSM_STATES)
+def refresh_progress_display() -> None:
+    """
+    Updates the workflow progress display in the Streamlit sidebar.
+    """
+    with st.sidebar:
+        num_states = st.session_state.workflow_fsm.num_states - 1
+        current_state_index = st.session_state.workflow_fsm.current_state_index
+        current_state_name = st.session_state.workflow_fsm.current_state
+        status = f"*Progress: {current_state_index}/{num_states}. Current: {current_state_name}.*"
+        st.session_state.disp_progress[0].markdown(status)
+        st.session_state.disp_progress[1].progress(current_state_index/num_states)
+def init_workflow_viz(debug:bool=True) -> None:
+    """
+    Set up the streamlit elements for visualising the workflow progress.
+    Adds placeholders for progress indicators, and adds a button to manually refresh
+    the displayed progress. Note: The button is mainly a development aid.
+    Args:
+        debug (bool): If True, include the manual refresh button. Default is True.
+    """
+    #Initialise the layout containers used in the input handling
+    # add progress indicator to session_state
+    if "progress" not in st.session_state:
+        with st.sidebar:
+            st.session_state.disp_progress = [st.empty(), st.empty()]
+            if debug:
+                # add button to sidebar, with the callback to refesh_progress
+                st.sidebar.button("Refresh Progress", on_click=refresh_progress_display)

src/whale_viewer.py CHANGED Viewed

@@ -115,6 +115,9 @@ def format_whale_name(whale_class:str) -> str:
     Returns:
         str: The formatted whale name with spaces instead of underscores and each word capitalized.
     """
     whale_name = whale_class.replace("_", " ").title()
     return whale_name

     Returns:
         str: The formatted whale name with spaces instead of underscores and each word capitalized.
     """
+    if not isinstance(whale_class, str):
+        raise TypeError("whale_class should be a string.")
     whale_name = whale_class.replace("_", " ").title()
     return whale_name

tests/test_input_handling.py CHANGED Viewed

@@ -51,9 +51,6 @@ def test_is_valid_email_invalid():
     assert not is_valid_email("[email protected].")
     assert not is_valid_email("a@[email protected]")
-# not sure how xfails come through the CI pipeline yet.
-# maybe better to just comment out this stuff until pipeline is setup, then can check /extend
-@pytest.mark.xfail(reason="Bug identified, but while setting up CI having failing tests causes more headache")
 def test_is_valid_email_invalid_plus():
     assert not is_valid_email("[email protected]")
     assert not is_valid_email("[email protected]")
@@ -143,7 +140,7 @@ def test_get_image_latlon():
     # missing GPS loc
     f2 = test_data_pth / 'cakes_no_exif_gps.jpg'
-    assert get_image_latlon(f2) == None
     # missng datetime -> expect gps not affected
     f3 = test_data_pth / 'cakes_no_exif_datetime.jpg'
@@ -151,7 +148,7 @@ def test_get_image_latlon():
 # tests for get_image_latlon with empty file
 def test_get_image_latlon_empty():
-    assert get_image_latlon("") == None
 # tests for decimal_coords
 # - without input, py raises TypeError

     assert not is_valid_email("[email protected].")
     assert not is_valid_email("a@[email protected]")
 def test_is_valid_email_invalid_plus():
     assert not is_valid_email("[email protected]")
     assert not is_valid_email("[email protected]")
     # missing GPS loc
     f2 = test_data_pth / 'cakes_no_exif_gps.jpg'
+    assert get_image_latlon(f2) == (None, None)
     # missng datetime -> expect gps not affected
     f3 = test_data_pth / 'cakes_no_exif_datetime.jpg'
 # tests for get_image_latlon with empty file
 def test_get_image_latlon_empty():
+    assert get_image_latlon("") == (None, None)
 # tests for decimal_coords
 # - without input, py raises TypeError

tests/test_whale_viewer.py CHANGED Viewed

@@ -40,11 +40,9 @@ def test_format_whale_name_empty():
     assert format_whale_name("") == ""
 # testing with the wrong datatype
-# we should get a TypeError - currently it fails with a AttributeError
-@pytest.mark.xfail
 def test_format_whale_name_none():
     with pytest.raises(TypeError):
         format_whale_name(None)
-# display_whale requires UI to test it.

     assert format_whale_name("") == ""
 # testing with the wrong datatype
 def test_format_whale_name_none():
     with pytest.raises(TypeError):
         format_whale_name(None)
+# display_whale requires UI to test it.