Spaces:

JVice
/

try-before-you-bias

Sleeping

App Files Files Community

JVice commited on Dec 12, 2023

Commit

0560487

1 Parent(s): 85d1a2a

updated for committing to user database file path

Browse files

Files changed (1) hide show

streamlit-app.py +343 -0

streamlit-app.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import streamlit as st
+st.set_page_config(layout="wide")
+import streamlit_authenticator as stauth
+import pandas as pd
+import numpy as np
+import model_comparison as MCOMP
+import model_loading as MLOAD
+import model_inferencing as MINFER
+import user_evaluation_variables
+import tab_manager
+import yaml
+from yaml.loader import SafeLoader
+from PIL import Image
+AUTHENTICATOR = None
+TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png')
+USER_LOGGED_IN = False
+USER_DATABASE_PATH = './data/user_database.yaml'
+def create_new_user(authenticator, users):
+    try:
+        if authenticator.register_user('Register user', preauthorization=False):
+            st.success('User registered successfully')
+    except Exception as e:
+        st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
+        yaml.dump(users, file, default_flow_style=False)
+def forgot_password(authenticator, users):
+    try:
+        username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password(
+            'Forgot password')
+        if username_of_forgotten_password:
+            st.success('New password to be sent securely')
+            # Random password should be transferred to user securely
+    except Exception as e:
+        st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
+        yaml.dump(users, file, default_flow_style=False)
+def update_account_details(authenticator, users):
+    if st.session_state["authentication_status"]:
+        try:
+            if authenticator.update_user_details(st.session_state["username"], 'Update user details'):
+                st.success('Entries updated successfully')
+        except Exception as e:
+            st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
+        yaml.dump(users, file, default_flow_style=False)
+def reset_password(authenticator, users):
+    if st.session_state["authentication_status"]:
+        try:
+            if authenticator.reset_password(st.session_state["username"], 'Reset password'):
+                st.success('Password modified successfully')
+        except Exception as e:
+            st.error(e)
+    with open(USER_DATABASE_PATH, 'w') as file:
+        yaml.dump(users, file, default_flow_style=False)
+def user_login_create():
+    global AUTHENTICATOR
+    global TBYB_LOGO
+    global USER_LOGGED_IN
+    users = None
+    with open(USER_DATABASE_PATH) as file:
+        users = yaml.load(file, Loader=SafeLoader)
+        AUTHENTICATOR = stauth.Authenticate(
+            users['credentials'],
+            users['cookie']['name'],
+            users['cookie']['key'],
+            users['cookie']['expiry_days'],
+            users['preauthorized']
+        )
+    with st.sidebar:
+        st.image(TBYB_LOGO, width=70)
+        loginTab, registerTab, detailsTab = st.tabs(["Log in", "Register", "Account details"])
+        with loginTab:
+            name, authentication_status, username = AUTHENTICATOR.login('Login', 'main')
+            if authentication_status:
+                AUTHENTICATOR.logout('Logout', 'main')
+                st.write(f'Welcome *{name}*')
+                user_evaluation_variables.USERNAME = username
+                USER_LOGGED_IN = True
+            elif authentication_status == False:
+                st.error('Username/password is incorrect')
+                forgot_password(AUTHENTICATOR, users)
+            elif authentication_status == None:
+                st.warning('Please enter your username and password')
+                forgot_password(AUTHENTICATOR, users)
+        if not authentication_status:
+            with registerTab:
+                create_new_user(AUTHENTICATOR, users)
+        else:
+            with detailsTab:
+                st.write('**Username:** ', username)
+                st.write('**Name:** ', name)
+                st.write('**Email:** ', users['credentials']['usernames'][username]['email'])
+                # update_account_details(AUTHENTICATOR, users)
+                reset_password(AUTHENTICATOR, users)
+    return USER_LOGGED_IN
+def setup_page_banner():
+    global USER_LOGGED_IN
+    # for tab in [tab1, tab2, tab3, tab4, tab5]:
+    c1,c2,c3,c4,c5,c6,c7,c8,c9 = st.columns(9)
+    with c5:
+        st.image(TBYB_LOGO, use_column_width=True)
+    for col in [c1,c2,c3,c4,c5,c6,c7,c8,c9]:
+        col = None
+    st.title('Try Before You Bias (TBYB)')
+    st.write('*A Quantitative T2I Bias Evaluation Tool*')
+def setup_how_to():
+    expander = st.expander("How to Use")
+    expander.write("1. Login to your TBYB Account using the bar on the right\n"
+                   "2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n")
+    expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png'))
+    expander.write("3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n")
+    expander.image(Image.open('./assets/lykon_corgi.png'))
+    expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs "
+                   "   to evaluate your model once it has been loaded\n"
+                   "5. Once you have generated some evaluation images, head over to the '\U0001F4C1 Generated Images' tab to have a look at them\n"
+                   "6. To check out your evaluations or all of the TBYB Community evaluations, head over to the '\U0001F4CA Model Comparison' tab\n"
+                   "7. For more information about the evaluation process, see our paper at --PAPER HYPERLINK-- or navigate to the "
+                   "   '\U0001F4F0 Additional Information' tab for a TL;DR.\n"
+                   "8. For any questions or to report any bugs/issues. Please contact [email protected].\n")
+def setup_additional_information_tab(tab):
+    with tab:
+        st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models")
+        st.markdown(
+            """
+            *Based on the article of the same name available here --PAPER HYPERLINK--
+            Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian
+            This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical
+            implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from
+            all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope
+            that others share their evaluations as we look to further the discussion on transparency and reliability
+            of T2I models.
+            """)
+        st.header('2. A (very) Brief Summary')
+        st.image(Image.open('./assets/TBYB_flowchart.png'))
+        st.markdown(
+                    """
+                    Bias in text-to-image models can propagate unfair social representations and could be exploited to
+                    aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation
+                    methods focused on social biases. So, we proposed a bias evaluation methodology that considered
+                    general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result.
+                    """
+                )
+        st.markdown(
+        """
+            We proposed three novel metrics to quantify T2I model biases:
+            1. Distribution Bias - $B_D$
+            2. Jaccard Hallucination - $H_J$
+            3. Generative Miss Rate - $M_G$
+            Open the appropriate drop-down menu to understand the logic and inspiration behind metric.
+            """
+        )
+        c1,c2,c3 = st.columns(3)
+        with c1:
+            with st.expander("Distribution Bias - $B_D$"):
+                st.markdown(
+                    """
+                    Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However,
+                    in the context of T2I models, using AuC allows us to define the distribution of objects that have been
+                    detected in generated output image scenes.
+                    So, everytime an object is detected in a scene, we update a dictionary (which is available for
+                    download after running an evaluation). After evaluating a full set of images, you can use this
+                    information to determine what objects appear more frequently than others.
+                    After all images are evaluated, we sort the objects in descending order and normalize the data. We
+                    then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.:
+                    $B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$
+                    So, if a user conducts a task-oriented study on biases related to **dogs** using a model
+                    that was heavily biased using pictures of animals in the wild. You might find that after running
+                    evaluations, the most common objects detected were trees and grass - even if these objects weren't
+                    specified in the prompt. This would result in a very low $B_D$ in comparison to a model that for
+                    example was trained on images of dogs and animals in various different scenarios $\\rightarrow$
+                    which would result in a *higher* $B_D$ in comparison.
+                    """
+                )
+        with c2:
+            with st.expander("Jaccard Hallucination - $H_J$"):
+                st.markdown(
+                    """
+                    Hallucination is a very common phenomena that is discussed in relation to generative AI, particularly
+                    in relation to some of the most popular large language models. Depending on where you look, hallucinations
+                    can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment
+                    that we echo in our bias evaluations.
+                    Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a
+                    T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were
+                    specified. This indicates that there could be an innate shift in bias in the model, causing it to
+                    add or omit certain objects.
+                    Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of
+                    hallucination. Then, we considered the Jaccard similarity coefficient, which
+                    measures the similarity *and* diversity of two sets of objects/samples - defining this as
+                    Jaccard Hallucination - $H_J$.
+                    Simply put, we define the set of objects detected in the input prompt and then detect the objects in
+                    the corresponding output image. Then, we determine the intersect over union. For a model, we
+                    calculate the average $H_J$ across generated images using:
+                    $H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$
+                    """
+                )
+        with c3:
+            with st.expander("Generative Miss Rate - $M_G$"):
+                st.markdown(
+                    """
+                    Whenever fairness and trust are discussed in the context of machine learning and AI systems,
+                    performance is always highlighted as a key metric - regardless of the downstream task. So, in terms
+                    of evaluating bias, we thought that it would be important to see if there was a correlation
+                    between bias and performance (as we predicted). And while the other metrics do evaluate biases
+                    in terms of misalignment, they do not consider the relationship between bias and performance.
+                    We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically,
+                    as a model becomes more biased, it will begin to diverge away from the intended target and so, the
+                    miss rate of the generative model will increase as a result. This was a major consideration when
+                    designing this metric.
+                    We use the CLIP model as a binary classifier, differentiating between two classes:
+                    - the prompt used to generate the image
+                    - **NOT** the prompt
+                    Through our experiments on intentionally-biased T2I models, we found that there was a clear
+                    relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer
+                    how badly model performances have been affected by their biases.
+                    """
+                )
+        st.header('3. TBYB Constraints')
+        st.markdown(
+            """
+            While we have attempted to design a comprehensive, automated bias evaluation tool. We must acknowledge that
+            in its infancy, TBYB has some constraints:
+            - We have not checked the validity of *every* single T2I model and model type on HuggingFace so we cannot
+            promise that all T2I models will work - if you run into any issues that you think should be possible, feel
+            free to reach out!
+            - Currently, a model_index.json file is required to load models and use them with TBYB, we will look to
+            address other models in future works
+            - TBYB only works on T2I models hosted on HuggingFace, other model repositories are not currently supported
+            - Adaptor models are not currently supported, we will look to add evaluation functionalities of these
+            models in the future.
+            - Download, generation, inference and evaluation times are all hardware dependent.
+            Keep in mind that these constraints may be removed or added to any time.
+            """)
+        st.header('4. Misuse, Malicious Use, and Out-of-Scope Use')
+        st.markdown(
+            """
+            Given this application is used for the assessment of T2I biases and relies on
+            pre-trained models available on HuggingFace, we are not responsible for any content generated
+            by public-facing models that have been used to generate images using this application.
+            TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output
+            insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or
+            representations of marginalised groups, please address your concerns to the model providers.
+            However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be
+            beneficial to the TBYB community to share evaluations of biased T2I models!
+            We share no association with HuggingFace \U0001F917, we only use their services as a model repository,
+            given their growth in popularity in the computer science community recently.
+            For further questions/queries or if you want to simply strike a conversation,
+            please reach out to Jordan Vice at: [email protected]""")
+setup_page_banner()
+setup_how_to()
+if user_login_create():
+    tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.",
+                                           "\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"])
+    setup_additional_information_tab(tab6)
+    # PLASTER THE LOGO EVERYWHERE
+    tab2.subheader("General Bias Evaluation")
+    tab2.write("Waiting for \U0001F527 Setup to be complete...")
+    tab3.subheader("Task-Oriented Bias Evaluation")
+    tab3.write("Waiting for \U0001F527 Setup to be complete...")
+    tab4.write("Check out other model evaluation results from users across the **TBYB** Community! \U0001F30E ")
+    tab4.write("You can also just compare your own model evaluations by clicking the '*Personal Evaluation*' buttons")
+    MCOMP.initialise_page(tab4)
+    tab5.subheader("Generated Images from General and Task-Oriented Bias Evaluations")
+    tab5.write("Waiting for \U0001F527 Setup to be complete...")
+    with tab1:
+        with st.form("model_definition_form", clear_on_submit=True):
+            modelID = st.text_input('Input the HuggingFace \U0001F917 T2I model_id for the model you '
+                                    'want to analyse e.g.: "runwayml/stable-diffusion-v1-5"')
+            submitted1 = st.form_submit_button("Submit")
+            if modelID:
+                with st.spinner('Checking if ' + modelID + ' is valid and downloading it (if required)'):
+                    modelLoaded = MLOAD.check_if_model_exists(modelID)
+                    if modelLoaded is not None:
+                        # st.write("Located " + modelID + " model_index.json file")
+                        st.write("Located " + modelID)
+                        modelType = MLOAD.get_model_info(modelLoaded)
+                        if modelType is not None:
+                            st.write("Model is of Type: ", modelType)
+                            if submitted1:
+                                MINFER.TargetModel = MLOAD.import_model(modelID, modelType)
+                                if MINFER.TargetModel is not None:
+                                    st.write("Text-to-image pipeline looks like this:")
+                                    st.write(MINFER.TargetModel)
+                                    user_evaluation_variables.MODEL = modelID
+                                    user_evaluation_variables.MODEL_TYPE = modelType
+                    else:
+                        st.error('The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.'
+                                                           ' Please check that that HuggingFace repo ID is valid.'
+                                                           ' For more help, please see the "How to Use" Tab above.', icon="🚨")
+        if modelID:
+            with st.form("example_image_gen_form", clear_on_submit=True):
+                testPrompt = st.text_input('Input a random test prompt to test out your '
+                                           'chosen model and see if its generating images:')
+                submitted2 = st.form_submit_button("Submit")
+                if testPrompt and submitted2:
+                    with st.spinner("Generating an image with the prompt:\n"+testPrompt+"(This may take some time)"):
+                        testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt)
+                    st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt)
+                    st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias!
+                                  Otherwise, feel free to load up a different model and run it again''')
+        if MINFER.TargetModel is not None:
+            tab_manager.completed_setup([tab2, tab3, tab4, tab5], modelID)
+else:
+    MCOMP.databaseDF = None
+    user_evaluation_variables.reset_variables('general')
+    user_evaluation_variables.reset_variables('task-oriented')
+    st.write('')
+    st.warning('Log in or register your email to get started! ', icon="⚠️")