Spaces:

nazneen
/

error-analysis

Runtime error

App Files Files Community

nazneen commited on May 11, 2022

Commit

08090c3

1 Parent(s): 609167a

k means clustering

Browse files

Files changed (1) hide show

app.py +78 -48

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
     ### LIBRARIES ###
 # # Data
-from matplotlib.pyplot import legend
 import numpy as np
 import pandas as pd
 import torch
@@ -10,11 +9,15 @@ from math import floor
 from datasets import load_dataset
 from collections import defaultdict
 from transformers import AutoTokenizer
 # Analysis
 # from gensim.models.doc2vec import Doc2Vec
 # from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
-# import nltk
 # nltk.download('punkt') #make sure that punkt is downloaded
 # App & Visualization
@@ -23,11 +26,11 @@ import altair as alt
 import plotly.graph_objects as go
 from streamlit_vega_lite import altair_component
 # utils
 from random import sample
-from error_analysis import utils as ut
-import os
 def down_samp(embedding):
@@ -61,12 +64,14 @@ def down_samp(embedding):
 def data_comparison(df):
     # set up a dropdown select bindinf
     # input_dropdown = alt.binding_select(options=['Negative Sentiment','Positive Sentiment'])
-    selection = alt.selection_multi(fields=['slice','label'])
-    color = alt.condition(alt.datum.slice == 'high-loss', alt.value("orange"), alt.value("steelblue"))
     # color = alt.condition(selection,
-    #                       alt.Color('slice:Q', legend=None),
-    #                       # scale = alt.Scale(domain = pop_domain,range=color_range)),
-    #                       alt.value('lightgray'))
     opacity = alt.condition(selection, alt.value(0.7), alt.value(0.25))
     # basic chart
@@ -75,7 +80,7 @@ def data_comparison(df):
         y=alt.Y('y', axis=None),
         color=color,
         shape=alt.Shape('label', scale=alt.Scale(range=['circle', 'diamond'])),
-        tooltip=['slice','content','label','pred'],
         opacity=opacity
     ).properties(
         width=1500,
@@ -83,28 +88,21 @@ def data_comparison(df):
     ).interactive()
     legend = alt.Chart(df).mark_point().encode(
-        y=alt.Y('slice:N', axis=alt.Axis(orient='right'), title="",),
         x=alt.X("label"),
         shape=alt.Shape('label', scale=alt.Scale(
-            range=['circle', 'diamond']), legend=None),
-        color=color
     ).add_selection(
         selection
     )
-    layered = legend | scatter
     layered = layered.configure_axis(
         grid=False
     ).configure_view(
         strokeOpacity=0
-    ).configure_legend(
-        strokeColor='gray',
-        fillColor='#EEEEEE',
-        padding=10,
-        cornerRadius=10,
-        orient='top-right'
     )
     return layered
@@ -166,7 +164,36 @@ def get_data(spotlight, emb):
     return pd.concat([pd.DataFrame(np.transpose(np.vstack([dataset[:num_examples]['content'],
                     dataset[:num_examples]['label'], preds, losses])), columns=['content', 'label', 'pred', 'loss']), embeddings], axis=1)
 def topic_distribution(weights, smoothing=0.01):
     topic_frequencies = defaultdict(float)
     topic_frequencies_spotlight = defaultdict(float)
@@ -196,15 +223,10 @@ def topic_distribution(weights, smoothing=0.01):
 if __name__ == "__main__":
     ### STREAMLIT APP CONGFIG ###
-    os.system("pip --ignore-installed streamlit ")
     st.set_page_config(layout="wide", page_title="Error Slice Analysis")
-    ut.init_style()
-    lcol, rcol = st.columns([2, 3])
     # ******* loading the mode and the data
-    with st.sidebar:
-        st.title('Error Analysis')
     dataset = st.sidebar.selectbox(
         "Dataset",
         ["amazon_polarity", "squad", "movielens", "waterbirds"],
@@ -221,15 +243,19 @@ if __name__ == "__main__":
         index=0
     )
-    loss_quantile = st.sidebar.selectbox(
-        "Loss Quantile",
-        [0.98, 0.95, 0.9, 0.8, 0.75],
-        index = 1
     )
     ### LOAD DATA AND SESSION VARIABLES ###
-    data_df = pd.read_parquet('./assets/data/amazon_polarity.test.parquet')
-    data_df.reset_index(drop=True, inplace=True)
-    embedding_umap = data_df[['x','y']]
     if "user_data" not in st.session_state:
         st.session_state["user_data"] = data_df
     if "selected_slice" not in st.session_state:
@@ -237,26 +263,30 @@ if __name__ == "__main__":
     if "embedding" not in st.session_state:
         st.session_state["embedding"] = embedding_umap
     with lcol:
         st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
-        dataframe = data_df[['content', 'label', 'pred', 'loss']].sort_values(
             by=['loss'], ascending=False)
         table_html = dataframe.to_html(
-            columns=['content', 'label', 'pred', 'loss'], max_rows=100)
         # table_html = table_html.replace("<th>", '<th align="left">')  # left-align the headers
         st.write(dataframe)
-        st.markdown('<h3>Word Distribution in Error Slice</h3>', unsafe_allow_html=True)
-        commontokens = frequent_tokens(data_df, tokenizer, loss_quantile=loss_quantile)
-        st.write(commontokens)
     # st_aggrid.AgGrid(dataframe)
     # table_html = dataframe.to_html(columns=['content', 'label', 'pred', 'loss'], max_rows=100)
     # table_html = table_html.replace("<th>", '<th align="left">')  # left-align the headers
     # st.write(table_html)
-    with rcol:
-        data_df['loss'] = data_df['loss'].astype(float)
-        losses = data_df['loss']
-        high_loss = losses.quantile(loss_quantile)
-        data_df['slice'] = 'high-loss'
-        data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
-        quant_panel(data_df)

     ### LIBRARIES ###
 # # Data
 import numpy as np
 import pandas as pd
 import torch
 from datasets import load_dataset
 from collections import defaultdict
 from transformers import AutoTokenizer
+pd.options.display.float_format = '${:,.2f}'.format
 # Analysis
 # from gensim.models.doc2vec import Doc2Vec
 # from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+import nltk
+from nltk.cluster import KMeansClusterer
+import scipy.spatial.distance as sdist
+from scipy.spatial import distance_matrix
 # nltk.download('punkt') #make sure that punkt is downloaded
 # App & Visualization
 import plotly.graph_objects as go
 from streamlit_vega_lite import altair_component
 # utils
 from random import sample
+# from PIL import Image
 def down_samp(embedding):
 def data_comparison(df):
     # set up a dropdown select bindinf
     # input_dropdown = alt.binding_select(options=['Negative Sentiment','Positive Sentiment'])
+        #data_kmeans['distance_from_centroid'] = data_kmeans.apply(distance_from_centroid, axis=1)
+    selection = alt.selection_multi(fields=['cluster','label'])
+    color = alt.condition(alt.datum.slice == 'high-loss', alt.Color('cluster:N', scale = alt.Scale(domain=df.cluster.tolist())), alt.value("lightgray"))
     # color = alt.condition(selection,
+    #                        alt.Color('cluster:Q', legend=None),
+    #                         # scale = alt.Scale(domain = pop_domain,range=color_range)),
+    #                         alt.value('lightgray'))
     opacity = alt.condition(selection, alt.value(0.7), alt.value(0.25))
     # basic chart
         y=alt.Y('y', axis=None),
         color=color,
         shape=alt.Shape('label', scale=alt.Scale(range=['circle', 'diamond'])),
+        tooltip=['cluster','slice','content','label','pred'],
         opacity=opacity
     ).properties(
         width=1500,
     ).interactive()
     legend = alt.Chart(df).mark_point().encode(
+        y=alt.Y('cluster:O', axis=alt.Axis(orient='right'), title=""),
         x=alt.X("label"),
         shape=alt.Shape('label', scale=alt.Scale(
+        range=['circle', 'diamond']), legend=None),
+        color=color,
     ).add_selection(
         selection
     )
+    layered = scatter |legend
     layered = layered.configure_axis(
         grid=False
     ).configure_view(
         strokeOpacity=0
     )
     return layered
     return pd.concat([pd.DataFrame(np.transpose(np.vstack([dataset[:num_examples]['content'],
                     dataset[:num_examples]['label'], preds, losses])), columns=['content', 'label', 'pred', 'loss']), embeddings], axis=1)
+@st.cache(ttl=600)
+def clustering(data,num_clusters):
+    X = np.array(data['embedding'].tolist())
+    kclusterer = KMeansClusterer(
+        num_clusters, distance=nltk.cluster.util.cosine_distance,
+        repeats=25,avoid_empty_clusters=True)
+    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
+    data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
+    data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
+    return data, assigned_clusters
+def kmeans(df, num_clusters=3):
+    data_hl = df.loc[df['slice'] == 'high-loss']
+    data_kmeans,clusters = clustering(data_hl,num_clusters)
+    merged = pd.merge(df, data_kmeans, left_index=True, right_index=True, how='outer', suffixes=('', '_y'))
+    merged.drop(merged.filter(regex='_y$').columns.tolist(),axis=1,inplace=True)
+    merged['cluster'] = merged['cluster'].fillna(num_clusters).astype('int')
+    return merged
+@st.cache(ttl=600)
+def distance_from_centroid(row):
+    return sdist.norm(row['embedding'] - row['centroid'].tolist())
+@st.cache(ttl=600)
 def topic_distribution(weights, smoothing=0.01):
     topic_frequencies = defaultdict(float)
     topic_frequencies_spotlight = defaultdict(float)
 if __name__ == "__main__":
     ### STREAMLIT APP CONGFIG ###
     st.set_page_config(layout="wide", page_title="Error Slice Analysis")
+    lcol, rcol = st.columns([2, 2])
     # ******* loading the mode and the data
     dataset = st.sidebar.selectbox(
         "Dataset",
         ["amazon_polarity", "squad", "movielens", "waterbirds"],
         index=0
     )
+    loss_quantile = st.sidebar.slider(
+        "Loss Quantile", min_value=0.0, max_value=1.0,step=0.1,value=0.95
     )
+    run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
+    num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
     ### LOAD DATA AND SESSION VARIABLES ###
+    data = pd.read_parquet('./assets/data/amazon_polarity.test.parquet')
+    embedding_umap = data[['x','y']]
+    emb_df = pd.read_parquet('./assets/data/amazon_test_emb.parquet')
+    data_df = pd.DataFrame([data['content'], data['label'], data['pred'], data['loss'], emb_df['embedding'], data['x'], data['y']]).transpose()
     if "user_data" not in st.session_state:
         st.session_state["user_data"] = data_df
     if "selected_slice" not in st.session_state:
     if "embedding" not in st.session_state:
         st.session_state["embedding"] = embedding_umap
+    data_df['loss'] = data_df['loss'].astype(float)
+    losses = data_df['loss']
+    high_loss = losses.quantile(loss_quantile)
+    data_df['slice'] = 'high-loss'
+    data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
+    if run_kmeans == 'True':
+        merged = kmeans(data_df,num_clusters=num_clusters)
     with lcol:
         st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
+        dataframe = merged[['content', 'label', 'pred', 'loss', 'cluster']].sort_values(
             by=['loss'], ascending=False)
         table_html = dataframe.to_html(
+            columns=['content', 'label', 'pred', 'loss', 'cluster'], max_rows=50)
         # table_html = table_html.replace("<th>", '<th align="left">')  # left-align the headers
         st.write(dataframe)
     # st_aggrid.AgGrid(dataframe)
     # table_html = dataframe.to_html(columns=['content', 'label', 'pred', 'loss'], max_rows=100)
     # table_html = table_html.replace("<th>", '<th align="left">')  # left-align the headers
     # st.write(table_html)
+    with rcol:
+        st.markdown('<h3>Word Distribution in Error Slice</h3>', unsafe_allow_html=True)
+        commontokens = frequent_tokens(merged, tokenizer, loss_quantile=loss_quantile)
+        st.write(commontokens)
+    quant_panel(merged)