Spaces:

Brand24
/

mms_benchmark

Runtime error

App Files Files Community

wscode commited on Jun 7, 2023

Commit

4a15609

1 Parent(s): 329333f

add language statistics, make inferring categorical for numeric types optional, make slider integer if df dtype is integer

Browse files

Files changed (5) hide show

.gitignore +2 -1
data/language_stats.parquet +3 -0
filter_dataframe.py +27 -6
pages/2_Language_Statistics.py +22 -0
pages/3_Dataset_Statistics.py +0 -0

.gitignore CHANGED Viewed

@@ -159,4 +159,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-*.ipynb

 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+*.ipynb
+*.code-workspace

data/language_stats.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e52fca5ff80ab2c16ba8bbd99244f7cbe5e2a988f45443fe48c1b2a176f98e9c
+size 9087

filter_dataframe.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit.components.v1 as components
 import pandas as pd
@@ -7,14 +8,16 @@ from pandas.api.types import (
     is_categorical_dtype,
     is_datetime64_any_dtype,
     is_numeric_dtype,
     is_object_dtype,
 )
-def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Adds a UI on top of a dataframe to let viewers filter columns
     Args:
         df (pd.DataFrame): Original dataframe
     Returns:
         pd.DataFrame: Filtered dataframe
@@ -45,17 +48,35 @@ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             left, right = st.columns((1, 20))
             left.write("↳")
             # Treat columns with < 10 unique values as categorical
-            if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
                 user_cat_input = right.multiselect(
                     f"Values for {column}",
                     df[column].unique(),
                     default=list(df[column].unique()),
                 )
                 df = df[df[column].isin(user_cat_input)]
-            elif is_numeric_dtype(df[column]):
-                _min = float(df[column].min())
-                _max = float(df[column].max())
-                step = (_max - _min) / 100
                 user_num_input = right.slider(
                     f"Values for {column}",
                     _min,

+# https://blog.streamlit.io/auto-generate-a-dataframe-filtering-ui-in-streamlit-with-filter_dataframe/
 import streamlit.components.v1 as components
 import pandas as pd
     is_categorical_dtype,
     is_datetime64_any_dtype,
     is_numeric_dtype,
+    is_integer_dtype,
     is_object_dtype,
 )
+def filter_dataframe(df: pd.DataFrame, numeric_as_categorical: bool = True) -> pd.DataFrame:
     """
     Adds a UI on top of a dataframe to let viewers filter columns
     Args:
         df (pd.DataFrame): Original dataframe
+        numeric_as_categorical (bool, optional): Whether to treat numeric columns with low number of unique values as categorical. Defaults to True.
     Returns:
         pd.DataFrame: Filtered dataframe
             left, right = st.columns((1, 20))
             left.write("↳")
             # Treat columns with < 10 unique values as categorical
+            low_nunique = df[column].nunique() < 10
+            is_categorical = is_categorical_dtype(df[column])
+            is_numeric = is_numeric_dtype(df[column])
+            treat_as_categorical = False
+            if is_categorical:
+                treat_as_categorical = True
+            elif low_nunique:
+                if is_numeric:
+                    treat_as_categorical = numeric_as_categorical
+                else:
+                    treat_as_categorical = True
+            if treat_as_categorical:
                 user_cat_input = right.multiselect(
                     f"Values for {column}",
                     df[column].unique(),
                     default=list(df[column].unique()),
                 )
                 df = df[df[column].isin(user_cat_input)]
+            elif is_numeric:
+                if is_integer_dtype(df[column]):
+                    _min = int(df[column].min())
+                    _max = int(df[column].max())
+                    step = 1
+                else:
+                    _min = float(df[column].min())
+                    _max = float(df[column].max())
+                    step = (_max - _min) / 100
                 user_num_input = right.slider(
                     f"Values for {column}",
                     _min,

pages/2_Language_Statistics.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import streamlit as st
+import pandas as pd
+from filter_dataframe import filter_dataframe
+@st.cache_data
+def get_language_stats_df():
+    return pd.read_parquet("data/language_stats.parquet")
+st.set_page_config(page_title="Language Statistics", page_icon="📈")
+st.markdown("# Language Statistics")
+st.sidebar.header("Language Statistics")
+st.write(
+    """TODO: Description"""
+)
+df = get_language_stats_df()
+st.dataframe(filter_dataframe(df, numeric_as_categorical=False))

pages/3_Dataset_Statistics.py ADDED Viewed

File without changes