inspect_web_clusters

Running

File size: 2,499 Bytes

8aac646
 
 
 
 
 
b19b634
64b11d2
b19b634
 
 
 
64b11d2
b19b634
64b11d2
b19b634
 
85a8c20
b19b634
85a8c20
64b11d2
85a8c20
 
 
6881bc0
 
85a8c20
 
 
 
 
8aac646
 
b19b634
f5985dd
2865184
dab4dfa
2865184
dab4dfa
85a8c20
6881bc0
85a8c20
 
 
18967bf
64b11d2
b19b634
8aac646
b19b634
c6c5724
 
 
 
 
8aac646
c6c5724
 
 
 
 
 
 
 
2b533cd

import streamlit as st
from datasets import load_dataset
import os 

HF_TOKEN = os.environ.get("HF_TOKEN", None)

st.set_page_config(page_title="FW Clusters inspection", layout="wide")
st.title("FW clusters inspection (free topics)")

st.markdown("""
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). 

Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. 

Additionally, the model was tasked with finding the topic of each cluster. 
""")


@st.cache_data
def load_data(min_score=1, max_score=10, show_special=False):
    ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
    def filter_func(x):
        try:
            score = int(x['educational_score'])
            value = False if show_special else min_score <= score <= max_score
            return value
        except (ValueError, TypeError):
            # Return True if show_special is checked and educational_score is None or ''
            return show_special

    ds = ds.filter(filter_func)
    return ds

st.subheader("Cluster information")
col_1, col_2, col_3 = st.columns(3)
with col_1:
    min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
with col_2:
    max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')
with col_3:
    show_special = st.checkbox('Show only clusters with undefined educational score', False)
    
# Load data based on slider values and checkbox status
ds = load_data(min_value, max_value, show_special)
selected_category_type = st.selectbox("Select a topic", categories)
categories = list(set(ds["category"]))
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)

# Select sample index
n_samples = len(selected_cluster)
if n_samples > 0:
    col_1, col_2 = st.columns(2)
    with col_1:
        index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one",  min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)

    files = selected_cluster[index_cluster]["examples"]

    with col_2:
        index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one",  min_value=0, max_value=len(files)-1, value=0, step=1)

    sample = files[index_example]
    st.markdown(sample)
else:
    st.markdown("No files found, change the cluster.")