File size: 2,460 Bytes
8aac646 b19b634 64b11d2 b19b634 64b11d2 b19b634 64b11d2 b19b634 85a8c20 b19b634 85a8c20 64b11d2 85a8c20 8aac646 b19b634 85a8c20 2865184 dab4dfa 2865184 dab4dfa 85a8c20 18967bf 64b11d2 b19b634 8aac646 b19b634 c6c5724 8aac646 c6c5724 2b533cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import streamlit as st
from datasets import load_dataset
import os
HF_TOKEN = os.environ.get("HF_TOKEN", None)
st.set_page_config(page_title="FW Clusters inspection", layout="wide")
st.title("FW clusters inspection (free topics)")
st.markdown("""
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10.
Additionally, the model was tasked with finding the topic of each cluster.
""")
@st.cache_data
def load_data(min_score=1, max_score=10, show_special=False):
ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
def filter_func(x):
try:
score = int(x['educational_score'])
return max(min_score <= score <= max_score, show_special)
except (ValueError, TypeError):
# Return True if show_special is checked and educational_score is None or ''
return show_special
ds = ds.filter(filter_func)
return ds
st.subheader("Cluster information")
col_1, col_2, col_3 = st.columns(2)
with col_1:
min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
with col_2:
max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')
with col_3:
show_special = st.checkbox('Show clusters with undefined educational score', False)
# Load data based on slider values and checkbox status
ds = load_data(min_value, max_value, show_special)
selected_category_type = st.selectbox("Select a topic", categories)
categories = list(set(ds["category"]))
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
# Select sample index
n_samples = len(selected_cluster)
if n_samples > 0:
col_1, col_2 = st.columns(2)
with col_1:
index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)
files = selected_cluster[index_cluster]["examples"]
with col_2:
index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1)
sample = files[index_example]
st.markdown(sample)
else:
st.markdown("No files found, change the cluster.") |