File size: 2,198 Bytes
8aac646 b19b634 8aac646 b19b634 8aac646 b19b634 8aac646 b19b634 8aac646 b19b634 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import streamlit as st
from datasets import load_dataset
import os
HF_TOKEN = os.environ.get("HF_TOKEN", None)
st.set_page_config(page_title="FW Clusters inspection", layout="wide")
st.title("FW clusters inspection")
st.markdown("""
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material.
Additionally, the model was tasked with assigning a category to each cluster from 23 predefined categories found in [AFAIK](https://afaik.io/).
Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics seperately. Hence the `Select Category Type` dropdown in our interface.
""")
@st.cache_data
def load_data(educational_topic):
ds = load_dataset("HuggingFaceTB/FW_clusters_under_afaik_topics", split="train", token=HF_TOKEN, num_proc=2)
if educational_topic in ['Yes', 'No']:
ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic)
return ds
@st.cache_data
def get_categories_by_type(_ds, category_type):
filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type)
return list(set(filtered_ds['category']))
st.subheader("Cluster information")
col_1, col_2, col_3 = st.columns(3)
with col_1:
educational_topic = st.selectbox('Are the topics deemed educational by the LLM?', ["Yes", "No"])
ds = load_data(educational_topic)
with col_2:
category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik']
selected_category_type = st.selectbox("Select Category Type", category_types)
with col_3:
categories = get_categories_by_type(ds, selected_category_type)
selected_category = st.selectbox("Select Category", categories)
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
# Select sample index
n_samples = len(selected_cluster["examples"])
index_example = st.number_input(f"Index of a sample: 0 - {n_samples}", min_value=0, max_value=n_samples-1, value=0, step=1)
sample = selected_cluster["examples"][index_example]
st.markdown(sample)
|