import streamlit as st from datasets import load_dataset import os HF_TOKEN = os.environ.get("HF_TOKEN", None) st.set_page_config(page_title="Synthetic textbooks inspection", layout="wide") st.title("Synthetic textbooks inspection") st.markdown("Inspection of synthetic textbooks generated by `Falcon-180B-chat`") @st.cache_data() def load_data(source="all"): ds = load_dataset("HuggingFaceTB/synthetic_textbooks_subset", split="train", use_auth_token=HF_TOKEN) if source != "all": ds = ds.filter(lambda x: x["source"] == source) return ds source = st.selectbox("Data source", ['all', 'wikihow','khan_academy', 'stanford_courses', 'rw_wikihow', 'rw_stanford']) samples = load_data(source) n_samples = len(samples) index = st.number_input(f"Index of the sample (out of {n_samples}):", min_value=0, max_value=n_samples-1, value=0, step=1) st.markdown(f"Displaying source: {source}") st.subheader("Prompt") st.markdown(samples[index_example]["prompt"]) st.subheader("Textbook") st.markdown(samples[index_example]['textbook'])