|
import streamlit as st |
|
from datasets import load_dataset |
|
import os |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
|
st.set_page_config(page_title="Synthetic textbooks inspection", layout="wide") |
|
st.title("Synthetic textbooks inspection") |
|
st.markdown("Inspection of synthetic textbooks generated by `Falcon-180B-chat`") |
|
|
|
@st.cache_data() |
|
def load_data(source="all"): |
|
ds = load_dataset("HuggingFaceTB/synthetic_textbooks_subset", split="train", use_auth_token=HF_TOKEN) |
|
if source != "all": |
|
ds = ds.filter(lambda x: x["source"] == source) |
|
return ds |
|
|
|
|
|
source = st.selectbox("Data source", ['all', 'wikihow','khan_academy', 'stanford_courses', 'rw_wikihow', 'rw_stanford']) |
|
samples = load_data(source) |
|
n_samples = len(samples) |
|
|
|
index = st.number_input(f"Index of the sample (out of {n_samples}):", min_value=0, max_value=n_samples-1, value=0, step=1) |
|
st.markdown(f"Displaying source: {source}") |
|
st.subheader("Prompt") |
|
st.markdown(samples[index_example]["prompt"]) |
|
|
|
st.subheader("Textbook") |
|
st.markdown(samples[index_example]['textbook']) |