proofsteps-from-proof-pile-v2

Runtime error

File size: 3,240 Bytes

f9f24ec
 
 
 
7ecbb3e
2319432
 
f9f24ec
f698f3c
 
 
 
 
 
 
 
 
f9f24ec
f698f3c
 
 
 
 
f9f24ec
 
44421ab
 
 
 
 
49e9afa
44421ab
 
 
e16e691
c1b9076
44421ab
 
 
2bc5907
44421ab
 
f698f3c
0974549
5342039
f698f3c
 
 
 
 
 
 
0974549
 
f698f3c
 
 
 
 
 
 
 
89247af

import streamlit as st
import json
from datasets import load_dataset

st.set_page_config(page_title="Kaggle Notebooks inspection", layout="wide")

st.markdown("<h1 style='text-align: center; color: #00BFFF;'>Kaggle Notebooks inspection 🔍</h1>", unsafe_allow_html=True)

st.markdown("""
Here you can inspect Kaggle notebooks that were converted to python scripts and deduplicated.

In the sidebar, you can choose to display:
- **dataset description and title** when it exists; this information was already available in Kaggle dataset
- only files for which we **retrieved extra information on the datasets being loaded** in the notebook using Kaggle API (e.g., column names, types, summary...), which makes up about 8% of the dataset.
There might be multiple CSV files loaded in the same notebook; we use delimiters `<start_description>` and `<end_description>` to separate them.
""")

@st.cache()
def load_data(upvote=0, size=0, has_data_info=False):
    ds = load_dataset("loubnabnl/kaggle_scripts_final_wdata", split="train")
    if has_data_info:
        ds = ds.filter(lambda x: x["has_data_info"])
    ds = ds.filter(lambda x: x["upvotes"] >= upvote and x["script_nb_tokens"] >= size)
    return ds

def show_extra_info(e):
    kv = json.loads(e["kversion"])[0]
    try:
        data_v = json.loads(e["dataset_versions"])[0]
    except:
        data_v = ""
    if data_v:
        data_title = data_v["Title"]
        import numpy as np
        description = data_v["Description"] if str(data_v["Description"]) != 'nan' else "<empty_description>"
        data_text = f"<br>**📚 Dataset description:**<br>Title: **{data_title}**, described as: {description}."
    else:
        data_text = ""
    
    text = f"The title of the notebook is: **{kv['Title']}** and it has **{kv['TotalVotes']} ⬆️ upvotes**.{data_text}"
    return text

st.sidebar.header('Notebook Filters')
vote = st.sidebar.slider("Minimum notebook ⬆️ upvotes", min_value=0, max_value=100, step=1, value=0)
size = st.sidebar.slider("Length of the notebook in number of tokens", min_value=0, max_value=15_000, step=1000, value=0)

st.sidebar.header('Display Settings')
show_data_metadata = st.sidebar.checkbox("Show associated (not necessarily retrieved) data Title and Description", value=True)
show_only_files_with_data = st.sidebar.checkbox("Show only files for which we retrieved dataset information", value=False)

samples = load_data(vote, size, show_only_files_with_data)
st.sidebar.header('Sample Selection')
index_example = st.sidebar.number_input(f"Choose a sample from the existing {len(samples)} notebooks:", min_value=0, max_value=max(0, len(samples)-1), value=0, step=1)

if show_data_metadata:
    st.markdown(f'<h2 style="color:blue;">Kaggle dataset description:</h2>', unsafe_allow_html=True)
    st.markdown(show_extra_info(samples[index_example]), unsafe_allow_html=True)

if samples[index_example]["has_data_info"]:
    st.markdown(f'<h2 style="color:blue;">Retrieved data information:</h2>', unsafe_allow_html=True)
    st.code(samples[index_example]["retreived_data_description"])
st.markdown(f'<h2 style="color:blue;">Notebook {index_example} converted to script:</h2>', unsafe_allow_html=True)
st.code(samples[index_example]["script"])