import os import pprint as pp from collections import OrderedDict, defaultdict import diff_viewer import pandas as pd import streamlit as st from datasets import load_from_disk DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("DATASET_DIR_PATH_BEFORE_CLEAN_SELECT") OPERATION_TYPES = [ "Applied filter", "Applied deduplication function", "Applied map function", ] MAX_LEN_DS_CHECKS = os.getenv("MAX_LEN_DS_CHECKS") def get_ds(ds_path): ds = load_from_disk(ds_path) return ds def next_idx(idx: int): idx += 1 return idx % len(st.session_state["ds"]) def previous_idx(idx: int): idx -= 1 return idx % len(st.session_state["ds"]) def on_click_next(): st.session_state["idx_1"] = next_idx(st.session_state["idx_1"]) st.session_state["idx_2"] = next_idx(st.session_state["idx_2"]) def on_click_previous(): st.session_state["idx_1"] = previous_idx(st.session_state["idx_1"]) st.session_state["idx_2"] = previous_idx(st.session_state["idx_2"]) def on_ds_change(ds_path): st.session_state["ds"] = get_ds(ds_path) st.session_state["idx_1"] = 0 st.session_state["idx_2"] = 1 if len(st.session_state["ds"]) > 1 else 0 st.session_state["ds_name"] = ds_path st.session_state["ds_max_docs"] = len(st.session_state["ds"]) def get_log_stats_df(raw_log): data = OrderedDict( { "Order": [], "Name": [], "Initial number of samples": [], "Final number of samples": [], "Initial size in bytes": [], "Final size in bytes": [], } ) metric_dict = defaultdict(lambda: {}) order = 0 for line in raw_log.split("\n"): for metric_name in list(data.keys()) + OPERATION_TYPES: if metric_name == "Name" or metric_name == "Order": continue if metric_name not in line: continue if ( metric_name == "Removed percentage" and "Removed percentage in bytes" in line ): continue if ( metric_name == "Deduplicated percentage" and "Deduplicated percentage in bytes" in line ): continue value = line.split(metric_name)[1].split(" ")[1] if metric_name in OPERATION_TYPES: operation_name = value metric_dict[operation_name]["Order"] = order order += 1 continue assert ( metric_name not in metric_dict[operation_name] ), f"operation_name: {operation_name}\n\nvalue: {value}\n\nmetric_dict: {pp.pformat(metric_dict)} \n\nmetric_name: {metric_name} \n\nline: {line}" metric_dict[operation_name][metric_name] = value for name, data_dict in metric_dict.items(): for metric_name in data.keys(): if metric_name == "Name": data[metric_name].append(name) continue data[metric_name].append(data_dict[metric_name]) df = pd.DataFrame(data) df.rename( { "Initial size in bytes": "Initial size (GB)", "Final size in bytes": "Final size (GB)", }, axis=1, inplace=True, ) df["% samples removed"] = ( ( df["Initial number of samples"].astype(float) - df["Final number of samples"].astype(float) ) / df["Initial number of samples"].astype(float) * 100 ) df["Size (GB) % removed"] = ( (df["Initial size (GB)"].astype(float) - df["Final size (GB)"].astype(float)) / df["Initial size (GB)"].astype(float) * 100 ) return df def get_logs_stats(log_path): with open(log_path) as f: raw_log = f.read() try: df = get_log_stats_df(raw_log) st.dataframe(df) except Exception as e: st.write(e) st.write("Subset of the logs:") subcontent = [ line for line in raw_log.split("\n") if "INFO - __main__" in line and "Examples of" not in line and "Examples n°" not in line ] st.write(subcontent) def meta_component(idx_key: str = "idx_1"): if "meta" not in st.session_state["ds"][st.session_state[idx_key]]: return with st.expander("See meta field of the example"): meta = st.session_state["ds"][st.session_state["idx_1"]]["meta"] st.write(meta) def filter_page(): index_example = st.number_input("Index of the chosen example", min_value=0, max_value=st.session_state["ds_max_docs"] -1, value=0, step=1) st.session_state["idx_1"] = index_example st.session_state["idx_2"] = next_idx(index_example) idx_1 = st.session_state["idx_1"] idx_2 = st.session_state["idx_2"] text_1 = st.session_state["ds"][idx_1]["text"] text_2 = st.session_state["ds"][idx_2]["text"] st.markdown( f"