File size: 2,470 Bytes
461c45d
 
 
2be75e8
 
 
 
461c45d
2be75e8
 
 
 
 
 
 
 
461c45d
 
 
2be75e8
 
 
 
461c45d
 
2be75e8
 
 
461c45d
 
2be75e8
 
 
461c45d
 
2be75e8
461c45d
2be75e8
 
 
461c45d
2be75e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461c45d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
import json
import pandas as pd
from datasets import load_dataset

st.set_page_config(page_title="The Stack data Inspection", layout="wide")
st.title("The Stack data Inspection")

df = pd.read_csv("extension_distribution.csv")
all_extensions = df["extension"].tolist()
tags = {}
for index, row in df.iterrows():
    if row["language"] not in tags:
        tags[row["language"]] = []
    tags[row["language"]].append(row["extension"])
all_languages = list(tags.keys())


@st.cache()
def load_data(language, ext):
    ds = load_dataset("loubnabnl/the-stack-inspection-data", data_dir=f"data/{language}/{ext}", split="train")
    return ds

col1, col2, col3 = st.columns([1, 1, 4])
with col1:
    chosen_language = st.selectbox(
    label="Select a programming language",
    options=all_languages,
    index=0)
with col2:
    chosen_ext = st.selectbox(
    label="Select an extension",
    options=tags[chosen_language],
    index=0)

samples = load_data(chosen_language, chosen_ext)
max_docs = len(samples)
samples = samples.add_column("idx", range(len(samples)))
not_lexed = samples.filter(lambda x: not x['lexable'])
indexes_not_lexed = not_lexed['idx']

# info about extension
st.markdown("### Information about the extension:")
text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \
{df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \
are not lexable. These files are at indexes: {indexes_not_lexed}."
st.markdown(text)

col_1, col_2 = st.columns([2, 4])
with col_1:
    index_example = st.number_input(f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", min_value=0, max_value=max_docs-1, value=0, step=1)

st.write(f"Example chosen:{index_example}")
# info about the chosen example
example = samples[index_example]
st.markdown("#### Information about the chosen example:")
text_alpha = "**has**" if example['long_lines'] else "doesn't have"
text_lines = "**has**" if example['low_alphanum'] else "doesn't have"
text_lexer = "is" if example['lexable'] else "**isn't**"


st.markdown(f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
    {text_lines} very long lines,  and {text_lexer} lexable.")

st.markdown("#### File content:")

st.code(example["content"], language=chosen_language)