File size: 2,283 Bytes
c7cc986
ff82de9
0f6cf6a
30585ac
833a07b
c7cc986
 
 
b75c1aa
 
360bd52
c7cc986
360bd52
c7cc986
 
0f6cf6a
 
 
 
 
360bd52
c7cc986
30585ac
b75c1aa
c7cc986
 
b75c1aa
360bd52
 
 
30585ac
 
 
0f6cf6a
 
30585ac
0f6cf6a
 
48aa054
18adb0d
 
30585ac
18adb0d
 
833a07b
 
30585ac
 
 
18adb0d
c7cc986
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
from datasets import load_dataset, concatenate_datasets
import json
import os
import base64  # Add base64 for encoding

def load_and_combine_datasets():
    python_codes_dataset = load_dataset('flytech/python-codes-25k', split='train')
    streamlit_issues_dataset = load_dataset("andfanilo/streamlit-issues", split='train')
    streamlit_docs_dataset = load_dataset("sai-lohith/streamlit_docs", split='train')
    
    combined_dataset = concatenate_datasets([python_codes_dataset, streamlit_issues_dataset, streamlit_docs_dataset])
    
    return combined_dataset

def save_combined_dataset_as_jsonl(combined_dataset, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for example in combined_dataset:
            json.dump(example, f, ensure_ascii=False)
            f.write('\n')

def main():
    st.title("Combined Dataset Viewer and Downloader")

    # Load and combine datasets
    combined_dataset = load_and_combine_datasets()

    # Display a subset of the combined dataset
    st.write("Subset of Combined Dataset:", combined_dataset[:10])

    # Take input for output dataset name
    output_dataset_name = st.text_input("Enter output dataset name (without extension):", "combined_dataset")

    # Add option to save the combined dataset as JSONL
    if st.button("Save Combined Dataset (JSONL)"):
        file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
        save_combined_dataset_as_jsonl(combined_dataset, file_path)
        st.write(f"Combined dataset saved as JSONL file: {file_path}")

    # Add option to download the JSONL file
    if st.button("Download Combined Dataset (JSONL)"):
        file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
        save_combined_dataset_as_jsonl(combined_dataset, file_path)
        st.write("Download the combined dataset as JSONL file:")
        with open(file_path, "r") as f:  # Open file in read mode
            bytes_data = f.read().encode()  # Encode the data as bytes
        b64 = base64.b64encode(bytes_data).decode()
        href = f'<a href="data:file/jsonl;base64,{b64}" download="{output_dataset_name}.jsonl">Download JSONL File</a>'
        st.markdown(href, unsafe_allow_html=True)

if __name__ == "__main__":
    main()