Spaces:
Sleeping
Sleeping
File size: 2,624 Bytes
c7cc986 ff82de9 0f6cf6a 30585ac 08e2876 24b4364 c7cc986 b75c1aa 360bd52 c7cc986 360bd52 c7cc986 24b4364 0f6cf6a 24b4364 0f6cf6a 360bd52 c7cc986 30585ac b75c1aa c7cc986 b75c1aa 360bd52 30585ac 0f6cf6a 30585ac 0f6cf6a 48aa054 18adb0d 30585ac 18adb0d 24b4364 30585ac 08e2876 18adb0d c7cc986 24b4364 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import streamlit as st
from datasets import load_dataset, concatenate_datasets
import json
import os
import base64
from datetime import datetime
def load_and_combine_datasets():
python_codes_dataset = load_dataset('flytech/python-codes-25k', split='train')
streamlit_issues_dataset = load_dataset("andfanilo/streamlit-issues", split='train')
streamlit_docs_dataset = load_dataset("sai-lohith/streamlit_docs", split='train')
combined_dataset = concatenate_datasets([python_codes_dataset, streamlit_issues_dataset, streamlit_docs_dataset])
return combined_dataset
def datetime_serializer(o):
if isinstance(o, datetime):
return o.strftime('%Y-%m-%d %H:%M:%S')
def save_combined_dataset_as_jsonl(combined_dataset, file_path):
with open(file_path, 'w', encoding='utf-8') as f:
for example in combined_dataset:
json.dump(example, f, ensure_ascii=False, default=datetime_serializer)
f.write('\n')
def main():
st.title("Combined Dataset Viewer and Downloader")
# Load and combine datasets
combined_dataset = load_and_combine_datasets()
# Display a subset of the combined dataset
st.write("Subset of Combined Dataset:", combined_dataset[:10])
# Take input for output dataset name
output_dataset_name = st.text_input("Enter output dataset name (without extension):", "combined_dataset")
# Add option to save the combined dataset as JSONL
if st.button("Save Combined Dataset (JSONL)"):
file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
save_combined_dataset_as_jsonl(combined_dataset, file_path)
st.write(f"Combined dataset saved as JSONL file: {file_path}")
# Add option to download the JSONL file
if st.button("Download Combined Dataset (JSONL)"):
file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
save_combined_dataset_as_jsonl(combined_dataset, file_path)
st.write("Download the combined dataset as JSONL file:")
with open(file_path, "rb") as f:
bytes_data = f.read()
b64 = base64.b64encode(bytes_data).decode()
href = f'<a href="data:file/jsonl;base64,{b64}" download="{output_dataset_name}.jsonl">Download JSONL File</a>'
st.markdown(href, unsafe_allow_html=True)
# Provide download button
st.download_button(label="Click to Download",
data=bytes_data,
file_name=f"{output_dataset_name}.jsonl",
mime="application/jsonl")
if __name__ == "__main__":
main()
|