Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,978 Bytes
a431d31 fcb283e 26cde09 a431d31 9205f25 b0c1665 2610e3a fcb283e a431d31 7666b36 914615f a431d31 a166ec2 2cce3db c4eee53 a431d31 26cde09 2cce3db fcb283e a431d31 24e8ad5 26cde09 79a7769 b0c1665 c4eee53 5ee220a 2610e3a b0c1665 2610e3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
from datasets import load_dataset
import streamlit as st
from huggingface_hub import hf_hub_download
import gzip
import json
import time
@st.experimental_memo
def load_all_usernames():
filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset")
with gzip.open(filepath, 'r') as f:
usernames = json.loads(f.read().decode('utf-8'))
return usernames
st.image("./banner.png", use_column_width=True)
st.markdown("**_The Stack is an open governance interface between the AI community and the open source community._**")
st.title("Am I in The Stack?")
st.markdown("As part of the BigCode project, we released and maintain [The Stack](https://huggingface.co/datasets/bigcode/the-stack), a 3.1 TB dataset of permissively licensed source code in 30 programming languages. One of our goals in this project is to give people agency over their source code by letting them decide whether or not it should be used to develop and evaluate machine learning models, as we acknowledge that not all developers may wish to have their data used for that purpose.")
st.markdown("This tool lets you check if a repository under a given username is part of The Stack dataset. Would you like to have your data removed from future versions of The Stack? You can opt-out following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack).")
usernames = load_all_usernames()
username = st.text_input("Your GitHub Username:")
if username or st.button("Check!"):
if username in usernames:
repos = usernames[username]
repo_word = "repository" if len(repos)==1 else "repositories"
st.markdown(f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack:")
for repo in repos:
st.markdown(f"`{repo}`")
else:
st.markdown("**No**, your code is not in The Stack.") |