File size: 1,633 Bytes
cd37af8
 
 
 
 
 
 
 
 
 
9d3e113
 
 
 
 
 
 
 
cd37af8
55c9088
cd37af8
 
 
 
 
 
 
 
 
55c9088
cd37af8
 
 
 
 
 
 
 
 
 
 
 
 
 
56e420d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import streamlit as st
import pandas as pd
from filter_dataframe import filter_dataframe


@st.cache_data
def get_language_stats_df():
    return pd.read_parquet("data/datasets_stats.parquet")

_MMS_CITATION = """\
@misc{augustyniak2023massively,
      title={Massively Multilingual Corpus of Sentiment Datasets and Multi-faceted Sentiment Classification Benchmark}, 
      author={Łukasz Augustyniak and Szymon Woźniak and Marcin Gruza and Piotr Gramacki and Krzysztof Rajda and Mikołaj Morzy and Tomasz Kajdanowicz},
      year={2023},
      eprint={2306.07902},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}"""

CITATION_SEPARATOR = "% " + ("-" * 90) + "\n\n"

def export_citations(df: pd.DataFrame):
    dataset_names = df.original_dataset.tolist()
    dataset_citations = df.citation.tolist()

    df = pd.DataFrame({"dataset": dataset_names, "citation": dataset_citations})
    citations_grouped_df = df.groupby("citation").agg({"dataset": lambda x: ", ".join(x)}).reset_index().sort_values(by="dataset")
    dataset_citations = ("% Datasets: " + citations_grouped_df["dataset"] + "\n" + citations_grouped_df["citation"]).to_list()
    dataset_citations_joined = CITATION_SEPARATOR.join(dataset_citations)
    return f"% MMS corpus citation\n{_MMS_CITATION}\n{CITATION_SEPARATOR}{dataset_citations_joined}"


st.set_page_config(page_title="Dataset statistics", page_icon="📈")

st.markdown("# Dataset statistics")

df = get_language_stats_df()

df_filter = filter_dataframe(df)
st.dataframe(df_filter)


if st.button("Export citations"):
    val = export_citations(df_filter)
    st.code(val, language="latex")