Spaces:

clarin-pl
/

datasets-explorer

Runtime error

App Files Files Community

Mariusz Kossakowski commited on Aug 31, 2022

Commit

b24d167

1 Parent(s): f10673c

Add two dataframes to the punctuation restorarion dataset

Browse files

Files changed (1) hide show

clarin_datasets/punctuation_restoration_dataset.py +67 -1

clarin_datasets/punctuation_restoration_dataset.py CHANGED Viewed

@@ -8,6 +8,7 @@ from clarin_datasets.dataset_to_show import DatasetToShow
 class PunctuationRestorationDataset(DatasetToShow):
     def __init__(self):
         DatasetToShow.__init__(self)
         self.dataset_name = "clarin-pl/2021-punctuation-restoration"
         self.description = """
         Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
@@ -41,6 +42,71 @@ class PunctuationRestorationDataset(DatasetToShow):
         self.data_dict = {
             subset: raw_dataset[subset].to_pandas() for subset in self.subsets
         }
     def show_dataset(self):
-        pass

 class PunctuationRestorationDataset(DatasetToShow):
     def __init__(self):
         DatasetToShow.__init__(self)
+        self.data_dict_named = None
         self.dataset_name = "clarin-pl/2021-punctuation-restoration"
         self.description = """
         Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
         self.data_dict = {
             subset: raw_dataset[subset].to_pandas() for subset in self.subsets
         }
+        self.data_dict_named = {}
+        for subset in self.subsets:
+            references = raw_dataset[subset]["tags"]
+            references_named = [
+                [
+                    raw_dataset[subset].features["tags"].feature.names[label]
+                    for label in labels
+                ]
+                for labels in references
+            ]
+            self.data_dict_named[subset] = pd.DataFrame(
+                {
+                    "tokens": self.data_dict[subset]["tokens"],
+                    "tags": references_named,
+                }
+            )
     def show_dataset(self):
+        header = st.container()
+        description = st.container()
+        dataframe_head = st.container()
+        class_distribution = st.container()
+        with header:
+            st.title(self.dataset_name)
+        with description:
+            st.header("Dataset description")
+            st.write(self.description)
+        full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
+        with dataframe_head:
+            st.header("First 10 observations of the chosen subset")
+            subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
+            df_to_show = self.data_dict[subset_to_show].head(10)
+            st.dataframe(df_to_show)
+            st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
+        class_distribution_dict = {}
+        for subset in self.subsets:
+            all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
+            all_labels_from_subset = [
+                x
+                for subarray in all_labels_from_subset
+                for x in subarray
+                if x != "O"
+            ]
+            all_labels_from_subset = pd.Series(all_labels_from_subset)
+            class_distribution_dict[subset] = (
+                all_labels_from_subset.value_counts(normalize=True)
+                .sort_index()
+                .reset_index()
+                .rename({"index": "class", 0: subset}, axis="columns")
+            )
+        class_distribution_df = pd.merge(
+            class_distribution_dict["train"],
+            class_distribution_dict["test"],
+            on="class",
+        )
+        with class_distribution:
+            st.header("Class distribution in each subset (without 'O')")
+            st.dataframe(class_distribution_df)
+            st.text_area(
+                label="LaTeX code", value=class_distribution_df.style.to_latex()
+            )