Mariusz Kossakowski commited on
Commit
b24d167
·
1 Parent(s): f10673c

Add two dataframes to the punctuation restorarion dataset

Browse files
clarin_datasets/punctuation_restoration_dataset.py CHANGED
@@ -8,6 +8,7 @@ from clarin_datasets.dataset_to_show import DatasetToShow
8
  class PunctuationRestorationDataset(DatasetToShow):
9
  def __init__(self):
10
  DatasetToShow.__init__(self)
 
11
  self.dataset_name = "clarin-pl/2021-punctuation-restoration"
12
  self.description = """
13
  Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
@@ -41,6 +42,71 @@ class PunctuationRestorationDataset(DatasetToShow):
41
  self.data_dict = {
42
  subset: raw_dataset[subset].to_pandas() for subset in self.subsets
43
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def show_dataset(self):
46
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  class PunctuationRestorationDataset(DatasetToShow):
9
  def __init__(self):
10
  DatasetToShow.__init__(self)
11
+ self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/2021-punctuation-restoration"
13
  self.description = """
14
  Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
 
42
  self.data_dict = {
43
  subset: raw_dataset[subset].to_pandas() for subset in self.subsets
44
  }
45
+ self.data_dict_named = {}
46
+ for subset in self.subsets:
47
+ references = raw_dataset[subset]["tags"]
48
+ references_named = [
49
+ [
50
+ raw_dataset[subset].features["tags"].feature.names[label]
51
+ for label in labels
52
+ ]
53
+ for labels in references
54
+ ]
55
+ self.data_dict_named[subset] = pd.DataFrame(
56
+ {
57
+ "tokens": self.data_dict[subset]["tokens"],
58
+ "tags": references_named,
59
+ }
60
+ )
61
 
62
  def show_dataset(self):
63
+ header = st.container()
64
+ description = st.container()
65
+ dataframe_head = st.container()
66
+ class_distribution = st.container()
67
+
68
+ with header:
69
+ st.title(self.dataset_name)
70
+
71
+ with description:
72
+ st.header("Dataset description")
73
+ st.write(self.description)
74
+
75
+ full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
76
+
77
+ with dataframe_head:
78
+ st.header("First 10 observations of the chosen subset")
79
+ subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
80
+ df_to_show = self.data_dict[subset_to_show].head(10)
81
+ st.dataframe(df_to_show)
82
+ st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
83
+
84
+ class_distribution_dict = {}
85
+ for subset in self.subsets:
86
+ all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
87
+ all_labels_from_subset = [
88
+ x
89
+ for subarray in all_labels_from_subset
90
+ for x in subarray
91
+ if x != "O"
92
+ ]
93
+ all_labels_from_subset = pd.Series(all_labels_from_subset)
94
+ class_distribution_dict[subset] = (
95
+ all_labels_from_subset.value_counts(normalize=True)
96
+ .sort_index()
97
+ .reset_index()
98
+ .rename({"index": "class", 0: subset}, axis="columns")
99
+ )
100
+
101
+ class_distribution_df = pd.merge(
102
+ class_distribution_dict["train"],
103
+ class_distribution_dict["test"],
104
+ on="class",
105
+ )
106
+
107
+ with class_distribution:
108
+ st.header("Class distribution in each subset (without 'O')")
109
+ st.dataframe(class_distribution_df)
110
+ st.text_area(
111
+ label="LaTeX code", value=class_distribution_df.style.to_latex()
112
+ )