File size: 2,252 Bytes
9f7f573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90966f7
9f7f573
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from datasets import load_dataset
import streamlit as st

from clarin_datasets.dataset_to_show import DatasetToShow


class KpwrNerDataset(DatasetToShow):
    def __init__(self):
        DatasetToShow.__init__(self)
        self.dataset_name = "clarin-pl/kpwr-ner"
        self.description = """
        KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka 
        Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories 
        of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
        originally 120). During corpus creation, texts were annotated by humans from various sources, covering many 
        domains and genres. 

        Tasks (input, output and metrics)
        Named entity recognition (NER) - tagging entities in text with their corresponding type.
        
        Input ('tokens' column): sequence of tokens
        
        Output ('ner' column): sequence of predicted tokens’ classes in BIO notation (82 possible classes, described 
        in detail in the annotation guidelines) 
        
        example:
        
        [‘Roboty’, ‘mają’, ‘kilkanaście’, ‘lat’, ‘i’, ‘pochodzą’, ‘z’, ‘USA’, ‘,’, ‘Wysokie’, ‘napięcie’, ‘jest’, 
        ‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, 
        ‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, 
        ‘B-nam_loc_gpe_country’, ‘O’]
        """

    def load_data(self):
        raw_dataset = load_dataset(self.dataset_name)
        self.data_dict = {
            subset: raw_dataset[subset].to_pandas() for subset in self.subsets
        }

    def show_dataset(self):
        header = st.container()
        description = st.container()
        dataframe_head = st.container()

        with header:
            st.title(self.dataset_name)

        with description:
            st.header("Dataset description")
            st.write(self.description)