File size: 4,173 Bytes
519b419
8c99444
519b419
 
 
 
 
 
 
 
 
 
 
 
 
8c99444
519b419
 
8c99444
 
 
519b419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c99444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519b419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding:utf-8 -*-
import zipfile
from io import BytesIO
import re
from zipfile import ZipFile
import os
from pathlib import Path


from cassis import load_typesystem, load_cas_from_xmi

from n4a_analytics_lib.st_components import st_pb


class Project:
    def __init__(self, zip_project, type, remote):
        # zip container that contains XMI and typesystem
        self.zip_project = zip_project

        self.remote = remote

        # 'iaa' or 'global'
        self.type = type

        # store source filename
        self.documents = []
        # store XMI representation
        self.xmi_documents = []
        # store typesystem file
        self.typesystem = None  # cassis.load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))

        # set annotators
        self.annotators = []
        # set annotations
        """
        {
            "Filename.xmi": {
                
                mentions: [],
                labels: []
                
            }, ...
        }
        """
        self.annotations = {}


        if isinstance(self.zip_project, zipfile.ZipFile) and self.remote and self.type == "global":
            for fp in self.zip_project.namelist():
                if self.typesystem is None:
                    self.typesystem = load_typesystem(BytesIO(self.zip_project.open('TypeSystem.xml').read()))
                if fp.endswith('.xmi'):
                    self.documents.append(fp)
                    self.xmi_documents.append(str(self.zip_project.open(fp).read().decode("utf-8")))


        else:
         with ZipFile(self.zip_project) as project_zip:
             if self.type == "global":
                 regex = re.compile('.*curation/.*/(?!\._).*zip$')
             elif self.type == "iaa":
                 regex = re.compile('.*xm[il]$')

             annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
             for fp in annotation_fps:
                 if self.type == "global":
                     with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
                         if self.typesystem is None:
                             self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
                         for f in annotation_zip.namelist():
                             if f.endswith('.xmi'):
                                 # store source filename
                                 self.documents.append(Path(fp).parent.name)
                                 # annotators = []
                                 # store XMI representation
                                 self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
                 elif self.type == "iaa":
                     if self.typesystem is None and fp.endswith('.xml'):
                         self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
                     else:
                         if fp.endswith('.xmi'):
                             # store source filename
                             self.documents.append(fp)
                             # set annotators
                             self.annotators.append(os.path.splitext(fp)[0])
                             # store XMI representation
                             self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))


        self.extract_ne()


    @st_pb
    def extract_ne(self):
        count = 0
        for xmi, src in zip(self.xmi_documents, self.documents):
            doc_flag = True
            try:
                cas = load_cas_from_xmi(xmi, typesystem=self.typesystem)
                self.annotations[src] = {
                        "mentions": [],
                        "labels": []
                }
                for ne in cas.select('de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity'):
                    self.annotations[src]["mentions"].append(ne.get_covered_text())
                    self.annotations[src]["labels"].append(ne.value)
            except:
                doc_flag = False

            count += 1
            yield (count / len(self.documents)) * 1.0, src, doc_flag