File size: 3,153 Bytes
2654a92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
def getIndex(inputData):
    '''
    get input index for lists and dicts.
    '''
    inputType = type(inputData)
    if inputType is dict:
        return list(inputData.keys())
    elif inputType is list:
        return range(len(inputData))
    return 0

def getSonNodes(nodeData, nodeName):
    '''
    Given a nodeData object and a nodeName string,
    returns a list of tuples containing the child
    nodes of the given node and their corresponding
    names.
    '''
    index = getIndex(nodeData)
    ans = []
    if type(nodeData) is list:
        for i in index:
            ans.append((nodeData[i], nodeName))
    elif type(nodeData) is dict:
        for i in index:
            ans.append((nodeData[i], i))
    return ans

def docRead(sonData, sonName):
    '''
    Given a sonData object and its corresponding
    sonName string, returns a string representation
    of the data.
    Returns:
    - If the sonData object is a string, its value
      will be returned.
    - If the sonData object is not a string, the
      recRead function will be called recursively 
      to construct the string.
    '''
    ans = ''
    dataType = type(sonData)
    if dataType is str:
        ans += sonData + '\n'
    else:
        ans += recRead(sonData, sonName)
    return ans

def recRead(data, key):
    '''
    Notes:
    - This function assumes that the data object
      is a dictionary or list.
    - This function is called recursively to traverse
      the nested structure of the data object and
      construct the string representation.
    - The function filters out certain stop words and
      keywords defined in the stopwordList and keywordList
      variables, respectively.
    - The docRead function is called to construct the
      string representation of each keyword object found.
    '''
    ans = ''
    stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note']
    keywordList = ['head', '#text', 'p', 'surname']
    
    if getIndex(data):
        for son, father in getSonNodes(data, key):
            if father == 'abstract':
                ans += 'ABSTRACT\n'
            if father in stopwordList:
                continue
            elif father in keywordList:
                ans += docRead(son, father)
            else:
                ans += recRead(son, father)
    return ans

def splitBody(article):
    article = article.split('\n')
    parts = ['head', 'body', 'tail']
    pointer = 0
    ans = dict()
    for block in article:
        if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE':
            pointer += 1            

        if parts[pointer] not in ans.keys():
            ans[parts[pointer]] = []

        ans[parts[pointer]].append(block)
    return ans

def bodyMerge(article):
    body = article['body']
    ans = dict()
    keyList = []
    for block in body:
        allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM')
        if set(block) <= allowed and len(block) < 32 and len(block) > 0:
            key = block
            keyList.append(key)
            ans[key] = []
        else:
            ans[key].append(block)
    return ans, keyList