com3dian commited on
Commit
2654a92
·
verified ·
1 Parent(s): 2b3c5ca

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +106 -0
utils.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def getIndex(inputData):
2
+ '''
3
+ get input index for lists and dicts.
4
+ '''
5
+ inputType = type(inputData)
6
+ if inputType is dict:
7
+ return list(inputData.keys())
8
+ elif inputType is list:
9
+ return range(len(inputData))
10
+ return 0
11
+
12
+ def getSonNodes(nodeData, nodeName):
13
+ '''
14
+ Given a nodeData object and a nodeName string,
15
+ returns a list of tuples containing the child
16
+ nodes of the given node and their corresponding
17
+ names.
18
+ '''
19
+ index = getIndex(nodeData)
20
+ ans = []
21
+ if type(nodeData) is list:
22
+ for i in index:
23
+ ans.append((nodeData[i], nodeName))
24
+ elif type(nodeData) is dict:
25
+ for i in index:
26
+ ans.append((nodeData[i], i))
27
+ return ans
28
+
29
+ def docRead(sonData, sonName):
30
+ '''
31
+ Given a sonData object and its corresponding
32
+ sonName string, returns a string representation
33
+ of the data.
34
+ Returns:
35
+ - If the sonData object is a string, its value
36
+ will be returned.
37
+ - If the sonData object is not a string, the
38
+ recRead function will be called recursively
39
+ to construct the string.
40
+ '''
41
+ ans = ''
42
+ dataType = type(sonData)
43
+ if dataType is str:
44
+ ans += sonData + '\n'
45
+ else:
46
+ ans += recRead(sonData, sonName)
47
+ return ans
48
+
49
+ def recRead(data, key):
50
+ '''
51
+ Notes:
52
+ - This function assumes that the data object
53
+ is a dictionary or list.
54
+ - This function is called recursively to traverse
55
+ the nested structure of the data object and
56
+ construct the string representation.
57
+ - The function filters out certain stop words and
58
+ keywords defined in the stopwordList and keywordList
59
+ variables, respectively.
60
+ - The docRead function is called to construct the
61
+ string representation of each keyword object found.
62
+ '''
63
+ ans = ''
64
+ stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note']
65
+ keywordList = ['head', '#text', 'p', 'surname']
66
+
67
+ if getIndex(data):
68
+ for son, father in getSonNodes(data, key):
69
+ if father == 'abstract':
70
+ ans += 'ABSTRACT\n'
71
+ if father in stopwordList:
72
+ continue
73
+ elif father in keywordList:
74
+ ans += docRead(son, father)
75
+ else:
76
+ ans += recRead(son, father)
77
+ return ans
78
+
79
+ def splitBody(article):
80
+ article = article.split('\n')
81
+ parts = ['head', 'body', 'tail']
82
+ pointer = 0
83
+ ans = dict()
84
+ for block in article:
85
+ if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE':
86
+ pointer += 1
87
+
88
+ if parts[pointer] not in ans.keys():
89
+ ans[parts[pointer]] = []
90
+
91
+ ans[parts[pointer]].append(block)
92
+ return ans
93
+
94
+ def bodyMerge(article):
95
+ body = article['body']
96
+ ans = dict()
97
+ keyList = []
98
+ for block in body:
99
+ allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM')
100
+ if set(block) <= allowed and len(block) < 32 and len(block) > 0:
101
+ key = block
102
+ keyList.append(key)
103
+ ans[key] = []
104
+ else:
105
+ ans[key].append(block)
106
+ return ans, keyList