Spaces:
Sleeping
Sleeping
def getIndex(inputData): | |
''' | |
get input index for lists and dicts. | |
''' | |
inputType = type(inputData) | |
if inputType is dict: | |
return list(inputData.keys()) | |
elif inputType is list: | |
return range(len(inputData)) | |
return 0 | |
def getSonNodes(nodeData, nodeName): | |
''' | |
Given a nodeData object and a nodeName string, | |
returns a list of tuples containing the child | |
nodes of the given node and their corresponding | |
names. | |
''' | |
index = getIndex(nodeData) | |
ans = [] | |
if type(nodeData) is list: | |
for i in index: | |
ans.append((nodeData[i], nodeName)) | |
elif type(nodeData) is dict: | |
for i in index: | |
ans.append((nodeData[i], i)) | |
return ans | |
def docRead(sonData, sonName): | |
''' | |
Given a sonData object and its corresponding | |
sonName string, returns a string representation | |
of the data. | |
Returns: | |
- If the sonData object is a string, its value | |
will be returned. | |
- If the sonData object is not a string, the | |
recRead function will be called recursively | |
to construct the string. | |
''' | |
ans = '' | |
dataType = type(sonData) | |
if dataType is str: | |
ans += sonData + '\n' | |
else: | |
ans += recRead(sonData, sonName) | |
return ans | |
def recRead(data, key): | |
''' | |
Notes: | |
- This function assumes that the data object | |
is a dictionary or list. | |
- This function is called recursively to traverse | |
the nested structure of the data object and | |
construct the string representation. | |
- The function filters out certain stop words and | |
keywords defined in the stopwordList and keywordList | |
variables, respectively. | |
- The docRead function is called to construct the | |
string representation of each keyword object found. | |
''' | |
ans = '' | |
stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note'] | |
keywordList = ['head', '#text', 'p', 'surname'] | |
if getIndex(data): | |
for son, father in getSonNodes(data, key): | |
if father == 'abstract': | |
ans += 'ABSTRACT\n' | |
if father in stopwordList: | |
continue | |
elif father in keywordList: | |
ans += docRead(son, father) | |
else: | |
ans += recRead(son, father) | |
return ans | |
def splitBody(article): | |
article = article.split('\n') | |
parts = ['head', 'body', 'tail'] | |
pointer = 0 | |
ans = dict() | |
for block in article: | |
if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE': | |
pointer += 1 | |
if parts[pointer] not in ans.keys(): | |
ans[parts[pointer]] = [] | |
ans[parts[pointer]].append(block) | |
return ans | |
def bodyMerge(article): | |
body = article['body'] | |
ans = dict() | |
keyList = [] | |
for block in body: | |
allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM') | |
if set(block) <= allowed and len(block) < 32 and len(block) > 0: | |
key = block | |
keyList.append(key) | |
ans[key] = [] | |
else: | |
ans[key].append(block) | |
return ans, keyList |