doc-to-slides / utils.py
com3dian's picture
Create utils.py
2654a92 verified
raw
history blame
3.15 kB
def getIndex(inputData):
'''
get input index for lists and dicts.
'''
inputType = type(inputData)
if inputType is dict:
return list(inputData.keys())
elif inputType is list:
return range(len(inputData))
return 0
def getSonNodes(nodeData, nodeName):
'''
Given a nodeData object and a nodeName string,
returns a list of tuples containing the child
nodes of the given node and their corresponding
names.
'''
index = getIndex(nodeData)
ans = []
if type(nodeData) is list:
for i in index:
ans.append((nodeData[i], nodeName))
elif type(nodeData) is dict:
for i in index:
ans.append((nodeData[i], i))
return ans
def docRead(sonData, sonName):
'''
Given a sonData object and its corresponding
sonName string, returns a string representation
of the data.
Returns:
- If the sonData object is a string, its value
will be returned.
- If the sonData object is not a string, the
recRead function will be called recursively
to construct the string.
'''
ans = ''
dataType = type(sonData)
if dataType is str:
ans += sonData + '\n'
else:
ans += recRead(sonData, sonName)
return ans
def recRead(data, key):
'''
Notes:
- This function assumes that the data object
is a dictionary or list.
- This function is called recursively to traverse
the nested structure of the data object and
construct the string representation.
- The function filters out certain stop words and
keywords defined in the stopwordList and keywordList
variables, respectively.
- The docRead function is called to construct the
string representation of each keyword object found.
'''
ans = ''
stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note']
keywordList = ['head', '#text', 'p', 'surname']
if getIndex(data):
for son, father in getSonNodes(data, key):
if father == 'abstract':
ans += 'ABSTRACT\n'
if father in stopwordList:
continue
elif father in keywordList:
ans += docRead(son, father)
else:
ans += recRead(son, father)
return ans
def splitBody(article):
article = article.split('\n')
parts = ['head', 'body', 'tail']
pointer = 0
ans = dict()
for block in article:
if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE':
pointer += 1
if parts[pointer] not in ans.keys():
ans[parts[pointer]] = []
ans[parts[pointer]].append(block)
return ans
def bodyMerge(article):
body = article['body']
ans = dict()
keyList = []
for block in body:
allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM')
if set(block) <= allowed and len(block) < 32 and len(block) > 0:
key = block
keyList.append(key)
ans[key] = []
else:
ans[key].append(block)
return ans, keyList