def getIndex(inputData): ''' get input index for lists and dicts. ''' inputType = type(inputData) if inputType is dict: return list(inputData.keys()) elif inputType is list: return range(len(inputData)) return 0 def getSonNodes(nodeData, nodeName): ''' Given a nodeData object and a nodeName string, returns a list of tuples containing the child nodes of the given node and their corresponding names. ''' index = getIndex(nodeData) ans = [] if type(nodeData) is list: for i in index: ans.append((nodeData[i], nodeName)) elif type(nodeData) is dict: for i in index: ans.append((nodeData[i], i)) return ans def docRead(sonData, sonName): ''' Given a sonData object and its corresponding sonName string, returns a string representation of the data. Returns: - If the sonData object is a string, its value will be returned. - If the sonData object is not a string, the recRead function will be called recursively to construct the string. ''' ans = '' dataType = type(sonData) if dataType is str: ans += sonData + '\n' else: ans += recRead(sonData, sonName) return ans def recRead(data, key): ''' Notes: - This function assumes that the data object is a dictionary or list. - This function is called recursively to traverse the nested structure of the data object and construct the string representation. - The function filters out certain stop words and keywords defined in the stopwordList and keywordList variables, respectively. - The docRead function is called to construct the string representation of each keyword object found. ''' ans = '' stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note'] keywordList = ['head', '#text', 'p', 'surname'] if getIndex(data): for son, father in getSonNodes(data, key): if father == 'abstract': ans += 'ABSTRACT\n' if father in stopwordList: continue elif father in keywordList: ans += docRead(son, father) else: ans += recRead(son, father) return ans def splitBody(article): article = article.split('\n') parts = ['head', 'body', 'tail'] pointer = 0 ans = dict() for block in article: if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE': pointer += 1 if parts[pointer] not in ans.keys(): ans[parts[pointer]] = [] ans[parts[pointer]].append(block) return ans def bodyMerge(article): body = article['body'] ans = dict() keyList = [] for block in body: allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM') if set(block) <= allowed and len(block) < 32 and len(block) > 0: key = block keyList.append(key) ans[key] = [] else: ans[key].append(block) return ans, keyList