Spaces:

ml6team
/

doc-to-slides

Sleeping

App Files Files Community

com3dian commited on Jun 4, 2024

Commit

2654a92

verified ·

1 Parent(s): 2b3c5ca

Create utils.py

Browse files

Files changed (1) hide show

utils.py +106 -0

utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+def getIndex(inputData):
+    '''
+    get input index for lists and dicts.
+    '''
+    inputType = type(inputData)
+    if inputType is dict:
+        return list(inputData.keys())
+    elif inputType is list:
+        return range(len(inputData))
+    return 0
+def getSonNodes(nodeData, nodeName):
+    '''
+    Given a nodeData object and a nodeName string,
+    returns a list of tuples containing the child
+    nodes of the given node and their corresponding
+    names.
+    '''
+    index = getIndex(nodeData)
+    ans = []
+    if type(nodeData) is list:
+        for i in index:
+            ans.append((nodeData[i], nodeName))
+    elif type(nodeData) is dict:
+        for i in index:
+            ans.append((nodeData[i], i))
+    return ans
+def docRead(sonData, sonName):
+    '''
+    Given a sonData object and its corresponding
+    sonName string, returns a string representation
+    of the data.
+    Returns:
+    - If the sonData object is a string, its value
+      will be returned.
+    - If the sonData object is not a string, the
+      recRead function will be called recursively
+      to construct the string.
+    '''
+    ans = ''
+    dataType = type(sonData)
+    if dataType is str:
+        ans += sonData + '\n'
+    else:
+        ans += recRead(sonData, sonName)
+    return ans
+def recRead(data, key):
+    '''
+    Notes:
+    - This function assumes that the data object
+      is a dictionary or list.
+    - This function is called recursively to traverse
+      the nested structure of the data object and
+      construct the string representation.
+    - The function filters out certain stop words and
+      keywords defined in the stopwordList and keywordList
+      variables, respectively.
+    - The docRead function is called to construct the
+      string representation of each keyword object found.
+    '''
+    ans = ''
+    stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note']
+    keywordList = ['head', '#text', 'p', 'surname']
+    if getIndex(data):
+        for son, father in getSonNodes(data, key):
+            if father == 'abstract':
+                ans += 'ABSTRACT\n'
+            if father in stopwordList:
+                continue
+            elif father in keywordList:
+                ans += docRead(son, father)
+            else:
+                ans += recRead(son, father)
+    return ans
+def splitBody(article):
+    article = article.split('\n')
+    parts = ['head', 'body', 'tail']
+    pointer = 0
+    ans = dict()
+    for block in article:
+        if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE':
+            pointer += 1
+        if parts[pointer] not in ans.keys():
+            ans[parts[pointer]] = []
+        ans[parts[pointer]].append(block)
+    return ans
+def bodyMerge(article):
+    body = article['body']
+    ans = dict()
+    keyList = []
+    for block in body:
+        allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM')
+        if set(block) <= allowed and len(block) < 32 and len(block) > 0:
+            key = block
+            keyList.append(key)
+            ans[key] = []
+        else:
+            ans[key].append(block)
+    return ans, keyList