File size: 2,059 Bytes
6c945f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*-coding:utf-8 -*-

import re
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
from build_index.parser.base import BaseParser


class HTMLParser(BaseParser):
    def parse_file(self, file):
        with open(file, "r", encoding="utf-8") as fp:
            elements = partition_html(file=fp)
            isd = convert_to_isd(elements)

        for isd_el in isd:
            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
            isd_el['text'] = self.remove_dup_space(isd_el['text'])
            isd_el['text'] = self.remove_empty_line(isd_el['text'])
            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )

        # Creating a list of all the indexes of isd_el['type'] = 'Title'
        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']

        # Creating 'Chunks' - List of lists of strings
        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
        # Where Each Title is grouped together with the data under it

        Chunks = [[]]
        final_chunks = list(list())

        for i, isd_el in enumerate(isd):
            if i in title_indexes:
                Chunks.append([])
            Chunks[-1].append(isd_el['text'])

        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
        for chunk in Chunks:
            # sum of lenth of all the strings in the chunk
            sum = 0
            sum += len(str(chunk))
            if sum < 25:
                Chunks.remove(chunk)
            else :
                # appending all the approved chunks to final_chunks as a single string
                final_chunks.append(" ".join([str(item) for item in chunk]))
        return final_chunks