File size: 2,059 Bytes
6c945f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# -*-coding:utf-8 -*-
import re
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
from build_index.parser.base import BaseParser
class HTMLParser(BaseParser):
def parse_file(self, file):
with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp)
isd = convert_to_isd(elements)
for isd_el in isd:
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
isd_el['text'] = self.remove_dup_space(isd_el['text'])
isd_el['text'] = self.remove_empty_line(isd_el['text'])
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
# Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
# Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
# Each Chunk can be thought of as an individual set of data, which can be sent to the model
# Where Each Title is grouped together with the data under it
Chunks = [[]]
final_chunks = list(list())
for i, isd_el in enumerate(isd):
if i in title_indexes:
Chunks.append([])
Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0
sum += len(str(chunk))
if sum < 25:
Chunks.remove(chunk)
else :
# appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks
|