FinDoc / build_index /parser /html_parser.py
xl2533's picture
initial
6c945f2
raw
history blame
2.06 kB
# -*-coding:utf-8 -*-
import re
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
from build_index.parser.base import BaseParser
class HTMLParser(BaseParser):
def parse_file(self, file):
with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp)
isd = convert_to_isd(elements)
for isd_el in isd:
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
isd_el['text'] = self.remove_dup_space(isd_el['text'])
isd_el['text'] = self.remove_empty_line(isd_el['text'])
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
# Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
# Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
# Each Chunk can be thought of as an individual set of data, which can be sent to the model
# Where Each Title is grouped together with the data under it
Chunks = [[]]
final_chunks = list(list())
for i, isd_el in enumerate(isd):
if i in title_indexes:
Chunks.append([])
Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0
sum += len(str(chunk))
if sum < 25:
Chunks.remove(chunk)
else :
# appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks