# -*-coding:utf-8 -*- """ 解析不同格式的文档 """ import re import abc from typing import Union, List class BaseParser(abc.ABC): def __init__(self, config=None): self.config = config @staticmethod def full2half(text): s = '' for c in text: num = ord(c) if num == 0x3000: num = 0x20 elif 0xFF01 <= num <= 0xFF5E: num = num - 0xFEE0 s += chr(num) return s @staticmethod def remove_dup_space(text): text = re.sub(r'\s{1,}', '', text, flags=re.MULTILINE|re.DOTALL) # Extra Spacee Remove return text @staticmethod def remove_empty_line(text): text = re.sub(r'\n', '', text, flags=re.MULTILINE|re.DOTALL) # Extra Spacee Remove return text def parse_file(self, file): raise NotImplementedError