|
|
|
""" |
|
解析不同格式的文档 |
|
""" |
|
import re |
|
import abc |
|
from typing import Union, List |
|
|
|
|
|
class BaseParser(abc.ABC): |
|
|
|
def __init__(self, config=None): |
|
self.config = config |
|
|
|
|
|
@staticmethod |
|
def full2half(text): |
|
s = '' |
|
for c in text: |
|
num = ord(c) |
|
if num == 0x3000: |
|
num = 0x20 |
|
elif 0xFF01 <= num <= 0xFF5E: |
|
num = num - 0xFEE0 |
|
s += chr(num) |
|
return s |
|
|
|
@staticmethod |
|
def remove_dup_space(text): |
|
text = re.sub(r'\s{1,}', '', text, flags=re.MULTILINE|re.DOTALL) |
|
return text |
|
|
|
@staticmethod |
|
def remove_empty_line(text): |
|
text = re.sub(r'\n', '', text, flags=re.MULTILINE|re.DOTALL) |
|
return text |
|
|
|
def parse_file(self, file): |
|
raise NotImplementedError |
|
|