xl2533's picture
initial
6c945f2
raw
history blame
893 Bytes
# -*-coding:utf-8 -*-
"""
解析不同格式的文档
"""
import re
import abc
from typing import Union, List
class BaseParser(abc.ABC):
def __init__(self, config=None):
self.config = config
@staticmethod
def full2half(text):
s = ''
for c in text:
num = ord(c)
if num == 0x3000:
num = 0x20
elif 0xFF01 <= num <= 0xFF5E:
num = num - 0xFEE0
s += chr(num)
return s
@staticmethod
def remove_dup_space(text):
text = re.sub(r'\s{1,}', '', text, flags=re.MULTILINE|re.DOTALL) # Extra Spacee Remove
return text
@staticmethod
def remove_empty_line(text):
text = re.sub(r'\n', '', text, flags=re.MULTILINE|re.DOTALL) # Extra Spacee Remove
return text
def parse_file(self, file):
raise NotImplementedError