File size: 893 Bytes
6c945f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# -*-coding:utf-8 -*-
"""
解析不同格式的文档
"""
import re
import abc
from typing import Union, List
class BaseParser(abc.ABC):
def __init__(self, config=None):
self.config = config
@staticmethod
def full2half(text):
s = ''
for c in text:
num = ord(c)
if num == 0x3000:
num = 0x20
elif 0xFF01 <= num <= 0xFF5E:
num = num - 0xFEE0
s += chr(num)
return s
@staticmethod
def remove_dup_space(text):
text = re.sub(r'\s{1,}', '', text, flags=re.MULTILINE|re.DOTALL) # Extra Spacee Remove
return text
@staticmethod
def remove_empty_line(text):
text = re.sub(r'\n', '', text, flags=re.MULTILINE|re.DOTALL) # Extra Spacee Remove
return text
def parse_file(self, file):
raise NotImplementedError
|