File size: 3,090 Bytes
072f9dd 96a1a44 072f9dd e6acaf6 072f9dd e6acaf6 072f9dd 96a1a44 e6acaf6 96a1a44 6224edc e6acaf6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import re
from nltk import word_tokenize
from rag.nlp import stemmer, huqie
BULLET_PATTERN = [[
r"第[零一二三四五六七八九十百]+(编|部分)",
r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节",
r"第[零一二三四五六七八九十百]+条",
r"[\((][零一二三四五六七八九十百]+[\))]",
], [
r"[0-9]{,3}[\. 、]",
r"[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
], [
r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节",
r"[零一二三四五六七八九十百]+[ 、]",
r"[\((][零一二三四五六七八九十百]+[\))]",
r"[\((][0-9]{,2}[\))]",
] ,[
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
r"Chapter (I+V?|VI*|XI|IX|X)",
r"Section [0-9]+",
r"Article [0-9]+"
]
]
def bullets_category(sections):
global BULLET_PATTERN
hits = [0] * len(BULLET_PATTERN)
for i, pro in enumerate(BULLET_PATTERN):
for sec in sections:
for p in pro:
if re.match(p, sec):
hits[i] += 1
break
maxium = 0
res = -1
for i,h in enumerate(hits):
if h <= maxium:continue
res = i
maxium = h
return res
def is_english(texts):
eng = 0
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False
def tokenize(d, t, eng):
d["content_with_weight"] = t
if eng:
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
else:
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
def get(i):
nonlocal sections
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i+128, len(sections))):
if not re.match(prefix, get(j)):
continue
for _ in range(i, j):sections.pop(i)
break |