File size: 3,090 Bytes
072f9dd
 
96a1a44
 
 
 
072f9dd
e6acaf6
072f9dd
 
 
 
 
 
 
 
 
 
e6acaf6
 
072f9dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96a1a44
 
 
 
e6acaf6
96a1a44
 
 
 
 
 
 
 
 
 
 
 
6224edc
 
 
e6acaf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re

from nltk import word_tokenize

from rag.nlp import stemmer, huqie

BULLET_PATTERN = [[
        r"第[零一二三四五六七八九十百]+(编|部分)",
        r"第[零一二三四五六七八九十百]+章",
        r"第[零一二三四五六七八九十百]+节",
        r"第[零一二三四五六七八九十百]+条",
        r"[\((][零一二三四五六七八九十百]+[\))]",
    ], [
        r"[0-9]{,3}[\. 、]",
        r"[0-9]{,2}\.[0-9]{,2}",
        r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
        r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
    ], [
        r"第[零一二三四五六七八九十百]+章",
        r"第[零一二三四五六七八九十百]+节",
        r"[零一二三四五六七八九十百]+[ 、]",
        r"[\((][零一二三四五六七八九十百]+[\))]",
        r"[\((][0-9]{,2}[\))]",
    ] ,[
        r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
        r"Chapter (I+V?|VI*|XI|IX|X)",
        r"Section [0-9]+",
        r"Article [0-9]+"
    ]
    ]


def bullets_category(sections):
    global BULLET_PATTERN
    hits = [0] * len(BULLET_PATTERN)
    for i, pro in enumerate(BULLET_PATTERN):
        for sec in sections:
            for p in pro:
                if re.match(p, sec):
                    hits[i] += 1
                    break
    maxium = 0
    res = -1
    for i,h in enumerate(hits):
        if h <= maxium:continue
        res = i
        maxium = h
    return res

def is_english(texts):
    eng = 0
    for t in texts:
        if re.match(r"[a-zA-Z]{2,}", t.strip()):
            eng += 1
    if eng / len(texts) > 0.8:
        return True
    return False

def tokenize(d, t, eng):
    d["content_with_weight"] = t
    if eng:
        t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
        d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
    else:
        d["content_ltks"] = huqie.qie(t)
        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])


def remove_contents_table(sections, eng=False):
    i = 0
    while i < len(sections):
        def get(i):
            nonlocal sections
            return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
        if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
            i += 1
            continue
        sections.pop(i)
        if i >= len(sections): break
        prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
        while not prefix:
            sections.pop(i)
            if i >= len(sections): break
            prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
        sections.pop(i)
        if i >= len(sections) or not prefix: break
        for j in range(i, min(i+128, len(sections))):
            if not re.match(prefix, get(j)):
                continue
            for _ in range(i, j):sections.pop(i)
            break