|
import random |
|
|
|
from .pdf_parser import HuParser as PdfParser |
|
from .docx_parser import HuDocxParser as DocxParser |
|
from .excel_parser import HuExcelParser as ExcelParser |
|
|
|
import re |
|
|
|
from nltk import word_tokenize |
|
|
|
from rag.nlp import stemmer, huqie |
|
from rag.utils import num_tokens_from_string |
|
|
|
BULLET_PATTERN = [[ |
|
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)", |
|
r"第[零一二三四五六七八九十百0-9]+章", |
|
r"第[零一二三四五六七八九十百0-9]+节", |
|
r"第[零一二三四五六七八九十百0-9]+条", |
|
r"[\((][零一二三四五六七八九十百]+[\))]", |
|
], [ |
|
r"第[0-9]+章", |
|
r"第[0-9]+节", |
|
r"[0-9]{,3}[\. 、]", |
|
r"[0-9]{,2}\.[0-9]{,2}", |
|
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", |
|
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", |
|
], [ |
|
r"第[零一二三四五六七八九十百0-9]+章", |
|
r"第[零一二三四五六七八九十百0-9]+节", |
|
r"[零一二三四五六七八九十百]+[ 、]", |
|
r"[\((][零一二三四五六七八九十百]+[\))]", |
|
r"[\((][0-9]{,2}[\))]", |
|
], [ |
|
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", |
|
r"Chapter (I+V?|VI*|XI|IX|X)", |
|
r"Section [0-9]+", |
|
r"Article [0-9]+" |
|
] |
|
] |
|
|
|
def random_choices(arr, k): |
|
k = min(len(arr), k) |
|
return random.choices(arr, k=k) |
|
|
|
def bullets_category(sections): |
|
global BULLET_PATTERN |
|
hits = [0] * len(BULLET_PATTERN) |
|
for i, pro in enumerate(BULLET_PATTERN): |
|
for sec in sections: |
|
for p in pro: |
|
if re.match(p, sec): |
|
hits[i] += 1 |
|
break |
|
maxium = 0 |
|
res = -1 |
|
for i, h in enumerate(hits): |
|
if h <= maxium: continue |
|
res = i |
|
maxium = h |
|
return res |
|
|
|
|
|
def is_english(texts): |
|
eng = 0 |
|
for t in texts: |
|
if re.match(r"[a-zA-Z]{2,}", t.strip()): |
|
eng += 1 |
|
if eng / len(texts) > 0.8: |
|
return True |
|
return False |
|
|
|
|
|
def tokenize(d, t, eng): |
|
d["content_with_weight"] = t |
|
if eng: |
|
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) |
|
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)]) |
|
else: |
|
d["content_ltks"] = huqie.qie(t) |
|
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) |
|
|
|
|
|
def remove_contents_table(sections, eng=False): |
|
i = 0 |
|
while i < len(sections): |
|
def get(i): |
|
nonlocal sections |
|
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip() |
|
|
|
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", |
|
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)): |
|
i += 1 |
|
continue |
|
sections.pop(i) |
|
if i >= len(sections): break |
|
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) |
|
while not prefix: |
|
sections.pop(i) |
|
if i >= len(sections): break |
|
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) |
|
sections.pop(i) |
|
if i >= len(sections) or not prefix: break |
|
for j in range(i, min(i + 128, len(sections))): |
|
if not re.match(prefix, get(j)): |
|
continue |
|
for _ in range(i, j): sections.pop(i) |
|
break |
|
|
|
|
|
def make_colon_as_title(sections): |
|
if not sections: return [] |
|
if type(sections[0]) == type(""): return sections |
|
i = 0 |
|
while i < len(sections): |
|
txt, layout = sections[i] |
|
i += 1 |
|
txt = txt.split("@")[0].strip() |
|
if not txt: |
|
continue |
|
if txt[-1] not in "::": |
|
continue |
|
txt = txt[::-1] |
|
arr = re.split(r"([。?!!?;;]| .)", txt) |
|
if len(arr) < 2 or len(arr[1]) < 32: |
|
continue |
|
sections.insert(i - 1, (arr[0][::-1], "title")) |
|
i += 1 |
|
|
|
|
|
def hierarchical_merge(bull, sections, depth): |
|
if not sections or bull < 0: return [] |
|
if type(sections[0]) == type(""): sections = [(s, "") for s in sections] |
|
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())] |
|
bullets_size = len(BULLET_PATTERN[bull]) |
|
levels = [[] for _ in range(bullets_size + 2)] |
|
|
|
def not_title(txt): |
|
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False |
|
if len(txt) >= 128: return True |
|
return re.search(r"[,;,。;!!]", txt) |
|
|
|
for i, (txt, layout) in enumerate(sections): |
|
for j, p in enumerate(BULLET_PATTERN[bull]): |
|
if re.match(p, txt.strip()) and not not_title(txt): |
|
levels[j].append(i) |
|
break |
|
else: |
|
if re.search(r"(title|head)", layout): |
|
levels[bullets_size].append(i) |
|
else: |
|
levels[bullets_size + 1].append(i) |
|
sections = [t for t, _ in sections] |
|
for s in sections: print("--", s) |
|
|
|
def binary_search(arr, target): |
|
if not arr: return -1 |
|
if target > arr[-1]: return len(arr) - 1 |
|
if target < arr[0]: return -1 |
|
s, e = 0, len(arr) |
|
while e - s > 1: |
|
i = (e + s) // 2 |
|
if target > arr[i]: |
|
s = i |
|
continue |
|
elif target < arr[i]: |
|
e = i |
|
continue |
|
else: |
|
assert False |
|
return s |
|
|
|
cks = [] |
|
readed = [False] * len(sections) |
|
levels = levels[::-1] |
|
for i, arr in enumerate(levels[:depth]): |
|
for j in arr: |
|
if readed[j]: continue |
|
readed[j] = True |
|
cks.append([j]) |
|
if i + 1 == len(levels) - 1: continue |
|
for ii in range(i + 1, len(levels)): |
|
jj = binary_search(levels[ii], j) |
|
if jj < 0: continue |
|
if jj > cks[-1][-1]: cks[-1].pop(-1) |
|
cks[-1].append(levels[ii][jj]) |
|
for ii in cks[-1]: readed[ii] = True |
|
for i in range(len(cks)): |
|
cks[i] = [sections[j] for j in cks[i][::-1]] |
|
print("--------------\n", "\n* ".join(cks[i])) |
|
|
|
return cks |
|
|
|
|
|
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): |
|
if not sections: return [] |
|
if type(sections[0]) == type(""): sections = [(s, "") for s in sections] |
|
cks = [""] |
|
tk_nums = [0] |
|
def add_chunk(t, pos): |
|
nonlocal cks, tk_nums, delimiter |
|
tnum = num_tokens_from_string(t) |
|
if tnum < 8: pos = "" |
|
if tk_nums[-1] > chunk_token_num: |
|
cks.append(t + pos) |
|
tk_nums.append(tnum) |
|
else: |
|
cks[-1] += t + pos |
|
tk_nums[-1] += tnum |
|
|
|
for sec, pos in sections: |
|
s, e = 0, 1 |
|
while e < len(sec): |
|
if sec[e] in delimiter: |
|
add_chunk(sec[s: e+1], pos) |
|
s = e + 1 |
|
e = s + 1 |
|
else: |
|
e += 1 |
|
if s < e: add_chunk(sec[s: e], pos) |
|
|
|
return cks |
|
|
|
|
|
|