Spaces:

retopara
/

ragflow

Build error

ragflow / rag /app /__init__.py

KevinHuSh

Add Q&A and Book, fix task running bugs (#50)

e6acaf6 over 1 year ago

3.09 kB

	import re

	from nltk import word_tokenize

	from rag.nlp import stemmer, huqie

	BULLET_PATTERN = [[
	r"第[零一二三四五六七八九十百]+(编\|部分)",
	r"第[零一二三四五六七八九十百]+章",
	r"第[零一二三四五六七八九十百]+节",
	r"第[零一二三四五六七八九十百]+条",
	r"[$（][零一二三四五六七八九十百]+[$）]",
	], [
	r"[0-9]{,3}[\. 、]",
	r"[0-9]{,2}\.[0-9]{,2}",
	r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
	r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
	], [
	r"第[零一二三四五六七八九十百]+章",
	r"第[零一二三四五六七八九十百]+节",
	r"[零一二三四五六七八九十百]+[ 、]",
	r"[$（][零一二三四五六七八九十百]+[$）]",
	r"[$（][0-9]{,2}[$）]",
	] ,[
	r"PART (ONE\|TWO\|THREE\|FOUR\|FIVE\|SIX\|SEVEN\|EIGHT\|NINE\|TEN)",
	r"Chapter (I+V?\|VI*\|XI\|IX\|X)",
	r"Section [0-9]+",
	r"Article [0-9]+"
	]
	]


	def bullets_category(sections):
	global BULLET_PATTERN
	hits = [0] * len(BULLET_PATTERN)
	for i, pro in enumerate(BULLET_PATTERN):
	for sec in sections:
	for p in pro:
	if re.match(p, sec):
	hits[i] += 1
	break
	maxium = 0
	res = -1
	for i,h in enumerate(hits):
	if h <= maxium:continue
	res = i
	maxium = h
	return res

	def is_english(texts):
	eng = 0
	for t in texts:
	if re.match(r"[a-zA-Z]{2,}", t.strip()):
	eng += 1
	if eng / len(texts) > 0.8:
	return True
	return False

	def tokenize(d, t, eng):
	d["content_with_weight"] = t
	if eng:
	t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
	d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
	else:
	d["content_ltks"] = huqie.qie(t)
	d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])


	def remove_contents_table(sections, eng=False):
	i = 0
	while i < len(sections):
	def get(i):
	nonlocal sections
	return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
	if not re.match(r"(contents\|目录\|目次\|table of contents\|致谢\|acknowledge)$", re.sub(r"( \| \|\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
	i += 1
	continue
	sections.pop(i)
	if i >= len(sections): break
	prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
	while not prefix:
	sections.pop(i)
	if i >= len(sections): break
	prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
	sections.pop(i)
	if i >= len(sections) or not prefix: break
	for j in range(i, min(i+128, len(sections))):
	if not re.match(prefix, get(j)):
	continue
	for _ in range(i, j):sections.pop(i)
	break