Add markdown support for QA parser (#1180)
Browse files### What problem does this PR solve?
Add markdown support for QA parser
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- rag/app/qa.py +54 -3
rag/app/qa.py
CHANGED
|
@@ -145,6 +145,10 @@ def beAdoc(d, q, a, eng):
|
|
| 145 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 146 |
return d
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
| 150 |
"""
|
|
@@ -214,6 +218,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 214 |
|
| 215 |
return res
|
| 216 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
|
|
|
| 217 |
pdf_parser = Pdf()
|
| 218 |
count = 0
|
| 219 |
qai_list, tbls = pdf_parser(filename if not binary else binary,
|
|
@@ -225,10 +230,58 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
| 225 |
count += 1
|
| 226 |
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
|
| 227 |
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
|
| 230 |
raise NotImplementedError(
|
| 231 |
-
"Excel
|
| 232 |
|
| 233 |
|
| 234 |
if __name__ == "__main__":
|
|
@@ -236,6 +289,4 @@ if __name__ == "__main__":
|
|
| 236 |
|
| 237 |
def dummy(prog=None, msg=""):
|
| 238 |
pass
|
| 239 |
-
import json
|
| 240 |
-
|
| 241 |
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
|
|
|
| 145 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 146 |
return d
|
| 147 |
|
| 148 |
+
def mdQuestionLevel(s):
|
| 149 |
+
match = re.match(r'#*', s)
|
| 150 |
+
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
| 151 |
+
|
| 152 |
|
| 153 |
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
| 154 |
"""
|
|
|
|
| 218 |
|
| 219 |
return res
|
| 220 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 221 |
+
callback(0.1, "Start to parse.")
|
| 222 |
pdf_parser = Pdf()
|
| 223 |
count = 0
|
| 224 |
qai_list, tbls = pdf_parser(filename if not binary else binary,
|
|
|
|
| 230 |
count += 1
|
| 231 |
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
|
| 232 |
return res
|
| 233 |
+
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
| 234 |
+
callback(0.1, "Start to parse.")
|
| 235 |
+
txt = ""
|
| 236 |
+
if binary:
|
| 237 |
+
encoding = find_codec(binary)
|
| 238 |
+
txt = binary.decode(encoding, errors="ignore")
|
| 239 |
+
else:
|
| 240 |
+
with open(filename, "r") as f:
|
| 241 |
+
while True:
|
| 242 |
+
l = f.readline()
|
| 243 |
+
if not l:
|
| 244 |
+
break
|
| 245 |
+
txt += l
|
| 246 |
+
lines = txt.split("\n")
|
| 247 |
+
comma, tab = 0, 0
|
| 248 |
+
last_question, last_answer = "", ""
|
| 249 |
+
question_stack, level_stack = [], []
|
| 250 |
+
code_block = False
|
| 251 |
+
level_index = [-1] * 7
|
| 252 |
+
for index, l in enumerate(lines):
|
| 253 |
+
if not l.strip():
|
| 254 |
+
continue
|
| 255 |
+
if l.strip().startswith('```'):
|
| 256 |
+
code_block = not code_block
|
| 257 |
+
question_level, question = 0, ''
|
| 258 |
+
if not code_block:
|
| 259 |
+
question_level, question = mdQuestionLevel(l)
|
| 260 |
+
|
| 261 |
+
if not question_level or question_level > 6: # not a question
|
| 262 |
+
last_answer = f'{last_answer}\n{l}'
|
| 263 |
+
else: # is a question
|
| 264 |
+
if last_answer:
|
| 265 |
+
sum_question = ('\n').join(question_stack)
|
| 266 |
+
if sum_question:
|
| 267 |
+
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
| 268 |
+
last_answer = ''
|
| 269 |
+
|
| 270 |
+
i = question_level
|
| 271 |
+
while question_stack and i <= level_stack[-1]:
|
| 272 |
+
question_stack.pop()
|
| 273 |
+
level_stack.pop()
|
| 274 |
+
question_stack.append(question)
|
| 275 |
+
level_stack.append(question_level)
|
| 276 |
+
if last_answer:
|
| 277 |
+
sum_question = ('\n').join(question_stack)
|
| 278 |
+
if sum_question:
|
| 279 |
+
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
| 280 |
+
return res
|
| 281 |
|
| 282 |
|
| 283 |
raise NotImplementedError(
|
| 284 |
+
"Excel, csv(txt), pdf and markdown format files are supported.")
|
| 285 |
|
| 286 |
|
| 287 |
if __name__ == "__main__":
|
|
|
|
| 289 |
|
| 290 |
def dummy(prog=None, msg=""):
|
| 291 |
pass
|
|
|
|
|
|
|
| 292 |
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|