File size: 4,287 Bytes
e6acaf6
 
 
 
 
407b252
e6acaf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e0a689
 
 
 
 
 
 
 
 
 
 
 
e6acaf6
5e0a689
 
 
 
 
e6acaf6
407b252
e6acaf6
5e0a689
 
e6acaf6
 
 
 
5e0a689
 
e6acaf6
 
 
 
 
5e0a689
 
e6acaf6
5e0a689
 
e6acaf6
 
 
 
 
 
51482f3
e6acaf6
 
 
 
 
5e0a689
e6acaf6
 
 
 
 
 
 
 
 
 
 
5e0a689
 
e6acaf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e0a689
 
e6acaf6
 
5e0a689
e6acaf6
5e0a689
51482f3
e6acaf6
51482f3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import random
import re
from io import BytesIO
from nltk import word_tokenize
from openpyxl import load_workbook
from rag.parser import is_english, random_choices
from rag.nlp import huqie, stemmer


class Excel(object):
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
            wb = load_workbook(fnm)
        else:
            wb = load_workbook(BytesIO(binary))
        total = 0
        for sheetname in wb.sheetnames:
            total += len(list(wb[sheetname].rows))

        res, fails = [], []
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
            for i, r in enumerate(rows):
                q, a = "", ""
                for cell in r:
                    if not cell.value:
                        continue
                    if not q:
                        q = str(cell.value)
                    elif not a:
                        a = str(cell.value)
                    else:
                        break
                if q and a:
                    res.append((q, a))
                else:
                    fails.append(str(i + 1))
                if len(res) % 999 == 0:
                    callback(len(res) *
                             0.6 /
                             total, ("Extract Q&A: {}".format(len(res)) +
                                     (f"{len(fails)} failure, line: %s..." %
                                      (",".join(fails[:3])) if fails else "")))

        callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        self.is_english = is_english(
            [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
        return res


def rmPrefix(txt):
    return re.sub(
        r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)


def beAdoc(d, q, a, eng):
    qprefix = "Question: " if eng else "问题:"
    aprefix = "Answer: " if eng else "回答:"
    d["content_with_weight"] = "\t".join(
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    if eng:
        d["content_ltks"] = " ".join([stemmer.stem(w)
                                     for w in word_tokenize(q)])
    else:
        d["content_ltks"] = huqie.qie(q)
        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    return d


def chunk(filename, binary=None, callback=None, **kwargs):

    res = []
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = Excel()
        for q, a in excel_parser(filename, binary, callback):
            res.append(beAdoc({}, q, a, excel_parser.is_english))
        return res
    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
        if binary:
            txt = binary.decode("utf-8")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
                    if not l:
                        break
                    txt += l
        lines = txt.split("\n")
        eng = is_english([rmPrefix(l) for l in lines[:100]])
        fails = []
        for i, line in enumerate(lines):
            arr = [l for l in line.split("\t") if len(l) > 1]
            if len(arr) != 2:
                fails.append(str(i))
                continue
            res.append(beAdoc({}, arr[0], arr[1], eng))
            if len(res) % 999 == 0:
                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        return res

    raise NotImplementedError(
        "file type not supported yet(pptx, pdf supported)")


if __name__ == "__main__":
    import sys

    def dummy(a, b):
        pass
    chunk(sys.argv[1], callback=dummy)