File size: 2,734 Bytes
2319518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from qwen_agent.schema import RefMaterial
from qwen_agent.utils.utils import get_keyword_by_llm, get_split_word


class SimilaritySearch:

    def __init__(self):
        pass

    def run(self, line, query, max_token=4000, keyword_agent=None):
        """
        Input: one line
        Output: the relative text
        """
        content = line['raw']
        if isinstance(content, str):
            content = content.split('\n')
        if not content:
            return RefMaterial(url=line['url'], text=[]).to_dict()

        tokens = [x['token'] for x in content]
        all_tokens = sum(tokens)
        if all_tokens <= max_token:
            print('use full ref: ', all_tokens)
            return {
                'url': line['url'],
                'text': [x['page_content'] for x in content]
            }

        wordlist = get_keyword_by_llm(query, keyword_agent)
        print('wordlist: ', wordlist)
        if not wordlist:
            return RefMaterial(url=line['url'], text=[]).to_dict()

        sims = []
        for i, page in enumerate(content):
            sim = self.filter_section(page, wordlist)
            sims.append([i, sim])
        sims.sort(key=lambda item: item[1], reverse=True)
        assert len(sims) > 0

        res = []
        max_sims = sims[0][1]
        if max_sims != 0:
            manul = 2
            for i in range(min(manul, len(content))):
                res.append(content[i]['page_content'])
                max_token -= tokens[i]
            for i, x in enumerate(sims):
                if x[0] < manul:
                    continue
                page = content[x[0]]
                print('select: ', x)
                if max_token < tokens[x[0]]:
                    use_rate = (max_token / page['token']) * 0.2
                    res.append(page['page_content']
                               [:int(len(page['page_content']) * use_rate)])
                    break

                text = ''
                if isinstance(page, str):
                    text = content[x[0]]
                elif isinstance(page, dict):
                    text = page['page_content']
                res.append(text)
                max_token -= tokens[x[0]]

        return RefMaterial(url=line['url'], text=res).to_dict()

    def filter_section(self, page, wordlist):
        text = page['page_content']

        page_list = get_split_word(text)
        sim = self.jaccard_similarity(wordlist, page_list)

        return sim

    def jaccard_similarity(self, list1, list2):
        s1 = set(list1)
        s2 = set(list2)
        return len(s1.intersection(s2))  # avoid text length impact
        # return len(s1.intersection(s2)) / len(s1.union(s2))  # jaccard similarity