File size: 9,536 Bytes
57cf043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
from typing import List, Dict, Optional, Tuple
import requests
from logging import Logger

from common.configuration import SemanticChunk
from common.configuration import SegmentationSearch
from common.configuration import SummaryChunks
from common.configuration import FilterChunks
from common.configuration import RocksNNSearch
from common.configuration import PeopleChunks
from common.configuration import SearchGroupComposition


def aggregate_answers(vector_answer: Optional[Dict] = None,
                      people_answer: Optional[List] = None,
                      chunks_answer: Optional[List] = None,
                      groups_answer: Optional[List] = None,
                      rocks_nn_answer: Optional[List] = None,
                      segmentation_answer: Optional[List] = None) -> Dict:
    """

    Args:
        vector_answer:
        people_answer:
        chunks_answer:
        groups_answer:
        rocks_nn_answer:
        segmentation_answer:

    Returns:

    """
    answer = {}
    if vector_answer is not None or chunks_answer is not None:
        answer['doc_chunks'] = combine_answer([vector_answer, chunks_answer])
    if people_answer is not None:
        answer['people_search'] = [PeopleChunks(**answer_dict['_source']) for answer_dict in people_answer]
    if groups_answer is not None:
        answer['groups_search'] = SearchGroupComposition(**groups_answer[0]['_source'])
    if rocks_nn_answer is not None:
        answer['rocks_nn_search'] = RocksNNSearch(division=rocks_nn_answer[0]['_source']['division_name'],
                                                  company_name=rocks_nn_answer[0]['_source']['company_name'])
    if segmentation_answer is not None:
        answer['segmentation_search'] = SegmentationSearch(**segmentation_answer[0]['_source'])

    return answer


def combine_answer(answers):
    """

    Args:
        answers:

    Returns:

    """
    answer_combined = []
    answer_file_names = []
    indexes = []
    for answer in answers:
        if answer is not None:
            for key in answer:
                if answer[key]["doc_name"] in answer_file_names:
                    if answer[key]['start_index_paragraph'] not in indexes:
                        obj_index = answer_file_names.index(answer[key]["doc_name"])
                        answer_combined[obj_index].chunks.append(SemanticChunk(**answer[key]))
                else:
                    answer_combined.append(FilterChunks(
                        id=str(answer[key]['id']),
                        filename=answer[key]["doc_name"],
                        title=answer[key]["title"],
                        chunks=[SemanticChunk(**answer[key])]))
                    answer_file_names.append(answer[key]["doc_name"])
                indexes.append(answer[key]['start_index_paragraph'])
    return answer_combined


def preprocessed_chunks(answer_chunks: SummaryChunks, llm_host_tokens: str, logger: Logger) -> str:
    output_text = ''
    count = 0
    count_tokens = 0
    if answer_chunks.doc_chunks is not None:
        for doc in answer_chunks.doc_chunks:
            output_text += f'Документ: [{count + 1}]\n'
            if doc.title != 'unknown':
                output_text += f'Название документа: {doc.title}\n'
            else:
                output_text += f'Название документа: {doc.filename}\n'
            for chunk in doc.chunks:
                if len(chunk.other_info):
                    output_text += '...\n'
                    for i in chunk.other_info:
                        output_text += f'{i}'.replace('', '-')
                    output_text += '...\n'
                else:
                    output_text += '...\n'
                    output_text += f'{chunk.text_answer}'
                    output_text += '...\n'
                count_tokens = len(output_text) * 2 
                #TODO: в deepinfra нет такой возможности. Нужно прокинуть токенизатор
                #len(requests.post(url=f'{llm_host_tokens}', json={"content": output_text}).json()['tokens'])
                if count_tokens > 20000:
                    logger.info('Количество токенов превысило значение 20k! Оставшиеся чанки отброшены!')
                    break

            if count_tokens > 20000:
                output_text += '\n\\\n\n'
                count += 1
                break

            output_text += '\n\\\n\n'
            count += 1

    if answer_chunks.people_search is not None:
        for doc in answer_chunks.people_search:
            output_text += f'Документ: [{count + 1}]\n'
            output_text += f'Название документа: Информация о сотруднике {doc.person_name}\n'
            output_text += f'Информация о сотруднике {doc.person_name}\n'
            if doc.organizatinal_structure is not None:
                for organizatinal_structure in doc.organizatinal_structure:
                    output_text += '[\n'
                    if organizatinal_structure.position != 'undefined':
                        output_text += f'Должность: {organizatinal_structure.position}'
                    if organizatinal_structure.leads is not None:
                        output_text += f'\nРуководит следующими сотрудниками:\n'
                        for lead in organizatinal_structure.leads:
                            if lead.person != "undefined":
                                output_text += f'{lead.person}\n'
                    if organizatinal_structure.subordinates is not None:
                        if organizatinal_structure.subordinates.person_name != "undefined":
                            output_text += f'Руководителем {doc.person_name} является {organizatinal_structure.subordinates.person_name}'
                    output_text += '\n]\n'

            if doc.business_processes is not None:
                if len(doc.business_processes) >= 2:
                    output_text += f'Отвечает за Бизнес процессы:\n'
                else:
                    output_text += f'Отвечает за Бизнес процесс: '
                for process in doc.business_processes:
                    output_text += f'{process.processes_name}\n'
            if doc.business_curator is not None:
                output_text += 'Является Бизнес-куратором (РОКС НН):\n'
                for curator in doc.business_curator:
                    output_text += f'{curator.company_name}\n'
            if doc.groups is not None:
                output_text += '\nВходит в состав групп, комитетов, координационных советов (КО):\n'
                for group in doc.groups:
                    if 'Члены' in group.position_in_group:
                        output_text += f'{group.group_name}. Должность внутри группы: {group.position_in_group.replace("Члены", "Член")}\n'
                    else:
                        output_text += f'{group.group_name}. Должность внутри группы: {group.position_in_group}\n'
            output_text += f'\n\\\n\n'
            count += 1

    if answer_chunks.groups_search is not None:
        output_text += f'Документ: [{count + 1}]\n'
        output_text += f'Название документа: Информация о группе\n'
        output_text += f'Название группы: {answer_chunks.groups_search.group_name}\n'
        if len(answer_chunks.groups_search.group_composition) > 1:
            output_text += f'\t ФИО \t\t\t| Должность внутри группы\n'
        for person_data in answer_chunks.groups_search.group_composition:
            if 'Члены' in person_data.position_in_group:
                output_text += f'{person_data.person_name:<{20}}| {person_data.position_in_group.replace("Члены", "Член")}\n'
            else:
                output_text += f'{person_data.person_name:<{20}}| {person_data.position_in_group}\n'
        output_text += f'\n\\\n\n'
        count += 1

    if answer_chunks.rocks_nn_search is not None:
        output_text += f'Документ: [{count + 1}]\n'
        output_text += f'Название документа: Информация о {answer_chunks.rocks_nn_search.division}\n'
        output_text += f'Название документа: В РОКС НН {answer_chunks.rocks_nn_search.division} входят:\n'
        for company_name in answer_chunks.rocks_nn_search.company_name:
            output_text += f'{company_name}\n'
        output_text += f'\n\\\n\n'
        count += 1

    if answer_chunks.segmentation_search is not None:
        output_text += f'Документ: [{count + 1}]\n'
        output_text += f'Название документа: {answer_chunks.segmentation_search.segmentation_model}\n'
        output_text += f'Название документа: В {answer_chunks.segmentation_search.segmentation_model} входят:\n'
        for company_name in answer_chunks.segmentation_search.company_name:
            output_text += f'{company_name}\n'
        output_text += f'\n\\\n\n'
        count += 1

    output_text = output_text.replace('\uf02d', '-').replace('', '-')
    return output_text