Spaces:

dmibor
/

ietm_search_and_qa

Sleeping

App Files Files Community

dmibor commited on Oct 30, 2024

Commit

61cffc1

1 Parent(s): cd39c4a

основной код поиска и вопросов/ответов

Browse files

Files changed (3) hide show

__pycache__/search_core.cpython-312.pyc +0 -0
app.py +43 -71
search_core.py +632 -0

__pycache__/search_core.cpython-312.pyc ADDED Viewed

Binary file (28.2 kB). View file

app.py CHANGED Viewed

@@ -4,87 +4,59 @@ import torch
 import numpy as np
 import os
 import glob
-from pathlib import Path
-from PIL import Image
-import chromadb
-import boto3
-import botocore
-from io import BytesIO
 st.set_page_config(
-    page_title="Поиск изображений Google Open Images по текстовому запросу",
     page_icon="🤖",
     layout="wide",
     initial_sidebar_state="expanded"
 )
-BUCKET_NAME = 'open-images-dataset'
-DOWNLOAD_FOLDER='ds_download'
-SPLIT='validation'
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model= None
-preprocess = None
-try:
-    model, preprocess = clip.load("ViT-B/32", device=device)
-except:
-    st.write("Exception loading model")
-collection_path = "image_embeddings_collection.chroma"
-chroma_client = chromadb.PersistentClient(path=collection_path)
-image_collection= None
-try:
-    image_collection = chroma_client.get_or_create_collection("image" , metadata={"hnsw:space": "cosine"})
-except:
-    st.write("Exception loading collection")
-#if os.path.isdir(DOWNLOAD_FOLDER)==False:
-  #os.mkdir(DOWNLOAD_FOLDER)
-num_embeddings = image_collection.count()
 # Main page heading
-st.title("Поиск изображений Google Open Images по текстовому запросу")
 # Sidebar
-st.sidebar.header("Настройки поиска")
-st.sidebar.write(f"Количество изображений в БД: {num_embeddings}")
-text_input = st.sidebar.text_input(label='Введите запрос:', value='kite in the sky')
-search_files_cnt = int(st.sidebar.slider(label="Количество изображений", min_value=1, max_value=10, value=2))
-searchStarted = st.sidebar.button('Искать')
-col1, col2 = st.columns(2)
-if searchStarted==True:
-    text_embedding = clip.tokenize(text_input).to(device)
-    text_features = model.encode_text(text_embedding).detach().cpu().numpy()
-    result = image_collection.query(text_features, n_results=search_files_cnt)
-    bucket = boto3.resource('s3',
-                            config=botocore.config.Config(
-                            signature_version=botocore.UNSIGNED)).Bucket(BUCKET_NAME)
-    cnt=0
-    for i in result['metadatas'][0]:
-        try:
-            filename= Path(i['name'])
-            image_id= filename.with_suffix('')
-            filepath= i['path']
-            #down_file_path= os.path.join(DOWNLOAD_FOLDER, f'{image_id}.jpg')
-            #bucket.download_file(f'{SPLIT}/{image_id}.jpg',  down_file_path)
-            #img = Image.open(down_file_path)
-            object_key= f'{SPLIT}/{image_id}.jpg'
-            image_data = BytesIO()
-            bucket.download_fileobj(object_key, image_data)
-            image_data.seek(0)
-            img = Image.open(image_data)
-            col_ref= col1
-            if ((cnt+1) % 2) == 0:
-                col_ref= col2
-            with col_ref:
-                st.write('image_id:', image_id)
-                st.write('distance:', result['distances'][0][cnt])
-                st.image(img, use_column_width=True)
-        except botocore.exceptions.ClientError as exception:
-            st.write(str(exception))
-        cnt=cnt+1

 import numpy as np
 import os
 import glob
+from search_core import make_search_index, make_search_index_qa, search_query_all, answer_question
+import pandas as pd
+import json
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model= None
+#try:
+    #model, preprocess = clip.load("ViT-B/32", device=device)
+#except:
+    #st.write("Exception loading model")
+# Setting page layout
 st.set_page_config(
+    page_title="Поиск по публикации/вопросы-ответы",
     page_icon="🤖",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 # Main page heading
+#st.title("Поиск изображений Google Open Images по текстовому запросу")
+searchStarted= False
+qaStarted= False
 # Sidebar
+with st.sidebar:
+    tab1, tab2 = st.sidebar.tabs(["Поиск по публикации", "Вопросы-ответы"])
+with tab1:
+    st.header("Поиск по публикации")
+    search_input = st.text_input(label='Введите запрос:', value='аккумуляторная батарея')
+    searchStarted = st.button('Искать')
+with tab2:
+    st.header("Вопросы-ответы")
+    qa_input = st.text_input(label='Введите вопрос:', value='Какой ресурс до первого ремонта?')
+    qaStarted = st.button('Узнать ответ')
+if searchStarted==True:
+    st.header("Результаты поиска")
+    search_result= search_query_all(search_input)
+    df = pd.DataFrame(pd.json_normalize(search_result))
+    df.columns=['Параграф модуля данных', 'Код МД']
+    st.table(df)
+if qaStarted==True:
+    st.header("Ответ")
+    mode_string = 'strict'
+    model_string = '1'
+    answer= answer_question(qa_input, mode_string, model_string)
+    df = pd.DataFrame(pd.json_normalize(answer))
+    df.columns=['Уверенность', 'Ответ', 'Код МД']
+    st.table(df)

search_core.py ADDED Viewed

	@@ -0,0 +1,632 @@

+import os
+import lxml.etree as ET
+import pandas as pd
+import numpy as np
+from tensorflow.keras.preprocessing.text import Tokenizer
+import spacy
+from tqdm import tqdm
+import configparser
+import pickle
+import re
+from transformers import pipeline
+import torch
+from tqdm import tqdm
+import Stemmer
+#stemmer= Stemmer.Stemmer('ru')#russian
+stemmer= Stemmer.Stemmer('en')#english
+import json
+#exclude_tags=['graphic', 'figure']
+#include_tags=['note', 'notePara', 'para']
+exclude_tags=['graphic']
+include_tags=['note', 'notePara', 'para', 'title', 'warningAndCautionPara', 'techName', 'infoName']
+add_colon_tags=['title', 'techName']
+make_lower_parent_tags=['listItemDefinition']
+PARSE_PATHS=['//dmodule/content[last()]/procedure[last()]/preliminaryRqmts[last()]',
+             '//dmodule/content[last()]/procedure[last()]/mainProcedure[last()]',
+             '//dmodule/content[last()]/description[last()]',
+             '//dmodule/content[last()]/crew[last()]/crewRefCard[last()]/crewDrill[last()]',
+             '//dmodule/identAndStatusSection[last()]/dmAddress[last()]/dmAddressItems[last()]/dmTitle[last()]']
+PERSCENTAGE_IN_RATIO=0.5
+THRESHOLD=0.1
+global nlp, tokenizer_search, tokenizer_qa, device
+global search_df, qa_df, SEARCH_DATA
+global index_data_loaded, qa_index_data_loaded, qa_model_initialized
+global qa_model, qa_model_num
+PUBLICATION_DEMO_RU_PATH="publications/Demo publication in Russian"
+PUBLICATION_DEMO_EN_PATH="publications/Bike Data Set for Release number 5.0"
+PUBLICATION_PATH=PUBLICATION_DEMO_RU_PATH
+TOKENIZER_SEARCH_FILENAME='tokenizer_search.pickle'
+TOKENIZER_QA_FILENAME='tokenizer_qa.pickle'
+INDEX_FOLDER= PUBLICATION_PATH+ os.sep+ "index"
+#print('INDEX_FOLDER:', INDEX_FOLDER)
+TOKENIZER_SEARCH_PATH= INDEX_FOLDER+ os.sep+ TOKENIZER_SEARCH_FILENAME
+TOKENIZER_QA_PATH= INDEX_FOLDER+ os.sep+ TOKENIZER_QA_FILENAME
+#print('TOKENIZER_SEARCH_PATH:', TOKENIZER_SEARCH_PATH)
+PUBLICATION_LANGUAGE="ru"
+nlp=None
+search_df=None
+qa_df=None
+index_data_loaded=False
+qa_index_data_loaded=False
+SEARCH_DATA= None
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+qa_model_initialized=False
+def get_xpath_one(tree, xpath):
+    res = tree.xpath(xpath)
+    if res:
+        return res[0]
+def get_dmc(doc):
+    dmc=""
+    node= get_xpath_one(doc, '//dmCode')
+    dmc='DMC-'+'-'.join([node.get('modelIdentCode'), \
+                         node.get('itemLocationCode'), \
+                         node.get('systemCode'), \
+                         node.get('subSystemCode')+node.get('subSubSystemCode'), \
+                         node.get('assyCode'),\
+                         node.get('disassyCode')+node.get('disassyCodeVariant'),\
+                         node.get('infoCode')+node.get('infoCodeVariant'),\
+                         node.get('systemDiffCode')])
+    #print('dmc:                  ', dmc)
+    return dmc
+def is_float(string):
+    if string.replace(".", "").replace(",", "").replace("+", "").replace("-", "").isnumeric():
+        return True
+    else:
+        return False
+def stringify_children(node, texts, pis, excludeDigits=True):
+    s = node.text
+    if (s != None) and (s.isspace()==False):
+        if excludeDigits:
+            if is_float(s)==False:
+               texts.add(s)
+        else:
+            texts.add(s)
+    for child in node:
+        if child.tag not in exclude_tags:
+            if child not in pis:
+                stringify_children(child, texts, pis)
+    return
+def stringify_children_incl(node, texts, pis, make_lower=False):
+    ET.strip_tags(node, 'internalRef')
+    ET.strip_tags(node, 'emphasis')
+    s = node.text
+    if s and make_lower==True:
+        s= s.lower()
+    if s and node.tag in add_colon_tags:
+        s=s+':'
+    #print('s', s)
+    clear_s= clear_text(s)
+    if (s != None) and (s.isspace()==False) and (clear_s!='') and (clear_s):
+        print('s:', s)
+        print('clear_text(s):', clear_text(s))
+        texts.append(s)
+    for child in node:
+        #print('child.tag:', child.tag)
+        if (len(child.getchildren())>0) or (child.tag in include_tags):
+            if (child not in pis) and (child.tag not in exclude_tags):
+                make_lower=False
+                if node.tag in make_lower_parent_tags:
+                    make_lower=True
+                stringify_children_incl(child, texts, pis, make_lower)
+    return
+def clear_text(text):
+    #print('clear_text!')
+    clean_text = re.sub(r'(?:(?!\u0301)[\W\d_])+', ' ', str(text).lower())
+    return clean_text
+def lemmatize_and_stemm(df_r):
+    global nlp
+    #print('lemmatize_and_stemm!')
+    disabled_pipes = [ "parser",  "ner"]
+    if PUBLICATION_LANGUAGE=="ru":
+        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
+    else:
+        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
+    lemm_texts = []
+    stem_texts=[]
+    for doc in tqdm(nlp.pipe(df_r['lemm_text'].values, disable = disabled_pipes), total=df_r.shape[0]):
+        lemm_text = " ".join([i.lemma_ for i in doc])
+        lemm_texts.append(lemm_text)
+        stem_text = " ".join([stemmer.stemWord(i.text) for i in doc])
+        stem_texts.append(stem_text)
+    df_r['lemm_text']= lemm_texts
+    df_r['stem_text']= stem_texts
+    df_r=df_r.drop_duplicates()
+    #print('lemmatization and stemming success!')
+    return
+def tokenize_text(df_r, save_filename):
+    #global tokenizer_search
+    #print('tokenize_text!')
+    #try:
+        #with open('tokenizer.pickle', 'rb') as handle:
+            #tokenizer = pickle.load(handle)
+            #print('tokenizer loaded from file')
+    #except Exception as e:
+    tokenizer = Tokenizer(oov_token='<oov>')
+    print('tokenizer created')
+    texts= pd.concat([df_r['lemm_text'],df_r['stem_text']])
+    tokenizer.fit_on_texts(texts)
+    total_words = len(tokenizer.word_index) + 1
+    print("Total number of words: ", total_words)
+    with open(save_filename, 'wb') as handle:
+        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    return tokenizer
+def make_final_index(df_r, tokenizer, index_filename='search_index.csv', includePlainText=True):
+    print('make_final_index!')
+    tokens=[]
+    labels=[]
+    dmcs=[]
+    texts=[]
+    for index, row in tqdm(df_r.iterrows()):
+        #print('row:', row)
+        text= row['text']
+        lemm_token= tokenizer.texts_to_sequences([row['lemm_text']])[0]
+        stem_token= tokenizer.texts_to_sequences([row['stem_text']])[0]
+        dmc= row['DMC']
+        #print(str(row['label_enc'])+':'+dmc)
+        tokens.append(lemm_token)
+        labels.append(row['label_enc'])
+        dmcs.append(dmc)
+        texts.append(text)
+        tokens.append(stem_token)
+        labels.append(row['label_enc'])
+        dmcs.append(dmc)
+        texts.append(text)
+    columns= ['tokens', 'labels', 'DMC']
+    data= {'tokens': tokens, 'labels': labels, 'DMC': dmcs}
+    if includePlainText==True:
+        columns= ['tokens', 'labels', 'text', 'DMC']
+        data= {'tokens': tokens, 'labels': labels, 'text': texts, 'DMC': dmcs}
+    s_df= pd.DataFrame(columns=columns, data= data)
+    s_df= s_df.loc[s_df.astype(str).drop_duplicates().index]
+    print('final index info:')
+    print(s_df.info())
+    s_df.to_csv(index_filename, sep=';', index=False)
+    #print(f'results saved to {index_filename}')
+    return s_df
+def make_search_index(path):
+    global nlp, tokenizer_search, search_df, index_data_loaded
+    #print('make_search_index!')
+    directory= path.replace('"', '')
+    #print(f'path: {directory}')
+    df_r= pd.DataFrame(columns=['text'])
+    for file in os.listdir(directory):
+        filename = file#os.fsdecode(file)
+        if 'PMC' in filename:
+            continue
+        #print('filename: ', filename)
+        if filename.lower().endswith(".xml")==False:
+            continue
+        filepath= directory+ os.sep+ filename
+        print('filepath:', filepath)
+        doc= ET.parse(filepath)
+        dmc= get_dmc(doc)
+        file_texts=set()
+        pis = doc.xpath("//processing-instruction()")
+        for node in doc.xpath('//dmodule'):
+            stringify_children(node, file_texts, pis)
+        #print('file_texts:', file_texts)
+        df= pd.DataFrame(columns=['text'], data= file_texts)
+        df['DMC']= dmc
+        df_r= pd.concat([df_r, df], ignore_index=True)
+    print('parsing results:')
+    print(df_r.info())
+    #PARSING_INDEX_FILENAME='strings_with_DMC.csv'
+    #print(f'parsing results saved to: {PARSING_INDEX_FILENAME}')
+    #df_r.to_csv(PARSING_INDEX_FILENAME, index=False, sep = ';')
+    df_r['lemm_text']=df_r['text'].apply(clear_text)
+    lemmatize_and_stemm(df_r)
+    df_r= df_r.reset_index(drop=True)
+    df_r['label_enc']= df_r.index
+    tokenizer_search= tokenize_text(df_r, TOKENIZER_SEARCH_PATH)
+    #print('tokenizer before make_final_index:', tokenizer_search)
+    search_df= make_final_index(df_r, tokenizer_search)
+    index_data_loaded= True
+    return len(search_df)
+def make_search_index_qa(path):
+    global nlp, tokenizer_qa, qa_df, qa_index_data_loaded
+    #print('make_search_index_qa!')
+    directory= path.replace('"', '')
+    #print(f'path: {directory}')
+    df_r= pd.DataFrame(columns=['text'])
+    for file in os.listdir(directory):
+        filename = file#os.fsdecode(file)
+        if 'PMC' in filename:
+            continue
+        #print('filename: ', filename)
+        if filename.lower().endswith(".xml")==False:
+            continue
+        filepath= directory+ os.sep+ filename
+        #print('filepath:', filepath)
+        doc= ET.parse(filepath)
+        dmc= get_dmc(doc)
+        paths= PARSE_PATHS
+        pis = doc.xpath("//processing-instruction()")
+        for pi in pis:
+            if pi.getparent()!=None:
+                ET.strip_tags(pi.getparent(), pi.tag)
+        cntr=1
+        for expr in paths:
+            try:
+                x_path_result = doc.xpath(expr)
+            except ET.XPathEvalError:
+                continue
+            if not x_path_result:
+                continue
+            file_texts=[]
+            dmc_with_chapter= f'{dmc}({cntr})'
+            for node in x_path_result:#doc.xpath(expr):
+                stringify_children_incl(node, file_texts, pis)
+            cntr=cntr+1
+            #print('file_texts:',file_texts)
+            #print('file_texts len:',len(file_texts))
+            if len(file_texts)==0:
+                continue
+            concat_texts=[' \n '.join(file_texts)]
+            #print('file_texts:', file_texts)
+            #df= pd.DataFrame(columns=['text'], data= file_texts)
+            df= pd.DataFrame(columns=['text'], data= concat_texts)
+            df['DMC']= dmc_with_chapter
+            df_r= pd.concat([df_r, df], ignore_index=True)
+    #print('parsing results:')
+    #print(df_r.info())
+    #PARSING_INDEX_FILENAME='strings_with_DMC.csv'
+    #print('parsing results saved to: {PARSING_INDEX_FILENAME}')
+    #df_r.to_csv(PARSING_INDEX_FILENAME, index=False, sep = ';')
+    df_r['lemm_text']=df_r['text'].apply(clear_text)
+    lemmatize_and_stemm(df_r)
+    df_r= df_r.reset_index(drop=True)
+    df_r['label_enc']= df_r.index
+    tokenizer_qa= tokenize_text(df_r, TOKENIZER_QA_PATH)
+    qa_df= make_final_index(df_r, tokenizer_qa, index_filename='qa_index.csv')
+    qa_index_data_loaded= True
+    return len(qa_df)
+def convert2list(string):
+    x = json.loads(string)
+    lst=[]
+    for n in x:
+        #print(x)
+        lst.append(int(n))
+    return lst
+def load_index_data():
+    global nlp, tokenizer_search, search_df, index_data_loaded
+    print('load_index_data!')
+    #spacy
+    disabled_pipes = [ "parser",  "ner"]
+    if PUBLICATION_LANGUAGE=="ru":
+        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
+    else:
+        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
+    #print('spacy loaded:', nlp)
+    #tokenizer
+    with open(TOKENIZER_SEARCH_PATH, 'rb') as handle:
+        tokenizer_search = pickle.load(handle)
+    #print('tokenizer loaded:', tokenizer)
+    #index
+    search_index_path= INDEX_FOLDER+os.sep+'search_index.csv'
+    search_df= pd.read_csv(search_index_path, sep=';')
+    print('index file loaded:', search_df.info())
+    search_df['tokens']= search_df['tokens'].apply(convert2list)
+    index_data_loaded= True
+    return nlp, tokenizer_search, search_df
+def load_index_data_qa():
+    global nlp, tokenizer_qa, qa_df, qa_index_data_loaded
+    #print('load_index_data_qa!')
+    #spacy
+    disabled_pipes = [ "parser",  "ner"]
+    if PUBLICATION_LANGUAGE=="ru":
+        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
+    else:
+        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
+    print('spacy loaded:', nlp)
+    #tokenizer
+    with open(TOKENIZER_QA_PATH, 'rb') as handle:
+        tokenizer_qa = pickle.load(handle)
+    #print('tokenizer loaded:', tokenizer_qa)
+    #index
+    qa_index_path= INDEX_FOLDER+os.sep+'qa_index.csv'
+    qa_df= pd.read_csv(qa_index_path, sep=';')
+    #print('index qa file loaded:', qa_df.info())
+    qa_df['tokens']= qa_df['tokens'].apply(convert2list)
+    qa_index_data_loaded= True
+    return nlp, tokenizer_qa, qa_df
+def customIsIn(x , tokens):
+    result= False
+    cnt_in=0
+    for val in x:
+        if val in tokens:
+            cnt_in+=1
+            PERSCENTAGE_IN= cnt_in/len(tokens)
+            if PERSCENTAGE_IN>=PERSCENTAGE_IN_RATIO:
+                return True
+    return result
+def get_lemmed_stemmed_text(text):
+    global nlp
+     #print('nlp loaded or not:', nlp)
+    if PUBLICATION_LANGUAGE=="ru":
+        spacy_stopwords = spacy.lang.ru.stop_words.STOP_WORDS #russian
+    else:
+        spacy_stopwords = nlp.Defaults.stop_words #english
+    #print('spacy_stopwords:', spacy_stopwords)
+    doc = nlp(clear_text(text))
+    # Remove stop words
+    doc_cleared = [token for token in doc if not token.is_stop]
+    #print('doc_cleared:', doc_cleared)
+    lemm_text = " ".join([i.lemma_ for i in doc_cleared if not i.lemma_ in spacy_stopwords])
+    print(f'lemm_text: {lemm_text}')
+    stem_text = " ".join([stemmer.stemWord(i.text) for i in doc_cleared if not stemmer.stemWord(i.text) in spacy_stopwords])
+    print(f'stem_text: {stem_text}')
+    return lemm_text, stem_text
+def search_query_any(query, df=None, tokenizer=None):
+    global SEARCH_DATA, search_df, index_data_loaded
+    print('search_query_any!')
+    print(f'query: {query}')
+    if index_data_loaded==False:
+        load_index_data()
+    SEARCH_DATA= df
+    if df is None:
+        if index_data_loaded==False:
+            load_index_data()
+            SEARCH_DATA=search_df
+    lemm_text, stem_text= get_lemmed_stemmed_text(query)
+    if tokenizer==None:
+        tokenizer= tokenizer_search
+    token_list = tokenizer.texts_to_sequences([lemm_text])[0]
+    #print(f'token_list: {token_list}')
+    token_list_stem = tokenizer.texts_to_sequences([stem_text])[0]
+    #print(f'token_list stem: {token_list_stem}')
+    mask1 = SEARCH_DATA.tokens.apply(lambda x: customIsIn(x, token_list))
+    indexes1= SEARCH_DATA[mask1]['labels'].unique()
+    mask2= SEARCH_DATA.tokens.apply(lambda x: customIsIn(x, token_list_stem))
+    indexes2= SEARCH_DATA[mask2]['labels'].unique()
+    indexes= np.concatenate((indexes1, indexes2), axis=None)
+    results_df= SEARCH_DATA[SEARCH_DATA['labels'].isin(indexes)].drop(['tokens', 'labels'], axis=1)
+    results_df= results_df.drop_duplicates()
+    result=[]
+    regex = re.compile(r'\([^)]*\)')
+    for index, row in results_df.iterrows():
+        text= row['text']
+        dmc= row['DMC']
+        dmc= re.sub(regex, '', dmc)
+        result.append({'text': text, 'DMC':dmc})
+    return result
+def search_query_all(query, df=None, tokenizer=None):
+    global SEARCH_DATA, search_df, index_data_loaded
+    print('search_query_all!')
+    print(f'query: {query}')
+    SEARCH_DATA= df
+    if df is None:
+        if index_data_loaded==False:
+            load_index_data()
+        SEARCH_DATA=search_df
+        print('SEARCH_DATA:', SEARCH_DATA.head())
+    print('nlp loaded or not:', nlp)
+    doc = nlp(clear_text(query))
+    lemm_text, stem_text= get_lemmed_stemmed_text(query)
+    if tokenizer==None:
+        tokenizer= tokenizer_search
+    token_list = tokenizer.texts_to_sequences([lemm_text])[0]
+    print(f'token_list: {token_list}')
+    token_list_stem = tokenizer.texts_to_sequences([stem_text])[0]
+    print(f'token_list stem: {token_list_stem}')
+    mask1= SEARCH_DATA['tokens'].map(set(token_list).issubset)
+    mask2= SEARCH_DATA['tokens'].map(set(token_list_stem).issubset)
+    indexes1= SEARCH_DATA[mask1]['labels'].unique()
+    indexes2= SEARCH_DATA[mask2]['labels'].unique()
+    indexes= np.concatenate((indexes1, indexes2), axis=None)
+    results_df= SEARCH_DATA[SEARCH_DATA['labels'].isin(indexes)].drop(['tokens', 'labels'], axis=1)
+    results_df= results_df.drop_duplicates()
+    result=[]
+    regex = re.compile(r'\([^)]*\)')
+    for index, row in results_df.iterrows():
+        text= row['text']
+        dmc= row['DMC']
+        dmc= re.sub(regex, '', dmc)
+        result.append({'text': text, 'DMC':dmc})
+    return result
+def concat_by_DMC(s_df):
+    #print('concat_by_DMC!')
+    #print(s_df.head())
+    #объединяем лемматизированную и стеммизированную часть датасета
+    concat_tokens=[]
+    for label in s_df['labels'].unique():
+        tokens_lists= s_df[s_df['labels']==label]['tokens'].to_list()
+        joined_lst=[]
+        for lst in tokens_lists:
+            joined_lst+= lst
+        concat_tokens.append(joined_lst)
+    #print(concat_tokens[:5])
+    df= s_df.drop('tokens', axis=1)
+    df= df.drop_duplicates()
+    df['tokens']=concat_tokens
+    #объединяем тексты и токены по DMC
+    concat_tokens=[]
+    DMCs=[]
+    texts=[]
+    for dmc_code in df['DMC'].unique():
+        DMCs.append(dmc_code)
+        #объединяем списки токенов для одного модуля данных (DMC)
+        tokens_lists= df[df['DMC']==dmc_code]['tokens'].to_list()
+        joined_token_lst=[]
+        for lst in tokens_lists:
+            joined_token_lst+= lst
+        concat_tokens.append(joined_token_lst)
+        #объединяем тексты
+        text_list= df[df['DMC']==dmc_code]['text'].to_list()
+        concat_text=' \n '.join(str(txt) for txt in text_list)
+        texts.append(concat_text)
+    #print('concat_tokens',len(concat_tokens))
+    #print('DMCs',len(DMCs))
+    #print('texts',len(texts))
+    df= pd.DataFrame(columns=['DMC'], data=DMCs)
+    df['text']= texts
+    df['tokens']= concat_tokens
+    df['labels']= df.index
+    #print(df.head())
+    return df
+def initialize_qa_model(model):
+    global qa_df, qa_model, qa_model_num
+    qa_model_num= model
+    print('initialize_qa_model!')
+    if model==1 or str(model)=="1":
+        qa_model= pipeline("question-answering", "dmibor/ietm_search_and_qa", device=device)
+        print('initialized model number 1!')
+    else:#model==2 (базовая)
+        qa_model= pipeline("question-answering", "timpal0l/mdeberta-v3-base-squad2", device=device)
+        print('initialized model number 2!')
+    if qa_index_data_loaded==False:
+        load_index_data_qa()
+    #print('len(qa_df)', len(qa_df))
+    qa_df= concat_by_DMC(qa_df)
+    #qa_df.to_csv('concat_index.csv', sep=';', index=False)
+    #print('concat_by_DMC len(qa_df)', len(qa_df))
+    qa_model_initialized=True
+def get_best_and_longest_result(model_results, threshold, mode):
+    print('get_best_and_longest_result!')
+    print('mode:', mode)
+    best_result=None
+    longest_result=None
+    if(type(model_results)!= list):
+        return best_result, longest_result
+    best_result= model_results[0]
+    best_result_answer= best_result['answer']
+    print('best_result_answer: ',best_result_answer)
+    best_answer_cleaned= (re.sub(r"[\W\d_]+$", "", best_result_answer)).strip()
+    print('best_answer_cleaned: ',best_answer_cleaned)
+    longest_answer=''
+    longest_answer_len= len(best_answer_cleaned)
+    longest_result= best_result
+    print("type(mode)", type(mode))
+    print("mode=='strict'", mode=='strict')
+    print("mode==\"strict\"", mode=="strict")
+    if mode=='strict':
+        return best_result, longest_result
+    if best_result['score']>=threshold:
+        print('best_result_answer: ',best_answer_cleaned)
+        print('best_result score:', best_result['score'])
+        for result in model_results:
+            answer= result['answer']
+            answer_cleaned= re.sub(r"[\W\d_]+$", "", answer).strip()
+            #print('answer_cleaned: ',answer_cleaned)
+            if best_answer_cleaned in answer_cleaned:
+                if len(answer_cleaned)>longest_answer_len:
+                    print('new longest answer: ',answer_cleaned)
+                    print('longest score:', result['score'])
+                    print()
+                    longest_answer= answer_cleaned
+                    longest_answer_len= len(answer_cleaned)
+                    longest_result= result
+    #print('longest_answer:' , longest_answer)
+    return best_result, longest_result
+def find_answer(question, context, threshold, max_answer_len=1000, top_k=20, verbose=True, mode='strict'):
+    print('find_answer!')
+    print('mode:', mode)
+    found_answer=False
+    #print('qa_model', qa_model)
+    model_results= qa_model(question = question, context = context, max_answer_len=max_answer_len, top_k=top_k)
+    #print(model_result)
+    best_result, longest_result= get_best_and_longest_result(model_results, threshold, mode)
+    #print('longest_result', longest_result)
+    if best_result['score']>=threshold:
+        longest_answer= longest_result['answer']
+        answer_cleaned= re.sub(r"[\W\d_]+$", "", longest_answer).strip()
+        if verbose==True:
+            print(f'Ответ (уверенность= {round(model_result['score'], 2)}): {answer_cleaned}')
+        longest_result['answer']= answer_cleaned
+        found_answer=True
+    if found_answer==False and verbose==True:
+        print('Ответ не найден!')
+    model_result= best_result
+    model_result['answer']= longest_result['answer']
+    return model_result
+def answer_question(question, mode, model=1):
+    global qa_model_initialized, qa_model_num, tokenizer_qa
+    print('answer_question!')
+    if qa_model_initialized==False or model!= qa_model_num:
+        initialize_qa_model(model)
+    print(f'question: {question}')
+    print(f'mode: {mode}')
+    print(f'model: {qa_model}')
+    filtered_index= search_query_all(question, qa_df, tokenizer_qa)
+    threshold= THRESHOLD
+    #print('filtered_index все слова:', len(filtered_index))
+    if len(filtered_index)<1:
+        filtered_index= search_query_any(question, qa_df, tokenizer_qa)
+        threshold= THRESHOLD
+        #print('filtered_index любое слово:', len(filtered_index))
+    found_answer=False
+    best_answer=""
+    best_score=0
+    best_DMC=""
+    regex = re.compile(r'\([^)]*\)')
+    for indx in filtered_index:
+        result= find_answer(question, indx['text'], threshold=threshold, max_answer_len=1000, top_k=20, verbose=False, mode=mode)
+        if result['score']>best_score:
+            best_answer= result['answer']
+            best_score= result['score']
+            best_DMC= indx['DMC']
+            best_DMC= re.sub(regex, '', best_DMC)
+    result= [{'score': best_score, 'answer': best_answer, 'DMC': best_DMC}]
+    return result