dmibor commited on
Commit
216e1e1
·
1 Parent(s): 61cffc1

added dependencies

Browse files
__pycache__/search_core.cpython-312.pyc CHANGED
Binary files a/__pycache__/search_core.cpython-312.pyc and b/__pycache__/search_core.cpython-312.pyc differ
 
app.py CHANGED
@@ -1,21 +1,8 @@
1
  import streamlit as st
2
- import clip
3
- import torch
4
- import numpy as np
5
- import os
6
- import glob
7
  from search_core import make_search_index, make_search_index_qa, search_query_all, answer_question
8
  import pandas as pd
9
  import json
10
 
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
12
- model= None
13
- #try:
14
- #model, preprocess = clip.load("ViT-B/32", device=device)
15
- #except:
16
- #st.write("Exception loading model")
17
-
18
-
19
  # Setting page layout
20
  st.set_page_config(
21
  page_title="Поиск по публикации/вопросы-ответы",
 
1
  import streamlit as st
 
 
 
 
 
2
  from search_core import make_search_index, make_search_index_qa, search_query_all, answer_question
3
  import pandas as pd
4
  import json
5
 
 
 
 
 
 
 
 
 
6
  # Setting page layout
7
  st.set_page_config(
8
  page_title="Поиск по публикации/вопросы-ответы",
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+
4
+
search_core.py CHANGED
@@ -14,8 +14,7 @@ import torch
14
  from tqdm import tqdm
15
 
16
  import Stemmer
17
- #stemmer= Stemmer.Stemmer('ru')#russian
18
- stemmer= Stemmer.Stemmer('en')#english
19
  import json
20
 
21
  #exclude_tags=['graphic', 'figure']
@@ -133,13 +132,15 @@ def clear_text(text):
133
  return clean_text
134
 
135
  def lemmatize_and_stemm(df_r):
136
- global nlp
137
  #print('lemmatize_and_stemm!')
138
  disabled_pipes = [ "parser", "ner"]
139
  if PUBLICATION_LANGUAGE=="ru":
140
- nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
 
141
  else:
142
- nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
 
143
 
144
  lemm_texts = []
145
  stem_texts=[]
@@ -336,9 +337,11 @@ def load_index_data():
336
  #spacy
337
  disabled_pipes = [ "parser", "ner"]
338
  if PUBLICATION_LANGUAGE=="ru":
339
- nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
 
340
  else:
341
- nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
 
342
  #print('spacy loaded:', nlp)
343
  #tokenizer
344
  with open(TOKENIZER_SEARCH_PATH, 'rb') as handle:
@@ -353,14 +356,16 @@ def load_index_data():
353
  return nlp, tokenizer_search, search_df
354
 
355
  def load_index_data_qa():
356
- global nlp, tokenizer_qa, qa_df, qa_index_data_loaded
357
  #print('load_index_data_qa!')
358
  #spacy
359
  disabled_pipes = [ "parser", "ner"]
360
  if PUBLICATION_LANGUAGE=="ru":
361
- nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
 
362
  else:
363
- nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
 
364
  print('spacy loaded:', nlp)
365
  #tokenizer
366
  with open(TOKENIZER_QA_PATH, 'rb') as handle:
@@ -386,12 +391,14 @@ def customIsIn(x , tokens):
386
  return result
387
 
388
  def get_lemmed_stemmed_text(text):
389
- global nlp
390
  #print('nlp loaded or not:', nlp)
391
  if PUBLICATION_LANGUAGE=="ru":
392
  spacy_stopwords = spacy.lang.ru.stop_words.STOP_WORDS #russian
 
393
  else:
394
  spacy_stopwords = nlp.Defaults.stop_words #english
 
395
  #print('spacy_stopwords:', spacy_stopwords)
396
  doc = nlp(clear_text(text))
397
  # Remove stop words
@@ -404,7 +411,7 @@ def get_lemmed_stemmed_text(text):
404
  return lemm_text, stem_text
405
 
406
  def search_query_any(query, df=None, tokenizer=None):
407
- global SEARCH_DATA, search_df, index_data_loaded
408
  print('search_query_any!')
409
  print(f'query: {query}')
410
  if index_data_loaded==False:
 
14
  from tqdm import tqdm
15
 
16
  import Stemmer
17
+ global stemmer
 
18
  import json
19
 
20
  #exclude_tags=['graphic', 'figure']
 
132
  return clean_text
133
 
134
  def lemmatize_and_stemm(df_r):
135
+ global nlp, stemmer
136
  #print('lemmatize_and_stemm!')
137
  disabled_pipes = [ "parser", "ner"]
138
  if PUBLICATION_LANGUAGE=="ru":
139
+ nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
140
+ stemmer= Stemmer.Stemmer('ru')#russian
141
  else:
142
+ nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
143
+ stemmer= Stemmer.Stemmer('en')#english
144
 
145
  lemm_texts = []
146
  stem_texts=[]
 
337
  #spacy
338
  disabled_pipes = [ "parser", "ner"]
339
  if PUBLICATION_LANGUAGE=="ru":
340
+ nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
341
+ stemmer= Stemmer.Stemmer('ru')#russian
342
  else:
343
+ nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
344
+ stemmer= Stemmer.Stemmer('en')#english
345
  #print('spacy loaded:', nlp)
346
  #tokenizer
347
  with open(TOKENIZER_SEARCH_PATH, 'rb') as handle:
 
356
  return nlp, tokenizer_search, search_df
357
 
358
  def load_index_data_qa():
359
+ global nlp, tokenizer_qa, qa_df, qa_index_data_loaded, stemmer
360
  #print('load_index_data_qa!')
361
  #spacy
362
  disabled_pipes = [ "parser", "ner"]
363
  if PUBLICATION_LANGUAGE=="ru":
364
+ nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
365
+ stemmer= Stemmer.Stemmer('ru')#russian
366
  else:
367
+ nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
368
+ stemmer= Stemmer.Stemmer('en')#english
369
  print('spacy loaded:', nlp)
370
  #tokenizer
371
  with open(TOKENIZER_QA_PATH, 'rb') as handle:
 
391
  return result
392
 
393
  def get_lemmed_stemmed_text(text):
394
+ global nlp, stemmer
395
  #print('nlp loaded or not:', nlp)
396
  if PUBLICATION_LANGUAGE=="ru":
397
  spacy_stopwords = spacy.lang.ru.stop_words.STOP_WORDS #russian
398
+ stemmer= Stemmer.Stemmer('ru')#russian
399
  else:
400
  spacy_stopwords = nlp.Defaults.stop_words #english
401
+ stemmer= Stemmer.Stemmer('en')#english
402
  #print('spacy_stopwords:', spacy_stopwords)
403
  doc = nlp(clear_text(text))
404
  # Remove stop words
 
411
  return lemm_text, stem_text
412
 
413
  def search_query_any(query, df=None, tokenizer=None):
414
+ global SEARCH_DATA, search_df, index_data_loaded, stemmer
415
  print('search_query_any!')
416
  print(f'query: {query}')
417
  if index_data_loaded==False: