Spaces:
Sleeping
Sleeping
added dependencies
Browse files- __pycache__/search_core.cpython-312.pyc +0 -0
- app.py +0 -13
- requirements.txt +4 -0
- search_core.py +19 -12
__pycache__/search_core.cpython-312.pyc
CHANGED
Binary files a/__pycache__/search_core.cpython-312.pyc and b/__pycache__/search_core.cpython-312.pyc differ
|
|
app.py
CHANGED
@@ -1,21 +1,8 @@
|
|
1 |
import streamlit as st
|
2 |
-
import clip
|
3 |
-
import torch
|
4 |
-
import numpy as np
|
5 |
-
import os
|
6 |
-
import glob
|
7 |
from search_core import make_search_index, make_search_index_qa, search_query_all, answer_question
|
8 |
import pandas as pd
|
9 |
import json
|
10 |
|
11 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
-
model= None
|
13 |
-
#try:
|
14 |
-
#model, preprocess = clip.load("ViT-B/32", device=device)
|
15 |
-
#except:
|
16 |
-
#st.write("Exception loading model")
|
17 |
-
|
18 |
-
|
19 |
# Setting page layout
|
20 |
st.set_page_config(
|
21 |
page_title="Поиск по публикации/вопросы-ответы",
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
2 |
from search_core import make_search_index, make_search_index_qa, search_query_all, answer_question
|
3 |
import pandas as pd
|
4 |
import json
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
# Setting page layout
|
7 |
st.set_page_config(
|
8 |
page_title="Поиск по публикации/вопросы-ответы",
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
torch
|
3 |
+
|
4 |
+
|
search_core.py
CHANGED
@@ -14,8 +14,7 @@ import torch
|
|
14 |
from tqdm import tqdm
|
15 |
|
16 |
import Stemmer
|
17 |
-
|
18 |
-
stemmer= Stemmer.Stemmer('en')#english
|
19 |
import json
|
20 |
|
21 |
#exclude_tags=['graphic', 'figure']
|
@@ -133,13 +132,15 @@ def clear_text(text):
|
|
133 |
return clean_text
|
134 |
|
135 |
def lemmatize_and_stemm(df_r):
|
136 |
-
global nlp
|
137 |
#print('lemmatize_and_stemm!')
|
138 |
disabled_pipes = [ "parser", "ner"]
|
139 |
if PUBLICATION_LANGUAGE=="ru":
|
140 |
-
nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
|
|
|
141 |
else:
|
142 |
-
nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
|
|
|
143 |
|
144 |
lemm_texts = []
|
145 |
stem_texts=[]
|
@@ -336,9 +337,11 @@ def load_index_data():
|
|
336 |
#spacy
|
337 |
disabled_pipes = [ "parser", "ner"]
|
338 |
if PUBLICATION_LANGUAGE=="ru":
|
339 |
-
nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
|
|
|
340 |
else:
|
341 |
-
nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
|
|
|
342 |
#print('spacy loaded:', nlp)
|
343 |
#tokenizer
|
344 |
with open(TOKENIZER_SEARCH_PATH, 'rb') as handle:
|
@@ -353,14 +356,16 @@ def load_index_data():
|
|
353 |
return nlp, tokenizer_search, search_df
|
354 |
|
355 |
def load_index_data_qa():
|
356 |
-
global nlp, tokenizer_qa, qa_df, qa_index_data_loaded
|
357 |
#print('load_index_data_qa!')
|
358 |
#spacy
|
359 |
disabled_pipes = [ "parser", "ner"]
|
360 |
if PUBLICATION_LANGUAGE=="ru":
|
361 |
-
nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
|
|
|
362 |
else:
|
363 |
-
nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
|
|
|
364 |
print('spacy loaded:', nlp)
|
365 |
#tokenizer
|
366 |
with open(TOKENIZER_QA_PATH, 'rb') as handle:
|
@@ -386,12 +391,14 @@ def customIsIn(x , tokens):
|
|
386 |
return result
|
387 |
|
388 |
def get_lemmed_stemmed_text(text):
|
389 |
-
global nlp
|
390 |
#print('nlp loaded or not:', nlp)
|
391 |
if PUBLICATION_LANGUAGE=="ru":
|
392 |
spacy_stopwords = spacy.lang.ru.stop_words.STOP_WORDS #russian
|
|
|
393 |
else:
|
394 |
spacy_stopwords = nlp.Defaults.stop_words #english
|
|
|
395 |
#print('spacy_stopwords:', spacy_stopwords)
|
396 |
doc = nlp(clear_text(text))
|
397 |
# Remove stop words
|
@@ -404,7 +411,7 @@ def get_lemmed_stemmed_text(text):
|
|
404 |
return lemm_text, stem_text
|
405 |
|
406 |
def search_query_any(query, df=None, tokenizer=None):
|
407 |
-
global SEARCH_DATA, search_df, index_data_loaded
|
408 |
print('search_query_any!')
|
409 |
print(f'query: {query}')
|
410 |
if index_data_loaded==False:
|
|
|
14 |
from tqdm import tqdm
|
15 |
|
16 |
import Stemmer
|
17 |
+
global stemmer
|
|
|
18 |
import json
|
19 |
|
20 |
#exclude_tags=['graphic', 'figure']
|
|
|
132 |
return clean_text
|
133 |
|
134 |
def lemmatize_and_stemm(df_r):
|
135 |
+
global nlp, stemmer
|
136 |
#print('lemmatize_and_stemm!')
|
137 |
disabled_pipes = [ "parser", "ner"]
|
138 |
if PUBLICATION_LANGUAGE=="ru":
|
139 |
+
nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
|
140 |
+
stemmer= Stemmer.Stemmer('ru')#russian
|
141 |
else:
|
142 |
+
nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
|
143 |
+
stemmer= Stemmer.Stemmer('en')#english
|
144 |
|
145 |
lemm_texts = []
|
146 |
stem_texts=[]
|
|
|
337 |
#spacy
|
338 |
disabled_pipes = [ "parser", "ner"]
|
339 |
if PUBLICATION_LANGUAGE=="ru":
|
340 |
+
nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
|
341 |
+
stemmer= Stemmer.Stemmer('ru')#russian
|
342 |
else:
|
343 |
+
nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
|
344 |
+
stemmer= Stemmer.Stemmer('en')#english
|
345 |
#print('spacy loaded:', nlp)
|
346 |
#tokenizer
|
347 |
with open(TOKENIZER_SEARCH_PATH, 'rb') as handle:
|
|
|
356 |
return nlp, tokenizer_search, search_df
|
357 |
|
358 |
def load_index_data_qa():
|
359 |
+
global nlp, tokenizer_qa, qa_df, qa_index_data_loaded, stemmer
|
360 |
#print('load_index_data_qa!')
|
361 |
#spacy
|
362 |
disabled_pipes = [ "parser", "ner"]
|
363 |
if PUBLICATION_LANGUAGE=="ru":
|
364 |
+
nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
|
365 |
+
stemmer= Stemmer.Stemmer('ru')#russian
|
366 |
else:
|
367 |
+
nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
|
368 |
+
stemmer= Stemmer.Stemmer('en')#english
|
369 |
print('spacy loaded:', nlp)
|
370 |
#tokenizer
|
371 |
with open(TOKENIZER_QA_PATH, 'rb') as handle:
|
|
|
391 |
return result
|
392 |
|
393 |
def get_lemmed_stemmed_text(text):
|
394 |
+
global nlp, stemmer
|
395 |
#print('nlp loaded or not:', nlp)
|
396 |
if PUBLICATION_LANGUAGE=="ru":
|
397 |
spacy_stopwords = spacy.lang.ru.stop_words.STOP_WORDS #russian
|
398 |
+
stemmer= Stemmer.Stemmer('ru')#russian
|
399 |
else:
|
400 |
spacy_stopwords = nlp.Defaults.stop_words #english
|
401 |
+
stemmer= Stemmer.Stemmer('en')#english
|
402 |
#print('spacy_stopwords:', spacy_stopwords)
|
403 |
doc = nlp(clear_text(text))
|
404 |
# Remove stop words
|
|
|
411 |
return lemm_text, stem_text
|
412 |
|
413 |
def search_query_any(query, df=None, tokenizer=None):
|
414 |
+
global SEARCH_DATA, search_df, index_data_loaded, stemmer
|
415 |
print('search_query_any!')
|
416 |
print(f'query: {query}')
|
417 |
if index_data_loaded==False:
|