Spaces:

dmibor
/

ietm_search_and_qa

Sleeping

App Files Files Community

dmibor commited on Oct 30, 2024

Commit

216e1e1

1 Parent(s): 61cffc1

added dependencies

Browse files

Files changed (4) hide show

__pycache__/search_core.cpython-312.pyc +0 -0
app.py +0 -13
requirements.txt +4 -0
search_core.py +19 -12

__pycache__/search_core.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/search_core.cpython-312.pyc and b/__pycache__/search_core.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -1,21 +1,8 @@
 import streamlit as st
-import clip
-import torch
-import numpy as np
-import os
-import glob
 from search_core import make_search_index, make_search_index_qa, search_query_all, answer_question
 import pandas as pd
 import json
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model= None
-#try:
-    #model, preprocess = clip.load("ViT-B/32", device=device)
-#except:
-    #st.write("Exception loading model")
 # Setting page layout
 st.set_page_config(
     page_title="Поиск по публикации/вопросы-ответы",

 import streamlit as st
 from search_core import make_search_index, make_search_index_qa, search_query_all, answer_question
 import pandas as pd
 import json
 # Setting page layout
 st.set_page_config(
     page_title="Поиск по публикации/вопросы-ответы",

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ streamlit
2	+ torch
3	+
4	+

search_core.py CHANGED Viewed

@@ -14,8 +14,7 @@ import torch
 from tqdm import tqdm
 import Stemmer
-#stemmer= Stemmer.Stemmer('ru')#russian
-stemmer= Stemmer.Stemmer('en')#english
 import json
 #exclude_tags=['graphic', 'figure']
@@ -133,13 +132,15 @@ def clear_text(text):
     return clean_text
 def lemmatize_and_stemm(df_r):
-    global nlp
     #print('lemmatize_and_stemm!')
     disabled_pipes = [ "parser",  "ner"]
     if PUBLICATION_LANGUAGE=="ru":
-        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
     else:
-        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
     lemm_texts = []
     stem_texts=[]
@@ -336,9 +337,11 @@ def load_index_data():
     #spacy
     disabled_pipes = [ "parser",  "ner"]
     if PUBLICATION_LANGUAGE=="ru":
-        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
     else:
-        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
     #print('spacy loaded:', nlp)
     #tokenizer
     with open(TOKENIZER_SEARCH_PATH, 'rb') as handle:
@@ -353,14 +356,16 @@ def load_index_data():
     return nlp, tokenizer_search, search_df
 def load_index_data_qa():
-    global nlp, tokenizer_qa, qa_df, qa_index_data_loaded
     #print('load_index_data_qa!')
     #spacy
     disabled_pipes = [ "parser",  "ner"]
     if PUBLICATION_LANGUAGE=="ru":
-        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)#english - en_core_web_sm
     else:
-        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)#russian - ru_core_news_sm
     print('spacy loaded:', nlp)
     #tokenizer
     with open(TOKENIZER_QA_PATH, 'rb') as handle:
@@ -386,12 +391,14 @@ def customIsIn(x , tokens):
     return result
 def get_lemmed_stemmed_text(text):
-    global nlp
      #print('nlp loaded or not:', nlp)
     if PUBLICATION_LANGUAGE=="ru":
         spacy_stopwords = spacy.lang.ru.stop_words.STOP_WORDS #russian
     else:
         spacy_stopwords = nlp.Defaults.stop_words #english
     #print('spacy_stopwords:', spacy_stopwords)
     doc = nlp(clear_text(text))
     # Remove stop words
@@ -404,7 +411,7 @@ def get_lemmed_stemmed_text(text):
     return lemm_text, stem_text
 def search_query_any(query, df=None, tokenizer=None):
-    global SEARCH_DATA, search_df, index_data_loaded
     print('search_query_any!')
     print(f'query: {query}')
     if index_data_loaded==False:

 from tqdm import tqdm
 import Stemmer
+global stemmer
 import json
 #exclude_tags=['graphic', 'figure']
     return clean_text
 def lemmatize_and_stemm(df_r):
+    global nlp, stemmer
     #print('lemmatize_and_stemm!')
     disabled_pipes = [ "parser",  "ner"]
     if PUBLICATION_LANGUAGE=="ru":
+        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
+        stemmer= Stemmer.Stemmer('ru')#russian
     else:
+        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
+        stemmer= Stemmer.Stemmer('en')#english
     lemm_texts = []
     stem_texts=[]
     #spacy
     disabled_pipes = [ "parser",  "ner"]
     if PUBLICATION_LANGUAGE=="ru":
+        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
+        stemmer= Stemmer.Stemmer('ru')#russian
     else:
+        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
+        stemmer= Stemmer.Stemmer('en')#english
     #print('spacy loaded:', nlp)
     #tokenizer
     with open(TOKENIZER_SEARCH_PATH, 'rb') as handle:
     return nlp, tokenizer_search, search_df
 def load_index_data_qa():
+    global nlp, tokenizer_qa, qa_df, qa_index_data_loaded, stemmer
     #print('load_index_data_qa!')
     #spacy
     disabled_pipes = [ "parser",  "ner"]
     if PUBLICATION_LANGUAGE=="ru":
+        nlp = spacy.load('ru_core_news_sm', disable=disabled_pipes)
+        stemmer= Stemmer.Stemmer('ru')#russian
     else:
+        nlp = spacy.load('en_core_web_sm', disable=disabled_pipes)
+        stemmer= Stemmer.Stemmer('en')#english
     print('spacy loaded:', nlp)
     #tokenizer
     with open(TOKENIZER_QA_PATH, 'rb') as handle:
     return result
 def get_lemmed_stemmed_text(text):
+    global nlp, stemmer
      #print('nlp loaded or not:', nlp)
     if PUBLICATION_LANGUAGE=="ru":
         spacy_stopwords = spacy.lang.ru.stop_words.STOP_WORDS #russian
+        stemmer= Stemmer.Stemmer('ru')#russian
     else:
         spacy_stopwords = nlp.Defaults.stop_words #english
+        stemmer= Stemmer.Stemmer('en')#english
     #print('spacy_stopwords:', spacy_stopwords)
     doc = nlp(clear_text(text))
     # Remove stop words
     return lemm_text, stem_text
 def search_query_any(query, df=None, tokenizer=None):
+    global SEARCH_DATA, search_df, index_data_loaded, stemmer
     print('search_query_any!')
     print(f'query: {query}')
     if index_data_loaded==False: