text-matching / utils.py
Keane Moraes
changes for topic modelling and embeddings
28e14c5
raw
history blame
2.04 kB
import streamlit as st
from keybert import KeyBERT
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import re
import spacy
from sentence_transformers import SentenceTransformer
# @st.cache_data
# def load_nlp():
# nlp =
@st.cache_data
def load_autotoken():
autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
return autotok
@st.cache_data
def load_keyword_model():
kw_model = KeyBERT()
return kw_model
@st.cache_data
def load_embedder():
embedder = SentenceTransformer('all-MiniLM-L6-v2')
return embedder
def create_nest_sentences(document:str, token_max_length = 1023):
nested = []
sent = []
length = 0
tokenizer = load_autotoken()
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
length += len(tokens_in_sentence)
if length < token_max_length:
sent.append(sentence)
else:
nested.append(sent)
sent = [sentence]
length = 0
if sent:
nested.append(sent)
return nested
def preprocess(text) -> str:
stop_words = set(stopwords.words("english"))
text = text.lower()
text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
words = text.split()
words = [w for w in words if not w in stop_words]
return " ".join(words)
def generate_keywords(kw_model, document: str) -> list:
atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
final_topics = []
for extraction in atomic_extractions:
final_topics.append(extraction[0])
for extraction in complex_extractions:
final_topics.append(extraction[0])
return final_topics