text-matching / utils.py
Keane Moraes
initial commit
d9ce745
raw
history blame
1.07 kB
import streamlit as st
from keybert import KeyBERT
from transformers import AutoTokenizer
import re
def create_nest_sentences(document:str, token_max_length = 1024):
nested = []
sent = []
length = 0
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
length += len(tokens_in_sentence)
if length < token_max_length:
sent.append(sentence)
else:
nested.append(sent)
sent = [sentence]
length = 0
if sent:
nested.append(sent)
return nested
@st.cache_data
def load_keyword_model():
kw_model = KeyBERT()
return kw_model
def keyword_gen(kw_model, sequence:str):
keywords = kw_model.extract_keywords(
sequence,
keyphrase_ngram_range=(1, 2),
stop_words='english',
use_mmr=True,
diversity=0.5,
top_n=10
)
return keywords