import streamlit as st from keybert import KeyBERT from transformers import AutoTokenizer import re def create_nest_sentences(document:str, token_max_length = 1024): nested = [] sent = [] length = 0 tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')): tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer length += len(tokens_in_sentence) if length < token_max_length: sent.append(sentence) else: nested.append(sent) sent = [sentence] length = 0 if sent: nested.append(sent) return nested @st.cache_data def load_keyword_model(): kw_model = KeyBERT() return kw_model def keyword_gen(kw_model, sequence:str): keywords = kw_model.extract_keywords( sequence, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.5, top_n=10 ) return keywords