Spaces:
Build error
Build error
import streamlit as st | |
from keybert import KeyBERT | |
from transformers import AutoTokenizer | |
import re | |
def create_nest_sentences(document:str, token_max_length = 1024): | |
nested = [] | |
sent = [] | |
length = 0 | |
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') | |
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')): | |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer | |
length += len(tokens_in_sentence) | |
if length < token_max_length: | |
sent.append(sentence) | |
else: | |
nested.append(sent) | |
sent = [sentence] | |
length = 0 | |
if sent: | |
nested.append(sent) | |
return nested | |
def load_keyword_model(): | |
kw_model = KeyBERT() | |
return kw_model | |
def keyword_gen(kw_model, sequence:str): | |
keywords = kw_model.extract_keywords( | |
sequence, | |
keyphrase_ngram_range=(1, 2), | |
stop_words='english', | |
use_mmr=True, | |
diversity=0.5, | |
top_n=10 | |
) | |
return keywords |