File size: 1,066 Bytes
d9ce745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import streamlit as st
from keybert import KeyBERT
from transformers import AutoTokenizer
import re


def create_nest_sentences(document:str, token_max_length = 1024):
  nested = []
  sent = []
  length = 0
  tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
    length += len(tokens_in_sentence)

    if length < token_max_length:
      sent.append(sentence)
    else:
      nested.append(sent)
      sent = [sentence]
      length = 0

  if sent:
    nested.append(sent)
  return nested

@st.cache_data
def load_keyword_model():
  kw_model = KeyBERT()
  return kw_model


def keyword_gen(kw_model, sequence:str):
    keywords = kw_model.extract_keywords(
        sequence, 
        keyphrase_ngram_range=(1, 2),
        stop_words='english', 
        use_mmr=True, 
        diversity=0.5,
        top_n=10
    )
    return keywords