File size: 2,042 Bytes
d9ce745
 
232a10d
d9ce745
 
28e14c5
 
 
 
 
 
 
d9ce745
232a10d
 
 
 
 
 
 
 
 
d9ce745
28e14c5
 
 
 
 
 
d9ce745
 
 
232a10d
d9ce745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232a10d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
from keybert import KeyBERT
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import re
import spacy
from sentence_transformers import SentenceTransformer

# @st.cache_data
# def load_nlp():
#   nlp = 


@st.cache_data
def load_autotoken():
  autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
  return autotok

@st.cache_data
def load_keyword_model():
  kw_model = KeyBERT()
  return kw_model

@st.cache_data
def load_embedder():
  embedder = SentenceTransformer('all-MiniLM-L6-v2')
  return embedder 

def create_nest_sentences(document:str, token_max_length = 1023):
  nested = []
  sent = []
  length = 0
  tokenizer = load_autotoken()

  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
    length += len(tokens_in_sentence)

    if length < token_max_length:
      sent.append(sentence)
    else:
      nested.append(sent)
      sent = [sentence]
      length = 0

  if sent:
    nested.append(sent)
  return nested

def preprocess(text) -> str:
    stop_words = set(stopwords.words("english"))
    text = text.lower()
    text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
    words = text.split()
    words = [w for w in words if not w in stop_words]
    return " ".join(words)

def generate_keywords(kw_model, document: str) -> list:
    atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
    complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
    final_topics = []
    for extraction in atomic_extractions:
        final_topics.append(extraction[0])
    for extraction in complex_extractions:
        final_topics.append(extraction[0])
    return final_topics