import streamlit as st from keybert import KeyBERT from nltk.corpus import stopwords from transformers import AutoTokenizer import re import spacy from sentence_transformers import SentenceTransformer # @st.cache_data # def load_nlp(): # nlp = @st.cache_data def load_autotoken(): autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') return autotok @st.cache_data def load_keyword_model(): kw_model = KeyBERT() return kw_model @st.cache_data def load_embedder(): embedder = SentenceTransformer('all-MiniLM-L6-v2') return embedder def create_nest_sentences(document:str, token_max_length = 1023): nested = [] sent = [] length = 0 tokenizer = load_autotoken() for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')): tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer length += len(tokens_in_sentence) if length < token_max_length: sent.append(sentence) else: nested.append(sent) sent = [sentence] length = 0 if sent: nested.append(sent) return nested def preprocess(text) -> str: stop_words = set(stopwords.words("english")) text = text.lower() text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')]) words = text.split() words = [w for w in words if not w in stop_words] return " ".join(words) def generate_keywords(kw_model, document: str) -> list: atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10) complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10) final_topics = [] for extraction in atomic_extractions: final_topics.append(extraction[0]) for extraction in complex_extractions: final_topics.append(extraction[0]) return final_topics