Spaces:
Build error
Build error
import streamlit as st | |
from keybert import KeyBERT | |
from nltk.corpus import stopwords | |
from transformers import AutoTokenizer | |
import re | |
import spacy | |
from sentence_transformers import SentenceTransformer | |
# @st.cache_data | |
# def load_nlp(): | |
# nlp = | |
def load_autotoken(): | |
autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') | |
return autotok | |
def load_keyword_model(): | |
kw_model = KeyBERT() | |
return kw_model | |
def load_embedder(): | |
embedder = SentenceTransformer('all-MiniLM-L6-v2') | |
return embedder | |
def create_nest_sentences(document:str, token_max_length = 1023): | |
nested = [] | |
sent = [] | |
length = 0 | |
tokenizer = load_autotoken() | |
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')): | |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer | |
length += len(tokens_in_sentence) | |
if length < token_max_length: | |
sent.append(sentence) | |
else: | |
nested.append(sent) | |
sent = [sentence] | |
length = 0 | |
if sent: | |
nested.append(sent) | |
return nested | |
def preprocess(text) -> str: | |
stop_words = set(stopwords.words("english")) | |
text = text.lower() | |
text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')]) | |
words = text.split() | |
words = [w for w in words if not w in stop_words] | |
return " ".join(words) | |
def generate_keywords(kw_model, document: str) -> list: | |
atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10) | |
complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10) | |
final_topics = [] | |
for extraction in atomic_extractions: | |
final_topics.append(extraction[0]) | |
for extraction in complex_extractions: | |
final_topics.append(extraction[0]) | |
return final_topics | |