Spaces:
Build error
Build error
File size: 2,042 Bytes
d9ce745 232a10d d9ce745 28e14c5 d9ce745 232a10d d9ce745 28e14c5 d9ce745 232a10d d9ce745 232a10d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import streamlit as st
from keybert import KeyBERT
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import re
import spacy
from sentence_transformers import SentenceTransformer
# @st.cache_data
# def load_nlp():
# nlp =
@st.cache_data
def load_autotoken():
autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
return autotok
@st.cache_data
def load_keyword_model():
kw_model = KeyBERT()
return kw_model
@st.cache_data
def load_embedder():
embedder = SentenceTransformer('all-MiniLM-L6-v2')
return embedder
def create_nest_sentences(document:str, token_max_length = 1023):
nested = []
sent = []
length = 0
tokenizer = load_autotoken()
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
length += len(tokens_in_sentence)
if length < token_max_length:
sent.append(sentence)
else:
nested.append(sent)
sent = [sentence]
length = 0
if sent:
nested.append(sent)
return nested
def preprocess(text) -> str:
stop_words = set(stopwords.words("english"))
text = text.lower()
text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
words = text.split()
words = [w for w in words if not w in stop_words]
return " ".join(words)
def generate_keywords(kw_model, document: str) -> list:
atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
final_topics = []
for extraction in atomic_extractions:
final_topics.append(extraction[0])
for extraction in complex_extractions:
final_topics.append(extraction[0])
return final_topics
|