Spaces:
Build error
Build error
import streamlit as st | |
from keybert import KeyBERT | |
from nltk.corpus import stopwords | |
from transformers import AutoTokenizer | |
import re | |
import spacy | |
from sklearn.cluster import KMeans, AgglomerativeClustering | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
MODEL = 'all-MiniLM-L6-v2' | |
def load_autotoken(): | |
autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') | |
return autotok | |
def load_keyword_model(): | |
sentence_model = load_model() | |
kw_model = KeyBERT(model=sentence_model) | |
return kw_model | |
def load_model(): | |
embedder = SentenceTransformer(MODEL) | |
return embedder | |
def create_nest_sentences(document:str, token_max_length = 1023): | |
nested = [] | |
sent = [] | |
length = 0 | |
tokenizer = load_autotoken() | |
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')): | |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer | |
length += len(tokens_in_sentence) | |
if length < token_max_length: | |
sent.append(sentence) | |
else: | |
nested.append(sent) | |
sent = [sentence] | |
length = 0 | |
if sent: | |
nested.append(sent) | |
return nested | |
def preprocess(text) -> str: | |
stop_words = set(stopwords.words("english")) | |
text = text.lower() | |
text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')]) | |
words = text.split() | |
words = [w for w in words if not w in stop_words] | |
return " ".join(words) | |
def generate_keywords(kw_model, document: str) -> list: | |
atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10) | |
complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10) | |
final_topics = [] | |
for extraction in atomic_extractions: | |
final_topics.append(extraction[0]) | |
for extraction in complex_extractions: | |
final_topics.append(extraction[0]) | |
return final_topics | |
def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters:int = 2): | |
nlp = spacy.load("en_core_web_sm") | |
# Preprocess and tokenize the texts | |
doc1 = nlp(preprocess(text1)) | |
doc2 = nlp(preprocess(text2)) | |
# Extract sentences from the texts | |
sentences1 = [sent.text for sent in doc1.sents] | |
sentences2 = [sent.text for sent in doc2.sents] | |
all_sentences = sentences1 + sentences2 | |
with open('insight1_sent.txt', 'w') as f: | |
for item in sentences1: | |
f.write("%s\n" % item) | |
with open('insight2_sent.txt', 'w') as f: | |
for item in sentences2: | |
f.write("%s\n" % item) | |
# Generate sentence embeddings for each sentence | |
sentence_embeddings1 = embedder.encode(sentences1) | |
sentence_embeddings2 = embedder.encode(sentences2) | |
all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0) | |
# Normalize the embeddings to unit length | |
all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True) | |
# Perform kmean clustering | |
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) | |
clustering_model.fit(all_embeddings) | |
cluster_assignment = clustering_model.labels_ | |
clustered_sentences = {} | |
for sentence_id, cluster_id in enumerate(cluster_assignment): | |
if cluster_id not in clustered_sentences: | |
clustered_sentences[cluster_id] = [] | |
clustered_sentences[cluster_id].append(all_sentences[sentence_id]) | |
return clustered_sentences | |