File size: 3,639 Bytes
d9ce745
 
232a10d
d9ce745
 
28e14c5
d87b50e
 
28e14c5
 
d87b50e
d9ce745
232a10d
 
 
 
 
 
 
d87b50e
 
 
d9ce745
28e14c5
d87b50e
 
28e14c5
 
 
d9ce745
 
 
232a10d
d9ce745
d87b50e
d9ce745
 
 
 
 
 
 
 
 
 
 
 
 
 
232a10d
 
 
 
d87b50e
232a10d
 
 
 
 
 
 
 
 
 
 
 
d87b50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import streamlit as st
from keybert import KeyBERT
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import re
import spacy
from sklearn.cluster import KMeans, AgglomerativeClustering
import numpy as np
from sentence_transformers import SentenceTransformer

MODEL = 'all-MiniLM-L6-v2' 

@st.cache_data
def load_autotoken():
  autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
  return autotok

@st.cache_data
def load_keyword_model():
	sentence_model = load_model()
	kw_model = KeyBERT(model=sentence_model)
	return kw_model

@st.cache_data
def load_model():
  embedder = SentenceTransformer(MODEL)
  return embedder 

def create_nest_sentences(document:str, token_max_length = 1023):
  nested = []
  sent = []
  length = 0
  tokenizer = load_autotoken()

  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
    length += len(tokens_in_sentence)

    if length < token_max_length:
      sent.append(sentence)
    else:
      nested.append(sent)
      sent = [sentence]
      length = 0

  if sent:
    nested.append(sent)
  return nested

def preprocess(text) -> str:
    stop_words = set(stopwords.words("english"))
    text = text.lower()
    text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
    words = text.split() 
    words = [w for w in words if not w in stop_words]
    return " ".join(words)

def generate_keywords(kw_model, document: str) -> list:
    atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
    complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
    final_topics = []
    for extraction in atomic_extractions:
        final_topics.append(extraction[0])
    for extraction in complex_extractions:
        final_topics.append(extraction[0])
    return final_topics

def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters:int = 2):
  nlp = spacy.load("en_core_web_sm")
  
  # Preprocess and tokenize the texts
  doc1 = nlp(preprocess(text1))
  doc2 = nlp(preprocess(text2))
  
  # Extract sentences from the texts  
  sentences1 = [sent.text for sent in doc1.sents]
  sentences2 = [sent.text for sent in doc2.sents]
  all_sentences = sentences1 + sentences2

  with open('insight1_sent.txt', 'w') as f:
    for item in sentences1:
      f.write("%s\n" % item)
            
  with open('insight2_sent.txt', 'w') as f:
    for item in sentences2:
      f.write("%s\n" % item)
        
  # Generate sentence embeddings for each sentence
  sentence_embeddings1 = embedder.encode(sentences1)
  sentence_embeddings2 = embedder.encode(sentences2)
  all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)

  # Normalize the embeddings to unit length
  all_embeddings = all_embeddings /  np.linalg.norm(all_embeddings, axis=1, keepdims=True)

    # Perform kmean clustering
  clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
  clustering_model.fit(all_embeddings)
  cluster_assignment = clustering_model.labels_

  clustered_sentences = {}
  for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
      clustered_sentences[cluster_id] = []
    clustered_sentences[cluster_id].append(all_sentences[sentence_id])

  return clustered_sentences