Keane Moraes commited on
Commit
1981c78
·
1 Parent(s): d87b50e

adding prompts and generation

Browse files
Files changed (6) hide show
  1. app.py +32 -12
  2. clustering.py +0 -2
  3. generation.py +3 -0
  4. insights.prompt +17 -0
  5. prompter/insights_33.prompt +21 -0
  6. utils.py +119 -78
app.py CHANGED
@@ -1,24 +1,44 @@
1
  import streamlit as st
2
- from topics import Insights
3
-
4
- import time
5
 
6
  st.title("Drop the first document")
7
  file1 = st.file_uploader("Upload a file", type=["md", "txt"], key="first")
8
  st.title("Drop the second document")
9
  file2 = st.file_uploader("Upload a file", type=["md", "txt"], key="second")
10
 
 
 
 
11
  if file1 is not None and file2 is not None:
 
 
 
 
 
 
12
 
13
  st.title("Generating insights")
 
14
  with st.spinner('Generating insights...'):
15
- insight1 = Insights(file1.read().decode("utf-8"))
16
- insight2 = Insights(file2.read().decode("utf-8"))
17
- st.write(insight1.generate_topics())
18
- st.write(insight2.generate_topics())
19
- st.write(insight1.text)
20
- st.write(insight2.text)
21
- embed1 = insight1.generate_embeddings()
22
- embed2 = insight2.generate_embeddings()
23
 
24
- st.success('Done!')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from topics import TopicModelling
3
+ import mdforest
4
+ import utils
5
 
6
  st.title("Drop the first document")
7
  file1 = st.file_uploader("Upload a file", type=["md", "txt"], key="first")
8
  st.title("Drop the second document")
9
  file2 = st.file_uploader("Upload a file", type=["md", "txt"], key="second")
10
 
11
+ topics = {}
12
+ results = {}
13
+
14
  if file1 is not None and file2 is not None:
15
+
16
+ input_text1 = file1.read().decode("utf-8")
17
+ input_text2 = file2.read().decode("utf-8")
18
+
19
+ cleaned_text1 = mdforest.clean_markdown(input_text1)
20
+ cleaned_text2 = mdforest.clean_markdown(input_text2)
21
 
22
  st.title("Generating insights")
23
+
24
  with st.spinner('Generating insights...'):
 
 
 
 
 
 
 
 
25
 
26
+ insight1 = TopicModelling(cleaned_text1)
27
+ insight2 = TopicModelling(cleaned_text2)
28
+
29
+ keywords1, concepts1 = insight1.generate_topics()
30
+ topics['insight1'] = [keywords1, concepts1]
31
+ keywords2, concepts2 = insight2.generate_topics()
32
+ topics['insight2'] = [keywords2, concepts2]
33
+ st.success('Done!')
34
+
35
+ with st.spinner("Flux capacitor is fluxing..."):
36
+ embedder = utils.load_model()
37
+ clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2)
38
+ print(clutered)
39
+ st.success("Done!")
40
+
41
+ with st.spinner("Polishing up"):
42
+ results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
43
+ st.write(results)
44
+ st.success("Done!")
clustering.py DELETED
@@ -1,2 +0,0 @@
1
- import spacy
2
- import pandas as pd
 
 
 
generation.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import openai
2
+
3
+ def
insights.prompt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a highly intelligent bot that is tasked with common ideas between documents. The following are two documents that have been topic modelled and have been clustered based on concepts.
2
+
3
+ The name for document 1 is : {{name1}}
4
+
5
+ The name for document 2 is : {{name2}}
6
+
7
+ The topics for document 1 is : {{topic1}}
8
+
9
+ The topics for document 2 is : {{topic2}}
10
+
11
+ The more complex concepts in document 1 is : {{complex1}}
12
+
13
+ The more complex concepts in document 2 is : {{complex2}}
14
+
15
+ The sentences in one of the clusters is : {{sentences}}
16
+
17
+ From the sentences and topics above, explain the common idea between the documents and write a paragraph about it and give me 3 new concepts that are linked to this idea.
prompter/insights_33.prompt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a highly intelligent bot that is tasked with common ideas between documents. The following are two documents that have been topic modelled and have been clustered based on concepts.
2
+
3
+ The name for document 1 is : AI tutors will be held back by culture - by Henrik Karlsson.md
4
+
5
+ The name for document 2 is : The Stability of Beliefs.md
6
+
7
+ The topics for document 1 is : bull,picasso,education,ai,chilean,bull 1945,the bull,of bull,prize bull,bull to
8
+
9
+ The topics for document 2 is : belief,beliefs,philosophy,epistemological,philosophic,science belief,scientific beliefs,beliefs ensconced,beliefs of,certain beliefs
10
+
11
+ The more complex concepts in document 1 is : picasso lithographs bull,story bull bruce,bull culture necessary,lithographs bull 1945,bull didn know
12
+
13
+ The more complex concepts in document 2 is : beliefs michael polanyi,beliefs held scientists,belief science declared,1951 scientific beliefs,michael polanyi essay
14
+
15
+ The sentences in one of the clusters is : # key takeaways --- # transcript ## excerpt gpt-4, khan academy, wolfram alpha - we're seeing progress ai tools learning.
16
+ demo state art ai tutoring capabilities, watch video march 14 salman khan khan academy demonstrates system built top gpt-4. video, khan uses ai model socratic tutor.
17
+ gpt-4 occasionally hallucinates answers true.
18
+ models improving faster anticipated, gpt-4 already scores top 10 percent university exams.
19
+ march 23, nine days khan demo:ed tutoring system, openai partnered wolfram released plugin gives gpt-4 ability things like: way fluidly interacting information, shaping dialogue, immensely powerful.
20
+
21
+ From the sentences and topics above, explain the common idea between the documents and write a paragraph about it and give me 3 new concepts that are linked to this idea.
utils.py CHANGED
@@ -2,7 +2,8 @@ import streamlit as st
2
  from keybert import KeyBERT
3
  from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
- import re
 
6
  import spacy
7
  from sklearn.cluster import KMeans, AgglomerativeClustering
8
  import numpy as np
@@ -12,8 +13,8 @@ MODEL = 'all-MiniLM-L6-v2'
12
 
13
  @st.cache_data
14
  def load_autotoken():
15
- autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
16
- return autotok
17
 
18
  @st.cache_data
19
  def load_keyword_model():
@@ -23,85 +24,125 @@ def load_keyword_model():
23
 
24
  @st.cache_data
25
  def load_model():
26
- embedder = SentenceTransformer(MODEL)
27
- return embedder
28
 
29
  def create_nest_sentences(document:str, token_max_length = 1023):
30
- nested = []
31
- sent = []
32
- length = 0
33
- tokenizer = load_autotoken()
34
-
35
- for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
36
- tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
37
- length += len(tokens_in_sentence)
38
-
39
- if length < token_max_length:
40
- sent.append(sentence)
41
- else:
42
- nested.append(sent)
43
- sent = [sentence]
44
- length = 0
45
-
46
- if sent:
47
- nested.append(sent)
48
- return nested
49
 
50
  def preprocess(text) -> str:
51
- stop_words = set(stopwords.words("english"))
52
- text = text.lower()
53
- text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
54
- words = text.split()
55
- words = [w for w in words if not w in stop_words]
56
- return " ".join(words)
57
 
58
  def generate_keywords(kw_model, document: str) -> list:
59
- atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
60
- complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
61
- final_topics = []
62
- for extraction in atomic_extractions:
63
- final_topics.append(extraction[0])
64
- for extraction in complex_extractions:
65
- final_topics.append(extraction[0])
66
- return final_topics
67
-
68
- def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters:int = 2):
69
- nlp = spacy.load("en_core_web_sm")
70
-
71
- # Preprocess and tokenize the texts
72
- doc1 = nlp(preprocess(text1))
73
- doc2 = nlp(preprocess(text2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- # Extract sentences from the texts
76
- sentences1 = [sent.text for sent in doc1.sents]
77
- sentences2 = [sent.text for sent in doc2.sents]
78
- all_sentences = sentences1 + sentences2
79
-
80
- with open('insight1_sent.txt', 'w') as f:
81
- for item in sentences1:
82
- f.write("%s\n" % item)
83
-
84
- with open('insight2_sent.txt', 'w') as f:
85
- for item in sentences2:
86
- f.write("%s\n" % item)
87
-
88
- # Generate sentence embeddings for each sentence
89
- sentence_embeddings1 = embedder.encode(sentences1)
90
- sentence_embeddings2 = embedder.encode(sentences2)
91
- all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
92
-
93
- # Normalize the embeddings to unit length
94
- all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
95
-
96
- # Perform kmean clustering
97
- clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
98
- clustering_model.fit(all_embeddings)
99
- cluster_assignment = clustering_model.labels_
100
-
101
- clustered_sentences = {}
102
- for sentence_id, cluster_id in enumerate(cluster_assignment):
103
- if cluster_id not in clustered_sentences:
104
- clustered_sentences[cluster_id] = []
105
- clustered_sentences[cluster_id].append(all_sentences[sentence_id])
106
-
107
- return clustered_sentences
 
2
  from keybert import KeyBERT
3
  from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
+ import os, re
6
+ import openai
7
  import spacy
8
  from sklearn.cluster import KMeans, AgglomerativeClustering
9
  import numpy as np
 
13
 
14
  @st.cache_data
15
  def load_autotoken():
16
+ autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
17
+ return autotok
18
 
19
  @st.cache_data
20
  def load_keyword_model():
 
24
 
25
  @st.cache_data
26
  def load_model():
27
+ embedder = SentenceTransformer(MODEL)
28
+ return embedder
29
 
30
  def create_nest_sentences(document:str, token_max_length = 1023):
31
+ nested = []
32
+ sent = []
33
+ length = 0
34
+ tokenizer = load_autotoken()
35
+
36
+ for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
37
+ tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
38
+ length += len(tokens_in_sentence)
39
+
40
+ if length < token_max_length:
41
+ sent.append(sentence)
42
+ else:
43
+ nested.append(sent)
44
+ sent = [sentence]
45
+ length = 0
46
+
47
+ if sent:
48
+ nested.append(sent)
49
+ return nested
50
 
51
  def preprocess(text) -> str:
52
+ stop_words = set(stopwords.words("english"))
53
+ text = text.lower()
54
+ # text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
55
+ words = text.split()
56
+ words = [w for w in words if not w in stop_words]
57
+ return " ".join(words)
58
 
59
  def generate_keywords(kw_model, document: str) -> list:
60
+ atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
61
+ complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
62
+ final_topics = []
63
+ for extraction in atomic_extractions:
64
+ final_topics.append(extraction[0])
65
+ for extraction in complex_extractions:
66
+ final_topics.append(extraction[0])
67
+ return final_topics
68
+
69
+ def cluster_based_on_topics(embedder, text1:str, text2:str):
70
+ nlp = spacy.load("en_core_web_sm")
71
+
72
+ # Preprocess and tokenize the texts
73
+ doc1 = nlp(preprocess(text1))
74
+ doc2 = nlp(preprocess(text2))
75
+
76
+ # Extract sentences from the texts
77
+ sentences1 = [sent.text for sent in doc1.sents]
78
+ sentences2 = [sent.text for sent in doc2.sents]
79
+ all_sentences = sentences1 + sentences2
80
+
81
+ # Generate sentence embeddings for each sentence
82
+ sentence_embeddings1 = embedder.encode(sentences1)
83
+ sentence_embeddings2 = embedder.encode(sentences2)
84
+ all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
85
+
86
+ # Normalize the embeddings to unit length
87
+ all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
88
+
89
+ # Perform agglomerative clustering
90
+ clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
91
+ clustering_model.fit(all_embeddings)
92
+ cluster_assignment = clustering_model.labels_
93
+
94
+ clustered_sentences = {}
95
+ for sentence_id, cluster_id in enumerate(cluster_assignment):
96
+ if cluster_id not in clustered_sentences:
97
+ clustered_sentences[cluster_id] = []
98
+ clustered_sentences[cluster_id].append(all_sentences[sentence_id])
99
+
100
+ return clustered_sentences
101
+
102
+
103
+ def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, clusters) -> list:
104
+
105
+ openai.api_key = os.getenv("OPENAI_API_KEY")
106
+
107
+ PROMPT = open("insights.prompt", "r").read()
108
+
109
+ print(topics)
110
+
111
+ PROMPT = PROMPT.replace("{{name1}}", name1)
112
+ PROMPT = PROMPT.replace("{{name2}}", name2)
113
+
114
+ PROMPT = PROMPT.replace("{{topic1}}", ",".join(topics['insight1'][0]))
115
+ PROMPT = PROMPT.replace("{{topic2}}", ",".join(topics['insight2'][0]))
116
+
117
+ PROMPT = PROMPT.replace("{{complex1}}", ",".join(topics['insight1'][1]))
118
+ PROMPT = PROMPT.replace("{{complex2}}", ",".join(topics['insight2'][1]))
119
+
120
+ final_insights = []
121
+
122
+ for cluster_id, sentences in clusters.items():
123
+
124
+ PROMPT = PROMPT.replace("{{sentences}}", "\n".join(sentences))
125
+
126
+ with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
127
+ f.write(PROMPT)
128
+
129
+ # Generate insights for each cluster
130
+ response = openai.Completion.create(
131
+ model="text-davinci-003",
132
+ prompt=PROMPT,
133
+ temperature=0.5,
134
+ top_p=1,
135
+ max_tokens=1000,
136
+ frequency_penalty=0.0,
137
+ presence_penalty=0.0,
138
+ )
139
+
140
+ text = response['choices'][0]['text']
141
+ with open(f"prompter/insights_{cluster_id}.txt", "a") as f:
142
+ f.write(text)
143
+
144
+ final_insights.append(text)
145
+
146
+ return final_insights
147
+
148