Spaces:

lordvader31
/

text-matching

Build error

App Files Files Community

Keane Moraes commited on May 26, 2023

Commit

1981c78

1 Parent(s): d87b50e

adding prompts and generation

Browse files

Files changed (6) hide show

app.py +32 -12
clustering.py +0 -2
generation.py +3 -0
insights.prompt +17 -0
prompter/insights_33.prompt +21 -0
utils.py +119 -78

app.py CHANGED Viewed

@@ -1,24 +1,44 @@
 import streamlit as st
-from topics import Insights
-import time
 st.title("Drop the first document")
 file1 = st.file_uploader("Upload a file", type=["md", "txt"], key="first")
 st.title("Drop the second document")
 file2 = st.file_uploader("Upload a file", type=["md", "txt"], key="second")
 if file1 is not None and file2 is not None:
     st.title("Generating insights")
     with st.spinner('Generating insights...'):
-        insight1 = Insights(file1.read().decode("utf-8"))
-        insight2 = Insights(file2.read().decode("utf-8"))
-        st.write(insight1.generate_topics())
-        st.write(insight2.generate_topics())
-        st.write(insight1.text)
-        st.write(insight2.text)
-        embed1 = insight1.generate_embeddings()
-        embed2 = insight2.generate_embeddings()
-        st.success('Done!')

 import streamlit as st
+from topics import TopicModelling
+import mdforest
+import utils
 st.title("Drop the first document")
 file1 = st.file_uploader("Upload a file", type=["md", "txt"], key="first")
 st.title("Drop the second document")
 file2 = st.file_uploader("Upload a file", type=["md", "txt"], key="second")
+topics = {}
+results = {}
 if file1 is not None and file2 is not None:
+    input_text1 = file1.read().decode("utf-8")
+    input_text2 = file2.read().decode("utf-8")
+    cleaned_text1 = mdforest.clean_markdown(input_text1)
+    cleaned_text2 = mdforest.clean_markdown(input_text2)
     st.title("Generating insights")
     with st.spinner('Generating insights...'):
+        insight1 = TopicModelling(cleaned_text1)
+        insight2 = TopicModelling(cleaned_text2)
+        keywords1, concepts1 = insight1.generate_topics()
+        topics['insight1'] = [keywords1, concepts1]
+        keywords2, concepts2 = insight2.generate_topics()
+        topics['insight2'] = [keywords2, concepts2]
+        st.success('Done!')
+    with st.spinner("Flux capacitor is fluxing..."):
+        embedder = utils.load_model()
+        clutered = utils.cluster_based_on_topics(embedder, cleaned_text1, cleaned_text2)
+        print(clutered)
+        st.success("Done!")
+    with st.spinner("Polishing up"):
+        results = utils.generate_insights(topics, file1.name, file2.name, cleaned_text1, cleaned_text2, clutered)
+        st.write(results)
+        st.success("Done!")

clustering.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- import spacy
2	- import pandas as pd

generation.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import openai
2	+
3	+ def

insights.prompt ADDED Viewed

	@@ -0,0 +1,17 @@

+You are a highly intelligent bot that is tasked with common ideas between documents. The following are two documents that have been topic modelled and have been clustered based on concepts.
+The name for document 1 is : {{name1}}
+The name for document 2 is : {{name2}}
+The topics for document 1 is : {{topic1}}
+The topics for document 2 is : {{topic2}}
+The more complex concepts in document 1 is : {{complex1}}
+The more complex concepts in document 2 is : {{complex2}}
+The sentences in one of the clusters is : {{sentences}}
+From the sentences and topics above, explain the common idea between the documents and write a paragraph about it and give me 3 new concepts that are linked to this idea.

prompter/insights_33.prompt ADDED Viewed

	@@ -0,0 +1,21 @@

+You are a highly intelligent bot that is tasked with common ideas between documents. The following are two documents that have been topic modelled and have been clustered based on concepts.
+The name for document 1 is : AI tutors will be held back by culture - by Henrik Karlsson.md
+The name for document 2 is : The Stability of Beliefs.md
+The topics for document 1 is : bull,picasso,education,ai,chilean,bull 1945,the bull,of bull,prize bull,bull to
+The topics for document 2 is : belief,beliefs,philosophy,epistemological,philosophic,science belief,scientific beliefs,beliefs ensconced,beliefs of,certain beliefs
+The more complex concepts in document 1 is : picasso lithographs bull,story bull bruce,bull culture necessary,lithographs bull 1945,bull didn know
+The more complex concepts in document 2 is : beliefs michael polanyi,beliefs held scientists,belief science declared,1951 scientific beliefs,michael polanyi essay
+The sentences in one of the clusters is : # key takeaways --- # transcript ## excerpt gpt-4, khan academy, wolfram alpha - we're seeing progress ai tools learning.
+demo state art ai tutoring capabilities, watch video march 14 salman khan khan academy demonstrates system built top gpt-4. video, khan uses ai model socratic tutor.
+gpt-4 occasionally hallucinates answers true.
+models improving faster anticipated, gpt-4 already scores top 10 percent university exams.
+march 23, nine days khan demo:ed tutoring system, openai partnered wolfram released plugin gives gpt-4 ability things like: way fluidly interacting information, shaping dialogue, immensely powerful.
+From the sentences and topics above, explain the common idea between the documents and write a paragraph about it and give me 3 new concepts that are linked to this idea.

utils.py CHANGED Viewed

@@ -2,7 +2,8 @@ import streamlit as st
 from keybert import KeyBERT
 from nltk.corpus import stopwords
 from transformers import AutoTokenizer
-import re
 import spacy
 from sklearn.cluster import KMeans, AgglomerativeClustering
 import numpy as np
@@ -12,8 +13,8 @@ MODEL = 'all-MiniLM-L6-v2'
 @st.cache_data
 def load_autotoken():
-  autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
-  return autotok
 @st.cache_data
 def load_keyword_model():
@@ -23,85 +24,125 @@ def load_keyword_model():
 @st.cache_data
 def load_model():
-  embedder = SentenceTransformer(MODEL)
-  return embedder
 def create_nest_sentences(document:str, token_max_length = 1023):
-  nested = []
-  sent = []
-  length = 0
-  tokenizer = load_autotoken()
-  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
-    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
-    length += len(tokens_in_sentence)
-    if length < token_max_length:
-      sent.append(sentence)
-    else:
-      nested.append(sent)
-      sent = [sentence]
-      length = 0
-  if sent:
-    nested.append(sent)
-  return nested
 def preprocess(text) -> str:
-    stop_words = set(stopwords.words("english"))
-    text = text.lower()
-    text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
-    words = text.split()
-    words = [w for w in words if not w in stop_words]
-    return " ".join(words)
 def generate_keywords(kw_model, document: str) -> list:
-    atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
-    complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
-    final_topics = []
-    for extraction in atomic_extractions:
-        final_topics.append(extraction[0])
-    for extraction in complex_extractions:
-        final_topics.append(extraction[0])
-    return final_topics
-def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters:int = 2):
-  nlp = spacy.load("en_core_web_sm")
-  # Preprocess and tokenize the texts
-  doc1 = nlp(preprocess(text1))
-  doc2 = nlp(preprocess(text2))
-  # Extract sentences from the texts
-  sentences1 = [sent.text for sent in doc1.sents]
-  sentences2 = [sent.text for sent in doc2.sents]
-  all_sentences = sentences1 + sentences2
-  with open('insight1_sent.txt', 'w') as f:
-    for item in sentences1:
-      f.write("%s\n" % item)
-  with open('insight2_sent.txt', 'w') as f:
-    for item in sentences2:
-      f.write("%s\n" % item)
-  # Generate sentence embeddings for each sentence
-  sentence_embeddings1 = embedder.encode(sentences1)
-  sentence_embeddings2 = embedder.encode(sentences2)
-  all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
-  # Normalize the embeddings to unit length
-  all_embeddings = all_embeddings /  np.linalg.norm(all_embeddings, axis=1, keepdims=True)
-    # Perform kmean clustering
-  clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
-  clustering_model.fit(all_embeddings)
-  cluster_assignment = clustering_model.labels_
-  clustered_sentences = {}
-  for sentence_id, cluster_id in enumerate(cluster_assignment):
-    if cluster_id not in clustered_sentences:
-      clustered_sentences[cluster_id] = []
-    clustered_sentences[cluster_id].append(all_sentences[sentence_id])
-  return clustered_sentences

 from keybert import KeyBERT
 from nltk.corpus import stopwords
 from transformers import AutoTokenizer
+import os, re
+import openai
 import spacy
 from sklearn.cluster import KMeans, AgglomerativeClustering
 import numpy as np
 @st.cache_data
 def load_autotoken():
+	autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
+	return autotok
 @st.cache_data
 def load_keyword_model():
 @st.cache_data
 def load_model():
+	embedder = SentenceTransformer(MODEL)
+	return embedder
 def create_nest_sentences(document:str, token_max_length = 1023):
+	nested = []
+	sent = []
+	length = 0
+	tokenizer = load_autotoken()
+	for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
+		tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
+		length += len(tokens_in_sentence)
+		if length < token_max_length:
+			sent.append(sentence)
+		else:
+			nested.append(sent)
+			sent = [sentence]
+			length = 0
+	if sent:
+		nested.append(sent)
+	return nested
 def preprocess(text) -> str:
+		stop_words = set(stopwords.words("english"))
+		text = text.lower()
+		# text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
+		words = text.split()
+		words = [w for w in words if not w in stop_words]
+		return " ".join(words)
 def generate_keywords(kw_model, document: str) -> list:
+		atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
+		complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
+		final_topics = []
+		for extraction in atomic_extractions:
+				final_topics.append(extraction[0])
+		for extraction in complex_extractions:
+				final_topics.append(extraction[0])
+		return final_topics
+def cluster_based_on_topics(embedder, text1:str, text2:str):
+	nlp = spacy.load("en_core_web_sm")
+	# Preprocess and tokenize the texts
+	doc1 = nlp(preprocess(text1))
+	doc2 = nlp(preprocess(text2))
+	# Extract sentences from the texts
+	sentences1 = [sent.text for sent in doc1.sents]
+	sentences2 = [sent.text for sent in doc2.sents]
+	all_sentences = sentences1 + sentences2
+	# Generate sentence embeddings for each sentence
+	sentence_embeddings1 = embedder.encode(sentences1)
+	sentence_embeddings2 = embedder.encode(sentences2)
+	all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
+	# Normalize the embeddings to unit length
+	all_embeddings = all_embeddings /  np.linalg.norm(all_embeddings, axis=1, keepdims=True)
+	# Perform agglomerative clustering
+	clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
+	clustering_model.fit(all_embeddings)
+	cluster_assignment = clustering_model.labels_
+	clustered_sentences = {}
+	for sentence_id, cluster_id in enumerate(cluster_assignment):
+		if cluster_id not in clustered_sentences:
+			clustered_sentences[cluster_id] = []
+		clustered_sentences[cluster_id].append(all_sentences[sentence_id])
+	return clustered_sentences
+def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, clusters) -> list:
+	openai.api_key = os.getenv("OPENAI_API_KEY")
+	PROMPT = open("insights.prompt", "r").read()
+	print(topics)
+	PROMPT = PROMPT.replace("{{name1}}", name1)
+	PROMPT = PROMPT.replace("{{name2}}", name2)
+	PROMPT = PROMPT.replace("{{topic1}}", ",".join(topics['insight1'][0]))
+	PROMPT = PROMPT.replace("{{topic2}}", ",".join(topics['insight2'][0]))
+	PROMPT = PROMPT.replace("{{complex1}}", ",".join(topics['insight1'][1]))
+	PROMPT = PROMPT.replace("{{complex2}}", ",".join(topics['insight2'][1]))
+	final_insights = []
+	for cluster_id, sentences in clusters.items():
+		PROMPT = PROMPT.replace("{{sentences}}", "\n".join(sentences))
+		with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
+			f.write(PROMPT)
+		# Generate insights for each cluster
+		response = openai.Completion.create(
+			model="text-davinci-003",
+			prompt=PROMPT,
+			temperature=0.5,
+			top_p=1,
+			max_tokens=1000,
+			frequency_penalty=0.0,
+			presence_penalty=0.0,
+		)
+		text = response['choices'][0]['text']
+		with open(f"prompter/insights_{cluster_id}.txt", "a") as f:
+			f.write(text)
+		final_insights.append(text)
+	return final_insights