Update functions.py
Browse files- functions.py +35 -41
functions.py
CHANGED
|
@@ -102,68 +102,61 @@ def sentiment_pipe(earnings_text):
|
|
| 102 |
earnings_sentiment = sent_pipe(earnings_sentences)
|
| 103 |
|
| 104 |
return earnings_sentiment, earnings_sentences
|
| 105 |
-
|
| 106 |
@st.experimental_memo(suppress_st_warning=True)
|
| 107 |
-
def
|
| 108 |
-
'''
|
| 109 |
-
|
| 110 |
text = text.encode("ascii", "ignore").decode() # unicode
|
| 111 |
text = re.sub(r"https*\S+", " ", text) # url
|
| 112 |
text = re.sub(r"@\S+", " ", text) # mentions
|
| 113 |
text = re.sub(r"#\S+", " ", text) # hastags
|
| 114 |
text = re.sub(r"\s{2,}", " ", text) # over spaces
|
| 115 |
-
#text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
#
|
| 121 |
-
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
-
## We split this article into paragraphs and then every paragraph into sentences
|
| 127 |
-
paragraphs = []
|
| 128 |
-
for paragraph in text.replace('\n',' ').split("\n\n"):
|
| 129 |
-
if len(paragraph.strip()) > 0:
|
| 130 |
-
paragraphs.append(sent_tokenize(paragraph.strip()))
|
| 131 |
-
|
| 132 |
-
#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
|
| 133 |
-
#Smaller value: Context from other sentences might get lost
|
| 134 |
-
#Lager values: More context from the paragraph remains, but results are longer
|
| 135 |
-
window_size = window_size
|
| 136 |
passages = []
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
for start_idx in range(0, len(paragraph), window_size):
|
| 139 |
end_idx = min(start_idx+window_size, len(paragraph))
|
| 140 |
passages.append(" ".join(paragraph[start_idx:end_idx]))
|
| 141 |
-
|
| 142 |
-
print(f"Sentences: {sum([len(p) for p in paragraphs])}")
|
| 143 |
-
print(f"Passages: {len(passages)}")
|
| 144 |
-
|
| 145 |
return passages
|
| 146 |
-
|
| 147 |
-
@st.experimental_memo(suppress_st_warning=True)
|
| 148 |
-
def chunk_and_preprocess_text(text):
|
| 149 |
|
| 150 |
-
"""Chunk text longer than
|
| 151 |
|
| 152 |
-
|
| 153 |
-
text = re.sub(r"https*\S+", " ", text) # url
|
| 154 |
-
text = re.sub(r"@\S+", " ", text) # mentions
|
| 155 |
-
text = re.sub(r"#\S+", " ", text) # hastags
|
| 156 |
-
text = re.sub(r"\s{2,}", " ", text) # over spaces
|
| 157 |
-
|
| 158 |
-
article = nlp(text)
|
| 159 |
-
sentences = [i.text for i in list(article.sents)]
|
| 160 |
|
| 161 |
current_chunk = 0
|
| 162 |
chunks = []
|
| 163 |
|
| 164 |
for sentence in sentences:
|
| 165 |
if len(chunks) == current_chunk + 1:
|
| 166 |
-
if len(chunks[current_chunk]) + len(sentence.split(" ")) <=
|
| 167 |
chunks[current_chunk].extend(sentence.split(" "))
|
| 168 |
else:
|
| 169 |
current_chunk += 1
|
|
@@ -174,7 +167,8 @@ def chunk_and_preprocess_text(text):
|
|
| 174 |
for chunk_id in range(len(chunks)):
|
| 175 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 176 |
|
| 177 |
-
return chunks
|
|
|
|
| 178 |
|
| 179 |
def summary_downloader(raw_text):
|
| 180 |
|
|
@@ -318,4 +312,4 @@ def fin_ext(text):
|
|
| 318 |
|
| 319 |
nlp = get_spacy()
|
| 320 |
sent_pipe, sum_pipe, ner_pipe, cross_encoder = load_models()
|
| 321 |
-
sbert = load_sbert('all-MiniLM-
|
|
|
|
| 102 |
earnings_sentiment = sent_pipe(earnings_sentences)
|
| 103 |
|
| 104 |
return earnings_sentiment, earnings_sentences
|
| 105 |
+
|
| 106 |
@st.experimental_memo(suppress_st_warning=True)
|
| 107 |
+
def clean_text(text):
|
| 108 |
+
'''Clean all text'''
|
| 109 |
+
|
| 110 |
text = text.encode("ascii", "ignore").decode() # unicode
|
| 111 |
text = re.sub(r"https*\S+", " ", text) # url
|
| 112 |
text = re.sub(r"@\S+", " ", text) # mentions
|
| 113 |
text = re.sub(r"#\S+", " ", text) # hastags
|
| 114 |
text = re.sub(r"\s{2,}", " ", text) # over spaces
|
|
|
|
| 115 |
|
| 116 |
+
return text
|
| 117 |
+
|
| 118 |
+
@st.experimental_memo(suppress_st_warning=True)
|
| 119 |
+
def chunk_long_text(text,threshold,window_size=3):
|
| 120 |
+
'''Preprocess text and chunk for semantic search and sentiment analysis'''
|
| 121 |
|
| 122 |
+
#Convert cleaned text into sentences
|
| 123 |
+
sentences = sent_tokenize(text)
|
| 124 |
|
| 125 |
+
out = []
|
| 126 |
+
|
| 127 |
+
#Limit the length of each sentence to a threshold
|
| 128 |
+
for chunk in sentences:
|
| 129 |
+
if len(chunk.split()) < threshold:
|
| 130 |
+
out.append(chunk)
|
| 131 |
+
else:
|
| 132 |
+
words = chunk.split()
|
| 133 |
+
num = int(len(words)/threshold)
|
| 134 |
+
for i in range(0,num*threshold+1,threshold):
|
| 135 |
+
out.append(' '.join(words[i:threshold+i]))
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
passages = []
|
| 138 |
+
|
| 139 |
+
#Combine sentences into a window of size window_size
|
| 140 |
+
for paragraph in [out]:
|
| 141 |
for start_idx in range(0, len(paragraph), window_size):
|
| 142 |
end_idx = min(start_idx+window_size, len(paragraph))
|
| 143 |
passages.append(" ".join(paragraph[start_idx:end_idx]))
|
| 144 |
+
|
|
|
|
|
|
|
|
|
|
| 145 |
return passages
|
| 146 |
+
|
| 147 |
+
@st.experimental_memo(suppress_st_warning=True)
|
| 148 |
+
def chunk_and_preprocess_text(text,thresh=500):
|
| 149 |
|
| 150 |
+
"""Chunk text longer than n tokens for summarization"""
|
| 151 |
|
| 152 |
+
sentences = sent_tokenize(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
current_chunk = 0
|
| 155 |
chunks = []
|
| 156 |
|
| 157 |
for sentence in sentences:
|
| 158 |
if len(chunks) == current_chunk + 1:
|
| 159 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
|
| 160 |
chunks[current_chunk].extend(sentence.split(" "))
|
| 161 |
else:
|
| 162 |
current_chunk += 1
|
|
|
|
| 167 |
for chunk_id in range(len(chunks)):
|
| 168 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 169 |
|
| 170 |
+
return chunks
|
| 171 |
+
|
| 172 |
|
| 173 |
def summary_downloader(raw_text):
|
| 174 |
|
|
|
|
| 312 |
|
| 313 |
nlp = get_spacy()
|
| 314 |
sent_pipe, sum_pipe, ner_pipe, cross_encoder = load_models()
|
| 315 |
+
sbert = load_sbert('all-MiniLM-L12-v2')
|