Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Oct 18, 2022

Commit

7a5728d

1 Parent(s): e369230

Update functions.py

Browse files

Files changed (1) hide show

functions.py +35 -41

functions.py CHANGED Viewed

@@ -102,68 +102,61 @@ def sentiment_pipe(earnings_text):
     earnings_sentiment = sent_pipe(earnings_sentences)
     return earnings_sentiment, earnings_sentences
 @st.experimental_memo(suppress_st_warning=True)
-def preprocess_plain_text(text,window_size=3):
-    '''Preprocess text for semantic search'''
     text = text.encode("ascii", "ignore").decode()  # unicode
     text = re.sub(r"https*\S+", " ", text)  # url
     text = re.sub(r"@\S+", " ", text)  # mentions
     text = re.sub(r"#\S+", " ", text)  # hastags
     text = re.sub(r"\s{2,}", " ", text)  # over spaces
-    #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text)  # special characters except .,!?
-    #break into lines and remove leading and trailing space on each
-    lines = [line.strip() for line in text.splitlines()]
-    # #break multi-headlines into a line each
-    chunks = [phrase.strip() for line in lines for phrase in line.split("  ")]
-    # # drop blank lines
-    text = '\n'.join(chunk for chunk in chunks if chunk)
-    ## We split this article into paragraphs and then every paragraph into sentences
-    paragraphs = []
-    for paragraph in text.replace('\n',' ').split("\n\n"):
-        if len(paragraph.strip()) > 0:
-            paragraphs.append(sent_tokenize(paragraph.strip()))
-    #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
-    #Smaller value: Context from other sentences might get lost
-    #Lager values: More context from the paragraph remains, but results are longer
-    window_size = window_size
     passages = []
-    for paragraph in paragraphs:
         for start_idx in range(0, len(paragraph), window_size):
             end_idx = min(start_idx+window_size, len(paragraph))
             passages.append(" ".join(paragraph[start_idx:end_idx]))
-    print(f"Sentences: {sum([len(p) for p in paragraphs])}")
-    print(f"Passages: {len(passages)}")
     return passages
-@st.experimental_memo(suppress_st_warning=True)
-def chunk_and_preprocess_text(text):
-    """Chunk text longer than 500 tokens"""
-    text = text.encode("ascii", "ignore").decode()  # unicode
-    text = re.sub(r"https*\S+", " ", text)  # url
-    text = re.sub(r"@\S+", " ", text)  # mentions
-    text = re.sub(r"#\S+", " ", text)  # hastags
-    text = re.sub(r"\s{2,}", " ", text)  # over spaces
-    article = nlp(text)
-    sentences = [i.text for i in list(article.sents)]
     current_chunk = 0
     chunks = []
     for sentence in sentences:
         if len(chunks) == current_chunk + 1:
-            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
                 chunks[current_chunk].extend(sentence.split(" "))
             else:
                 current_chunk += 1
@@ -174,7 +167,8 @@ def chunk_and_preprocess_text(text):
     for chunk_id in range(len(chunks)):
         chunks[chunk_id] = " ".join(chunks[chunk_id])
-    return chunks
 def summary_downloader(raw_text):
@@ -318,4 +312,4 @@ def fin_ext(text):
 nlp = get_spacy()
 sent_pipe, sum_pipe, ner_pipe, cross_encoder  = load_models()
-sbert = load_sbert('all-MiniLM-L6-v2')

     earnings_sentiment = sent_pipe(earnings_sentences)
     return earnings_sentiment, earnings_sentences
 @st.experimental_memo(suppress_st_warning=True)
+def clean_text(text):
+    '''Clean all text'''
     text = text.encode("ascii", "ignore").decode()  # unicode
     text = re.sub(r"https*\S+", " ", text)  # url
     text = re.sub(r"@\S+", " ", text)  # mentions
     text = re.sub(r"#\S+", " ", text)  # hastags
     text = re.sub(r"\s{2,}", " ", text)  # over spaces
+    return text
+@st.experimental_memo(suppress_st_warning=True)
+def chunk_long_text(text,threshold,window_size=3):
+    '''Preprocess text and chunk for semantic search and sentiment analysis'''
+    #Convert cleaned text into sentences
+    sentences = sent_tokenize(text)
+    out = []
+    #Limit the length of each sentence to a threshold
+    for chunk in sentences:
+        if len(chunk.split()) < threshold:
+            out.append(chunk)
+        else:
+            words = chunk.split()
+            num = int(len(words)/threshold)
+            for i in range(0,num*threshold+1,threshold):
+                out.append(' '.join(words[i:threshold+i]))
     passages = []
+    #Combine sentences into a window of size window_size
+    for paragraph in [out]:
         for start_idx in range(0, len(paragraph), window_size):
             end_idx = min(start_idx+window_size, len(paragraph))
             passages.append(" ".join(paragraph[start_idx:end_idx]))
     return passages
+@st.experimental_memo(suppress_st_warning=True)
+def chunk_and_preprocess_text(text,thresh=500):
+    """Chunk text longer than n tokens for summarization"""
+    sentences = sent_tokenize(text)
     current_chunk = 0
     chunks = []
     for sentence in sentences:
         if len(chunks) == current_chunk + 1:
+            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
                 chunks[current_chunk].extend(sentence.split(" "))
             else:
                 current_chunk += 1
     for chunk_id in range(len(chunks)):
         chunks[chunk_id] = " ".join(chunks[chunk_id])
+    return chunks
 def summary_downloader(raw_text):
 nlp = get_spacy()
 sent_pipe, sum_pipe, ner_pipe, cross_encoder  = load_models()
+sbert = load_sbert('all-MiniLM-L12-v2')