Spaces:

MarMont
/

MARITESS

Sleeping

App Files Files Community

MarMont commited on Jun 26, 2023

Commit

e2bd7bd

1 Parent(s): 997565c

Transfer code

Browse files

Files changed (4) hide show

app.py +339 -1
katip-december.csv +0 -0
requirements.txt +13 -0
stopwords-tl.json +1 -0

app.py CHANGED Viewed

@@ -1,7 +1,345 @@
 import gradio as gr
 def greet(name):
     return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

+# Required Libraries
+#Base and Cleaning
+import json
+import requests
+import pandas as pd
+import numpy as np
+import emoji
+import regex
+import re
+import string
+from collections import Counter
+import tqdm
+from operator import itemgetter
+#Visualizations
+import plotly.express as px
+import seaborn as sns
+import matplotlib.pyplot as plt
+import pyLDAvis.gensim
+import chart_studio
+import chart_studio.plotly as py
+import chart_studio.tools as tls
+#Natural Language Processing (NLP)
+import spacy
+import gensim
+import json
+from spacy.tokenizer import Tokenizer
+from gensim.corpora import Dictionary
+from gensim.models.ldamulticore import LdaMulticore
+from gensim.models.coherencemodel import CoherenceModel
+from gensim.parsing.preprocessing import STOPWORDS as SW
+from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.model_selection import GridSearchCV
+from pprint import pprint
+from wordcloud import STOPWORDS
+from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
 import gradio as gr
+def give_emoji_free_text(text):
+    """
+    Removes emoji's from tweets
+    Accepts:
+        Text (tweets)
+    Returns:
+        Text (emoji free tweets)
+    """
+    emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
+    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
+    return clean_text
+def url_free_text(text):
+    '''
+    Cleans text from urls
+    '''
+    text = re.sub(r'http\S+', '', text)
+    return text
+# Tokenizer function
+def tokenize(text):
+    """
+    Parses a string into a list of semantic units (words)
+    Args:
+        text (str): The string that the function will tokenize.
+    Returns:
+        list: tokens parsed out
+    """
+    # Removing url's
+    pattern = r"http\S+"
+    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
+    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
+    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
+    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
+    # tokens = re.sub('@*!*$*', '', text) # Remove @ ! $
+    tokens = tokens.strip(',') # TESTING THIS LINE
+    tokens = tokens.strip('?') # TESTING THIS LINE
+    tokens = tokens.strip('!') # TESTING THIS LINE
+    tokens = tokens.strip("'") # TESTING THIS LINE
+    tokens = tokens.strip(".") # TESTING THIS LINE
+    tokens = tokens.lower().split() # Make text lowercase and split it
+    return tokens
+def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
+  coherence_values = []
+  model_list = []
+  for num_topics in range(start, limit, step):
+    model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                            num_topics=num_topics,
+                                            random_state=100,
+                                            chunksize=200,
+                                            passes=10,
+                                            per_word_topics=True,
+                                            id2word=id2word)
+    model_list.append(model)
+    coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
+    coherence_values.append(coherencemodel.get_coherence())
+  return model_list, coherence_values
+def compute_coherence_values2(corpus, dictionary, k, a, b):
+  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                              id2word=id2word,
+                                              num_topics=num_topics,
+                                              random_state=100,
+                                              chunksize=200,
+                                              passes=10,
+                                              alpha=a,
+                                              eta=b,
+                                              per_word_topics=True)
+  coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v')
+  return coherence_model_lda.get_coherence()
+def assignTopic(l):
+  maxTopic = max(l,key=itemgetter(1))[0]
+  return maxTopic
+def get_topic_value(row, i):
+  if len(row) == 1:
+    return row[0][1]
+  else:
+    return row[i][1]
+df = pd.DataFrame()
+def dataframeProcessing(dataset):
+    # Opening JSON file
+    f = open('stopwords-tl.json')
+    tlStopwords = json.loads(f.read())
+    stopwords = set(STOPWORDS)
+    stopwords.update(tlStopwords)
+    stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi'])
+    df = pd.read_csv('katip-december.csv')
+    df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
+    df = df.apply(lambda row: row[df['language'].isin(['en'])])
+    df.reset_index(inplace=True)
+    # Apply the function above and get tweets free of emoji's
+    call_emoji_free = lambda x: give_emoji_free_text(x)
+    # Apply `call_emoji_free` which calls the function to remove all emoji's
+    df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)
+    #Create a new column with url free tweets
+    df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)
+    # Load spacy
+    # Make sure to restart the runtime after running installations and libraries tab
+    nlp = spacy.load('en_core_web_lg')
+    # Tokenizer
+    tokenizer = Tokenizer(nlp.vocab)
+    # Custom stopwords
+    custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
+    # Customize stop words by adding to the default list
+    STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
+    # ALL_STOP_WORDS = spacy + gensim + wordcloud
+    ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)
+    tokens = []
+    STOP_WORDS.update(stopwords)
+    for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
+        doc_tokens = []
+        for token in doc:
+            if token.text.lower() not in STOP_WORDS:
+                doc_tokens.append(token.text.lower())
+        tokens.append(doc_tokens)
+    # Makes tokens column
+    df['tokens'] = tokens
+    # Make tokens a string again
+    df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]
+    def get_lemmas(text):
+        '''Used to lemmatize the processed tweets'''
+        lemmas = []
+        doc = nlp(text)
+        # Something goes here :P
+        for token in doc:
+            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
+                lemmas.append(token.lemma_)
+        return lemmas
+    df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)
+    # Make lemmas a string again
+    df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
+    # Apply tokenizer
+    df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
+    # Create a id2word dictionary
+    id2word = Dictionary(df['lemma_tokens'])
+    # Filtering Extremes
+    id2word.filter_extremes(no_below=2, no_above=.99)
+    print(len(id2word))
+    # Creating a corpus object
+    corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
+    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                                id2word=id2word,
+                                                num_topics=5,
+                                                random_state=100,
+                                                chunksize=200,
+                                                passes=10,
+                                                per_word_topics=True)
+    pprint(lda_model.print_topics())
+    doc_lda = lda_model[corpus]
+    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v')
+    coherence_lda = coherence_model_lda.get_coherence()
+    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
+                                                            texts=df['lemma_tokens'],
+                                                            start=2,
+                                                            limit=10,
+                                                            step=1)
+    k_max = max(coherence_values)
+    num_topics = coherence_values.index(k_max) + 2
+    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                                id2word=id2word,
+                                                num_topics=num_topics,
+                                                random_state=100,
+                                                chunksize=200,
+                                                passes=10,
+                                                per_word_topics=True)
+    grid = {}
+    grid['Validation_Set'] = {}
+    alpha = [0.05, 0.1, 0.5, 1, 5, 10]
+    beta = [0.05, 0.1, 0.5, 1, 5, 10]
+    num_of_docs = len(corpus)
+    corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
+                corpus]
+    corpus_title = ['75% Corpus', '100% Corpus']
+    model_results = {'Validation_Set': [],
+                    'Alpha': [],
+                    'Beta': [],
+                    'Coherence': []
+                    }
+    if 1 == 1:
+        pbar = tqdm.tqdm(total=540)
+    for i in range(len(corpus_sets)):
+        for a in alpha:
+            for b in beta:
+                cv = compute_coherence_values2(corpus=corpus_sets[i], dictionary=id2word, k=num_topics, a=a, b=b)
+                model_results['Validation_Set'].append(corpus_title[i])
+                model_results['Alpha'].append(a)
+                model_results['Beta'].append(b)
+                model_results['Coherence'].append(cv)
+                pbar.update(1)
+    pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
+    pbar.close()
+    params_df = pd.read_csv('lda_tuning_results_new.csv')
+    params_df = params_df[params_df.Validation_Set == '100% Corpus']
+    params_df.reset_index(inplace=True)
+    max_params = params_df.loc[params_df['Coherence'].idxmax()]
+    max_coherence = max_params['Coherence']
+    max_alpha = max_params['Alpha']
+    max_beta = max_params['Beta']
+    lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                                    id2word=id2word,
+                                                    num_topics=7,
+                                                    random_state=100,
+                                                    chunksize=200,
+                                                    passes=10,
+                                                    alpha=max_alpha,
+                                                    eta=max_beta,
+                                                    per_word_topics=True)
+    coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
+                                        coherence='c_v')
+    coherence_lda = coherence_model_lda.get_coherence()
+    lda_topics = lda_model_final.show_topics(num_words=10)
+    topics = []
+    filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
+    for topic in lda_topics:
+        print(topic)
+        topics.append(preprocess_string(topic[1], filters))
+    df['topic'] = [sorted(lda_model_final[corpus][text][0]) for text in range(len(df['original_tweets']))]
+    df = df[df['topic'].map(lambda d: len(d)) > 0]
+    df['topic'][0]
+    df['max_topic'] = df['topic'].map(lambda row: assignTopic(row))
+    topic_clusters = []
+    for i in range(num_topics):
+        topic_clusters.append(df[df['max_topic'].isin(([i]))])
+        topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
+    for i in range(len(topic_clusters)):
+        tweets = df.loc[df['max_topic'] == i]
+        tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i))
+        # tweets['topic'] = [row[i][1] for row in tweets['topic']]
+        tweets_sorted = tweets.sort_values('topic', ascending=False)
+        tweets_sorted.drop_duplicates(subset=['original_tweets'])
+        rep_tweets = tweets_sorted['original_tweets']
+        rep_tweets = [*set(rep_tweets)]
+        print('Topic ', i)
+        print(rep_tweets[:5])
+    return df
 def greet(name):
     return "Hello " + name + "!!"
+iface = gr.Interface(fn=dataframeProcessing, outputs=gr.Dataframe(headers=['original_tweets', 'max_topic']))
 iface.launch()

katip-december.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+emoji==1.7.0
+pandas-profiling==2.*
+plotly==4.*
+spacy>=3.0.0,<4.0.0
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl
+pyldavis
+gensim
+chart_studio
+autopep8
+transformers
+sentencepiece
+bert-extractive-summarizer
+tqdm

stopwords-tl.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["akin","aking","ako","alin","am","amin","aming","ang","ano","anumang","apat","at","atin","ating","ay","bababa","bago","bakit","bawat","bilang","dahil","dalawa","dapat","din","dito","doon","gagawin","gayunman","ginagawa","ginawa","ginawang","gumawa","gusto","habang","hanggang","hindi","huwag","iba","ibaba","ibabaw","ibig","ikaw","ilagay","ilalim","ilan","inyong","isa","isang","itaas","ito","iyo","iyon","iyong","ka","kahit","kailangan","kailanman","kami","kanila","kanilang","kanino","kanya","kanyang","kapag","kapwa","karamihan","katiyakan","katulad","kaya","kaysa","ko","kong","kulang","kumuha","kung","laban","lahat","lamang","likod","lima","maaari","maaaring","maging","mahusay","makita","marami","marapat","masyado","may","mayroon","mga","minsan","mismo","mula","muli","na","nabanggit","naging","nagkaroon","nais","nakita","namin","napaka","narito","nasaan","ng","ngayon","ni","nila","nilang","nito","niya","niyang","noon","o","pa","paano","pababa","paggawa","pagitan","pagkakaroon","pagkatapos","palabas","pamamagitan","panahon","pangalawa","para","paraan","pareho","pataas","pero","pumunta","pumupunta","sa","saan","sabi","sabihin","sarili","sila","sino","siya","tatlo","tayo","tulad","tungkol","una","walang"]