import pandas as pd from gradio.components import Textbox, HighlightedText, JSON import gradio as gr import numpy as np from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from nltk import sent_tokenize from tqdm import tqdm from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet import nltk import os import pickle def lowercasing(lda_samples): for idx, sample in tqdm(enumerate(lda_samples)): lda_samples[idx] = sample.lower() return lda_samples def punctuation_removal(lda_samples): # non-exhaustive; not sure if we want to treat punctuation as significant # doesn't remove punctuation from inside words for i, sample in tqdm(enumerate(lda_samples)): _sample = sample.split() for j, word in enumerate(_sample): _sample[j] = word.strip(" .!?@#&():;,'\/\\") sample = " ".join(_sample) lda_samples[i] = sample return lda_samples def get_wordnet_pos(word): """Map POS tag to first character lemmatize() accepts""" tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} return tag_dict.get(tag, wordnet.NOUN) def lemmatize(lda_samples): wnl = WordNetLemmatizer() for i, sample in tqdm(enumerate(lda_samples)): _sample = sample.split() for j, word in enumerate(_sample): tag = get_wordnet_pos(word) _sample[j] = wnl.lemmatize(word, tag) lda_samples[i] = " ".join(_sample) return lda_samples def predict(text): raw_sentences = sent_tokenize(text) processed_sentences = raw_sentences[:] processed_sentences = lowercasing(processed_sentences) processed_sentences = punctuation_removal(processed_sentences) processed_sentences = lemmatize(processed_sentences) res = [] present_topics = set() for raw, processed in zip(raw_sentences, processed_sentences): vs = analyzer.polarity_scores(raw) probs = lda.transform(tf_vectorizer.transform([processed]))[0] topic = probs.argmax() res.append((raw, f"Topic {topic+1} ({round(vs['compound'],2)})")) present_topics.add(topic) topics = {str(i + 1): ", ".join(topic_words[i]) for i in sorted(list(present_topics))} return [res, topics] json_files = [pos_json for pos_json in os.listdir(".") if pos_json.endswith('.json')] dfs = [] for f in json_files: dfs.append(pd.read_json(path_or_buf=f, lines=True)) df = pd.concat(dfs) n_features = 1000 n_components = 10 nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('punkt') with open("vectorizer.pkl", "rb") as f: tf_vectorizer = pickle.load(f) product_id = 'B009MA34NY' lda_samples = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText'])) lda_samples = lowercasing(lda_samples) lda_samples = punctuation_removal(lda_samples) lda_samples = lemmatize(lda_samples) documents = tf_vectorizer.transform(lda_samples) lda = LatentDirichletAllocation( n_components=n_components, max_iter=5, learning_offset=50.0, random_state=0, ) lda.fit(documents) tf_feature_names = tf_vectorizer.get_feature_names_out() raw_reviews = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText'])) raw_sentences = sent_tokenize(raw_reviews[337]) processed_sentences = raw_sentences[:] processed_sentences = lowercasing(processed_sentences) processed_sentences = punctuation_removal(processed_sentences) processed_sentences = lemmatize(processed_sentences) feature_names = tf_vectorizer.get_feature_names_out() topic_words = [] for topic in lda.components_: top_features_ind = topic.argsort()[: -10 - 1: -1] topic_words.append([feature_names[i] for i in top_features_ind]) analyzer = SentimentIntensityAnalyzer() sentiment_vals = np.linspace(-1.0, 1.0, num=201) color_map = {} colors = {1: "red", 2: "orange", 3: "lime", 4: "pink", 5: "brown", 6: "green", 7: "purple", 8: "blue", 9: "cyan", 10: "yellow"} for i, color in colors.items(): color_map.update({f"Topic {i} ({round(val,2)})": color for val in sentiment_vals}) gr.Interface(fn=predict, inputs=Textbox(placeholder="Enter review here...", lines=5), outputs=[HighlightedText().style(color_map=color_map), JSON()], examples=[ ["Good indoor training shoes for running on treadmill, doing lunges and regular exercises at the gym. These are very flexible, light weight and comfortable. Grip is okay - sticky rubber is used only at the edges of heel and toe areas so I slipped a little when I worked on cable machines, resistance band, etc. on un-carpeted floor. I would emphasize that if you do lifting as a part of your everyday routine workout I would not recommend them because mine (cushion) lasted only for six months and this is the reason I gave three stars. Other than that, I liked them!"], ["I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference."], ["The picture makes the shoe look like it has a \"boxier\" toe rather than the \"pointier\" toe that it actually has. I have wider feet and generally need to buy a size or half size longer to get a comfortable width (in any brand of shoe). I was shooting for a rounder, broader toe design which is more comfortable for me, and I feel that the pictures of this shoe didn't accurately depict what I received, in that one detail. Otherwise, \"the shoe fits\" So I am wearing it."] ], ) \ .launch()