Spaces:
Runtime error
Runtime error
import pandas as pd | |
from gradio.components import Textbox, HighlightedText, JSON | |
import gradio as gr | |
import numpy as np | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
from nltk import sent_tokenize | |
from tqdm import tqdm | |
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.feature_extraction.text import CountVectorizer | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import wordnet | |
import nltk | |
import os | |
import pickle | |
def lowercasing(lda_samples): | |
for idx, sample in tqdm(enumerate(lda_samples)): | |
lda_samples[idx] = sample.lower() | |
return lda_samples | |
def punctuation_removal(lda_samples): | |
# non-exhaustive; not sure if we want to treat punctuation as significant | |
# doesn't remove punctuation from inside words | |
for i, sample in tqdm(enumerate(lda_samples)): | |
_sample = sample.split() | |
for j, word in enumerate(_sample): | |
_sample[j] = word.strip(" .!?@#&():;,'\/\\") | |
sample = " ".join(_sample) | |
lda_samples[i] = sample | |
return lda_samples | |
def get_wordnet_pos(word): | |
"""Map POS tag to first character lemmatize() accepts""" | |
tag = nltk.pos_tag([word])[0][1][0].upper() | |
tag_dict = {"J": wordnet.ADJ, | |
"N": wordnet.NOUN, | |
"V": wordnet.VERB, | |
"R": wordnet.ADV} | |
return tag_dict.get(tag, wordnet.NOUN) | |
def lemmatize(lda_samples): | |
wnl = WordNetLemmatizer() | |
for i, sample in tqdm(enumerate(lda_samples)): | |
_sample = sample.split() | |
for j, word in enumerate(_sample): | |
tag = get_wordnet_pos(word) | |
_sample[j] = wnl.lemmatize(word, tag) | |
lda_samples[i] = " ".join(_sample) | |
return lda_samples | |
def predict(text): | |
raw_sentences = sent_tokenize(text) | |
processed_sentences = raw_sentences[:] | |
processed_sentences = lowercasing(processed_sentences) | |
processed_sentences = punctuation_removal(processed_sentences) | |
processed_sentences = lemmatize(processed_sentences) | |
res = [] | |
present_topics = set() | |
for raw, processed in zip(raw_sentences, processed_sentences): | |
vs = analyzer.polarity_scores(raw) | |
probs = lda.transform(tf_vectorizer.transform([processed]))[0] | |
topic = probs.argmax() | |
res.append((raw, f"Topic {topic+1} ({round(vs['compound'],2)})")) | |
present_topics.add(topic) | |
topics = {str(i + 1): ", ".join(topic_words[i]) for i in sorted(list(present_topics))} | |
return [res, topics] | |
json_files = [pos_json for pos_json in os.listdir(".") if pos_json.endswith('.json')] | |
dfs = [] | |
for f in json_files: | |
dfs.append(pd.read_json(path_or_buf=f, lines=True)) | |
df = pd.concat(dfs) | |
n_features = 1000 | |
n_components = 10 | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('wordnet') | |
nltk.download('omw-1.4') | |
nltk.download('punkt') | |
with open("vectorizer.pkl", "rb") as f: | |
tf_vectorizer = pickle.load(f) | |
product_id = 'B009MA34NY' | |
lda_samples = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText'])) | |
lda_samples = lowercasing(lda_samples) | |
lda_samples = punctuation_removal(lda_samples) | |
lda_samples = lemmatize(lda_samples) | |
documents = tf_vectorizer.transform(lda_samples) | |
lda = LatentDirichletAllocation( | |
n_components=n_components, | |
max_iter=5, | |
learning_offset=50.0, | |
random_state=0, | |
) | |
lda.fit(documents) | |
tf_feature_names = tf_vectorizer.get_feature_names_out() | |
raw_reviews = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText'])) | |
raw_sentences = sent_tokenize(raw_reviews[337]) | |
processed_sentences = raw_sentences[:] | |
processed_sentences = lowercasing(processed_sentences) | |
processed_sentences = punctuation_removal(processed_sentences) | |
processed_sentences = lemmatize(processed_sentences) | |
feature_names = tf_vectorizer.get_feature_names_out() | |
topic_words = [] | |
for topic in lda.components_: | |
top_features_ind = topic.argsort()[: -10 - 1: -1] | |
topic_words.append([feature_names[i] for i in top_features_ind]) | |
analyzer = SentimentIntensityAnalyzer() | |
sentiment_vals = np.linspace(-1.0, 1.0, num=201) | |
color_map = {} | |
colors = {1: "red", 2: "orange", 3: "lime", 4: "pink", 5: "brown", 6: "green", 7: "purple", 8: "blue", 9: "cyan", 10: "yellow"} | |
for i, color in colors.items(): | |
color_map.update({f"Topic {i} ({round(val,2)})": color for val in sentiment_vals}) | |
gr.Interface(fn=predict, | |
inputs=Textbox(placeholder="Enter review here...", lines=5), | |
outputs=[HighlightedText().style(color_map=color_map), JSON()], | |
examples=[ | |
["Good indoor training shoes for running on treadmill, doing lunges and regular exercises at the gym. These are very flexible, light weight and comfortable. Grip is okay - sticky rubber is used only at the edges of heel and toe areas so I slipped a little when I worked on cable machines, resistance band, etc. on un-carpeted floor. I would emphasize that if you do lifting as a part of your everyday routine workout I would not recommend them because mine (cushion) lasted only for six months and this is the reason I gave three stars. Other than that, I liked them!"], | |
["I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference."], | |
["The picture makes the shoe look like it has a \"boxier\" toe rather than the \"pointier\" toe that it actually has. I have wider feet and generally need to buy a size or half size longer to get a comfortable width (in any brand of shoe). I was shooting for a rounder, broader toe design which is more comfortable for me, and I feel that the pictures of this shoe didn't accurately depict what I received, in that one detail. Otherwise, \"the shoe fits\" So I am wearing it."] | |
], | |
) \ | |
.launch() | |