File size: 6,081 Bytes
afdf195
 
 
 
 
 
 
 
 
 
 
 
 
dc05613
afdf195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc05613
 
afdf195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
from gradio.components import Textbox, HighlightedText, JSON
import gradio as gr
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import os
import pickle


def lowercasing(lda_samples):
    for idx, sample in tqdm(enumerate(lda_samples)):
        lda_samples[idx] = sample.lower()
    return lda_samples


def punctuation_removal(lda_samples):
    # non-exhaustive; not sure if we want to treat punctuation as significant
    # doesn't remove punctuation from inside words
    for i, sample in tqdm(enumerate(lda_samples)):
        _sample = sample.split()
        for j, word in enumerate(_sample):
            _sample[j] = word.strip(" .!?@#&():;,'\/\\")
        sample = " ".join(_sample)
        lda_samples[i] = sample
    return lda_samples


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize(lda_samples):
    wnl = WordNetLemmatizer()
    for i, sample in tqdm(enumerate(lda_samples)):
        _sample = sample.split()
        for j, word in enumerate(_sample):
            tag = get_wordnet_pos(word)
            _sample[j] = wnl.lemmatize(word, tag)
        lda_samples[i] = " ".join(_sample)
    return lda_samples


def predict(text):
    raw_sentences = sent_tokenize(text)

    processed_sentences = raw_sentences[:]
    processed_sentences = lowercasing(processed_sentences)
    processed_sentences = punctuation_removal(processed_sentences)
    processed_sentences = lemmatize(processed_sentences)

    res = []
    present_topics = set()
    for raw, processed in zip(raw_sentences, processed_sentences):
        vs = analyzer.polarity_scores(raw)
        probs = lda.transform(tf_vectorizer.transform([processed]))[0]
        topic = probs.argmax()

        res.append((raw, f"Topic {topic+1} ({round(vs['compound'],2)})"))
        present_topics.add(topic)

    topics = {str(i + 1): ", ".join(topic_words[i]) for i in sorted(list(present_topics))}
    return [res, topics]


json_files = [pos_json for pos_json in os.listdir(".") if pos_json.endswith('.json')]

dfs = []
for f in json_files:
    dfs.append(pd.read_json(path_or_buf=f, lines=True))

df = pd.concat(dfs)


n_features = 1000
n_components = 10

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

with open("vectorizer.pkl", "rb") as f:
    tf_vectorizer = pickle.load(f)

product_id = 'B009MA34NY'
lda_samples = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
lda_samples = lowercasing(lda_samples)
lda_samples = punctuation_removal(lda_samples)
lda_samples = lemmatize(lda_samples)
documents = tf_vectorizer.transform(lda_samples)


lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_offset=50.0,
    random_state=0,
)
lda.fit(documents)

tf_feature_names = tf_vectorizer.get_feature_names_out()

raw_reviews = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
raw_sentences = sent_tokenize(raw_reviews[337])

processed_sentences = raw_sentences[:]
processed_sentences = lowercasing(processed_sentences)
processed_sentences = punctuation_removal(processed_sentences)
processed_sentences = lemmatize(processed_sentences)

feature_names = tf_vectorizer.get_feature_names_out()
topic_words = []
for topic in lda.components_:
    top_features_ind = topic.argsort()[: -10 - 1: -1]
    topic_words.append([feature_names[i] for i in top_features_ind])

analyzer = SentimentIntensityAnalyzer()


sentiment_vals = np.linspace(-1.0, 1.0, num=201)
color_map = {}
colors = {1: "red", 2: "orange", 3: "lime", 4: "pink", 5: "brown", 6: "green", 7: "purple", 8: "blue", 9: "cyan", 10: "yellow"}
for i, color in colors.items():
    color_map.update({f"Topic {i} ({round(val,2)})": color for val in sentiment_vals})

gr.Interface(fn=predict,
             inputs=Textbox(placeholder="Enter review here...", lines=5),
             outputs=[HighlightedText().style(color_map=color_map), JSON()],
             examples=[
                 ["Good indoor training shoes for running on treadmill, doing lunges and regular exercises at the gym. These are very flexible, light weight and comfortable. Grip is okay - sticky rubber is used only at the edges of heel and toe areas so I slipped a little when I worked on cable machines, resistance band, etc. on un-carpeted floor.  I would emphasize that if you do lifting as a part of your everyday routine workout I would not recommend them because mine (cushion) lasted only for six months and this is the reason I gave three stars. Other than that, I liked them!"],
                 ["I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference."],
                 ["The picture makes the shoe look like it has a \"boxier\" toe rather than the \"pointier\" toe that it actually has. I have wider feet and generally need to buy a size or half size longer to get a comfortable width (in any brand of shoe). I was shooting for a rounder, broader toe design which is more comfortable for me, and I feel that the pictures of this shoe didn't accurately depict what I received, in that one detail. Otherwise, \"the shoe fits\" So I am wearing it."]
             ],
             ) \
    .launch(share=True)