Spaces:
Runtime error
Runtime error
Commit
·
afdf195
1
Parent(s):
d17c06f
Add app.py
Browse files
app.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from gradio.components import Textbox, HighlightedText, JSON
|
3 |
+
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
6 |
+
from nltk import sent_tokenize
|
7 |
+
from tqdm import tqdm
|
8 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
9 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
from nltk.corpus import wordnet
|
12 |
+
import nltk
|
13 |
+
import os
|
14 |
+
|
15 |
+
|
16 |
+
def lowercasing(lda_samples):
|
17 |
+
for idx, sample in tqdm(enumerate(lda_samples)):
|
18 |
+
lda_samples[idx] = sample.lower()
|
19 |
+
return lda_samples
|
20 |
+
|
21 |
+
|
22 |
+
def punctuation_removal(lda_samples):
|
23 |
+
# non-exhaustive; not sure if we want to treat punctuation as significant
|
24 |
+
# doesn't remove punctuation from inside words
|
25 |
+
for i, sample in tqdm(enumerate(lda_samples)):
|
26 |
+
_sample = sample.split()
|
27 |
+
for j, word in enumerate(_sample):
|
28 |
+
_sample[j] = word.strip(" .!?@#&():;,'\/\\")
|
29 |
+
sample = " ".join(_sample)
|
30 |
+
lda_samples[i] = sample
|
31 |
+
return lda_samples
|
32 |
+
|
33 |
+
|
34 |
+
def get_wordnet_pos(word):
|
35 |
+
"""Map POS tag to first character lemmatize() accepts"""
|
36 |
+
tag = nltk.pos_tag([word])[0][1][0].upper()
|
37 |
+
tag_dict = {"J": wordnet.ADJ,
|
38 |
+
"N": wordnet.NOUN,
|
39 |
+
"V": wordnet.VERB,
|
40 |
+
"R": wordnet.ADV}
|
41 |
+
|
42 |
+
return tag_dict.get(tag, wordnet.NOUN)
|
43 |
+
|
44 |
+
|
45 |
+
def lemmatize(lda_samples):
|
46 |
+
wnl = WordNetLemmatizer()
|
47 |
+
for i, sample in tqdm(enumerate(lda_samples)):
|
48 |
+
_sample = sample.split()
|
49 |
+
for j, word in enumerate(_sample):
|
50 |
+
tag = get_wordnet_pos(word)
|
51 |
+
_sample[j] = wnl.lemmatize(word, tag)
|
52 |
+
lda_samples[i] = " ".join(_sample)
|
53 |
+
return lda_samples
|
54 |
+
|
55 |
+
|
56 |
+
def predict(text):
|
57 |
+
raw_sentences = sent_tokenize(text)
|
58 |
+
|
59 |
+
processed_sentences = raw_sentences[:]
|
60 |
+
processed_sentences = lowercasing(processed_sentences)
|
61 |
+
processed_sentences = punctuation_removal(processed_sentences)
|
62 |
+
processed_sentences = lemmatize(processed_sentences)
|
63 |
+
|
64 |
+
res = []
|
65 |
+
present_topics = set()
|
66 |
+
for raw, processed in zip(raw_sentences, processed_sentences):
|
67 |
+
vs = analyzer.polarity_scores(raw)
|
68 |
+
probs = lda.transform(tf_vectorizer.transform([processed]))[0]
|
69 |
+
topic = probs.argmax()
|
70 |
+
|
71 |
+
res.append((raw, f"Topic {topic+1} ({round(vs['compound'],2)})"))
|
72 |
+
present_topics.add(topic)
|
73 |
+
|
74 |
+
topics = {str(i + 1): ", ".join(topic_words[i]) for i in sorted(list(present_topics))}
|
75 |
+
return [res, topics]
|
76 |
+
|
77 |
+
|
78 |
+
json_files = [pos_json for pos_json in os.listdir(".") if pos_json.endswith('.json')]
|
79 |
+
|
80 |
+
dfs = []
|
81 |
+
for f in json_files:
|
82 |
+
dfs.append(pd.read_json(path_or_buf=f, lines=True))
|
83 |
+
|
84 |
+
df = pd.concat(dfs)
|
85 |
+
|
86 |
+
|
87 |
+
n_features = 1000
|
88 |
+
n_components = 10
|
89 |
+
|
90 |
+
nltk.download('averaged_perceptron_tagger')
|
91 |
+
nltk.download('wordnet')
|
92 |
+
nltk.download('omw-1.4')
|
93 |
+
|
94 |
+
preprocessing_samples = list(filter(lambda x: isinstance(x, str), df['reviewText']))
|
95 |
+
preprocessing_samples = lowercasing(preprocessing_samples)
|
96 |
+
preprocessing_samples = punctuation_removal(preprocessing_samples)
|
97 |
+
preprocessing_samples = lemmatize(preprocessing_samples)
|
98 |
+
|
99 |
+
tf_vectorizer = CountVectorizer(
|
100 |
+
max_df=0.01, min_df=2, max_features=n_features, stop_words="english"
|
101 |
+
)
|
102 |
+
tf = tf_vectorizer.fit_transform(preprocessing_samples)
|
103 |
+
|
104 |
+
|
105 |
+
product_id = 'B009MA34NY'
|
106 |
+
lda_samples = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
|
107 |
+
lda_samples = lowercasing(lda_samples)
|
108 |
+
lda_samples = punctuation_removal(lda_samples)
|
109 |
+
lda_samples = lemmatize(lda_samples)
|
110 |
+
documents = tf_vectorizer.transform(lda_samples)
|
111 |
+
|
112 |
+
|
113 |
+
lda = LatentDirichletAllocation(
|
114 |
+
n_components=n_components,
|
115 |
+
max_iter=5,
|
116 |
+
learning_offset=50.0,
|
117 |
+
random_state=0,
|
118 |
+
)
|
119 |
+
lda.fit(documents)
|
120 |
+
|
121 |
+
tf_feature_names = tf_vectorizer.get_feature_names_out()
|
122 |
+
|
123 |
+
raw_reviews = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
|
124 |
+
raw_sentences = sent_tokenize(raw_reviews[337])
|
125 |
+
|
126 |
+
processed_sentences = raw_sentences[:]
|
127 |
+
processed_sentences = lowercasing(processed_sentences)
|
128 |
+
processed_sentences = punctuation_removal(processed_sentences)
|
129 |
+
processed_sentences = lemmatize(processed_sentences)
|
130 |
+
|
131 |
+
feature_names = tf_vectorizer.get_feature_names_out()
|
132 |
+
topic_words = []
|
133 |
+
for topic in lda.components_:
|
134 |
+
top_features_ind = topic.argsort()[: -10 - 1: -1]
|
135 |
+
topic_words.append([feature_names[i] for i in top_features_ind])
|
136 |
+
|
137 |
+
analyzer = SentimentIntensityAnalyzer()
|
138 |
+
|
139 |
+
|
140 |
+
sentiment_vals = np.linspace(-1.0, 1.0, num=201)
|
141 |
+
color_map = {}
|
142 |
+
colors = {1: "red", 2: "orange", 3: "lime", 4: "pink", 5: "brown", 6: "green", 7: "purple", 8: "blue", 9: "cyan", 10: "yellow"}
|
143 |
+
for i, color in colors.items():
|
144 |
+
color_map.update({f"Topic {i} ({round(val,2)})": color for val in sentiment_vals})
|
145 |
+
|
146 |
+
gr.Interface(fn=predict,
|
147 |
+
inputs=Textbox(placeholder="Enter review here...", lines=5),
|
148 |
+
outputs=[HighlightedText().style(color_map=color_map), JSON()],
|
149 |
+
examples=[
|
150 |
+
["Good indoor training shoes for running on treadmill, doing lunges and regular exercises at the gym. These are very flexible, light weight and comfortable. Grip is okay - sticky rubber is used only at the edges of heel and toe areas so I slipped a little when I worked on cable machines, resistance band, etc. on un-carpeted floor. I would emphasize that if you do lifting as a part of your everyday routine workout I would not recommend them because mine (cushion) lasted only for six months and this is the reason I gave three stars. Other than that, I liked them!"],
|
151 |
+
["I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference."],
|
152 |
+
["The picture makes the shoe look like it has a \"boxier\" toe rather than the \"pointier\" toe that it actually has. I have wider feet and generally need to buy a size or half size longer to get a comfortable width (in any brand of shoe). I was shooting for a rounder, broader toe design which is more comfortable for me, and I feel that the pictures of this shoe didn't accurately depict what I received, in that one detail. Otherwise, \"the shoe fits\" So I am wearing it."]
|
153 |
+
],
|
154 |
+
) \
|
155 |
+
.launch(share=True)
|