defyingentropy commited on
Commit
afdf195
·
1 Parent(s): d17c06f

Add app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from gradio.components import Textbox, HighlightedText, JSON
3
+ import gradio as gr
4
+ import numpy as np
5
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
6
+ from nltk import sent_tokenize
7
+ from tqdm import tqdm
8
+ from sklearn.decomposition import LatentDirichletAllocation
9
+ from sklearn.feature_extraction.text import CountVectorizer
10
+ from nltk.stem import WordNetLemmatizer
11
+ from nltk.corpus import wordnet
12
+ import nltk
13
+ import os
14
+
15
+
16
+ def lowercasing(lda_samples):
17
+ for idx, sample in tqdm(enumerate(lda_samples)):
18
+ lda_samples[idx] = sample.lower()
19
+ return lda_samples
20
+
21
+
22
+ def punctuation_removal(lda_samples):
23
+ # non-exhaustive; not sure if we want to treat punctuation as significant
24
+ # doesn't remove punctuation from inside words
25
+ for i, sample in tqdm(enumerate(lda_samples)):
26
+ _sample = sample.split()
27
+ for j, word in enumerate(_sample):
28
+ _sample[j] = word.strip(" .!?@#&():;,'\/\\")
29
+ sample = " ".join(_sample)
30
+ lda_samples[i] = sample
31
+ return lda_samples
32
+
33
+
34
+ def get_wordnet_pos(word):
35
+ """Map POS tag to first character lemmatize() accepts"""
36
+ tag = nltk.pos_tag([word])[0][1][0].upper()
37
+ tag_dict = {"J": wordnet.ADJ,
38
+ "N": wordnet.NOUN,
39
+ "V": wordnet.VERB,
40
+ "R": wordnet.ADV}
41
+
42
+ return tag_dict.get(tag, wordnet.NOUN)
43
+
44
+
45
+ def lemmatize(lda_samples):
46
+ wnl = WordNetLemmatizer()
47
+ for i, sample in tqdm(enumerate(lda_samples)):
48
+ _sample = sample.split()
49
+ for j, word in enumerate(_sample):
50
+ tag = get_wordnet_pos(word)
51
+ _sample[j] = wnl.lemmatize(word, tag)
52
+ lda_samples[i] = " ".join(_sample)
53
+ return lda_samples
54
+
55
+
56
+ def predict(text):
57
+ raw_sentences = sent_tokenize(text)
58
+
59
+ processed_sentences = raw_sentences[:]
60
+ processed_sentences = lowercasing(processed_sentences)
61
+ processed_sentences = punctuation_removal(processed_sentences)
62
+ processed_sentences = lemmatize(processed_sentences)
63
+
64
+ res = []
65
+ present_topics = set()
66
+ for raw, processed in zip(raw_sentences, processed_sentences):
67
+ vs = analyzer.polarity_scores(raw)
68
+ probs = lda.transform(tf_vectorizer.transform([processed]))[0]
69
+ topic = probs.argmax()
70
+
71
+ res.append((raw, f"Topic {topic+1} ({round(vs['compound'],2)})"))
72
+ present_topics.add(topic)
73
+
74
+ topics = {str(i + 1): ", ".join(topic_words[i]) for i in sorted(list(present_topics))}
75
+ return [res, topics]
76
+
77
+
78
+ json_files = [pos_json for pos_json in os.listdir(".") if pos_json.endswith('.json')]
79
+
80
+ dfs = []
81
+ for f in json_files:
82
+ dfs.append(pd.read_json(path_or_buf=f, lines=True))
83
+
84
+ df = pd.concat(dfs)
85
+
86
+
87
+ n_features = 1000
88
+ n_components = 10
89
+
90
+ nltk.download('averaged_perceptron_tagger')
91
+ nltk.download('wordnet')
92
+ nltk.download('omw-1.4')
93
+
94
+ preprocessing_samples = list(filter(lambda x: isinstance(x, str), df['reviewText']))
95
+ preprocessing_samples = lowercasing(preprocessing_samples)
96
+ preprocessing_samples = punctuation_removal(preprocessing_samples)
97
+ preprocessing_samples = lemmatize(preprocessing_samples)
98
+
99
+ tf_vectorizer = CountVectorizer(
100
+ max_df=0.01, min_df=2, max_features=n_features, stop_words="english"
101
+ )
102
+ tf = tf_vectorizer.fit_transform(preprocessing_samples)
103
+
104
+
105
+ product_id = 'B009MA34NY'
106
+ lda_samples = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
107
+ lda_samples = lowercasing(lda_samples)
108
+ lda_samples = punctuation_removal(lda_samples)
109
+ lda_samples = lemmatize(lda_samples)
110
+ documents = tf_vectorizer.transform(lda_samples)
111
+
112
+
113
+ lda = LatentDirichletAllocation(
114
+ n_components=n_components,
115
+ max_iter=5,
116
+ learning_offset=50.0,
117
+ random_state=0,
118
+ )
119
+ lda.fit(documents)
120
+
121
+ tf_feature_names = tf_vectorizer.get_feature_names_out()
122
+
123
+ raw_reviews = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
124
+ raw_sentences = sent_tokenize(raw_reviews[337])
125
+
126
+ processed_sentences = raw_sentences[:]
127
+ processed_sentences = lowercasing(processed_sentences)
128
+ processed_sentences = punctuation_removal(processed_sentences)
129
+ processed_sentences = lemmatize(processed_sentences)
130
+
131
+ feature_names = tf_vectorizer.get_feature_names_out()
132
+ topic_words = []
133
+ for topic in lda.components_:
134
+ top_features_ind = topic.argsort()[: -10 - 1: -1]
135
+ topic_words.append([feature_names[i] for i in top_features_ind])
136
+
137
+ analyzer = SentimentIntensityAnalyzer()
138
+
139
+
140
+ sentiment_vals = np.linspace(-1.0, 1.0, num=201)
141
+ color_map = {}
142
+ colors = {1: "red", 2: "orange", 3: "lime", 4: "pink", 5: "brown", 6: "green", 7: "purple", 8: "blue", 9: "cyan", 10: "yellow"}
143
+ for i, color in colors.items():
144
+ color_map.update({f"Topic {i} ({round(val,2)})": color for val in sentiment_vals})
145
+
146
+ gr.Interface(fn=predict,
147
+ inputs=Textbox(placeholder="Enter review here...", lines=5),
148
+ outputs=[HighlightedText().style(color_map=color_map), JSON()],
149
+ examples=[
150
+ ["Good indoor training shoes for running on treadmill, doing lunges and regular exercises at the gym. These are very flexible, light weight and comfortable. Grip is okay - sticky rubber is used only at the edges of heel and toe areas so I slipped a little when I worked on cable machines, resistance band, etc. on un-carpeted floor. I would emphasize that if you do lifting as a part of your everyday routine workout I would not recommend them because mine (cushion) lasted only for six months and this is the reason I gave three stars. Other than that, I liked them!"],
151
+ ["I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference."],
152
+ ["The picture makes the shoe look like it has a \"boxier\" toe rather than the \"pointier\" toe that it actually has. I have wider feet and generally need to buy a size or half size longer to get a comfortable width (in any brand of shoe). I was shooting for a rounder, broader toe design which is more comfortable for me, and I feel that the pictures of this shoe didn't accurately depict what I received, in that one detail. Otherwise, \"the shoe fits\" So I am wearing it."]
153
+ ],
154
+ ) \
155
+ .launch(share=True)