Spaces:

defyingentropy
/

review_jst

Runtime error

App Files Files Community

defyingentropy commited on Oct 25, 2022

Commit

afdf195

1 Parent(s): d17c06f

Add app.py

Browse files

Files changed (1) hide show

app.py +155 -0

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import pandas as pd
+from gradio.components import Textbox, HighlightedText, JSON
+import gradio as gr
+import numpy as np
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from nltk import sent_tokenize
+from tqdm import tqdm
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import wordnet
+import nltk
+import os
+def lowercasing(lda_samples):
+    for idx, sample in tqdm(enumerate(lda_samples)):
+        lda_samples[idx] = sample.lower()
+    return lda_samples
+def punctuation_removal(lda_samples):
+    # non-exhaustive; not sure if we want to treat punctuation as significant
+    # doesn't remove punctuation from inside words
+    for i, sample in tqdm(enumerate(lda_samples)):
+        _sample = sample.split()
+        for j, word in enumerate(_sample):
+            _sample[j] = word.strip(" .!?@#&():;,'\/\\")
+        sample = " ".join(_sample)
+        lda_samples[i] = sample
+    return lda_samples
+def get_wordnet_pos(word):
+    """Map POS tag to first character lemmatize() accepts"""
+    tag = nltk.pos_tag([word])[0][1][0].upper()
+    tag_dict = {"J": wordnet.ADJ,
+                "N": wordnet.NOUN,
+                "V": wordnet.VERB,
+                "R": wordnet.ADV}
+    return tag_dict.get(tag, wordnet.NOUN)
+def lemmatize(lda_samples):
+    wnl = WordNetLemmatizer()
+    for i, sample in tqdm(enumerate(lda_samples)):
+        _sample = sample.split()
+        for j, word in enumerate(_sample):
+            tag = get_wordnet_pos(word)
+            _sample[j] = wnl.lemmatize(word, tag)
+        lda_samples[i] = " ".join(_sample)
+    return lda_samples
+def predict(text):
+    raw_sentences = sent_tokenize(text)
+    processed_sentences = raw_sentences[:]
+    processed_sentences = lowercasing(processed_sentences)
+    processed_sentences = punctuation_removal(processed_sentences)
+    processed_sentences = lemmatize(processed_sentences)
+    res = []
+    present_topics = set()
+    for raw, processed in zip(raw_sentences, processed_sentences):
+        vs = analyzer.polarity_scores(raw)
+        probs = lda.transform(tf_vectorizer.transform([processed]))[0]
+        topic = probs.argmax()
+        res.append((raw, f"Topic {topic+1} ({round(vs['compound'],2)})"))
+        present_topics.add(topic)
+    topics = {str(i + 1): ", ".join(topic_words[i]) for i in sorted(list(present_topics))}
+    return [res, topics]
+json_files = [pos_json for pos_json in os.listdir(".") if pos_json.endswith('.json')]
+dfs = []
+for f in json_files:
+    dfs.append(pd.read_json(path_or_buf=f, lines=True))
+df = pd.concat(dfs)
+n_features = 1000
+n_components = 10
+nltk.download('averaged_perceptron_tagger')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+preprocessing_samples = list(filter(lambda x: isinstance(x, str), df['reviewText']))
+preprocessing_samples = lowercasing(preprocessing_samples)
+preprocessing_samples = punctuation_removal(preprocessing_samples)
+preprocessing_samples = lemmatize(preprocessing_samples)
+tf_vectorizer = CountVectorizer(
+    max_df=0.01, min_df=2, max_features=n_features, stop_words="english"
+)
+tf = tf_vectorizer.fit_transform(preprocessing_samples)
+product_id = 'B009MA34NY'
+lda_samples = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
+lda_samples = lowercasing(lda_samples)
+lda_samples = punctuation_removal(lda_samples)
+lda_samples = lemmatize(lda_samples)
+documents = tf_vectorizer.transform(lda_samples)
+lda = LatentDirichletAllocation(
+    n_components=n_components,
+    max_iter=5,
+    learning_offset=50.0,
+    random_state=0,
+)
+lda.fit(documents)
+tf_feature_names = tf_vectorizer.get_feature_names_out()
+raw_reviews = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
+raw_sentences = sent_tokenize(raw_reviews[337])
+processed_sentences = raw_sentences[:]
+processed_sentences = lowercasing(processed_sentences)
+processed_sentences = punctuation_removal(processed_sentences)
+processed_sentences = lemmatize(processed_sentences)
+feature_names = tf_vectorizer.get_feature_names_out()
+topic_words = []
+for topic in lda.components_:
+    top_features_ind = topic.argsort()[: -10 - 1: -1]
+    topic_words.append([feature_names[i] for i in top_features_ind])
+analyzer = SentimentIntensityAnalyzer()
+sentiment_vals = np.linspace(-1.0, 1.0, num=201)
+color_map = {}
+colors = {1: "red", 2: "orange", 3: "lime", 4: "pink", 5: "brown", 6: "green", 7: "purple", 8: "blue", 9: "cyan", 10: "yellow"}
+for i, color in colors.items():
+    color_map.update({f"Topic {i} ({round(val,2)})": color for val in sentiment_vals})
+gr.Interface(fn=predict,
+             inputs=Textbox(placeholder="Enter review here...", lines=5),
+             outputs=[HighlightedText().style(color_map=color_map), JSON()],
+             examples=[
+                 ["Good indoor training shoes for running on treadmill, doing lunges and regular exercises at the gym. These are very flexible, light weight and comfortable. Grip is okay - sticky rubber is used only at the edges of heel and toe areas so I slipped a little when I worked on cable machines, resistance band, etc. on un-carpeted floor.  I would emphasize that if you do lifting as a part of your everyday routine workout I would not recommend them because mine (cushion) lasted only for six months and this is the reason I gave three stars. Other than that, I liked them!"],
+                 ["I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference."],
+                 ["The picture makes the shoe look like it has a \"boxier\" toe rather than the \"pointier\" toe that it actually has. I have wider feet and generally need to buy a size or half size longer to get a comfortable width (in any brand of shoe). I was shooting for a rounder, broader toe design which is more comfortable for me, and I feel that the pictures of this shoe didn't accurately depict what I received, in that one detail. Otherwise, \"the shoe fits\" So I am wearing it."]
+             ],
+             ) \
+    .launch(share=True)