ashok2216 commited on
Commit
7901fc5
·
verified ·
1 Parent(s): 2b53a6a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +19 -0
  2. data_cleaning.py +33 -0
  3. data_integration.py +88 -0
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import seaborn as sns
3
+ from transformers import pipeline
4
+
5
+
6
+ sentiment_model = pipeline(model="sentiment-analysis")
7
+
8
+ st.write('Hi')
9
+
10
+ sentiments = []
11
+ for text in df['clean_text']:
12
+ if list(sentiment_model(text)[0].values())[0] == 'LABEL_1':
13
+ output = 'Positive'
14
+ else:
15
+ output = 'Negative'
16
+ sentiments.append(output)
17
+
18
+ df['sentiments'] = sentiments
19
+ sns.countplot(df['sentiments'])
data_cleaning.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import nltk
3
+ nltk.download('stopwords')
4
+ from nltk.corpus import stopwords
5
+ nltk.download('punkt')
6
+ from nltk import sent_tokenize,word_tokenize
7
+ from nltk.stem.snowball import SnowballStemmer
8
+
9
+
10
+ def normalize(text):
11
+ return(text.lower())
12
+
13
+ def remove_stopwords(text):
14
+ list_stopwords = stopwords.words("english")
15
+ finalText=' '.join(a for a in word_tokenize(text) if (a not in list_stopwords and a.isalnum()))
16
+ return finalText
17
+
18
+ def removenumbers(text):
19
+ re_num = "\d+" ###COMPLETE THE REGULAR EXPRESSION
20
+ text = re.sub(re_num, "", text)
21
+ return text
22
+
23
+ def stem_text(text):
24
+ stemmer = SnowballStemmer("english")
25
+ t=' '.join(stemmer.stem(a) for a in word_tokenize(text))
26
+ return t
27
+
28
+ def preprocess(text):
29
+ text = normalize(text)
30
+ text = remove_stopwords(text)
31
+ text = removenumbers(text)
32
+ text = stem_text(text)
33
+ return(text)
data_integration.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+
5
+ custom_headers = {
6
+ "Accept-language": "en-GB,en;q=0.9",
7
+ "Accept-Encoding": "gzip, deflate, br",
8
+ "Cache-Control": "max-age=0",
9
+ "Connection": "keep-alive",
10
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
11
+ }
12
+
13
+ def get_soup(url):
14
+ response = requests.get(url, headers=custom_headers)
15
+
16
+ if response.status_code != 200:
17
+ print("Error in getting webpage")
18
+ print(f"Error: {response.status_code} - {response.reason}")
19
+ exit(-1)
20
+
21
+ soup = BeautifulSoup(response.text, "lxml")
22
+ return soup
23
+
24
+ def get_reviews(soup):
25
+ review_elements = soup.select("div.review")
26
+
27
+ scraped_reviews = []
28
+
29
+ for review in review_elements:
30
+ r_author_element = review.select_one("span.a-profile-name")
31
+ r_author = r_author_element.text if r_author_element else None
32
+
33
+ r_rating_element = review.select_one("i.review-rating")
34
+ r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None
35
+
36
+ r_title_element = review.select_one("a.review-title")
37
+ r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
38
+ r_title = r_title_span_element.text if r_title_span_element else None
39
+
40
+ r_content_element = review.select_one("span.review-text")
41
+ r_content = r_content_element.text if r_content_element else None
42
+
43
+ r_date_element = review.select_one("span.review-date")
44
+ r_date = r_date_element.text if r_date_element else None
45
+
46
+ r_verified_element = review.select_one("span.a-size-mini")
47
+ r_verified = r_verified_element.text if r_verified_element else None
48
+
49
+ r_image_element = review.select_one("img.review-image-tile")
50
+ r_image = r_image_element.attrs["src"] if r_image_element else None
51
+
52
+ r = {
53
+ "author": r_author,
54
+ "rating": r_rating,
55
+ "title": r_title,
56
+ "content": r_content,
57
+ "date": r_date,
58
+ "verified": r_verified,
59
+ "image_url": r_image
60
+ }
61
+
62
+ scraped_reviews.append(r)
63
+
64
+ return scraped_reviews
65
+
66
+ def scrape_all_pages(url):
67
+ all_reviews = []
68
+
69
+ page_number = 1
70
+ while True:
71
+ soup = get_soup(f"{url}&pageNumber={page_number}")
72
+ reviews = get_reviews(soup)
73
+
74
+ if not reviews: # Break the loop if no reviews found on this page
75
+ break
76
+
77
+ all_reviews.extend(reviews)
78
+ page_number += 1
79
+
80
+ return all_reviews
81
+
82
+ # # Example usage:
83
+ # url = "https://www.amazon.in/OnePlus-Nord-Pastel-128GB-Storage/product-reviews/B0BY8JZ22K/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
84
+ # all_reviews = scrape_all_pages(url)
85
+
86
+ # # Convert to DataFrame for further analysis
87
+ # df = pd.DataFrame(all_reviews)
88
+ # df