amazon-sentiment-analysis / data_integration.py
ashok2216's picture
Update data_integration.py
97a4d1e verified
import requests
from bs4 import BeautifulSoup
import pandas as pd
custom_headers = {
"Accept-language": "en-GB,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
def get_soup(url):
response = requests.get(url, headers=custom_headers)
if response.status_code != 200:
print("Error in getting webpage")
print(f"Error: {response.status_code} - {response.reason}")
exit(-1)
soup = BeautifulSoup(response.text, "lxml")
return soup
def get_reviews(soup):
review_elements = soup.select("div.review")
scraped_reviews = []
for review in review_elements:
r_author_element = review.select_one("span.a-profile-name")
r_author = r_author_element.text if r_author_element else None
r_rating_element = review.select_one("i.review-rating")
r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None
r_title_element = review.select_one("a.review-title")
r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
r_title = r_title_span_element.text if r_title_span_element else None
r_content_element = review.select_one("span.review-text")
r_content = r_content_element.text if r_content_element else None
r_date_element = review.select_one("span.review-date")
r_date = r_date_element.text if r_date_element else None
r_verified_element = review.select_one("span.a-size-mini")
r_verified = r_verified_element.text if r_verified_element else None
r_image_element = review.select_one("img.review-image-tile")
r_image = r_image_element.attrs["src"] if r_image_element else None
r = {
"author": r_author,
"rating": r_rating,
"title": r_title,
"content": r_content,
"date": r_date,
"verified": r_verified,
"image_url": r_image
}
scraped_reviews.append(r)
return scraped_reviews
def scrape_all_pages(url):
all_reviews = []
page_number = 1
while True:
soup = get_soup(f"{url}&pageNumber={page_number}")
reviews = get_reviews(soup)
if not reviews: # Break the loop if no reviews found on this page
break
all_reviews.extend(reviews)
page_number += 1
return all_reviews
# # Example usage:
# url = "https://www.amazon.in/OnePlus-Nord-Pastel-128GB-Storage/product-reviews/B0BY8JZ22K/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
# all_reviews = scrape_all_pages(url)
# # Convert to DataFrame for further analysis
# df = pd.DataFrame(all_reviews)
# df