Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
custom_headers = { | |
"Accept-language": "en-GB,en;q=0.9", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Cache-Control": "max-age=0", | |
"Connection": "keep-alive", | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
} | |
def get_soup(url): | |
response = requests.get(url, headers=custom_headers) | |
if response.status_code != 200: | |
print("Error in getting webpage") | |
print(f"Error: {response.status_code} - {response.reason}") | |
exit(-1) | |
soup = BeautifulSoup(response.text, "lxml") | |
return soup | |
def get_reviews(soup): | |
review_elements = soup.select("div.review") | |
scraped_reviews = [] | |
for review in review_elements: | |
r_author_element = review.select_one("span.a-profile-name") | |
r_author = r_author_element.text if r_author_element else None | |
r_rating_element = review.select_one("i.review-rating") | |
r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None | |
r_title_element = review.select_one("a.review-title") | |
r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None | |
r_title = r_title_span_element.text if r_title_span_element else None | |
r_content_element = review.select_one("span.review-text") | |
r_content = r_content_element.text if r_content_element else None | |
r_date_element = review.select_one("span.review-date") | |
r_date = r_date_element.text if r_date_element else None | |
r_verified_element = review.select_one("span.a-size-mini") | |
r_verified = r_verified_element.text if r_verified_element else None | |
r_image_element = review.select_one("img.review-image-tile") | |
r_image = r_image_element.attrs["src"] if r_image_element else None | |
r = { | |
"author": r_author, | |
"rating": r_rating, | |
"title": r_title, | |
"content": r_content, | |
"date": r_date, | |
"verified": r_verified, | |
"image_url": r_image | |
} | |
scraped_reviews.append(r) | |
return scraped_reviews | |
def scrape_all_pages(url): | |
all_reviews = [] | |
page_number = 1 | |
while True: | |
soup = get_soup(f"{url}&pageNumber={page_number}") | |
reviews = get_reviews(soup) | |
if not reviews: # Break the loop if no reviews found on this page | |
break | |
all_reviews.extend(reviews) | |
page_number += 1 | |
return all_reviews | |
# # Example usage: | |
# url = "https://www.amazon.in/OnePlus-Nord-Pastel-128GB-Storage/product-reviews/B0BY8JZ22K/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews" | |
# all_reviews = scrape_all_pages(url) | |
# # Convert to DataFrame for further analysis | |
# df = pd.DataFrame(all_reviews) | |
# df | |