import requests from bs4 import BeautifulSoup import pandas as pd custom_headers = { "Accept-language": "en-GB,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "max-age=0", "Connection": "keep-alive", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } def get_soup(url): response = requests.get(url, headers=custom_headers) if response.status_code != 200: print("Error in getting webpage") print(f"Error: {response.status_code} - {response.reason}") exit(-1) soup = BeautifulSoup(response.text, "lxml") return soup def get_reviews(soup): review_elements = soup.select("div.review") scraped_reviews = [] for review in review_elements: r_author_element = review.select_one("span.a-profile-name") r_author = r_author_element.text if r_author_element else None r_rating_element = review.select_one("i.review-rating") r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None r_title_element = review.select_one("a.review-title") r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None r_title = r_title_span_element.text if r_title_span_element else None r_content_element = review.select_one("span.review-text") r_content = r_content_element.text if r_content_element else None r_date_element = review.select_one("span.review-date") r_date = r_date_element.text if r_date_element else None r_verified_element = review.select_one("span.a-size-mini") r_verified = r_verified_element.text if r_verified_element else None r_image_element = review.select_one("img.review-image-tile") r_image = r_image_element.attrs["src"] if r_image_element else None r = { "author": r_author, "rating": r_rating, "title": r_title, "content": r_content, "date": r_date, "verified": r_verified, "image_url": r_image } scraped_reviews.append(r) return scraped_reviews def scrape_all_pages(url): all_reviews = [] page_number = 1 while True: soup = get_soup(f"{url}&pageNumber={page_number}") reviews = get_reviews(soup) if not reviews: # Break the loop if no reviews found on this page break all_reviews.extend(reviews) page_number += 1 return all_reviews # # Example usage: # url = "https://www.amazon.in/OnePlus-Nord-Pastel-128GB-Storage/product-reviews/B0BY8JZ22K/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews" # all_reviews = scrape_all_pages(url) # # Convert to DataFrame for further analysis # df = pd.DataFrame(all_reviews) # df