booktoread / parsing .py
Vladislawoo's picture
Rename parsing (1).py to parsing .py
e3f4a13
raw
history blame
2.76 kB
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
def text(links):
for elem in links:
result = elem.text.strip()
break
return result
url = 'https://www.biblio-globus.ru/catalog/categories'
catalog = requests.get(url)
catalog_soup = BeautifulSoup(catalog.text, 'lxml')
list_categories = catalog_soup.find_all('li', class_='list-group-item')
df = []
columns = ['product_url', 'image', 'author', 'title', 'annotation', 'genre']
n = 1
for link in tqdm(list_categories):
category_url = 'https://www.biblio-globus.ru' + link.find('a')['href']
category_page = requests.get(category_url)
category_soup = BeautifulSoup(category_page.text, 'lxml')
list_subcategories = category_soup.find_all('a', class_='product-preview-title')
for sub in tqdm(list_subcategories):
subcategory_id = sub['href'].split('/')[-1]
page = 1
while True:
subcategiry_url = f'https://www.biblio-globus.ru/catalog/category?id={subcategory_id}&page={page}&sort=0'
subcategiry_page = requests.get(subcategiry_url)
subcategiry_soup = BeautifulSoup(subcategiry_page.text, 'lxml')
subcategiry_links = subcategiry_soup.find_all('div', class_='text')
if not subcategiry_links:
break
for product in subcategiry_links:
product_url = 'https://www.biblio-globus.ru' + product.find('a')['href']
product_page = requests.get(product_url)
product_soup = BeautifulSoup(product_page.text, 'lxml')
product_annotation = product_soup.find('div', id='collapseExample')
if product_annotation:
annotation = ''.join([symbol for symbol in product_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
annotation = annotation.split('Характеристики', 1)[0]
annotation = annotation.strip()
else:
annotation = None
try:
product_json = product_soup.find('script', type='application/ld+json')
dict_json = json.loads(product_json.text)
except (AttributeError, json.JSONDecodeError):
continue
author = dict_json['author']['name']
title = dict_json['name']
image = dict_json['image']
genre = dict_json['genre']
df.append([product_url, image, author, title, annotation, genre])
page += 1
data = pd.DataFrame(df, columns=columns)
data.to_csv(f'data{n}.csv', index=False)
n += 1