Spaces:
Sleeping
Sleeping
Commit
·
8492c80
1
Parent(s):
a3a661b
Delete parsing.py
Browse files- parsing.py +0 -74
parsing.py
DELETED
@@ -1,74 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import requests
|
3 |
-
import pandas as pd
|
4 |
-
from tqdm import tqdm
|
5 |
-
from bs4 import BeautifulSoup
|
6 |
-
|
7 |
-
|
8 |
-
def text(links):
|
9 |
-
for elem in links:
|
10 |
-
result = elem.text.strip()
|
11 |
-
break
|
12 |
-
|
13 |
-
return result
|
14 |
-
|
15 |
-
|
16 |
-
url = 'https://www.biblio-globus.ru/catalog/categories'
|
17 |
-
catalog = requests.get(url)
|
18 |
-
catalog_soup = BeautifulSoup(catalog.text, 'lxml')
|
19 |
-
list_categories = catalog_soup.find_all('li', class_='list-group-item')
|
20 |
-
|
21 |
-
df = []
|
22 |
-
columns = ['product_url', 'image', 'author', 'title', 'annotation', 'genre']
|
23 |
-
|
24 |
-
|
25 |
-
for link in tqdm(list_categories):
|
26 |
-
|
27 |
-
category_url = 'https://www.biblio-globus.ru' + link.find('a')['href']
|
28 |
-
category_page = requests.get(category_url)
|
29 |
-
category_soup = BeautifulSoup(category_page.text, 'lxml')
|
30 |
-
list_subcategories = category_soup.find_all('a', class_='product-preview-title')
|
31 |
-
|
32 |
-
n = 1
|
33 |
-
for sub in tqdm(list_subcategories):
|
34 |
-
|
35 |
-
subcategory_id = sub['href'].split('/')[-1]
|
36 |
-
|
37 |
-
page = 1
|
38 |
-
while True:
|
39 |
-
|
40 |
-
subcategiry_url = f'https://www.biblio-globus.ru/catalog/category?id={subcategory_id}&page={page}&sort=0'
|
41 |
-
subcategiry_page = requests.get(subcategiry_url)
|
42 |
-
subcategiry_soup = BeautifulSoup(subcategiry_page.text, 'lxml')
|
43 |
-
subcategiry_links = subcategiry_soup.find_all('div', class_='text')
|
44 |
-
if not subcategiry_links:
|
45 |
-
break
|
46 |
-
|
47 |
-
for product in subcategiry_links:
|
48 |
-
product_url = 'https://www.biblio-globus.ru' + product.find('a')['href']
|
49 |
-
product_page = requests.get(product_url)
|
50 |
-
product_soup = BeautifulSoup(product_page.text, 'lxml')
|
51 |
-
product_annotation = product_soup.find('div', id='collapseExample')
|
52 |
-
if product_annotation:
|
53 |
-
annotation = ''.join([symbol for symbol in product_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
|
54 |
-
annotation = annotation.split('Характеристики', 1)[0]
|
55 |
-
annotation = annotation.strip()
|
56 |
-
else:
|
57 |
-
annotation = None
|
58 |
-
|
59 |
-
try:
|
60 |
-
product_json = product_soup.find('script', type='application/ld+json')
|
61 |
-
dict_json = json.loads(product_json.text)
|
62 |
-
except (AttributeError, json.JSONDecodeError):
|
63 |
-
continue
|
64 |
-
|
65 |
-
author = dict_json['author']['name']
|
66 |
-
title = dict_json['name']
|
67 |
-
image = dict_json['image']
|
68 |
-
genre = dict_json['genre']
|
69 |
-
df.append([product_url, image, author, title, annotation, genre])
|
70 |
-
page += 1
|
71 |
-
|
72 |
-
data = pd.DataFrame(df, columns=columns)
|
73 |
-
data.to_csv(f'data{n}.csv', index=False)
|
74 |
-
n += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|