Vladislawoo commited on
Commit
8492c80
·
1 Parent(s): a3a661b

Delete parsing.py

Browse files
Files changed (1) hide show
  1. parsing.py +0 -74
parsing.py DELETED
@@ -1,74 +0,0 @@
1
- import json
2
- import requests
3
- import pandas as pd
4
- from tqdm import tqdm
5
- from bs4 import BeautifulSoup
6
-
7
-
8
- def text(links):
9
- for elem in links:
10
- result = elem.text.strip()
11
- break
12
-
13
- return result
14
-
15
-
16
- url = 'https://www.biblio-globus.ru/catalog/categories'
17
- catalog = requests.get(url)
18
- catalog_soup = BeautifulSoup(catalog.text, 'lxml')
19
- list_categories = catalog_soup.find_all('li', class_='list-group-item')
20
-
21
- df = []
22
- columns = ['product_url', 'image', 'author', 'title', 'annotation', 'genre']
23
-
24
-
25
- for link in tqdm(list_categories):
26
-
27
- category_url = 'https://www.biblio-globus.ru' + link.find('a')['href']
28
- category_page = requests.get(category_url)
29
- category_soup = BeautifulSoup(category_page.text, 'lxml')
30
- list_subcategories = category_soup.find_all('a', class_='product-preview-title')
31
-
32
- n = 1
33
- for sub in tqdm(list_subcategories):
34
-
35
- subcategory_id = sub['href'].split('/')[-1]
36
-
37
- page = 1
38
- while True:
39
-
40
- subcategiry_url = f'https://www.biblio-globus.ru/catalog/category?id={subcategory_id}&page={page}&sort=0'
41
- subcategiry_page = requests.get(subcategiry_url)
42
- subcategiry_soup = BeautifulSoup(subcategiry_page.text, 'lxml')
43
- subcategiry_links = subcategiry_soup.find_all('div', class_='text')
44
- if not subcategiry_links:
45
- break
46
-
47
- for product in subcategiry_links:
48
- product_url = 'https://www.biblio-globus.ru' + product.find('a')['href']
49
- product_page = requests.get(product_url)
50
- product_soup = BeautifulSoup(product_page.text, 'lxml')
51
- product_annotation = product_soup.find('div', id='collapseExample')
52
- if product_annotation:
53
- annotation = ''.join([symbol for symbol in product_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
54
- annotation = annotation.split('Характеристики', 1)[0]
55
- annotation = annotation.strip()
56
- else:
57
- annotation = None
58
-
59
- try:
60
- product_json = product_soup.find('script', type='application/ld+json')
61
- dict_json = json.loads(product_json.text)
62
- except (AttributeError, json.JSONDecodeError):
63
- continue
64
-
65
- author = dict_json['author']['name']
66
- title = dict_json['name']
67
- image = dict_json['image']
68
- genre = dict_json['genre']
69
- df.append([product_url, image, author, title, annotation, genre])
70
- page += 1
71
-
72
- data = pd.DataFrame(df, columns=columns)
73
- data.to_csv(f'data{n}.csv', index=False)
74
- n += 1