Spaces:
Sleeping
Sleeping
| import json | |
| import requests | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from bs4 import BeautifulSoup | |
| def text(links): | |
| for elem in links: | |
| result = elem.text.strip() | |
| break | |
| return result | |
| url = 'https://www.biblio-globus.ru/catalog/categories' | |
| catalog = requests.get(url) | |
| catalog_soup = BeautifulSoup(catalog.text, 'lxml') | |
| list_categories = catalog_soup.find_all('li', class_='list-group-item') | |
| df = [] | |
| columns = ['product_url', 'image', 'author', 'title', 'annotation', 'genre'] | |
| n = 1 | |
| for link in tqdm(list_categories): | |
| category_url = 'https://www.biblio-globus.ru' + link.find('a')['href'] | |
| category_page = requests.get(category_url) | |
| category_soup = BeautifulSoup(category_page.text, 'lxml') | |
| list_subcategories = category_soup.find_all('a', class_='product-preview-title') | |
| for sub in tqdm(list_subcategories): | |
| subcategory_id = sub['href'].split('/')[-1] | |
| page = 1 | |
| while True: | |
| subcategiry_url = f'https://www.biblio-globus.ru/catalog/category?id={subcategory_id}&page={page}&sort=0' | |
| subcategiry_page = requests.get(subcategiry_url) | |
| subcategiry_soup = BeautifulSoup(subcategiry_page.text, 'lxml') | |
| subcategiry_links = subcategiry_soup.find_all('div', class_='text') | |
| if not subcategiry_links: | |
| break | |
| for product in subcategiry_links: | |
| product_url = 'https://www.biblio-globus.ru' + product.find('a')['href'] | |
| product_page = requests.get(product_url) | |
| product_soup = BeautifulSoup(product_page.text, 'lxml') | |
| product_annotation = product_soup.find('div', id='collapseExample') | |
| if product_annotation: | |
| annotation = ''.join([symbol for symbol in product_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']]) | |
| annotation = annotation.split('Характеристики', 1)[0] | |
| annotation = annotation.strip() | |
| else: | |
| annotation = None | |
| try: | |
| product_json = product_soup.find('script', type='application/ld+json') | |
| dict_json = json.loads(product_json.text) | |
| except (AttributeError, json.JSONDecodeError): | |
| continue | |
| author = dict_json['author']['name'] | |
| title = dict_json['name'] | |
| image = dict_json['image'] | |
| genre = dict_json['genre'] | |
| df.append([product_url, image, author, title, annotation, genre]) | |
| page += 1 | |
| data = pd.DataFrame(df, columns=columns) | |
| data.to_csv(f'data{n}.csv', index=False) | |
| n += 1 | |