403 lines
15 KiB
Python
403 lines
15 KiB
Python
# modules/parser.py
|
||
"""
|
||
Модуль для парсинга товаров с morele.net
|
||
"""
|
||
|
||
import requests
|
||
import time
|
||
import re
|
||
from urllib.parse import urljoin, urlparse
|
||
from bs4 import BeautifulSoup
|
||
import logging
|
||
import hashlib
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
|
||
class MoreleParser:
|
||
"""Парсер для morele.net"""
|
||
|
||
def __init__(self, config):
|
||
self.config = config
|
||
self.session = requests.Session()
|
||
self.logger = logging.getLogger(__name__)
|
||
|
||
# Настройка сессии
|
||
self.session.headers.update({
|
||
'User-Agent': config.get('parsing.user_agent'),
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language': 'pl,en-US;q=0.7,en;q=0.3',
|
||
'Accept-Encoding': 'gzip, deflate',
|
||
'Connection': 'keep-alive',
|
||
'Upgrade-Insecure-Requests': '1',
|
||
})
|
||
|
||
def parse_category(self, category_url):
|
||
"""Парсит все товары из категории"""
|
||
self.logger.info(f"Начинаем парсинг категории: {category_url}")
|
||
|
||
products = []
|
||
page = 1
|
||
max_pages = self.config.get('parsing.max_pages', 100)
|
||
|
||
while page <= max_pages:
|
||
self.logger.debug(f"Парсинг страницы {page}")
|
||
|
||
page_url = self._get_page_url(category_url, page)
|
||
page_products = self._parse_category_page(page_url)
|
||
|
||
if not page_products:
|
||
self.logger.info(f"Страница {page} пуста, завершаем парсинг категории")
|
||
break
|
||
|
||
products.extend(page_products)
|
||
page += 1
|
||
|
||
# Пауза между запросами
|
||
time.sleep(self.config.get('parsing.delay_between_requests', 1))
|
||
|
||
self.logger.info(f"Найдено {len(products)} товаров в категории")
|
||
return products
|
||
|
||
def _get_page_url(self, base_url, page):
|
||
"""Формирует URL для конкретной страницы"""
|
||
if page == 1:
|
||
return base_url
|
||
|
||
# Проверяем, есть ли уже параметры в URL
|
||
separator = '&' if '?' in base_url else '?'
|
||
return f"{base_url}{separator}page={page}"
|
||
|
||
def _parse_category_page(self, page_url):
|
||
"""Парсит товары с одной страницы категории"""
|
||
try:
|
||
response = self._make_request(page_url)
|
||
if not response:
|
||
return []
|
||
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
|
||
# Ищем карточки товаров
|
||
product_cards = soup.find_all('div', class_='cat-product')
|
||
products = []
|
||
|
||
# Используем многопоточность для парсинга товаров
|
||
with ThreadPoolExecutor(max_workers=self.config.get('parsing.concurrent_requests', 5)) as executor:
|
||
futures = []
|
||
|
||
for card in product_cards:
|
||
product_url = self._extract_product_url(card)
|
||
if product_url:
|
||
future = executor.submit(self._parse_product_page, product_url)
|
||
futures.append(future)
|
||
|
||
for future in as_completed(futures):
|
||
try:
|
||
product = future.result()
|
||
if product:
|
||
products.append(product)
|
||
except Exception as e:
|
||
self.logger.error(f"Ошибка при парсинге товара: {e}")
|
||
|
||
return products
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"Ошибка при парсинге страницы {page_url}: {e}")
|
||
return []
|
||
|
||
def _extract_product_url(self, card):
|
||
"""Извлекает URL товара из карточки"""
|
||
try:
|
||
link = card.find('a', href=True)
|
||
if link:
|
||
href = link['href']
|
||
if not href.startswith('http'):
|
||
href = urljoin('https://www.morele.net', href)
|
||
return href
|
||
except Exception as e:
|
||
self.logger.error(f"Ошибка при извлечении URL товара: {e}")
|
||
|
||
return None
|
||
|
||
def _parse_product_page(self, product_url):
|
||
"""Парсит детальную страницу товара"""
|
||
try:
|
||
response = self._make_request(product_url)
|
||
if not response:
|
||
return None
|
||
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
|
||
# Извлекаем данные товара
|
||
product = {
|
||
'url': product_url,
|
||
'id': self._extract_product_id(product_url),
|
||
'title': self._extract_title(soup),
|
||
'price': self._extract_price(soup),
|
||
'availability': self._extract_availability(soup),
|
||
'description': self._extract_description(soup),
|
||
'attributes': self._extract_attributes(soup),
|
||
'category': self._extract_category(soup),
|
||
'images': self._extract_images(soup),
|
||
'brand': self._extract_brand(soup),
|
||
'model': self._extract_model(soup),
|
||
'sku': self._extract_sku(soup),
|
||
'parsed_at': time.time()
|
||
}
|
||
|
||
# Генерируем хеш для определения изменений
|
||
product['content_hash'] = self._generate_content_hash(product)
|
||
|
||
return product
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"Ошибка при парсинге товара {product_url}: {e}")
|
||
return None
|
||
|
||
def _extract_product_id(self, url):
|
||
"""Извлекает ID товара из URL"""
|
||
# Ищем числовой ID в URL
|
||
match = re.search(r'/(\d+)-', url)
|
||
if match:
|
||
return match.group(1)
|
||
|
||
# Если не найден, используем хеш URL
|
||
return hashlib.md5(url.encode()).hexdigest()[:10]
|
||
|
||
def _extract_title(self, soup):
|
||
"""Извлекает название товара"""
|
||
selectors = [
|
||
'h1.prod-name',
|
||
'h1[data-test="product-name"]',
|
||
'h1.product-title',
|
||
'.product-name h1',
|
||
'h1'
|
||
]
|
||
|
||
for selector in selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
return element.get_text(strip=True)
|
||
|
||
return "Без названия"
|
||
|
||
def _extract_price(self, soup):
|
||
"""Извлекает цену товара"""
|
||
selectors = [
|
||
'.price-new',
|
||
'.price-main',
|
||
'[data-test="product-price"]',
|
||
'.product-price .price',
|
||
'.price'
|
||
]
|
||
|
||
for selector in selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
price_text = element.get_text(strip=True)
|
||
# Извлекаем числовое значение
|
||
price_match = re.search(r'([\d\s]+[,.]?\d*)', price_text.replace(' ', ''))
|
||
if price_match:
|
||
price_str = price_match.group(1).replace(' ', '').replace(',', '.')
|
||
try:
|
||
return float(price_str)
|
||
except ValueError:
|
||
continue
|
||
|
||
return 0.0
|
||
|
||
def _extract_availability(self, soup):
|
||
"""Извлекает информацию о наличии"""
|
||
selectors = [
|
||
'.availability',
|
||
'[data-test="product-availability"]',
|
||
'.product-availability',
|
||
'.stock-info'
|
||
]
|
||
|
||
for selector in selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
availability_text = element.get_text(strip=True).lower()
|
||
|
||
if any(word in availability_text for word in ['dostępny', 'w magazynie', 'dostępne']):
|
||
return 'в наличии'
|
||
elif any(word in availability_text for word in ['brak', 'niedostępny']):
|
||
return 'нет в наличии'
|
||
else:
|
||
return availability_text
|
||
|
||
return 'неизвестно'
|
||
|
||
def _extract_description(self, soup):
|
||
"""Извлекает описание товара"""
|
||
selectors = [
|
||
'.product-description',
|
||
'[data-test="product-description"]',
|
||
'.prod-description',
|
||
'.description',
|
||
'.product-details .description'
|
||
]
|
||
|
||
for selector in selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
# Удаляем HTML теги и лишние пробелы
|
||
description = element.get_text(separator=' ', strip=True)
|
||
return re.sub(r'\s+', ' ', description)
|
||
|
||
return ""
|
||
|
||
def _extract_attributes(self, soup):
|
||
"""Извлекает характеристики товара"""
|
||
attributes = {}
|
||
|
||
# Различные селекторы для характеристик
|
||
specs_sections = soup.find_all(['div', 'section'], class_=re.compile(r'spec|param|attribute|feature'))
|
||
|
||
for section in specs_sections:
|
||
# Ищем пары ключ-значение
|
||
rows = section.find_all(['tr', 'div'], class_=re.compile(r'spec-row|param-row|attribute-row'))
|
||
|
||
for row in rows:
|
||
# Пытаемся найти название и значение
|
||
name_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'name|key|label'))
|
||
value_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'value|val'))
|
||
|
||
if name_elem and value_elem:
|
||
name = name_elem.get_text(strip=True)
|
||
value = value_elem.get_text(strip=True)
|
||
|
||
if name and value and len(name) < 100:
|
||
attributes[name] = value
|
||
|
||
return attributes
|
||
|
||
def _extract_category(self, soup):
|
||
"""Извлекает категорию товара"""
|
||
# Ищем хлебные крошки
|
||
breadcrumb_selectors = [
|
||
'.breadcrumb',
|
||
'.breadcrumbs',
|
||
'[data-test="breadcrumb"]',
|
||
'.navigation-path'
|
||
]
|
||
|
||
for selector in breadcrumb_selectors:
|
||
breadcrumb = soup.select_one(selector)
|
||
if breadcrumb:
|
||
links = breadcrumb.find_all('a')
|
||
if len(links) > 1: # Пропускаем "Главная"
|
||
return links[-1].get_text(strip=True)
|
||
|
||
return "Без категории"
|
||
|
||
def _extract_images(self, soup):
|
||
"""Извлекает изображения товара"""
|
||
images = []
|
||
|
||
# Селекторы для изображений
|
||
img_selectors = [
|
||
'.product-gallery img',
|
||
'.product-images img',
|
||
'[data-test="product-image"]',
|
||
'.gallery img',
|
||
'.product-photo img'
|
||
]
|
||
|
||
for selector in img_selectors:
|
||
imgs = soup.select(selector)
|
||
for img in imgs:
|
||
src = img.get('src') or img.get('data-src') or img.get('data-lazy')
|
||
if src:
|
||
if not src.startswith('http'):
|
||
src = urljoin('https://www.morele.net', src)
|
||
|
||
# Фильтруем маленькие изображения
|
||
if not any(size in src for size in ['icon', 'thumb', 'small']) and src not in images:
|
||
images.append(src)
|
||
|
||
return images[:10] # Ограничиваем количество изображений
|
||
|
||
def _extract_brand(self, soup):
|
||
"""Извлекает бренд товара"""
|
||
selectors = [
|
||
'[data-test="product-brand"]',
|
||
'.product-brand',
|
||
'.brand',
|
||
'.manufacturer'
|
||
]
|
||
|
||
for selector in selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
return element.get_text(strip=True)
|
||
|
||
return ""
|
||
|
||
def _extract_model(self, soup):
|
||
"""Извлекает модель товара"""
|
||
selectors = [
|
||
'[data-test="product-model"]',
|
||
'.product-model',
|
||
'.model'
|
||
]
|
||
|
||
for selector in selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
return element.get_text(strip=True)
|
||
|
||
return ""
|
||
|
||
def _extract_sku(self, soup):
|
||
"""Извлекает артикул товара"""
|
||
selectors = [
|
||
'[data-test="product-sku"]',
|
||
'.product-sku',
|
||
'.sku',
|
||
'.article-number',
|
||
'.product-code'
|
||
]
|
||
|
||
for selector in selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
return element.get_text(strip=True)
|
||
|
||
return ""
|
||
|
||
def _generate_content_hash(self, product):
|
||
"""Генерирует хеш содержимого товара для определения изменений"""
|
||
content = f"{product['title']}{product['price']}{product['availability']}{product['description']}"
|
||
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
||
|
||
def _make_request(self, url, retries=None):
|
||
"""Выполняет HTTP запрос с повторными попытками"""
|
||
if retries is None:
|
||
retries = self.config.get('parsing.max_retries', 3)
|
||
|
||
for attempt in range(retries + 1):
|
||
try:
|
||
response = self.session.get(
|
||
url,
|
||
timeout=self.config.get('parsing.timeout', 30)
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
return response
|
||
elif response.status_code == 429: # Too Many Requests
|
||
wait_time = (attempt + 1) * 5
|
||
self.logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
|
||
time.sleep(wait_time)
|
||
else:
|
||
self.logger.warning(f"HTTP {response.status_code} for {url}")
|
||
|
||
except requests.RequestException as e:
|
||
self.logger.error(f"Request error on attempt {attempt + 1} for {url}: {e}")
|
||
|
||
if attempt < retries:
|
||
time.sleep((attempt + 1) * 2)
|
||
|
||
self.logger.error(f"Failed to fetch {url} after {retries + 1} attempts")
|
||
return None
|