morele_scraper/modules/parser.py

# modules/parser.py
"""
Модуль для парсинга товаров с morele.net
"""

import requests
import time
import re
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import logging
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed


class MoreleParser:
    """Парсер для morele.net"""

    def __init__(self, config):
        self.config = config
        self.session = requests.Session()
        self.logger = logging.getLogger(__name__)

        # Настройка сессии
        self.session.headers.update({
            'User-Agent': config.get('parsing.user_agent'),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'pl,en-US;q=0.7,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })

    def parse_category(self, category_url):
        """Парсит все товары из категории"""
        self.logger.info(f"Начинаем парсинг категории: {category_url}")

        products = []
        page = 1
        max_pages = self.config.get('parsing.max_pages', 100)

        while page <= max_pages:
            self.logger.debug(f"Парсинг страницы {page}")

            page_url = self._get_page_url(category_url, page)
            page_products = self._parse_category_page(page_url)

            if not page_products:
                self.logger.info(f"Страница {page} пуста, завершаем парсинг категории")
                break

            products.extend(page_products)
            page += 1

            # Пауза между запросами
            time.sleep(self.config.get('parsing.delay_between_requests', 1))

        self.logger.info(f"Найдено {len(products)} товаров в категории")
        return products

    def _get_page_url(self, base_url, page):
        """Формирует URL для конкретной страницы"""
        if page == 1:
            return base_url

        # Проверяем, есть ли уже параметры в URL
        separator = '&' if '?' in base_url else '?'
        return f"{base_url}{separator}page={page}"

    def _parse_category_page(self, page_url):
        """Парсит товары с одной страницы категории"""
        try:
            response = self._make_request(page_url)
            if not response:
                return []

            soup = BeautifulSoup(response.content, 'html.parser')

            # Ищем карточки товаров
            product_cards = soup.find_all('div', class_='cat-product')
            products = []

            # Используем многопоточность для парсинга товаров
            with ThreadPoolExecutor(max_workers=self.config.get('parsing.concurrent_requests', 5)) as executor:
                futures = []

                for card in product_cards:
                    product_url = self._extract_product_url(card)
                    if product_url:
                        future = executor.submit(self._parse_product_page, product_url)
                        futures.append(future)

                for future in as_completed(futures):
                    try:
                        product = future.result()
                        if product:
                            products.append(product)
                    except Exception as e:
                        self.logger.error(f"Ошибка при парсинге товара: {e}")

            return products

        except Exception as e:
            self.logger.error(f"Ошибка при парсинге страницы {page_url}: {e}")
            return []

    def _extract_product_url(self, card):
        """Извлекает URL товара из карточки"""
        try:
            link = card.find('a', href=True)
            if link:
                href = link['href']
                if not href.startswith('http'):
                    href = urljoin('https://www.morele.net', href)
                return href
        except Exception as e:
            self.logger.error(f"Ошибка при извлечении URL товара: {e}")

        return None

    def _parse_product_page(self, product_url):
        """Парсит детальную страницу товара"""
        try:
            response = self._make_request(product_url)
            if not response:
                return None

            soup = BeautifulSoup(response.content, 'html.parser')

            # Извлекаем данные товара
            product = {
                'url': product_url,
                'id': self._extract_product_id(product_url),
                'title': self._extract_title(soup),
                'price': self._extract_price(soup),
                'availability': self._extract_availability(soup),
                'description': self._extract_description(soup),
                'attributes': self._extract_attributes(soup),
                'category': self._extract_category(soup),
                'images': self._extract_images(soup),
                'brand': self._extract_brand(soup),
                'model': self._extract_model(soup),
                'sku': self._extract_sku(soup),
                'parsed_at': time.time()
            }

            # Генерируем хеш для определения изменений
            product['content_hash'] = self._generate_content_hash(product)

            return product

        except Exception as e:
            self.logger.error(f"Ошибка при парсинге товара {product_url}: {e}")
            return None

    def _extract_product_id(self, url):
        """Извлекает ID товара из URL"""
        # Ищем числовой ID в URL
        match = re.search(r'/(\d+)-', url)
        if match:
            return match.group(1)

        # Если не найден, используем хеш URL
        return hashlib.md5(url.encode()).hexdigest()[:10]

    def _extract_title(self, soup):
        """Извлекает название товара"""
        selectors = [
            'h1.prod-name',
            'h1[data-test="product-name"]',
            'h1.product-title',
            '.product-name h1',
            'h1'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text(strip=True)

        return "Без названия"

    def _extract_price(self, soup):
        """Извлекает цену товара"""
        selectors = [
            '.price-new',
            '.price-main',
            '[data-test="product-price"]',
            '.product-price .price',
            '.price'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                price_text = element.get_text(strip=True)
                # Извлекаем числовое значение
                price_match = re.search(r'([\d\s]+[,.]?\d*)', price_text.replace(' ', ''))
                if price_match:
                    price_str = price_match.group(1).replace(' ', '').replace(',', '.')
                    try:
                        return float(price_str)
                    except ValueError:
                        continue

        return 0.0

    def _extract_availability(self, soup):
        """Извлекает информацию о наличии"""
        selectors = [
            '.availability',
            '[data-test="product-availability"]',
            '.product-availability',
            '.stock-info'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                availability_text = element.get_text(strip=True).lower()

                if any(word in availability_text for word in ['dostępny', 'w magazynie', 'dostępne']):
                    return 'в наличии'
                elif any(word in availability_text for word in ['brak', 'niedostępny']):
                    return 'нет в наличии'
                else:
                    return availability_text

        return 'неизвестно'

    def _extract_description(self, soup):
        """Извлекает описание товара"""
        selectors = [
            '.product-description',
            '[data-test="product-description"]',
            '.prod-description',
            '.description',
            '.product-details .description'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                # Удаляем HTML теги и лишние пробелы
                description = element.get_text(separator=' ', strip=True)
                return re.sub(r'\s+', ' ', description)

        return ""

    def _extract_attributes(self, soup):
        """Извлекает характеристики товара"""
        attributes = {}

        # Различные селекторы для характеристик
        specs_sections = soup.find_all(['div', 'section'], class_=re.compile(r'spec|param|attribute|feature'))

        for section in specs_sections:
            # Ищем пары ключ-значение
            rows = section.find_all(['tr', 'div'], class_=re.compile(r'spec-row|param-row|attribute-row'))

            for row in rows:
                # Пытаемся найти название и значение
                name_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'name|key|label'))
                value_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'value|val'))

                if name_elem and value_elem:
                    name = name_elem.get_text(strip=True)
                    value = value_elem.get_text(strip=True)

                    if name and value and len(name) < 100:
                        attributes[name] = value

        return attributes

    def _extract_category(self, soup):
        """Извлекает категорию товара"""
        # Ищем хлебные крошки
        breadcrumb_selectors = [
            '.breadcrumb',
            '.breadcrumbs',
            '[data-test="breadcrumb"]',
            '.navigation-path'
        ]

        for selector in breadcrumb_selectors:
            breadcrumb = soup.select_one(selector)
            if breadcrumb:
                links = breadcrumb.find_all('a')
                if len(links) > 1:  # Пропускаем "Главная"
                    return links[-1].get_text(strip=True)

        return "Без категории"

    def _extract_images(self, soup):
        """Извлекает изображения товара"""
        images = []

        # Селекторы для изображений
        img_selectors = [
            '.product-gallery img',
            '.product-images img',
            '[data-test="product-image"]',
            '.gallery img',
            '.product-photo img'
        ]

        for selector in img_selectors:
            imgs = soup.select(selector)
            for img in imgs:
                src = img.get('src') or img.get('data-src') or img.get('data-lazy')
                if src:
                    if not src.startswith('http'):
                        src = urljoin('https://www.morele.net', src)

                    # Фильтруем маленькие изображения
                    if not any(size in src for size in ['icon', 'thumb', 'small']) and src not in images:
                        images.append(src)

        return images[:10]  # Ограничиваем количество изображений

    def _extract_brand(self, soup):
        """Извлекает бренд товара"""
        selectors = [
            '[data-test="product-brand"]',
            '.product-brand',
            '.brand',
            '.manufacturer'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text(strip=True)

        return ""

    def _extract_model(self, soup):
        """Извлекает модель товара"""
        selectors = [
            '[data-test="product-model"]',
            '.product-model',
            '.model'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text(strip=True)

        return ""

    def _extract_sku(self, soup):
        """Извлекает артикул товара"""
        selectors = [
            '[data-test="product-sku"]',
            '.product-sku',
            '.sku',
            '.article-number',
            '.product-code'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text(strip=True)

        return ""

    def _generate_content_hash(self, product):
        """Генерирует хеш содержимого товара для определения изменений"""
        content = f"{product['title']}{product['price']}{product['availability']}{product['description']}"
        return hashlib.md5(content.encode('utf-8')).hexdigest()

    def _make_request(self, url, retries=None):
        """Выполняет HTTP запрос с повторными попытками"""
        if retries is None:
            retries = self.config.get('parsing.max_retries', 3)

        for attempt in range(retries + 1):
            try:
                response = self.session.get(
                    url,
                    timeout=self.config.get('parsing.timeout', 30)
                )

                if response.status_code == 200:
                    return response
                elif response.status_code == 429:  # Too Many Requests
                    wait_time = (attempt + 1) * 5
                    self.logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    self.logger.warning(f"HTTP {response.status_code} for {url}")

            except requests.RequestException as e:
                self.logger.error(f"Request error on attempt {attempt + 1} for {url}: {e}")

                if attempt < retries:
                    time.sleep((attempt + 1) * 2)

        self.logger.error(f"Failed to fetch {url} after {retries + 1} attempts")
        return None