first commit

2025-06-18 21:22:55 +03:00
commit ad4d215f04
22 changed files with 3762 additions and 0 deletions
--- a/modules/parser.py
+++ b/modules/parser.py
@@ -0,0 +1,402 @@
+# modules/parser.py
+"""
+Модуль для парсинга товаров с morele.net
+"""
+
+import requests
+import time
+import re
+from urllib.parse import urljoin, urlparse
+from bs4 import BeautifulSoup
+import logging
+import hashlib
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+class MoreleParser:
+    """Парсер для morele.net"""
+
+    def __init__(self, config):
+        self.config = config
+        self.session = requests.Session()
+        self.logger = logging.getLogger(__name__)
+
+        # Настройка сессии
+        self.session.headers.update({
+            'User-Agent': config.get('parsing.user_agent'),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'pl,en-US;q=0.7,en;q=0.3',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+
+    def parse_category(self, category_url):
+        """Парсит все товары из категории"""
+        self.logger.info(f"Начинаем парсинг категории: {category_url}")
+
+        products = []
+        page = 1
+        max_pages = self.config.get('parsing.max_pages', 100)
+
+        while page <= max_pages:
+            self.logger.debug(f"Парсинг страницы {page}")
+
+            page_url = self._get_page_url(category_url, page)
+            page_products = self._parse_category_page(page_url)
+
+            if not page_products:
+                self.logger.info(f"Страница {page} пуста, завершаем парсинг категории")
+                break
+
+            products.extend(page_products)
+            page += 1
+
+            # Пауза между запросами
+            time.sleep(self.config.get('parsing.delay_between_requests', 1))
+
+        self.logger.info(f"Найдено {len(products)} товаров в категории")
+        return products
+
+    def _get_page_url(self, base_url, page):
+        """Формирует URL для конкретной страницы"""
+        if page == 1:
+            return base_url
+
+        # Проверяем, есть ли уже параметры в URL
+        separator = '&' if '?' in base_url else '?'
+        return f"{base_url}{separator}page={page}"
+
+    def _parse_category_page(self, page_url):
+        """Парсит товары с одной страницы категории"""
+        try:
+            response = self._make_request(page_url)
+            if not response:
+                return []
+
+            soup = BeautifulSoup(response.content, 'html.parser')
+
+            # Ищем карточки товаров
+            product_cards = soup.find_all('div', class_='cat-product')
+            products = []
+
+            # Используем многопоточность для парсинга товаров
+            with ThreadPoolExecutor(max_workers=self.config.get('parsing.concurrent_requests', 5)) as executor:
+                futures = []
+
+                for card in product_cards:
+                    product_url = self._extract_product_url(card)
+                    if product_url:
+                        future = executor.submit(self._parse_product_page, product_url)
+                        futures.append(future)
+
+                for future in as_completed(futures):
+                    try:
+                        product = future.result()
+                        if product:
+                            products.append(product)
+                    except Exception as e:
+                        self.logger.error(f"Ошибка при парсинге товара: {e}")
+
+            return products
+
+        except Exception as e:
+            self.logger.error(f"Ошибка при парсинге страницы {page_url}: {e}")
+            return []
+
+    def _extract_product_url(self, card):
+        """Извлекает URL товара из карточки"""
+        try:
+            link = card.find('a', href=True)
+            if link:
+                href = link['href']
+                if not href.startswith('http'):
+                    href = urljoin('https://www.morele.net', href)
+                return href
+        except Exception as e:
+            self.logger.error(f"Ошибка при извлечении URL товара: {e}")
+
+        return None
+
+    def _parse_product_page(self, product_url):
+        """Парсит детальную страницу товара"""
+        try:
+            response = self._make_request(product_url)
+            if not response:
+                return None
+
+            soup = BeautifulSoup(response.content, 'html.parser')
+
+            # Извлекаем данные товара
+            product = {
+                'url': product_url,
+                'id': self._extract_product_id(product_url),
+                'title': self._extract_title(soup),
+                'price': self._extract_price(soup),
+                'availability': self._extract_availability(soup),
+                'description': self._extract_description(soup),
+                'attributes': self._extract_attributes(soup),
+                'category': self._extract_category(soup),
+                'images': self._extract_images(soup),
+                'brand': self._extract_brand(soup),
+                'model': self._extract_model(soup),
+                'sku': self._extract_sku(soup),
+                'parsed_at': time.time()
+            }
+
+            # Генерируем хеш для определения изменений
+            product['content_hash'] = self._generate_content_hash(product)
+
+            return product
+
+        except Exception as e:
+            self.logger.error(f"Ошибка при парсинге товара {product_url}: {e}")
+            return None
+
+    def _extract_product_id(self, url):
+        """Извлекает ID товара из URL"""
+        # Ищем числовой ID в URL
+        match = re.search(r'/(\d+)-', url)
+        if match:
+            return match.group(1)
+
+        # Если не найден, используем хеш URL
+        return hashlib.md5(url.encode()).hexdigest()[:10]
+
+    def _extract_title(self, soup):
+        """Извлекает название товара"""
+        selectors = [
+            'h1.prod-name',
+            'h1[data-test="product-name"]',
+            'h1.product-title',
+            '.product-name h1',
+            'h1'
+        ]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                return element.get_text(strip=True)
+
+        return "Без названия"
+
+    def _extract_price(self, soup):
+        """Извлекает цену товара"""
+        selectors = [
+            '.price-new',
+            '.price-main',
+            '[data-test="product-price"]',
+            '.product-price .price',
+            '.price'
+        ]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                price_text = element.get_text(strip=True)
+                # Извлекаем числовое значение
+                price_match = re.search(r'([\d\s]+[,.]?\d*)', price_text.replace(' ', ''))
+                if price_match:
+                    price_str = price_match.group(1).replace(' ', '').replace(',', '.')
+                    try:
+                        return float(price_str)
+                    except ValueError:
+                        continue
+
+        return 0.0
+
+    def _extract_availability(self, soup):
+        """Извлекает информацию о наличии"""
+        selectors = [
+            '.availability',
+            '[data-test="product-availability"]',
+            '.product-availability',
+            '.stock-info'
+        ]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                availability_text = element.get_text(strip=True).lower()
+
+                if any(word in availability_text for word in ['dostępny', 'w magazynie', 'dostępne']):
+                    return 'в наличии'
+                elif any(word in availability_text for word in ['brak', 'niedostępny']):
+                    return 'нет в наличии'
+                else:
+                    return availability_text
+
+        return 'неизвестно'
+
+    def _extract_description(self, soup):
+        """Извлекает описание товара"""
+        selectors = [
+            '.product-description',
+            '[data-test="product-description"]',
+            '.prod-description',
+            '.description',
+            '.product-details .description'
+        ]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                # Удаляем HTML теги и лишние пробелы
+                description = element.get_text(separator=' ', strip=True)
+                return re.sub(r'\s+', ' ', description)
+
+        return ""
+
+    def _extract_attributes(self, soup):
+        """Извлекает характеристики товара"""
+        attributes = {}
+
+        # Различные селекторы для характеристик
+        specs_sections = soup.find_all(['div', 'section'], class_=re.compile(r'spec|param|attribute|feature'))
+
+        for section in specs_sections:
+            # Ищем пары ключ-значение
+            rows = section.find_all(['tr', 'div'], class_=re.compile(r'spec-row|param-row|attribute-row'))
+
+            for row in rows:
+                # Пытаемся найти название и значение
+                name_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'name|key|label'))
+                value_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'value|val'))
+
+                if name_elem and value_elem:
+                    name = name_elem.get_text(strip=True)
+                    value = value_elem.get_text(strip=True)
+
+                    if name and value and len(name) < 100:
+                        attributes[name] = value
+
+        return attributes
+
+    def _extract_category(self, soup):
+        """Извлекает категорию товара"""
+        # Ищем хлебные крошки
+        breadcrumb_selectors = [
+            '.breadcrumb',
+            '.breadcrumbs',
+            '[data-test="breadcrumb"]',
+            '.navigation-path'
+        ]
+
+        for selector in breadcrumb_selectors:
+            breadcrumb = soup.select_one(selector)
+            if breadcrumb:
+                links = breadcrumb.find_all('a')
+                if len(links) > 1:  # Пропускаем "Главная"
+                    return links[-1].get_text(strip=True)
+
+        return "Без категории"
+
+    def _extract_images(self, soup):
+        """Извлекает изображения товара"""
+        images = []
+
+        # Селекторы для изображений
+        img_selectors = [
+            '.product-gallery img',
+            '.product-images img',
+            '[data-test="product-image"]',
+            '.gallery img',
+            '.product-photo img'
+        ]
+
+        for selector in img_selectors:
+            imgs = soup.select(selector)
+            for img in imgs:
+                src = img.get('src') or img.get('data-src') or img.get('data-lazy')
+                if src:
+                    if not src.startswith('http'):
+                        src = urljoin('https://www.morele.net', src)
+
+                    # Фильтруем маленькие изображения
+                    if not any(size in src for size in ['icon', 'thumb', 'small']) and src not in images:
+                        images.append(src)
+
+        return images[:10]  # Ограничиваем количество изображений
+
+    def _extract_brand(self, soup):
+        """Извлекает бренд товара"""
+        selectors = [
+            '[data-test="product-brand"]',
+            '.product-brand',
+            '.brand',
+            '.manufacturer'
+        ]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                return element.get_text(strip=True)
+
+        return ""
+
+    def _extract_model(self, soup):
+        """Извлекает модель товара"""
+        selectors = [
+            '[data-test="product-model"]',
+            '.product-model',
+            '.model'
+        ]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                return element.get_text(strip=True)
+
+        return ""
+
+    def _extract_sku(self, soup):
+        """Извлекает артикул товара"""
+        selectors = [
+            '[data-test="product-sku"]',
+            '.product-sku',
+            '.sku',
+            '.article-number',
+            '.product-code'
+        ]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                return element.get_text(strip=True)
+
+        return ""
+
+    def _generate_content_hash(self, product):
+        """Генерирует хеш содержимого товара для определения изменений"""
+        content = f"{product['title']}{product['price']}{product['availability']}{product['description']}"
+        return hashlib.md5(content.encode('utf-8')).hexdigest()
+
+    def _make_request(self, url, retries=None):
+        """Выполняет HTTP запрос с повторными попытками"""
+        if retries is None:
+            retries = self.config.get('parsing.max_retries', 3)
+
+        for attempt in range(retries + 1):
+            try:
+                response = self.session.get(
+                    url,
+                    timeout=self.config.get('parsing.timeout', 30)
+                )
+
+                if response.status_code == 200:
+                    return response
+                elif response.status_code == 429:  # Too Many Requests
+                    wait_time = (attempt + 1) * 5
+                    self.logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
+                    time.sleep(wait_time)
+                else:
+                    self.logger.warning(f"HTTP {response.status_code} for {url}")
+
+            except requests.RequestException as e:
+                self.logger.error(f"Request error on attempt {attempt + 1} for {url}: {e}")
+
+                if attempt < retries:
+                    time.sleep((attempt + 1) * 2)
+
+        self.logger.error(f"Failed to fetch {url} after {retries + 1} attempts")
+        return None