first commit

This commit is contained in:
2025-06-18 21:22:55 +03:00
commit ad4d215f04
22 changed files with 3762 additions and 0 deletions

402
modules/parser.py Normal file
View File

@@ -0,0 +1,402 @@
# modules/parser.py
"""
Модуль для парсинга товаров с morele.net
"""
import requests
import time
import re
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import logging
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
class MoreleParser:
"""Парсер для morele.net"""
def __init__(self, config):
self.config = config
self.session = requests.Session()
self.logger = logging.getLogger(__name__)
# Настройка сессии
self.session.headers.update({
'User-Agent': config.get('parsing.user_agent'),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'pl,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
def parse_category(self, category_url):
"""Парсит все товары из категории"""
self.logger.info(f"Начинаем парсинг категории: {category_url}")
products = []
page = 1
max_pages = self.config.get('parsing.max_pages', 100)
while page <= max_pages:
self.logger.debug(f"Парсинг страницы {page}")
page_url = self._get_page_url(category_url, page)
page_products = self._parse_category_page(page_url)
if not page_products:
self.logger.info(f"Страница {page} пуста, завершаем парсинг категории")
break
products.extend(page_products)
page += 1
# Пауза между запросами
time.sleep(self.config.get('parsing.delay_between_requests', 1))
self.logger.info(f"Найдено {len(products)} товаров в категории")
return products
def _get_page_url(self, base_url, page):
"""Формирует URL для конкретной страницы"""
if page == 1:
return base_url
# Проверяем, есть ли уже параметры в URL
separator = '&' if '?' in base_url else '?'
return f"{base_url}{separator}page={page}"
def _parse_category_page(self, page_url):
"""Парсит товары с одной страницы категории"""
try:
response = self._make_request(page_url)
if not response:
return []
soup = BeautifulSoup(response.content, 'html.parser')
# Ищем карточки товаров
product_cards = soup.find_all('div', class_='cat-product')
products = []
# Используем многопоточность для парсинга товаров
with ThreadPoolExecutor(max_workers=self.config.get('parsing.concurrent_requests', 5)) as executor:
futures = []
for card in product_cards:
product_url = self._extract_product_url(card)
if product_url:
future = executor.submit(self._parse_product_page, product_url)
futures.append(future)
for future in as_completed(futures):
try:
product = future.result()
if product:
products.append(product)
except Exception as e:
self.logger.error(f"Ошибка при парсинге товара: {e}")
return products
except Exception as e:
self.logger.error(f"Ошибка при парсинге страницы {page_url}: {e}")
return []
def _extract_product_url(self, card):
"""Извлекает URL товара из карточки"""
try:
link = card.find('a', href=True)
if link:
href = link['href']
if not href.startswith('http'):
href = urljoin('https://www.morele.net', href)
return href
except Exception as e:
self.logger.error(f"Ошибка при извлечении URL товара: {e}")
return None
def _parse_product_page(self, product_url):
"""Парсит детальную страницу товара"""
try:
response = self._make_request(product_url)
if not response:
return None
soup = BeautifulSoup(response.content, 'html.parser')
# Извлекаем данные товара
product = {
'url': product_url,
'id': self._extract_product_id(product_url),
'title': self._extract_title(soup),
'price': self._extract_price(soup),
'availability': self._extract_availability(soup),
'description': self._extract_description(soup),
'attributes': self._extract_attributes(soup),
'category': self._extract_category(soup),
'images': self._extract_images(soup),
'brand': self._extract_brand(soup),
'model': self._extract_model(soup),
'sku': self._extract_sku(soup),
'parsed_at': time.time()
}
# Генерируем хеш для определения изменений
product['content_hash'] = self._generate_content_hash(product)
return product
except Exception as e:
self.logger.error(f"Ошибка при парсинге товара {product_url}: {e}")
return None
def _extract_product_id(self, url):
"""Извлекает ID товара из URL"""
# Ищем числовой ID в URL
match = re.search(r'/(\d+)-', url)
if match:
return match.group(1)
# Если не найден, используем хеш URL
return hashlib.md5(url.encode()).hexdigest()[:10]
def _extract_title(self, soup):
"""Извлекает название товара"""
selectors = [
'h1.prod-name',
'h1[data-test="product-name"]',
'h1.product-title',
'.product-name h1',
'h1'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return "Без названия"
def _extract_price(self, soup):
"""Извлекает цену товара"""
selectors = [
'.price-new',
'.price-main',
'[data-test="product-price"]',
'.product-price .price',
'.price'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
price_text = element.get_text(strip=True)
# Извлекаем числовое значение
price_match = re.search(r'([\d\s]+[,.]?\d*)', price_text.replace(' ', ''))
if price_match:
price_str = price_match.group(1).replace(' ', '').replace(',', '.')
try:
return float(price_str)
except ValueError:
continue
return 0.0
def _extract_availability(self, soup):
"""Извлекает информацию о наличии"""
selectors = [
'.availability',
'[data-test="product-availability"]',
'.product-availability',
'.stock-info'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
availability_text = element.get_text(strip=True).lower()
if any(word in availability_text for word in ['dostępny', 'w magazynie', 'dostępne']):
return 'в наличии'
elif any(word in availability_text for word in ['brak', 'niedostępny']):
return 'нет в наличии'
else:
return availability_text
return 'неизвестно'
def _extract_description(self, soup):
"""Извлекает описание товара"""
selectors = [
'.product-description',
'[data-test="product-description"]',
'.prod-description',
'.description',
'.product-details .description'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
# Удаляем HTML теги и лишние пробелы
description = element.get_text(separator=' ', strip=True)
return re.sub(r'\s+', ' ', description)
return ""
def _extract_attributes(self, soup):
"""Извлекает характеристики товара"""
attributes = {}
# Различные селекторы для характеристик
specs_sections = soup.find_all(['div', 'section'], class_=re.compile(r'spec|param|attribute|feature'))
for section in specs_sections:
# Ищем пары ключ-значение
rows = section.find_all(['tr', 'div'], class_=re.compile(r'spec-row|param-row|attribute-row'))
for row in rows:
# Пытаемся найти название и значение
name_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'name|key|label'))
value_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'value|val'))
if name_elem and value_elem:
name = name_elem.get_text(strip=True)
value = value_elem.get_text(strip=True)
if name and value and len(name) < 100:
attributes[name] = value
return attributes
def _extract_category(self, soup):
"""Извлекает категорию товара"""
# Ищем хлебные крошки
breadcrumb_selectors = [
'.breadcrumb',
'.breadcrumbs',
'[data-test="breadcrumb"]',
'.navigation-path'
]
for selector in breadcrumb_selectors:
breadcrumb = soup.select_one(selector)
if breadcrumb:
links = breadcrumb.find_all('a')
if len(links) > 1: # Пропускаем "Главная"
return links[-1].get_text(strip=True)
return "Без категории"
def _extract_images(self, soup):
"""Извлекает изображения товара"""
images = []
# Селекторы для изображений
img_selectors = [
'.product-gallery img',
'.product-images img',
'[data-test="product-image"]',
'.gallery img',
'.product-photo img'
]
for selector in img_selectors:
imgs = soup.select(selector)
for img in imgs:
src = img.get('src') or img.get('data-src') or img.get('data-lazy')
if src:
if not src.startswith('http'):
src = urljoin('https://www.morele.net', src)
# Фильтруем маленькие изображения
if not any(size in src for size in ['icon', 'thumb', 'small']) and src not in images:
images.append(src)
return images[:10] # Ограничиваем количество изображений
def _extract_brand(self, soup):
"""Извлекает бренд товара"""
selectors = [
'[data-test="product-brand"]',
'.product-brand',
'.brand',
'.manufacturer'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return ""
def _extract_model(self, soup):
"""Извлекает модель товара"""
selectors = [
'[data-test="product-model"]',
'.product-model',
'.model'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return ""
def _extract_sku(self, soup):
"""Извлекает артикул товара"""
selectors = [
'[data-test="product-sku"]',
'.product-sku',
'.sku',
'.article-number',
'.product-code'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return ""
def _generate_content_hash(self, product):
"""Генерирует хеш содержимого товара для определения изменений"""
content = f"{product['title']}{product['price']}{product['availability']}{product['description']}"
return hashlib.md5(content.encode('utf-8')).hexdigest()
def _make_request(self, url, retries=None):
"""Выполняет HTTP запрос с повторными попытками"""
if retries is None:
retries = self.config.get('parsing.max_retries', 3)
for attempt in range(retries + 1):
try:
response = self.session.get(
url,
timeout=self.config.get('parsing.timeout', 30)
)
if response.status_code == 200:
return response
elif response.status_code == 429: # Too Many Requests
wait_time = (attempt + 1) * 5
self.logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
time.sleep(wait_time)
else:
self.logger.warning(f"HTTP {response.status_code} for {url}")
except requests.RequestException as e:
self.logger.error(f"Request error on attempt {attempt + 1} for {url}: {e}")
if attempt < retries:
time.sleep((attempt + 1) * 2)
self.logger.error(f"Failed to fetch {url} after {retries + 1} attempts")
return None