first commit

This commit is contained in:
2025-06-18 21:22:55 +03:00
commit ad4d215f04
22 changed files with 3762 additions and 0 deletions

4
modules/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
# modules/__init__.py
"""
Модули парсера morele.net
"""

362
modules/admin.py Normal file
View File

@@ -0,0 +1,362 @@
# modules/admin.py
"""
Простая веб-админка для управления парсером
"""
from flask import Flask, render_template_string, request, redirect, url_for, flash, jsonify
import logging
from datetime import datetime, timedelta
class AdminPanel:
"""Простая веб-админка"""
def __init__(self, config, storage):
self.config = config
self.storage = storage
self.app = Flask(__name__)
self.app.secret_key = 'morele-parser-secret-key'
self._setup_routes()
def _setup_routes(self):
"""Настройка маршрутов"""
@self.app.route('/')
def index():
"""Главная страница"""
categories = self.storage.get_active_categories()
stats = self.storage.get_parsing_stats(7) # За неделю
return render_template_string(self.INDEX_TEMPLATE,
categories=categories,
stats=stats)
@self.app.route('/categories')
def categories():
"""Страница управления категориями"""
categories = self.storage.get_active_categories()
return render_template_string(self.CATEGORIES_TEMPLATE, categories=categories)
@self.app.route('/add_category', methods=['POST'])
def add_category():
"""Добавление категории"""
name = request.form.get('name')
url = request.form.get('url')
if name and url:
try:
self.storage.add_category(name, url)
flash('Категория добавлена успешно', 'success')
except Exception as e:
flash(f'Ошибка при добавлении категории: {e}', 'error')
else:
flash('Заполните все поля', 'error')
return redirect(url_for('categories'))
@self.app.route('/deactivate_category/<int:category_id>')
def deactivate_category(category_id):
"""Деактивация категории"""
try:
self.storage.deactivate_category(category_id)
flash('Категория деактивирована', 'success')
except Exception as e:
flash(f'Ошибка: {e}', 'error')
return redirect(url_for('categories'))
@self.app.route('/products')
def products():
"""Страница товаров"""
page = int(request.args.get('page', 1))
per_page = 50
# Здесь можно добавить пагинацию
products = self.storage.get_products_for_feed()[:per_page]
return render_template_string(self.PRODUCTS_TEMPLATE, products=products)
@self.app.route('/api/stats')
def api_stats():
"""API для получения статистики"""
stats = self.storage.get_parsing_stats(30)
return jsonify(stats)
def run(self, host='127.0.0.1', port=5000):
"""Запуск админки"""
self.app.run(host=host, port=port, debug=False)
# HTML шаблоны
INDEX_TEMPLATE = '''
<!DOCTYPE html>
<html lang="ru">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Morele.net Parser - Админка</title>
<style>
body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
.container { max-width: 1200px; margin: 0 auto; }
.header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
.nav { margin-bottom: 20px; }
.nav a { background: #007bff; color: white; padding: 10px 15px; text-decoration: none; margin-right: 10px; border-radius: 4px; }
.nav a:hover { background: #0056b3; }
.card { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
.stats { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; }
.stat-item { text-align: center; padding: 20px; background: #f8f9fa; border-radius: 8px; }
.stat-number { font-size: 2em; font-weight: bold; color: #007bff; }
table { width: 100%; border-collapse: collapse; }
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
th { background: #f8f9fa; }
.status-active { color: #28a745; font-weight: bold; }
.status-inactive { color: #dc3545; }
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>🔍 Morele.net Parser - Админка</h1>
<div class="nav">
<a href="{{ url_for('index') }}">Главная</a>
<a href="{{ url_for('categories') }}">Категории</a>
<a href="{{ url_for('products') }}">Товары</a>
</div>
</div>
<div class="card">
<h2>📊 Статистика</h2>
<div class="stats">
<div class="stat-item">
<div class="stat-number">{{ categories|length }}</div>
<div>Активных категорий</div>
</div>
<div class="stat-item">
<div class="stat-number">{{ stats|length }}</div>
<div>Сессий парсинга за неделю</div>
</div>
</div>
</div>
<div class="card">
<h2>📋 Последние сессии парсинга</h2>
{% if stats %}
<table>
<thead>
<tr>
<th>Дата</th>
<th>Категория</th>
<th>Найдено</th>
<th>Новых</th>
<th>Обновлено</th>
<th>Ошибок</th>
</tr>
</thead>
<tbody>
{% for stat in stats[:10] %}
<tr>
<td>{{ stat.completed_at }}</td>
<td>{{ stat.category_url }}</td>
<td>{{ stat.products_found }}</td>
<td>{{ stat.products_new }}</td>
<td>{{ stat.products_updated }}</td>
<td>{{ stat.errors_count }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p>Пока нет данных о парсинге</p>
{% endif %}
</div>
</div>
</body>
</html>
'''
CATEGORIES_TEMPLATE = '''
<!DOCTYPE html>
<html lang="ru">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Категории - Morele.net Parser</title>
<style>
body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
.container { max-width: 1200px; margin: 0 auto; }
.header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
.nav { margin-bottom: 20px; }
.nav a { background: #007bff; color: white; padding: 10px 15px; text-decoration: none; margin-right: 10px; border-radius: 4px; }
.nav a:hover { background: #0056b3; }
.card { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
.form-group { margin-bottom: 15px; }
.form-group label { display: block; margin-bottom: 5px; font-weight: bold; }
.form-group input { width: 100%; padding: 10px; border: 1px solid #ddd; border-radius: 4px; box-sizing: border-box; }
.btn { background: #28a745; color: white; padding: 10px 20px; border: none; border-radius: 4px; cursor: pointer; }
.btn:hover { background: #218838; }
.btn-danger { background: #dc3545; }
.btn-danger:hover { background: #c82333; }
table { width: 100%; border-collapse: collapse; }
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
th { background: #f8f9fa; }
.alert { padding: 15px; margin-bottom: 20px; border-radius: 4px; }
.alert-success { background: #d4edda; border: 1px solid #c3e6cb; color: #155724; }
.alert-error { background: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>📁 Управление категориями</h1>
<div class="nav">
<a href="{{ url_for('index') }}">Главная</a>
<a href="{{ url_for('categories') }}">Категории</a>
<a href="{{ url_for('products') }}">Товары</a>
</div>
</div>
{% with messages = get_flashed_messages(with_categories=true) %}
{% if messages %}
{% for category, message in messages %}
<div class="alert alert-{{ category }}">{{ message }}</div>
{% endfor %}
{% endif %}
{% endwith %}
<div class="card">
<h2> Добавить категорию</h2>
<form method="POST" action="{{ url_for('add_category') }}">
<div class="form-group">
<label for="name">Название категории:</label>
<input type="text" id="name" name="name" required>
</div>
<div class="form-group">
<label for="url">URL категории на morele.net:</label>
<input type="url" id="url" name="url" required placeholder="https://www.morele.net/kategoria/...">
</div>
<button type="submit" class="btn">Добавить категорию</button>
</form>
</div>
<div class="card">
<h2>📋 Список категорий</h2>
{% if categories %}
<table>
<thead>
<tr>
<th>ID</th>
<th>Название</th>
<th>URL</th>
<th>Дата добавления</th>
<th>Действия</th>
</tr>
</thead>
<tbody>
{% for category in categories %}
<tr>
<td>{{ category.id }}</td>
<td>{{ category.name }}</td>
<td><a href="{{ category.url }}" target="_blank">{{ category.url[:50] }}...</a></td>
<td>{{ category.created_at }}</td>
<td>
<a href="{{ url_for('deactivate_category', category_id=category.id) }}"
class="btn btn-danger"
onclick="return confirm('Уверены, что хотите деактивировать эту категорию?')">
Деактивировать
</a>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p>Пока нет добавленных категорий. Добавьте первую категорию выше.</p>
{% endif %}
</div>
</div>
</body>
</html>
'''
PRODUCTS_TEMPLATE = '''
<!DOCTYPE html>
<html lang="ru">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Товары - Morele.net Parser</title>
<style>
body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
.container { max-width: 1400px; margin: 0 auto; }
.header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
.nav { margin-bottom: 20px; }
.nav a { background: #007bff; color: white; padding: 10px 15px; text-decoration: none; margin-right: 10px; border-radius: 4px; }
.nav a:hover { background: #0056b3; }
.card { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
table { width: 100%; border-collapse: collapse; font-size: 14px; }
th, td { padding: 8px; text-align: left; border-bottom: 1px solid #ddd; }
th { background: #f8f9fa; position: sticky; top: 0; }
.product-title { max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
.product-image { width: 50px; height: 50px; object-fit: cover; border-radius: 4px; }
.price { font-weight: bold; color: #28a745; }
.status-available { color: #28a745; }
.status-unavailable { color: #dc3545; }
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>📦 Товары</h1>
<div class="nav">
<a href="{{ url_for('index') }}">Главная</a>
<a href="{{ url_for('categories') }}">Категории</a>
<a href="{{ url_for('products') }}">Товары</a>
</div>
</div>
<div class="card">
<h2>📋 Список товаров</h2>
{% if products %}
<p>Показано товаров: {{ products|length }}</p>
<table>
<thead>
<tr>
<th>Изображение</th>
<th>Название (UA)</th>
<th>Цена (PLN)</th>
<th>Наличие</th>
<th>Категория</th>
<th>Обновлено</th>
<th>Ссылка</th>
</tr>
</thead>
<tbody>
{% for product in products %}
<tr>
<td>
{% if product.local_images %}
<img src="{{ product.local_images[0] }}" class="product-image" alt="Product image">
{% else %}
<div style="width: 50px; height: 50px; background: #f8f9fa; border-radius: 4px;"></div>
{% endif %}
</td>
<td class="product-title" title="{{ product.title_ua }}">{{ product.title_ua }}</td>
<td class="price">{{ "%.2f"|format(product.price) }} PLN</td>
<td class="{% if 'наличии' in product.availability %}status-available{% else %}status-unavailable{% endif %}">
{{ product.availability }}
</td>
<td>{{ product.category }}</td>
<td>{{ product.updated_at[:16] }}</td>
<td><a href="{{ product.url }}" target="_blank">Открыть</a></td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p>Пока нет товаров. Запустите парсинг для получения товаров.</p>
{% endif %}
</div>
</div>
</body>
</html>
'''

210
modules/feed_generator.py Normal file
View File

@@ -0,0 +1,210 @@
# modules/feed_generator.py
"""
Модуль для генерации YML фида для Prom.ua
"""
import xml.etree.ElementTree as ET
from xml.dom import minidom
import requests
import logging
from datetime import datetime
from pathlib import Path
class FeedGenerator:
"""Генератор YML фида для Prom.ua"""
def __init__(self, config, storage):
self.config = config
self.storage = storage
self.logger = logging.getLogger(__name__)
self.output_path = Path(config.get('feed.output_path', 'feeds/prom_feed.yml'))
self.shop_name = config.get('feed.shop_name', 'Ваш магазин')
self.company = config.get('feed.company', 'Ваша компания')
self.currency = config.get('feed.currency', 'UAH')
self.margin_percent = config.get('feed.margin_percent', 10)
# Создаём директорию для фидов
self.output_path.parent.mkdir(parents=True, exist_ok=True)
def generate_yml_feed(self):
"""Генерирует YML фид"""
self.logger.info("Начинаем генерацию YML фида...")
# Получаем товары
products = self.storage.get_products_for_feed()
if not products:
self.logger.warning("Нет товаров для генерации фида")
return
# Получаем курс валют
pln_to_uah_rate = self._get_exchange_rate()
# Создаём XML структуру
yml_catalog = ET.Element('yml_catalog', date=datetime.now().strftime('%Y-%m-%d %H:%M'))
shop = ET.SubElement(yml_catalog, 'shop')
# Информация о магазине
ET.SubElement(shop, 'name').text = self.shop_name
ET.SubElement(shop, 'company').text = self.company
ET.SubElement(shop, 'url').text = self.config.get('feed.shop_url', 'https://example.com')
# Валюты
currencies = ET.SubElement(shop, 'currencies')
currency = ET.SubElement(currencies, 'currency', id=self.currency, rate='1')
# Категории
categories_elem = ET.SubElement(shop, 'categories')
categories_map = self._build_categories(products, categories_elem)
# Товары
offers = ET.SubElement(shop, 'offers')
for product in products:
try:
offer = self._create_offer(product, pln_to_uah_rate, categories_map)
if offer is not None:
offers.append(offer)
except Exception as e:
self.logger.error(f"Ошибка при создании оффера для товара {product['id']}: {e}")
# Сохраняем файл
self._save_xml(yml_catalog)
self.logger.info(f"YML фид сгенерирован: {self.output_path}")
self.logger.info(f"Товаров в фиде: {len(offers)}")
def _get_exchange_rate(self):
"""Получает курс PLN к UAH"""
rate_setting = self.config.get('feed.pln_to_uah_rate', 'auto')
if isinstance(rate_setting, (int, float)):
return float(rate_setting)
try:
# Пытаемся получить актуальный курс
response = requests.get(
'https://api.exchangerate-api.com/v4/latest/PLN',
timeout=10
)
response.raise_for_status()
data = response.json()
rate = data['rates'].get('UAH', 10.0) # Fallback курс
self.logger.info(f"Получен курс PLN/UAH: {rate}")
return rate
except Exception as e:
self.logger.error(f"Ошибка получения курса валют: {e}")
return 10.0 # Fallback курс
def _build_categories(self, products, categories_elem):
"""Строит список категорий"""
categories = set()
for product in products:
if product.get('category'):
categories.add(product['category'])
categories_map = {}
category_id = 1
for category_name in sorted(categories):
category_elem = ET.SubElement(categories_elem, 'category', id=str(category_id))
category_elem.text = category_name
categories_map[category_name] = category_id
category_id += 1
return categories_map
def _create_offer(self, product, pln_to_uah_rate, categories_map):
"""Создаёт элемент offer для товара"""
# Проверяем обязательные поля
if not product.get('title_ua') or not product.get('price'):
return None
# Рассчитываем цену в UAH с наценкой
price_pln = float(product['price'])
price_uah = price_pln * pln_to_uah_rate
price_uah_with_margin = price_uah * (1 + self.margin_percent / 100)
price_uah_final = round(price_uah_with_margin, 2)
# Создаём элемент offer
offer = ET.Element('offer', id=str(product['external_id']), available='true')
# Основные поля
ET.SubElement(offer, 'name').text = product['title_ua']
ET.SubElement(offer, 'price').text = str(price_uah_final)
ET.SubElement(offer, 'currencyId').text = self.currency
# Категория
if product.get('category') and product['category'] in categories_map:
ET.SubElement(offer, 'categoryId').text = str(categories_map[product['category']])
# Изображения
if product.get('local_images'):
for img_path in product['local_images'][:10]: # Ограничиваем количество
# Преобразуем локальный путь в URL
img_url = self._local_path_to_url(img_path)
if img_url:
ET.SubElement(offer, 'picture').text = img_url
# Описание
description = product.get('description_ua', '').strip()
if description:
description_elem = ET.SubElement(offer, 'description')
description_elem.text = description[:3000] # Ограничиваем длину
# Атрибуты товара
if product.get('brand'):
ET.SubElement(offer, 'vendor').text = product['brand']
if product.get('model'):
ET.SubElement(offer, 'model').text = product['model']
if product.get('sku'):
ET.SubElement(offer, 'vendorCode').text = product['sku']
# Характеристики
if product.get('attributes_ua'):
for attr_name, attr_value in product['attributes_ua'].items():
if attr_name and attr_value and len(str(attr_value)) < 100:
param = ET.SubElement(offer, 'param', name=str(attr_name))
param.text = str(attr_value)
# Наличие
stock_status = product.get('availability', '').lower()
if 'нет' in stock_status or 'недоступ' in stock_status:
offer.set('available', 'false')
# URL товара
if product.get('url'):
ET.SubElement(offer, 'url').text = product['url']
return offer
def _local_path_to_url(self, local_path):
"""Преобразует локальный путь в URL"""
# Здесь нужно настроить базовый URL вашего сервера
base_url = self.config.get('feed.images_base_url', 'https://yoursite.com/')
if base_url.endswith('/'):
base_url = base_url[:-1]
# Убираем начальную часть пути
relative_path = str(local_path).replace('\\', '/')
if relative_path.startswith('./'):
relative_path = relative_path[2:]
return f"{base_url}/{relative_path}"
def _save_xml(self, root):
"""Сохраняет XML в файл с красивым форматированием"""
rough_string = ET.tostring(root, encoding='unicode')
reparsed = minidom.parseString(rough_string)
pretty_xml = reparsed.toprettyxml(indent=" ", encoding='utf-8')
with open(self.output_path, 'wb') as f:
f.write(pretty_xml)

126
modules/image_downloader.py Normal file
View File

@@ -0,0 +1,126 @@
# modules/image_downloader.py
"""
Модуль для загрузки изображений
"""
import os
import requests
import hashlib
import logging
from pathlib import Path
from urllib.parse import urlparse
from PIL import Image
import mimetypes
class ImageDownloader:
"""Загрузчик изображений"""
def __init__(self, config):
self.config = config
self.logger = logging.getLogger(__name__)
self.download_path = Path(config.get('images.download_path', 'images'))
self.max_size_mb = config.get('images.max_size_mb', 10)
self.allowed_formats = config.get('images.allowed_formats', ['jpg', 'jpeg', 'png', 'webp'])
self.quality = config.get('images.quality', 85)
# Создаём директорию для изображений
self.download_path.mkdir(parents=True, exist_ok=True)
def download_product_images(self, image_urls, product_id):
"""Загружает все изображения товара"""
local_images = []
# Создаём папку для товара
product_dir = self.download_path / str(product_id)
product_dir.mkdir(exist_ok=True)
for i, url in enumerate(image_urls):
try:
local_path = self._download_image(url, product_dir, f"img_{i}")
if local_path:
local_images.append(str(local_path))
except Exception as e:
self.logger.error(f"Failed to download image {url}: {e}")
return local_images
def _download_image(self, url, save_dir, filename_prefix):
"""Загружает одно изображение"""
try:
# Получаем изображение
response = requests.get(url, timeout=30, stream=True)
response.raise_for_status()
# Проверяем размер
content_length = response.headers.get('content-length')
if content_length and int(content_length) > self.max_size_mb * 1024 * 1024:
self.logger.warning(f"Image too large: {url}")
return None
# Определяем формат
content_type = response.headers.get('content-type', '')
extension = mimetypes.guess_extension(content_type)
if not extension:
# Пытаемся определить по URL
parsed_url = urlparse(url)
path_ext = Path(parsed_url.path).suffix.lower()
if path_ext in ['.jpg', '.jpeg', '.png', '.webp']:
extension = path_ext
else:
extension = '.jpg' # По умолчанию
# Проверяем разрешённые форматы
format_name = extension[1:].lower()
if format_name not in self.allowed_formats:
self.logger.warning(f"Unsupported format {format_name}: {url}")
return None
# Генерируем имя файла
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
filename = f"{filename_prefix}_{url_hash}{extension}"
filepath = save_dir / filename
# Проверяем, не скачан ли уже файл
if filepath.exists():
return filepath
# Сохраняем изображение
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
# Оптимизируем изображение
self._optimize_image(filepath)
self.logger.debug(f"Downloaded image: {filepath}")
return filepath
except Exception as e:
self.logger.error(f"Error downloading image {url}: {e}")
return None
def _optimize_image(self, filepath):
"""Оптимизирует изображение"""
try:
with Image.open(filepath) as img:
# Конвертируем в RGB если необходимо
if img.mode in ('RGBA', 'LA', 'P'):
background = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'P':
img = img.convert('RGBA')
background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
img = background
# Ограничиваем размер
max_size = (1200, 1200)
if img.size[0] > max_size[0] or img.size[1] > max_size[1]:
img.thumbnail(max_size, Image.Resampling.LANCZOS)
# Сохраняем с оптимизацией
img.save(filepath, 'JPEG', quality=self.quality, optimize=True)
except Exception as e:
self.logger.error(f"Error optimizing image {filepath}: {e}")

402
modules/parser.py Normal file
View File

@@ -0,0 +1,402 @@
# modules/parser.py
"""
Модуль для парсинга товаров с morele.net
"""
import requests
import time
import re
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import logging
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
class MoreleParser:
"""Парсер для morele.net"""
def __init__(self, config):
self.config = config
self.session = requests.Session()
self.logger = logging.getLogger(__name__)
# Настройка сессии
self.session.headers.update({
'User-Agent': config.get('parsing.user_agent'),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'pl,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
def parse_category(self, category_url):
"""Парсит все товары из категории"""
self.logger.info(f"Начинаем парсинг категории: {category_url}")
products = []
page = 1
max_pages = self.config.get('parsing.max_pages', 100)
while page <= max_pages:
self.logger.debug(f"Парсинг страницы {page}")
page_url = self._get_page_url(category_url, page)
page_products = self._parse_category_page(page_url)
if not page_products:
self.logger.info(f"Страница {page} пуста, завершаем парсинг категории")
break
products.extend(page_products)
page += 1
# Пауза между запросами
time.sleep(self.config.get('parsing.delay_between_requests', 1))
self.logger.info(f"Найдено {len(products)} товаров в категории")
return products
def _get_page_url(self, base_url, page):
"""Формирует URL для конкретной страницы"""
if page == 1:
return base_url
# Проверяем, есть ли уже параметры в URL
separator = '&' if '?' in base_url else '?'
return f"{base_url}{separator}page={page}"
def _parse_category_page(self, page_url):
"""Парсит товары с одной страницы категории"""
try:
response = self._make_request(page_url)
if not response:
return []
soup = BeautifulSoup(response.content, 'html.parser')
# Ищем карточки товаров
product_cards = soup.find_all('div', class_='cat-product')
products = []
# Используем многопоточность для парсинга товаров
with ThreadPoolExecutor(max_workers=self.config.get('parsing.concurrent_requests', 5)) as executor:
futures = []
for card in product_cards:
product_url = self._extract_product_url(card)
if product_url:
future = executor.submit(self._parse_product_page, product_url)
futures.append(future)
for future in as_completed(futures):
try:
product = future.result()
if product:
products.append(product)
except Exception as e:
self.logger.error(f"Ошибка при парсинге товара: {e}")
return products
except Exception as e:
self.logger.error(f"Ошибка при парсинге страницы {page_url}: {e}")
return []
def _extract_product_url(self, card):
"""Извлекает URL товара из карточки"""
try:
link = card.find('a', href=True)
if link:
href = link['href']
if not href.startswith('http'):
href = urljoin('https://www.morele.net', href)
return href
except Exception as e:
self.logger.error(f"Ошибка при извлечении URL товара: {e}")
return None
def _parse_product_page(self, product_url):
"""Парсит детальную страницу товара"""
try:
response = self._make_request(product_url)
if not response:
return None
soup = BeautifulSoup(response.content, 'html.parser')
# Извлекаем данные товара
product = {
'url': product_url,
'id': self._extract_product_id(product_url),
'title': self._extract_title(soup),
'price': self._extract_price(soup),
'availability': self._extract_availability(soup),
'description': self._extract_description(soup),
'attributes': self._extract_attributes(soup),
'category': self._extract_category(soup),
'images': self._extract_images(soup),
'brand': self._extract_brand(soup),
'model': self._extract_model(soup),
'sku': self._extract_sku(soup),
'parsed_at': time.time()
}
# Генерируем хеш для определения изменений
product['content_hash'] = self._generate_content_hash(product)
return product
except Exception as e:
self.logger.error(f"Ошибка при парсинге товара {product_url}: {e}")
return None
def _extract_product_id(self, url):
"""Извлекает ID товара из URL"""
# Ищем числовой ID в URL
match = re.search(r'/(\d+)-', url)
if match:
return match.group(1)
# Если не найден, используем хеш URL
return hashlib.md5(url.encode()).hexdigest()[:10]
def _extract_title(self, soup):
"""Извлекает название товара"""
selectors = [
'h1.prod-name',
'h1[data-test="product-name"]',
'h1.product-title',
'.product-name h1',
'h1'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return "Без названия"
def _extract_price(self, soup):
"""Извлекает цену товара"""
selectors = [
'.price-new',
'.price-main',
'[data-test="product-price"]',
'.product-price .price',
'.price'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
price_text = element.get_text(strip=True)
# Извлекаем числовое значение
price_match = re.search(r'([\d\s]+[,.]?\d*)', price_text.replace(' ', ''))
if price_match:
price_str = price_match.group(1).replace(' ', '').replace(',', '.')
try:
return float(price_str)
except ValueError:
continue
return 0.0
def _extract_availability(self, soup):
"""Извлекает информацию о наличии"""
selectors = [
'.availability',
'[data-test="product-availability"]',
'.product-availability',
'.stock-info'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
availability_text = element.get_text(strip=True).lower()
if any(word in availability_text for word in ['dostępny', 'w magazynie', 'dostępne']):
return 'в наличии'
elif any(word in availability_text for word in ['brak', 'niedostępny']):
return 'нет в наличии'
else:
return availability_text
return 'неизвестно'
def _extract_description(self, soup):
"""Извлекает описание товара"""
selectors = [
'.product-description',
'[data-test="product-description"]',
'.prod-description',
'.description',
'.product-details .description'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
# Удаляем HTML теги и лишние пробелы
description = element.get_text(separator=' ', strip=True)
return re.sub(r'\s+', ' ', description)
return ""
def _extract_attributes(self, soup):
"""Извлекает характеристики товара"""
attributes = {}
# Различные селекторы для характеристик
specs_sections = soup.find_all(['div', 'section'], class_=re.compile(r'spec|param|attribute|feature'))
for section in specs_sections:
# Ищем пары ключ-значение
rows = section.find_all(['tr', 'div'], class_=re.compile(r'spec-row|param-row|attribute-row'))
for row in rows:
# Пытаемся найти название и значение
name_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'name|key|label'))
value_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'value|val'))
if name_elem and value_elem:
name = name_elem.get_text(strip=True)
value = value_elem.get_text(strip=True)
if name and value and len(name) < 100:
attributes[name] = value
return attributes
def _extract_category(self, soup):
"""Извлекает категорию товара"""
# Ищем хлебные крошки
breadcrumb_selectors = [
'.breadcrumb',
'.breadcrumbs',
'[data-test="breadcrumb"]',
'.navigation-path'
]
for selector in breadcrumb_selectors:
breadcrumb = soup.select_one(selector)
if breadcrumb:
links = breadcrumb.find_all('a')
if len(links) > 1: # Пропускаем "Главная"
return links[-1].get_text(strip=True)
return "Без категории"
def _extract_images(self, soup):
"""Извлекает изображения товара"""
images = []
# Селекторы для изображений
img_selectors = [
'.product-gallery img',
'.product-images img',
'[data-test="product-image"]',
'.gallery img',
'.product-photo img'
]
for selector in img_selectors:
imgs = soup.select(selector)
for img in imgs:
src = img.get('src') or img.get('data-src') or img.get('data-lazy')
if src:
if not src.startswith('http'):
src = urljoin('https://www.morele.net', src)
# Фильтруем маленькие изображения
if not any(size in src for size in ['icon', 'thumb', 'small']) and src not in images:
images.append(src)
return images[:10] # Ограничиваем количество изображений
def _extract_brand(self, soup):
"""Извлекает бренд товара"""
selectors = [
'[data-test="product-brand"]',
'.product-brand',
'.brand',
'.manufacturer'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return ""
def _extract_model(self, soup):
"""Извлекает модель товара"""
selectors = [
'[data-test="product-model"]',
'.product-model',
'.model'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return ""
def _extract_sku(self, soup):
"""Извлекает артикул товара"""
selectors = [
'[data-test="product-sku"]',
'.product-sku',
'.sku',
'.article-number',
'.product-code'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return ""
def _generate_content_hash(self, product):
"""Генерирует хеш содержимого товара для определения изменений"""
content = f"{product['title']}{product['price']}{product['availability']}{product['description']}"
return hashlib.md5(content.encode('utf-8')).hexdigest()
def _make_request(self, url, retries=None):
"""Выполняет HTTP запрос с повторными попытками"""
if retries is None:
retries = self.config.get('parsing.max_retries', 3)
for attempt in range(retries + 1):
try:
response = self.session.get(
url,
timeout=self.config.get('parsing.timeout', 30)
)
if response.status_code == 200:
return response
elif response.status_code == 429: # Too Many Requests
wait_time = (attempt + 1) * 5
self.logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
time.sleep(wait_time)
else:
self.logger.warning(f"HTTP {response.status_code} for {url}")
except requests.RequestException as e:
self.logger.error(f"Request error on attempt {attempt + 1} for {url}: {e}")
if attempt < retries:
time.sleep((attempt + 1) * 2)
self.logger.error(f"Failed to fetch {url} after {retries + 1} attempts")
return None

272
modules/storage.py Normal file
View File

@@ -0,0 +1,272 @@
# modules/storage.py
"""
Модуль для работы с хранилищем данных
"""
import sqlite3
import json
import logging
from datetime import datetime
from pathlib import Path
class StorageManager:
"""Менеджер для работы с базой данных"""
def __init__(self, config):
self.config = config
self.logger = logging.getLogger(__name__)
# Инициализация БД
self.db_type = config.get('database.type', 'sqlite')
if self.db_type == 'sqlite':
self.db_path = config.get('database.sqlite_path', 'data/morele_parser.db')
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
self._init_sqlite()
else:
raise NotImplementedError("Пока поддерживается только SQLite")
def _init_sqlite(self):
"""Инициализирует SQLite базу данных"""
with sqlite3.connect(self.db_path) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
external_id TEXT UNIQUE NOT NULL,
url TEXT NOT NULL,
title TEXT NOT NULL,
title_ua TEXT,
price REAL NOT NULL,
availability TEXT,
description TEXT,
description_ua TEXT,
attributes TEXT,
attributes_ua TEXT,
category TEXT,
brand TEXT,
model TEXT,
sku TEXT,
images TEXT,
local_images TEXT,
content_hash TEXT,
is_translated BOOLEAN DEFAULT 0,
is_active BOOLEAN DEFAULT 1,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS categories (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
url TEXT UNIQUE NOT NULL,
is_active BOOLEAN DEFAULT 1,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS translation_cache (
id INTEGER PRIMARY KEY AUTOINCREMENT,
original_text TEXT UNIQUE NOT NULL,
translated_text TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS parsing_logs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
category_url TEXT,
products_found INTEGER,
products_new INTEGER,
products_updated INTEGER,
errors_count INTEGER,
started_at TIMESTAMP,
completed_at TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_products_external_id ON products(external_id);
CREATE INDEX IF NOT EXISTS idx_products_url ON products(url);
CREATE INDEX IF NOT EXISTS idx_translation_cache_original ON translation_cache(original_text);
""")
def save_product(self, product):
"""Сохраняет товар в базу данных"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO products (
external_id, url, title, title_ua, price, availability,
description, description_ua, attributes, attributes_ua,
category, brand, model, sku, images, local_images,
content_hash, is_translated, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
product['id'],
product['url'],
product['title'],
product.get('title_ua', ''),
product['price'],
product['availability'],
product['description'],
product.get('description_ua', ''),
json.dumps(product.get('attributes', {}), ensure_ascii=False),
json.dumps(product.get('attributes_ua', {}), ensure_ascii=False),
product.get('category', ''),
product.get('brand', ''),
product.get('model', ''),
product.get('sku', ''),
json.dumps(product.get('images', [])),
json.dumps(product.get('local_images', [])),
product.get('content_hash', ''),
product.get('is_translated', False),
datetime.now().isoformat()
))
def get_product_by_url(self, url):
"""Получает товар по URL"""
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT * FROM products WHERE url = ?", (url,))
row = cursor.fetchone()
if row:
product = dict(row)
product['attributes'] = json.loads(product['attributes'] or '{}')
product['attributes_ua'] = json.loads(product['attributes_ua'] or '{}')
product['images'] = json.loads(product['images'] or '[]')
product['local_images'] = json.loads(product['local_images'] or '[]')
return product
return None
def get_product_by_id(self, product_id):
"""Получает товар по ID"""
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT * FROM products WHERE id = ?", (product_id,))
row = cursor.fetchone()
if row:
product = dict(row)
product['attributes'] = json.loads(product['attributes'] or '{}')
product['attributes_ua'] = json.loads(product['attributes_ua'] or '{}')
product['images'] = json.loads(product['images'] or '[]')
product['local_images'] = json.loads(product['local_images'] or '[]')
return product
return None
def update_product(self, product_id, product_data):
"""Обновляет товар"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
UPDATE products SET
title = ?, title_ua = ?, price = ?, availability = ?,
description = ?, description_ua = ?, attributes = ?, attributes_ua = ?,
category = ?, brand = ?, model = ?, sku = ?, images = ?, local_images = ?,
content_hash = ?, is_translated = ?, updated_at = ?
WHERE id = ?
""", (
product_data['title'],
product_data.get('title_ua', ''),
product_data['price'],
product_data['availability'],
product_data['description'],
product_data.get('description_ua', ''),
json.dumps(product_data.get('attributes', {}), ensure_ascii=False),
json.dumps(product_data.get('attributes_ua', {}), ensure_ascii=False),
product_data.get('category', ''),
product_data.get('brand', ''),
product_data.get('model', ''),
product_data.get('sku', ''),
json.dumps(product_data.get('images', [])),
json.dumps(product_data.get('local_images', [])),
product_data.get('content_hash', ''),
product_data.get('is_translated', False),
datetime.now().isoformat(),
product_id
))
def get_active_categories(self):
"""Получает список активных категорий для парсинга"""
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT * FROM categories WHERE is_active = 1")
return [dict(row) for row in cursor.fetchall()]
def add_category(self, name, url):
"""Добавляет категорию"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO categories (name, url) VALUES (?, ?)
""", (name, url))
def deactivate_category(self, category_id):
"""Деактивирует категорию"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("UPDATE categories SET is_active = 0 WHERE id = ?", (category_id,))
def get_translation_from_cache(self, original_text):
"""Получает перевод из кеша"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT translated_text FROM translation_cache WHERE original_text = ?",
(original_text,)
)
row = cursor.fetchone()
return row[0] if row else None
def save_translation_to_cache(self, original_text, translated_text):
"""Сохраняет перевод в кеш"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO translation_cache (original_text, translated_text)
VALUES (?, ?)
""", (original_text, translated_text))
def get_products_for_feed(self):
"""Получает товары для генерации фида"""
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("""
SELECT * FROM products
WHERE is_active = 1 AND is_translated = 1 AND price > 0
ORDER BY updated_at DESC
""")
products = []
for row in cursor.fetchall():
product = dict(row)
product['attributes'] = json.loads(product['attributes'] or '{}')
product['attributes_ua'] = json.loads(product['attributes_ua'] or '{}')
product['images'] = json.loads(product['images'] or '[]')
product['local_images'] = json.loads(product['local_images'] or '[]')
products.append(product)
return products
def log_parsing_session(self, category_url, stats):
"""Логирует сессию парсинга"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT INTO parsing_logs
(category_url, products_found, products_new, products_updated, errors_count, started_at, completed_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
category_url,
stats.get('found', 0),
stats.get('new', 0),
stats.get('updated', 0),
stats.get('errors', 0),
stats.get('started_at'),
stats.get('completed_at')
))
def get_parsing_stats(self, days=30):
"""Получает статистику парсинга за последние дни"""
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("""
SELECT * FROM parsing_logs
WHERE completed_at > datetime('now', '-{} days')
ORDER BY completed_at DESC
""".format(days))
return [dict(row) for row in cursor.fetchall()]

154
modules/translator.py Normal file
View File

@@ -0,0 +1,154 @@
# modules/translator.py
"""
Модуль для перевода текста с кешем
"""
import hashlib
import logging
import time
from abc import ABC, abstractmethod
class TranslationProvider(ABC):
"""Абстрактный класс для провайдеров перевода"""
@abstractmethod
def translate(self, text, source_lang, target_lang):
pass
class GoogleTranslateProvider(TranslationProvider):
"""Провайдер Google Translate"""
def __init__(self, api_key):
self.api_key = api_key
def translate(self, text, source_lang='pl', target_lang='uk'):
try:
from googletrans import Translator
translator = Translator()
result = translator.translate(text, src=source_lang, dest=target_lang)
return result.text
except Exception as e:
logging.error(f"Google Translate error: {e}")
raise
class DeepLProvider(TranslationProvider):
"""Провайдер DeepL"""
def __init__(self, api_key):
self.api_key = api_key
def translate(self, text, source_lang='PL', target_lang='UK'):
try:
import deepl
translator = deepl.Translator(self.api_key)
result = translator.translate_text(text, source_lang=source_lang, target_lang=target_lang)
return result.text
except Exception as e:
logging.error(f"DeepL error: {e}")
raise
class LibreTranslateProvider(TranslationProvider):
"""Провайдер LibreTranslate"""
def __init__(self, url, api_key=None):
self.url = url
self.api_key = api_key
def translate(self, text, source_lang='pl', target_lang='uk'):
try:
import requests
data = {
'q': text,
'source': source_lang,
'target': target_lang,
'format': 'text'
}
if self.api_key:
data['api_key'] = self.api_key
response = requests.post(f"{self.url}/translate", data=data)
response.raise_for_status()
return response.json()['translatedText']
except Exception as e:
logging.error(f"LibreTranslate error: {e}")
raise
class TranslationService:
"""Сервис для перевода с кешем"""
def __init__(self, config, storage):
self.config = config
self.storage = storage
self.logger = logging.getLogger(__name__)
# Инициализация провайдера
self.provider = self._init_provider()
# Настройки
self.cache_enabled = config.get('translation.cache_enabled', True)
self.source_lang = config.get('translation.google.source_lang', 'pl')
self.target_lang = config.get('translation.google.target_lang', 'uk')
def _init_provider(self):
"""Инициализирует провайдера перевода"""
service = self.config.get('translation.service', 'google')
if service == 'google':
api_key = self.config.get('translation.google.api_key')
if not api_key:
self.logger.warning("Google Translate API key not found, using googletrans library")
return GoogleTranslateProvider(api_key)
elif service == 'deepl':
api_key = self.config.get('translation.deepl.api_key')
if not api_key:
raise ValueError("DeepL API key is required")
return DeepLProvider(api_key)
elif service == 'libretranslate':
url = self.config.get('translation.libretranslate.url')
api_key = self.config.get('translation.libretranslate.api_key')
return LibreTranslateProvider(url, api_key)
else:
raise ValueError(f"Unsupported translation service: {service}")
def translate(self, text):
"""Переводит текст с использованием кеша"""
if not text or not text.strip():
return text
text = text.strip()
# Проверяем кеш
if self.cache_enabled:
cached = self.storage.get_translation_from_cache(text)
if cached:
return cached
try:
# Переводим
translated = self.provider.translate(text, self.source_lang, self.target_lang)
# Сохраняем в кеш
if self.cache_enabled and translated:
self.storage.save_translation_to_cache(text, translated)
# Небольшая пауза чтобы не превысить лимиты API
time.sleep(0.1)
return translated
except Exception as e:
self.logger.error(f"Translation failed for text '{text[:50]}...': {e}")
return text # Возвращаем оригинальный текст при ошибке