first commit
This commit is contained in:
4
modules/__init__.py
Normal file
4
modules/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# modules/__init__.py
|
||||
"""
|
||||
Модули парсера morele.net
|
||||
"""
|
||||
362
modules/admin.py
Normal file
362
modules/admin.py
Normal file
@@ -0,0 +1,362 @@
|
||||
# modules/admin.py
|
||||
"""
|
||||
Простая веб-админка для управления парсером
|
||||
"""
|
||||
|
||||
from flask import Flask, render_template_string, request, redirect, url_for, flash, jsonify
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
class AdminPanel:
|
||||
"""Простая веб-админка"""
|
||||
|
||||
def __init__(self, config, storage):
|
||||
self.config = config
|
||||
self.storage = storage
|
||||
self.app = Flask(__name__)
|
||||
self.app.secret_key = 'morele-parser-secret-key'
|
||||
|
||||
self._setup_routes()
|
||||
|
||||
def _setup_routes(self):
|
||||
"""Настройка маршрутов"""
|
||||
|
||||
@self.app.route('/')
|
||||
def index():
|
||||
"""Главная страница"""
|
||||
categories = self.storage.get_active_categories()
|
||||
stats = self.storage.get_parsing_stats(7) # За неделю
|
||||
|
||||
return render_template_string(self.INDEX_TEMPLATE,
|
||||
categories=categories,
|
||||
stats=stats)
|
||||
|
||||
@self.app.route('/categories')
|
||||
def categories():
|
||||
"""Страница управления категориями"""
|
||||
categories = self.storage.get_active_categories()
|
||||
return render_template_string(self.CATEGORIES_TEMPLATE, categories=categories)
|
||||
|
||||
@self.app.route('/add_category', methods=['POST'])
|
||||
def add_category():
|
||||
"""Добавление категории"""
|
||||
name = request.form.get('name')
|
||||
url = request.form.get('url')
|
||||
|
||||
if name and url:
|
||||
try:
|
||||
self.storage.add_category(name, url)
|
||||
flash('Категория добавлена успешно', 'success')
|
||||
except Exception as e:
|
||||
flash(f'Ошибка при добавлении категории: {e}', 'error')
|
||||
else:
|
||||
flash('Заполните все поля', 'error')
|
||||
|
||||
return redirect(url_for('categories'))
|
||||
|
||||
@self.app.route('/deactivate_category/<int:category_id>')
|
||||
def deactivate_category(category_id):
|
||||
"""Деактивация категории"""
|
||||
try:
|
||||
self.storage.deactivate_category(category_id)
|
||||
flash('Категория деактивирована', 'success')
|
||||
except Exception as e:
|
||||
flash(f'Ошибка: {e}', 'error')
|
||||
|
||||
return redirect(url_for('categories'))
|
||||
|
||||
@self.app.route('/products')
|
||||
def products():
|
||||
"""Страница товаров"""
|
||||
page = int(request.args.get('page', 1))
|
||||
per_page = 50
|
||||
|
||||
# Здесь можно добавить пагинацию
|
||||
products = self.storage.get_products_for_feed()[:per_page]
|
||||
|
||||
return render_template_string(self.PRODUCTS_TEMPLATE, products=products)
|
||||
|
||||
@self.app.route('/api/stats')
|
||||
def api_stats():
|
||||
"""API для получения статистики"""
|
||||
stats = self.storage.get_parsing_stats(30)
|
||||
return jsonify(stats)
|
||||
|
||||
def run(self, host='127.0.0.1', port=5000):
|
||||
"""Запуск админки"""
|
||||
self.app.run(host=host, port=port, debug=False)
|
||||
|
||||
# HTML шаблоны
|
||||
INDEX_TEMPLATE = '''
|
||||
<!DOCTYPE html>
|
||||
<html lang="ru">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Morele.net Parser - Админка</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
|
||||
.container { max-width: 1200px; margin: 0 auto; }
|
||||
.header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
||||
.nav { margin-bottom: 20px; }
|
||||
.nav a { background: #007bff; color: white; padding: 10px 15px; text-decoration: none; margin-right: 10px; border-radius: 4px; }
|
||||
.nav a:hover { background: #0056b3; }
|
||||
.card { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
||||
.stats { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; }
|
||||
.stat-item { text-align: center; padding: 20px; background: #f8f9fa; border-radius: 8px; }
|
||||
.stat-number { font-size: 2em; font-weight: bold; color: #007bff; }
|
||||
table { width: 100%; border-collapse: collapse; }
|
||||
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
|
||||
th { background: #f8f9fa; }
|
||||
.status-active { color: #28a745; font-weight: bold; }
|
||||
.status-inactive { color: #dc3545; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>🔍 Morele.net Parser - Админка</h1>
|
||||
<div class="nav">
|
||||
<a href="{{ url_for('index') }}">Главная</a>
|
||||
<a href="{{ url_for('categories') }}">Категории</a>
|
||||
<a href="{{ url_for('products') }}">Товары</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h2>📊 Статистика</h2>
|
||||
<div class="stats">
|
||||
<div class="stat-item">
|
||||
<div class="stat-number">{{ categories|length }}</div>
|
||||
<div>Активных категорий</div>
|
||||
</div>
|
||||
<div class="stat-item">
|
||||
<div class="stat-number">{{ stats|length }}</div>
|
||||
<div>Сессий парсинга за неделю</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h2>📋 Последние сессии парсинга</h2>
|
||||
{% if stats %}
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Дата</th>
|
||||
<th>Категория</th>
|
||||
<th>Найдено</th>
|
||||
<th>Новых</th>
|
||||
<th>Обновлено</th>
|
||||
<th>Ошибок</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for stat in stats[:10] %}
|
||||
<tr>
|
||||
<td>{{ stat.completed_at }}</td>
|
||||
<td>{{ stat.category_url }}</td>
|
||||
<td>{{ stat.products_found }}</td>
|
||||
<td>{{ stat.products_new }}</td>
|
||||
<td>{{ stat.products_updated }}</td>
|
||||
<td>{{ stat.errors_count }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% else %}
|
||||
<p>Пока нет данных о парсинге</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
CATEGORIES_TEMPLATE = '''
|
||||
<!DOCTYPE html>
|
||||
<html lang="ru">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Категории - Morele.net Parser</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
|
||||
.container { max-width: 1200px; margin: 0 auto; }
|
||||
.header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
||||
.nav { margin-bottom: 20px; }
|
||||
.nav a { background: #007bff; color: white; padding: 10px 15px; text-decoration: none; margin-right: 10px; border-radius: 4px; }
|
||||
.nav a:hover { background: #0056b3; }
|
||||
.card { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
||||
.form-group { margin-bottom: 15px; }
|
||||
.form-group label { display: block; margin-bottom: 5px; font-weight: bold; }
|
||||
.form-group input { width: 100%; padding: 10px; border: 1px solid #ddd; border-radius: 4px; box-sizing: border-box; }
|
||||
.btn { background: #28a745; color: white; padding: 10px 20px; border: none; border-radius: 4px; cursor: pointer; }
|
||||
.btn:hover { background: #218838; }
|
||||
.btn-danger { background: #dc3545; }
|
||||
.btn-danger:hover { background: #c82333; }
|
||||
table { width: 100%; border-collapse: collapse; }
|
||||
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
|
||||
th { background: #f8f9fa; }
|
||||
.alert { padding: 15px; margin-bottom: 20px; border-radius: 4px; }
|
||||
.alert-success { background: #d4edda; border: 1px solid #c3e6cb; color: #155724; }
|
||||
.alert-error { background: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>📁 Управление категориями</h1>
|
||||
<div class="nav">
|
||||
<a href="{{ url_for('index') }}">Главная</a>
|
||||
<a href="{{ url_for('categories') }}">Категории</a>
|
||||
<a href="{{ url_for('products') }}">Товары</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% with messages = get_flashed_messages(with_categories=true) %}
|
||||
{% if messages %}
|
||||
{% for category, message in messages %}
|
||||
<div class="alert alert-{{ category }}">{{ message }}</div>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
|
||||
<div class="card">
|
||||
<h2>➕ Добавить категорию</h2>
|
||||
<form method="POST" action="{{ url_for('add_category') }}">
|
||||
<div class="form-group">
|
||||
<label for="name">Название категории:</label>
|
||||
<input type="text" id="name" name="name" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="url">URL категории на morele.net:</label>
|
||||
<input type="url" id="url" name="url" required placeholder="https://www.morele.net/kategoria/...">
|
||||
</div>
|
||||
<button type="submit" class="btn">Добавить категорию</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h2>📋 Список категорий</h2>
|
||||
{% if categories %}
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>Название</th>
|
||||
<th>URL</th>
|
||||
<th>Дата добавления</th>
|
||||
<th>Действия</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for category in categories %}
|
||||
<tr>
|
||||
<td>{{ category.id }}</td>
|
||||
<td>{{ category.name }}</td>
|
||||
<td><a href="{{ category.url }}" target="_blank">{{ category.url[:50] }}...</a></td>
|
||||
<td>{{ category.created_at }}</td>
|
||||
<td>
|
||||
<a href="{{ url_for('deactivate_category', category_id=category.id) }}"
|
||||
class="btn btn-danger"
|
||||
onclick="return confirm('Уверены, что хотите деактивировать эту категорию?')">
|
||||
Деактивировать
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% else %}
|
||||
<p>Пока нет добавленных категорий. Добавьте первую категорию выше.</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
PRODUCTS_TEMPLATE = '''
|
||||
<!DOCTYPE html>
|
||||
<html lang="ru">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Товары - Morele.net Parser</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
|
||||
.container { max-width: 1400px; margin: 0 auto; }
|
||||
.header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
||||
.nav { margin-bottom: 20px; }
|
||||
.nav a { background: #007bff; color: white; padding: 10px 15px; text-decoration: none; margin-right: 10px; border-radius: 4px; }
|
||||
.nav a:hover { background: #0056b3; }
|
||||
.card { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
||||
table { width: 100%; border-collapse: collapse; font-size: 14px; }
|
||||
th, td { padding: 8px; text-align: left; border-bottom: 1px solid #ddd; }
|
||||
th { background: #f8f9fa; position: sticky; top: 0; }
|
||||
.product-title { max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
||||
.product-image { width: 50px; height: 50px; object-fit: cover; border-radius: 4px; }
|
||||
.price { font-weight: bold; color: #28a745; }
|
||||
.status-available { color: #28a745; }
|
||||
.status-unavailable { color: #dc3545; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>📦 Товары</h1>
|
||||
<div class="nav">
|
||||
<a href="{{ url_for('index') }}">Главная</a>
|
||||
<a href="{{ url_for('categories') }}">Категории</a>
|
||||
<a href="{{ url_for('products') }}">Товары</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h2>📋 Список товаров</h2>
|
||||
{% if products %}
|
||||
<p>Показано товаров: {{ products|length }}</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Изображение</th>
|
||||
<th>Название (UA)</th>
|
||||
<th>Цена (PLN)</th>
|
||||
<th>Наличие</th>
|
||||
<th>Категория</th>
|
||||
<th>Обновлено</th>
|
||||
<th>Ссылка</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for product in products %}
|
||||
<tr>
|
||||
<td>
|
||||
{% if product.local_images %}
|
||||
<img src="{{ product.local_images[0] }}" class="product-image" alt="Product image">
|
||||
{% else %}
|
||||
<div style="width: 50px; height: 50px; background: #f8f9fa; border-radius: 4px;"></div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="product-title" title="{{ product.title_ua }}">{{ product.title_ua }}</td>
|
||||
<td class="price">{{ "%.2f"|format(product.price) }} PLN</td>
|
||||
<td class="{% if 'наличии' in product.availability %}status-available{% else %}status-unavailable{% endif %}">
|
||||
{{ product.availability }}
|
||||
</td>
|
||||
<td>{{ product.category }}</td>
|
||||
<td>{{ product.updated_at[:16] }}</td>
|
||||
<td><a href="{{ product.url }}" target="_blank">Открыть</a></td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% else %}
|
||||
<p>Пока нет товаров. Запустите парсинг для получения товаров.</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
210
modules/feed_generator.py
Normal file
210
modules/feed_generator.py
Normal file
@@ -0,0 +1,210 @@
|
||||
# modules/feed_generator.py
|
||||
"""
|
||||
Модуль для генерации YML фида для Prom.ua
|
||||
"""
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.dom import minidom
|
||||
import requests
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class FeedGenerator:
|
||||
"""Генератор YML фида для Prom.ua"""
|
||||
|
||||
def __init__(self, config, storage):
|
||||
self.config = config
|
||||
self.storage = storage
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.output_path = Path(config.get('feed.output_path', 'feeds/prom_feed.yml'))
|
||||
self.shop_name = config.get('feed.shop_name', 'Ваш магазин')
|
||||
self.company = config.get('feed.company', 'Ваша компания')
|
||||
self.currency = config.get('feed.currency', 'UAH')
|
||||
self.margin_percent = config.get('feed.margin_percent', 10)
|
||||
|
||||
# Создаём директорию для фидов
|
||||
self.output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def generate_yml_feed(self):
|
||||
"""Генерирует YML фид"""
|
||||
self.logger.info("Начинаем генерацию YML фида...")
|
||||
|
||||
# Получаем товары
|
||||
products = self.storage.get_products_for_feed()
|
||||
|
||||
if not products:
|
||||
self.logger.warning("Нет товаров для генерации фида")
|
||||
return
|
||||
|
||||
# Получаем курс валют
|
||||
pln_to_uah_rate = self._get_exchange_rate()
|
||||
|
||||
# Создаём XML структуру
|
||||
yml_catalog = ET.Element('yml_catalog', date=datetime.now().strftime('%Y-%m-%d %H:%M'))
|
||||
shop = ET.SubElement(yml_catalog, 'shop')
|
||||
|
||||
# Информация о магазине
|
||||
ET.SubElement(shop, 'name').text = self.shop_name
|
||||
ET.SubElement(shop, 'company').text = self.company
|
||||
ET.SubElement(shop, 'url').text = self.config.get('feed.shop_url', 'https://example.com')
|
||||
|
||||
# Валюты
|
||||
currencies = ET.SubElement(shop, 'currencies')
|
||||
currency = ET.SubElement(currencies, 'currency', id=self.currency, rate='1')
|
||||
|
||||
# Категории
|
||||
categories_elem = ET.SubElement(shop, 'categories')
|
||||
categories_map = self._build_categories(products, categories_elem)
|
||||
|
||||
# Товары
|
||||
offers = ET.SubElement(shop, 'offers')
|
||||
|
||||
for product in products:
|
||||
try:
|
||||
offer = self._create_offer(product, pln_to_uah_rate, categories_map)
|
||||
if offer is not None:
|
||||
offers.append(offer)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Ошибка при создании оффера для товара {product['id']}: {e}")
|
||||
|
||||
# Сохраняем файл
|
||||
self._save_xml(yml_catalog)
|
||||
|
||||
self.logger.info(f"YML фид сгенерирован: {self.output_path}")
|
||||
self.logger.info(f"Товаров в фиде: {len(offers)}")
|
||||
|
||||
def _get_exchange_rate(self):
|
||||
"""Получает курс PLN к UAH"""
|
||||
rate_setting = self.config.get('feed.pln_to_uah_rate', 'auto')
|
||||
|
||||
if isinstance(rate_setting, (int, float)):
|
||||
return float(rate_setting)
|
||||
|
||||
try:
|
||||
# Пытаемся получить актуальный курс
|
||||
response = requests.get(
|
||||
'https://api.exchangerate-api.com/v4/latest/PLN',
|
||||
timeout=10
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
rate = data['rates'].get('UAH', 10.0) # Fallback курс
|
||||
|
||||
self.logger.info(f"Получен курс PLN/UAH: {rate}")
|
||||
return rate
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Ошибка получения курса валют: {e}")
|
||||
return 10.0 # Fallback курс
|
||||
|
||||
def _build_categories(self, products, categories_elem):
|
||||
"""Строит список категорий"""
|
||||
categories = set()
|
||||
|
||||
for product in products:
|
||||
if product.get('category'):
|
||||
categories.add(product['category'])
|
||||
|
||||
categories_map = {}
|
||||
category_id = 1
|
||||
|
||||
for category_name in sorted(categories):
|
||||
category_elem = ET.SubElement(categories_elem, 'category', id=str(category_id))
|
||||
category_elem.text = category_name
|
||||
categories_map[category_name] = category_id
|
||||
category_id += 1
|
||||
|
||||
return categories_map
|
||||
|
||||
def _create_offer(self, product, pln_to_uah_rate, categories_map):
|
||||
"""Создаёт элемент offer для товара"""
|
||||
# Проверяем обязательные поля
|
||||
if not product.get('title_ua') or not product.get('price'):
|
||||
return None
|
||||
|
||||
# Рассчитываем цену в UAH с наценкой
|
||||
price_pln = float(product['price'])
|
||||
price_uah = price_pln * pln_to_uah_rate
|
||||
price_uah_with_margin = price_uah * (1 + self.margin_percent / 100)
|
||||
price_uah_final = round(price_uah_with_margin, 2)
|
||||
|
||||
# Создаём элемент offer
|
||||
offer = ET.Element('offer', id=str(product['external_id']), available='true')
|
||||
|
||||
# Основные поля
|
||||
ET.SubElement(offer, 'name').text = product['title_ua']
|
||||
ET.SubElement(offer, 'price').text = str(price_uah_final)
|
||||
ET.SubElement(offer, 'currencyId').text = self.currency
|
||||
|
||||
# Категория
|
||||
if product.get('category') and product['category'] in categories_map:
|
||||
ET.SubElement(offer, 'categoryId').text = str(categories_map[product['category']])
|
||||
|
||||
# Изображения
|
||||
if product.get('local_images'):
|
||||
for img_path in product['local_images'][:10]: # Ограничиваем количество
|
||||
# Преобразуем локальный путь в URL
|
||||
img_url = self._local_path_to_url(img_path)
|
||||
if img_url:
|
||||
ET.SubElement(offer, 'picture').text = img_url
|
||||
|
||||
# Описание
|
||||
description = product.get('description_ua', '').strip()
|
||||
if description:
|
||||
description_elem = ET.SubElement(offer, 'description')
|
||||
description_elem.text = description[:3000] # Ограничиваем длину
|
||||
|
||||
# Атрибуты товара
|
||||
if product.get('brand'):
|
||||
ET.SubElement(offer, 'vendor').text = product['brand']
|
||||
|
||||
if product.get('model'):
|
||||
ET.SubElement(offer, 'model').text = product['model']
|
||||
|
||||
if product.get('sku'):
|
||||
ET.SubElement(offer, 'vendorCode').text = product['sku']
|
||||
|
||||
# Характеристики
|
||||
if product.get('attributes_ua'):
|
||||
for attr_name, attr_value in product['attributes_ua'].items():
|
||||
if attr_name and attr_value and len(str(attr_value)) < 100:
|
||||
param = ET.SubElement(offer, 'param', name=str(attr_name))
|
||||
param.text = str(attr_value)
|
||||
|
||||
# Наличие
|
||||
stock_status = product.get('availability', '').lower()
|
||||
if 'нет' in stock_status or 'недоступ' in stock_status:
|
||||
offer.set('available', 'false')
|
||||
|
||||
# URL товара
|
||||
if product.get('url'):
|
||||
ET.SubElement(offer, 'url').text = product['url']
|
||||
|
||||
return offer
|
||||
|
||||
def _local_path_to_url(self, local_path):
|
||||
"""Преобразует локальный путь в URL"""
|
||||
# Здесь нужно настроить базовый URL вашего сервера
|
||||
base_url = self.config.get('feed.images_base_url', 'https://yoursite.com/')
|
||||
|
||||
if base_url.endswith('/'):
|
||||
base_url = base_url[:-1]
|
||||
|
||||
# Убираем начальную часть пути
|
||||
relative_path = str(local_path).replace('\\', '/')
|
||||
if relative_path.startswith('./'):
|
||||
relative_path = relative_path[2:]
|
||||
|
||||
return f"{base_url}/{relative_path}"
|
||||
|
||||
def _save_xml(self, root):
|
||||
"""Сохраняет XML в файл с красивым форматированием"""
|
||||
rough_string = ET.tostring(root, encoding='unicode')
|
||||
reparsed = minidom.parseString(rough_string)
|
||||
pretty_xml = reparsed.toprettyxml(indent=" ", encoding='utf-8')
|
||||
|
||||
with open(self.output_path, 'wb') as f:
|
||||
f.write(pretty_xml)
|
||||
126
modules/image_downloader.py
Normal file
126
modules/image_downloader.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# modules/image_downloader.py
|
||||
"""
|
||||
Модуль для загрузки изображений
|
||||
"""
|
||||
|
||||
import os
|
||||
import requests
|
||||
import hashlib
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from PIL import Image
|
||||
import mimetypes
|
||||
|
||||
|
||||
class ImageDownloader:
|
||||
"""Загрузчик изображений"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.download_path = Path(config.get('images.download_path', 'images'))
|
||||
self.max_size_mb = config.get('images.max_size_mb', 10)
|
||||
self.allowed_formats = config.get('images.allowed_formats', ['jpg', 'jpeg', 'png', 'webp'])
|
||||
self.quality = config.get('images.quality', 85)
|
||||
|
||||
# Создаём директорию для изображений
|
||||
self.download_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def download_product_images(self, image_urls, product_id):
|
||||
"""Загружает все изображения товара"""
|
||||
local_images = []
|
||||
|
||||
# Создаём папку для товара
|
||||
product_dir = self.download_path / str(product_id)
|
||||
product_dir.mkdir(exist_ok=True)
|
||||
|
||||
for i, url in enumerate(image_urls):
|
||||
try:
|
||||
local_path = self._download_image(url, product_dir, f"img_{i}")
|
||||
if local_path:
|
||||
local_images.append(str(local_path))
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to download image {url}: {e}")
|
||||
|
||||
return local_images
|
||||
|
||||
def _download_image(self, url, save_dir, filename_prefix):
|
||||
"""Загружает одно изображение"""
|
||||
try:
|
||||
# Получаем изображение
|
||||
response = requests.get(url, timeout=30, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Проверяем размер
|
||||
content_length = response.headers.get('content-length')
|
||||
if content_length and int(content_length) > self.max_size_mb * 1024 * 1024:
|
||||
self.logger.warning(f"Image too large: {url}")
|
||||
return None
|
||||
|
||||
# Определяем формат
|
||||
content_type = response.headers.get('content-type', '')
|
||||
extension = mimetypes.guess_extension(content_type)
|
||||
|
||||
if not extension:
|
||||
# Пытаемся определить по URL
|
||||
parsed_url = urlparse(url)
|
||||
path_ext = Path(parsed_url.path).suffix.lower()
|
||||
if path_ext in ['.jpg', '.jpeg', '.png', '.webp']:
|
||||
extension = path_ext
|
||||
else:
|
||||
extension = '.jpg' # По умолчанию
|
||||
|
||||
# Проверяем разрешённые форматы
|
||||
format_name = extension[1:].lower()
|
||||
if format_name not in self.allowed_formats:
|
||||
self.logger.warning(f"Unsupported format {format_name}: {url}")
|
||||
return None
|
||||
|
||||
# Генерируем имя файла
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
||||
filename = f"{filename_prefix}_{url_hash}{extension}"
|
||||
filepath = save_dir / filename
|
||||
|
||||
# Проверяем, не скачан ли уже файл
|
||||
if filepath.exists():
|
||||
return filepath
|
||||
|
||||
# Сохраняем изображение
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
# Оптимизируем изображение
|
||||
self._optimize_image(filepath)
|
||||
|
||||
self.logger.debug(f"Downloaded image: {filepath}")
|
||||
return filepath
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error downloading image {url}: {e}")
|
||||
return None
|
||||
|
||||
def _optimize_image(self, filepath):
|
||||
"""Оптимизирует изображение"""
|
||||
try:
|
||||
with Image.open(filepath) as img:
|
||||
# Конвертируем в RGB если необходимо
|
||||
if img.mode in ('RGBA', 'LA', 'P'):
|
||||
background = Image.new('RGB', img.size, (255, 255, 255))
|
||||
if img.mode == 'P':
|
||||
img = img.convert('RGBA')
|
||||
background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
|
||||
img = background
|
||||
|
||||
# Ограничиваем размер
|
||||
max_size = (1200, 1200)
|
||||
if img.size[0] > max_size[0] or img.size[1] > max_size[1]:
|
||||
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
||||
|
||||
# Сохраняем с оптимизацией
|
||||
img.save(filepath, 'JPEG', quality=self.quality, optimize=True)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error optimizing image {filepath}: {e}")
|
||||
402
modules/parser.py
Normal file
402
modules/parser.py
Normal file
@@ -0,0 +1,402 @@
|
||||
# modules/parser.py
|
||||
"""
|
||||
Модуль для парсинга товаров с morele.net
|
||||
"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
import re
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import hashlib
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
|
||||
class MoreleParser:
|
||||
"""Парсер для morele.net"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.session = requests.Session()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Настройка сессии
|
||||
self.session.headers.update({
|
||||
'User-Agent': config.get('parsing.user_agent'),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'pl,en-US;q=0.7,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
})
|
||||
|
||||
def parse_category(self, category_url):
|
||||
"""Парсит все товары из категории"""
|
||||
self.logger.info(f"Начинаем парсинг категории: {category_url}")
|
||||
|
||||
products = []
|
||||
page = 1
|
||||
max_pages = self.config.get('parsing.max_pages', 100)
|
||||
|
||||
while page <= max_pages:
|
||||
self.logger.debug(f"Парсинг страницы {page}")
|
||||
|
||||
page_url = self._get_page_url(category_url, page)
|
||||
page_products = self._parse_category_page(page_url)
|
||||
|
||||
if not page_products:
|
||||
self.logger.info(f"Страница {page} пуста, завершаем парсинг категории")
|
||||
break
|
||||
|
||||
products.extend(page_products)
|
||||
page += 1
|
||||
|
||||
# Пауза между запросами
|
||||
time.sleep(self.config.get('parsing.delay_between_requests', 1))
|
||||
|
||||
self.logger.info(f"Найдено {len(products)} товаров в категории")
|
||||
return products
|
||||
|
||||
def _get_page_url(self, base_url, page):
|
||||
"""Формирует URL для конкретной страницы"""
|
||||
if page == 1:
|
||||
return base_url
|
||||
|
||||
# Проверяем, есть ли уже параметры в URL
|
||||
separator = '&' if '?' in base_url else '?'
|
||||
return f"{base_url}{separator}page={page}"
|
||||
|
||||
def _parse_category_page(self, page_url):
|
||||
"""Парсит товары с одной страницы категории"""
|
||||
try:
|
||||
response = self._make_request(page_url)
|
||||
if not response:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Ищем карточки товаров
|
||||
product_cards = soup.find_all('div', class_='cat-product')
|
||||
products = []
|
||||
|
||||
# Используем многопоточность для парсинга товаров
|
||||
with ThreadPoolExecutor(max_workers=self.config.get('parsing.concurrent_requests', 5)) as executor:
|
||||
futures = []
|
||||
|
||||
for card in product_cards:
|
||||
product_url = self._extract_product_url(card)
|
||||
if product_url:
|
||||
future = executor.submit(self._parse_product_page, product_url)
|
||||
futures.append(future)
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
product = future.result()
|
||||
if product:
|
||||
products.append(product)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Ошибка при парсинге товара: {e}")
|
||||
|
||||
return products
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Ошибка при парсинге страницы {page_url}: {e}")
|
||||
return []
|
||||
|
||||
def _extract_product_url(self, card):
|
||||
"""Извлекает URL товара из карточки"""
|
||||
try:
|
||||
link = card.find('a', href=True)
|
||||
if link:
|
||||
href = link['href']
|
||||
if not href.startswith('http'):
|
||||
href = urljoin('https://www.morele.net', href)
|
||||
return href
|
||||
except Exception as e:
|
||||
self.logger.error(f"Ошибка при извлечении URL товара: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _parse_product_page(self, product_url):
|
||||
"""Парсит детальную страницу товара"""
|
||||
try:
|
||||
response = self._make_request(product_url)
|
||||
if not response:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Извлекаем данные товара
|
||||
product = {
|
||||
'url': product_url,
|
||||
'id': self._extract_product_id(product_url),
|
||||
'title': self._extract_title(soup),
|
||||
'price': self._extract_price(soup),
|
||||
'availability': self._extract_availability(soup),
|
||||
'description': self._extract_description(soup),
|
||||
'attributes': self._extract_attributes(soup),
|
||||
'category': self._extract_category(soup),
|
||||
'images': self._extract_images(soup),
|
||||
'brand': self._extract_brand(soup),
|
||||
'model': self._extract_model(soup),
|
||||
'sku': self._extract_sku(soup),
|
||||
'parsed_at': time.time()
|
||||
}
|
||||
|
||||
# Генерируем хеш для определения изменений
|
||||
product['content_hash'] = self._generate_content_hash(product)
|
||||
|
||||
return product
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Ошибка при парсинге товара {product_url}: {e}")
|
||||
return None
|
||||
|
||||
def _extract_product_id(self, url):
|
||||
"""Извлекает ID товара из URL"""
|
||||
# Ищем числовой ID в URL
|
||||
match = re.search(r'/(\d+)-', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Если не найден, используем хеш URL
|
||||
return hashlib.md5(url.encode()).hexdigest()[:10]
|
||||
|
||||
def _extract_title(self, soup):
|
||||
"""Извлекает название товара"""
|
||||
selectors = [
|
||||
'h1.prod-name',
|
||||
'h1[data-test="product-name"]',
|
||||
'h1.product-title',
|
||||
'.product-name h1',
|
||||
'h1'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return "Без названия"
|
||||
|
||||
def _extract_price(self, soup):
|
||||
"""Извлекает цену товара"""
|
||||
selectors = [
|
||||
'.price-new',
|
||||
'.price-main',
|
||||
'[data-test="product-price"]',
|
||||
'.product-price .price',
|
||||
'.price'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
price_text = element.get_text(strip=True)
|
||||
# Извлекаем числовое значение
|
||||
price_match = re.search(r'([\d\s]+[,.]?\d*)', price_text.replace(' ', ''))
|
||||
if price_match:
|
||||
price_str = price_match.group(1).replace(' ', '').replace(',', '.')
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return 0.0
|
||||
|
||||
def _extract_availability(self, soup):
|
||||
"""Извлекает информацию о наличии"""
|
||||
selectors = [
|
||||
'.availability',
|
||||
'[data-test="product-availability"]',
|
||||
'.product-availability',
|
||||
'.stock-info'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
availability_text = element.get_text(strip=True).lower()
|
||||
|
||||
if any(word in availability_text for word in ['dostępny', 'w magazynie', 'dostępne']):
|
||||
return 'в наличии'
|
||||
elif any(word in availability_text for word in ['brak', 'niedostępny']):
|
||||
return 'нет в наличии'
|
||||
else:
|
||||
return availability_text
|
||||
|
||||
return 'неизвестно'
|
||||
|
||||
def _extract_description(self, soup):
|
||||
"""Извлекает описание товара"""
|
||||
selectors = [
|
||||
'.product-description',
|
||||
'[data-test="product-description"]',
|
||||
'.prod-description',
|
||||
'.description',
|
||||
'.product-details .description'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Удаляем HTML теги и лишние пробелы
|
||||
description = element.get_text(separator=' ', strip=True)
|
||||
return re.sub(r'\s+', ' ', description)
|
||||
|
||||
return ""
|
||||
|
||||
def _extract_attributes(self, soup):
|
||||
"""Извлекает характеристики товара"""
|
||||
attributes = {}
|
||||
|
||||
# Различные селекторы для характеристик
|
||||
specs_sections = soup.find_all(['div', 'section'], class_=re.compile(r'spec|param|attribute|feature'))
|
||||
|
||||
for section in specs_sections:
|
||||
# Ищем пары ключ-значение
|
||||
rows = section.find_all(['tr', 'div'], class_=re.compile(r'spec-row|param-row|attribute-row'))
|
||||
|
||||
for row in rows:
|
||||
# Пытаемся найти название и значение
|
||||
name_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'name|key|label'))
|
||||
value_elem = row.find(['td', 'div', 'span'], class_=re.compile(r'value|val'))
|
||||
|
||||
if name_elem and value_elem:
|
||||
name = name_elem.get_text(strip=True)
|
||||
value = value_elem.get_text(strip=True)
|
||||
|
||||
if name and value and len(name) < 100:
|
||||
attributes[name] = value
|
||||
|
||||
return attributes
|
||||
|
||||
def _extract_category(self, soup):
|
||||
"""Извлекает категорию товара"""
|
||||
# Ищем хлебные крошки
|
||||
breadcrumb_selectors = [
|
||||
'.breadcrumb',
|
||||
'.breadcrumbs',
|
||||
'[data-test="breadcrumb"]',
|
||||
'.navigation-path'
|
||||
]
|
||||
|
||||
for selector in breadcrumb_selectors:
|
||||
breadcrumb = soup.select_one(selector)
|
||||
if breadcrumb:
|
||||
links = breadcrumb.find_all('a')
|
||||
if len(links) > 1: # Пропускаем "Главная"
|
||||
return links[-1].get_text(strip=True)
|
||||
|
||||
return "Без категории"
|
||||
|
||||
def _extract_images(self, soup):
|
||||
"""Извлекает изображения товара"""
|
||||
images = []
|
||||
|
||||
# Селекторы для изображений
|
||||
img_selectors = [
|
||||
'.product-gallery img',
|
||||
'.product-images img',
|
||||
'[data-test="product-image"]',
|
||||
'.gallery img',
|
||||
'.product-photo img'
|
||||
]
|
||||
|
||||
for selector in img_selectors:
|
||||
imgs = soup.select(selector)
|
||||
for img in imgs:
|
||||
src = img.get('src') or img.get('data-src') or img.get('data-lazy')
|
||||
if src:
|
||||
if not src.startswith('http'):
|
||||
src = urljoin('https://www.morele.net', src)
|
||||
|
||||
# Фильтруем маленькие изображения
|
||||
if not any(size in src for size in ['icon', 'thumb', 'small']) and src not in images:
|
||||
images.append(src)
|
||||
|
||||
return images[:10] # Ограничиваем количество изображений
|
||||
|
||||
def _extract_brand(self, soup):
|
||||
"""Извлекает бренд товара"""
|
||||
selectors = [
|
||||
'[data-test="product-brand"]',
|
||||
'.product-brand',
|
||||
'.brand',
|
||||
'.manufacturer'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return ""
|
||||
|
||||
def _extract_model(self, soup):
|
||||
"""Извлекает модель товара"""
|
||||
selectors = [
|
||||
'[data-test="product-model"]',
|
||||
'.product-model',
|
||||
'.model'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return ""
|
||||
|
||||
def _extract_sku(self, soup):
|
||||
"""Извлекает артикул товара"""
|
||||
selectors = [
|
||||
'[data-test="product-sku"]',
|
||||
'.product-sku',
|
||||
'.sku',
|
||||
'.article-number',
|
||||
'.product-code'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return ""
|
||||
|
||||
def _generate_content_hash(self, product):
|
||||
"""Генерирует хеш содержимого товара для определения изменений"""
|
||||
content = f"{product['title']}{product['price']}{product['availability']}{product['description']}"
|
||||
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def _make_request(self, url, retries=None):
|
||||
"""Выполняет HTTP запрос с повторными попытками"""
|
||||
if retries is None:
|
||||
retries = self.config.get('parsing.max_retries', 3)
|
||||
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
response = self.session.get(
|
||||
url,
|
||||
timeout=self.config.get('parsing.timeout', 30)
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response
|
||||
elif response.status_code == 429: # Too Many Requests
|
||||
wait_time = (attempt + 1) * 5
|
||||
self.logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
self.logger.warning(f"HTTP {response.status_code} for {url}")
|
||||
|
||||
except requests.RequestException as e:
|
||||
self.logger.error(f"Request error on attempt {attempt + 1} for {url}: {e}")
|
||||
|
||||
if attempt < retries:
|
||||
time.sleep((attempt + 1) * 2)
|
||||
|
||||
self.logger.error(f"Failed to fetch {url} after {retries + 1} attempts")
|
||||
return None
|
||||
272
modules/storage.py
Normal file
272
modules/storage.py
Normal file
@@ -0,0 +1,272 @@
|
||||
# modules/storage.py
|
||||
"""
|
||||
Модуль для работы с хранилищем данных
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class StorageManager:
|
||||
"""Менеджер для работы с базой данных"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Инициализация БД
|
||||
self.db_type = config.get('database.type', 'sqlite')
|
||||
|
||||
if self.db_type == 'sqlite':
|
||||
self.db_path = config.get('database.sqlite_path', 'data/morele_parser.db')
|
||||
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
self._init_sqlite()
|
||||
else:
|
||||
raise NotImplementedError("Пока поддерживается только SQLite")
|
||||
|
||||
def _init_sqlite(self):
|
||||
"""Инициализирует SQLite базу данных"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS products (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
external_id TEXT UNIQUE NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
title_ua TEXT,
|
||||
price REAL NOT NULL,
|
||||
availability TEXT,
|
||||
description TEXT,
|
||||
description_ua TEXT,
|
||||
attributes TEXT,
|
||||
attributes_ua TEXT,
|
||||
category TEXT,
|
||||
brand TEXT,
|
||||
model TEXT,
|
||||
sku TEXT,
|
||||
images TEXT,
|
||||
local_images TEXT,
|
||||
content_hash TEXT,
|
||||
is_translated BOOLEAN DEFAULT 0,
|
||||
is_active BOOLEAN DEFAULT 1,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS categories (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
is_active BOOLEAN DEFAULT 1,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS translation_cache (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
original_text TEXT UNIQUE NOT NULL,
|
||||
translated_text TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS parsing_logs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
category_url TEXT,
|
||||
products_found INTEGER,
|
||||
products_new INTEGER,
|
||||
products_updated INTEGER,
|
||||
errors_count INTEGER,
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_products_external_id ON products(external_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_products_url ON products(url);
|
||||
CREATE INDEX IF NOT EXISTS idx_translation_cache_original ON translation_cache(original_text);
|
||||
""")
|
||||
|
||||
def save_product(self, product):
|
||||
"""Сохраняет товар в базу данных"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO products (
|
||||
external_id, url, title, title_ua, price, availability,
|
||||
description, description_ua, attributes, attributes_ua,
|
||||
category, brand, model, sku, images, local_images,
|
||||
content_hash, is_translated, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
product['id'],
|
||||
product['url'],
|
||||
product['title'],
|
||||
product.get('title_ua', ''),
|
||||
product['price'],
|
||||
product['availability'],
|
||||
product['description'],
|
||||
product.get('description_ua', ''),
|
||||
json.dumps(product.get('attributes', {}), ensure_ascii=False),
|
||||
json.dumps(product.get('attributes_ua', {}), ensure_ascii=False),
|
||||
product.get('category', ''),
|
||||
product.get('brand', ''),
|
||||
product.get('model', ''),
|
||||
product.get('sku', ''),
|
||||
json.dumps(product.get('images', [])),
|
||||
json.dumps(product.get('local_images', [])),
|
||||
product.get('content_hash', ''),
|
||||
product.get('is_translated', False),
|
||||
datetime.now().isoformat()
|
||||
))
|
||||
|
||||
def get_product_by_url(self, url):
|
||||
"""Получает товар по URL"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT * FROM products WHERE url = ?", (url,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
product = dict(row)
|
||||
product['attributes'] = json.loads(product['attributes'] or '{}')
|
||||
product['attributes_ua'] = json.loads(product['attributes_ua'] or '{}')
|
||||
product['images'] = json.loads(product['images'] or '[]')
|
||||
product['local_images'] = json.loads(product['local_images'] or '[]')
|
||||
return product
|
||||
|
||||
return None
|
||||
|
||||
def get_product_by_id(self, product_id):
|
||||
"""Получает товар по ID"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT * FROM products WHERE id = ?", (product_id,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
product = dict(row)
|
||||
product['attributes'] = json.loads(product['attributes'] or '{}')
|
||||
product['attributes_ua'] = json.loads(product['attributes_ua'] or '{}')
|
||||
product['images'] = json.loads(product['images'] or '[]')
|
||||
product['local_images'] = json.loads(product['local_images'] or '[]')
|
||||
return product
|
||||
|
||||
return None
|
||||
|
||||
def update_product(self, product_id, product_data):
|
||||
"""Обновляет товар"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
UPDATE products SET
|
||||
title = ?, title_ua = ?, price = ?, availability = ?,
|
||||
description = ?, description_ua = ?, attributes = ?, attributes_ua = ?,
|
||||
category = ?, brand = ?, model = ?, sku = ?, images = ?, local_images = ?,
|
||||
content_hash = ?, is_translated = ?, updated_at = ?
|
||||
WHERE id = ?
|
||||
""", (
|
||||
product_data['title'],
|
||||
product_data.get('title_ua', ''),
|
||||
product_data['price'],
|
||||
product_data['availability'],
|
||||
product_data['description'],
|
||||
product_data.get('description_ua', ''),
|
||||
json.dumps(product_data.get('attributes', {}), ensure_ascii=False),
|
||||
json.dumps(product_data.get('attributes_ua', {}), ensure_ascii=False),
|
||||
product_data.get('category', ''),
|
||||
product_data.get('brand', ''),
|
||||
product_data.get('model', ''),
|
||||
product_data.get('sku', ''),
|
||||
json.dumps(product_data.get('images', [])),
|
||||
json.dumps(product_data.get('local_images', [])),
|
||||
product_data.get('content_hash', ''),
|
||||
product_data.get('is_translated', False),
|
||||
datetime.now().isoformat(),
|
||||
product_id
|
||||
))
|
||||
|
||||
def get_active_categories(self):
|
||||
"""Получает список активных категорий для парсинга"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT * FROM categories WHERE is_active = 1")
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
def add_category(self, name, url):
|
||||
"""Добавляет категорию"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO categories (name, url) VALUES (?, ?)
|
||||
""", (name, url))
|
||||
|
||||
def deactivate_category(self, category_id):
|
||||
"""Деактивирует категорию"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("UPDATE categories SET is_active = 0 WHERE id = ?", (category_id,))
|
||||
|
||||
def get_translation_from_cache(self, original_text):
|
||||
"""Получает перевод из кеша"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT translated_text FROM translation_cache WHERE original_text = ?",
|
||||
(original_text,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
def save_translation_to_cache(self, original_text, translated_text):
|
||||
"""Сохраняет перевод в кеш"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO translation_cache (original_text, translated_text)
|
||||
VALUES (?, ?)
|
||||
""", (original_text, translated_text))
|
||||
|
||||
def get_products_for_feed(self):
|
||||
"""Получает товары для генерации фида"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("""
|
||||
SELECT * FROM products
|
||||
WHERE is_active = 1 AND is_translated = 1 AND price > 0
|
||||
ORDER BY updated_at DESC
|
||||
""")
|
||||
|
||||
products = []
|
||||
for row in cursor.fetchall():
|
||||
product = dict(row)
|
||||
product['attributes'] = json.loads(product['attributes'] or '{}')
|
||||
product['attributes_ua'] = json.loads(product['attributes_ua'] or '{}')
|
||||
product['images'] = json.loads(product['images'] or '[]')
|
||||
product['local_images'] = json.loads(product['local_images'] or '[]')
|
||||
products.append(product)
|
||||
|
||||
return products
|
||||
|
||||
def log_parsing_session(self, category_url, stats):
|
||||
"""Логирует сессию парсинга"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT INTO parsing_logs
|
||||
(category_url, products_found, products_new, products_updated, errors_count, started_at, completed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
category_url,
|
||||
stats.get('found', 0),
|
||||
stats.get('new', 0),
|
||||
stats.get('updated', 0),
|
||||
stats.get('errors', 0),
|
||||
stats.get('started_at'),
|
||||
stats.get('completed_at')
|
||||
))
|
||||
|
||||
def get_parsing_stats(self, days=30):
|
||||
"""Получает статистику парсинга за последние дни"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("""
|
||||
SELECT * FROM parsing_logs
|
||||
WHERE completed_at > datetime('now', '-{} days')
|
||||
ORDER BY completed_at DESC
|
||||
""".format(days))
|
||||
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
154
modules/translator.py
Normal file
154
modules/translator.py
Normal file
@@ -0,0 +1,154 @@
|
||||
# modules/translator.py
|
||||
"""
|
||||
Модуль для перевода текста с кешем
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class TranslationProvider(ABC):
|
||||
"""Абстрактный класс для провайдеров перевода"""
|
||||
|
||||
@abstractmethod
|
||||
def translate(self, text, source_lang, target_lang):
|
||||
pass
|
||||
|
||||
|
||||
class GoogleTranslateProvider(TranslationProvider):
|
||||
"""Провайдер Google Translate"""
|
||||
|
||||
def __init__(self, api_key):
|
||||
self.api_key = api_key
|
||||
|
||||
def translate(self, text, source_lang='pl', target_lang='uk'):
|
||||
try:
|
||||
from googletrans import Translator
|
||||
translator = Translator()
|
||||
result = translator.translate(text, src=source_lang, dest=target_lang)
|
||||
return result.text
|
||||
except Exception as e:
|
||||
logging.error(f"Google Translate error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
class DeepLProvider(TranslationProvider):
|
||||
"""Провайдер DeepL"""
|
||||
|
||||
def __init__(self, api_key):
|
||||
self.api_key = api_key
|
||||
|
||||
def translate(self, text, source_lang='PL', target_lang='UK'):
|
||||
try:
|
||||
import deepl
|
||||
translator = deepl.Translator(self.api_key)
|
||||
result = translator.translate_text(text, source_lang=source_lang, target_lang=target_lang)
|
||||
return result.text
|
||||
except Exception as e:
|
||||
logging.error(f"DeepL error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
class LibreTranslateProvider(TranslationProvider):
|
||||
"""Провайдер LibreTranslate"""
|
||||
|
||||
def __init__(self, url, api_key=None):
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
|
||||
def translate(self, text, source_lang='pl', target_lang='uk'):
|
||||
try:
|
||||
import requests
|
||||
|
||||
data = {
|
||||
'q': text,
|
||||
'source': source_lang,
|
||||
'target': target_lang,
|
||||
'format': 'text'
|
||||
}
|
||||
|
||||
if self.api_key:
|
||||
data['api_key'] = self.api_key
|
||||
|
||||
response = requests.post(f"{self.url}/translate", data=data)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()['translatedText']
|
||||
except Exception as e:
|
||||
logging.error(f"LibreTranslate error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
class TranslationService:
|
||||
"""Сервис для перевода с кешем"""
|
||||
|
||||
def __init__(self, config, storage):
|
||||
self.config = config
|
||||
self.storage = storage
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Инициализация провайдера
|
||||
self.provider = self._init_provider()
|
||||
|
||||
# Настройки
|
||||
self.cache_enabled = config.get('translation.cache_enabled', True)
|
||||
self.source_lang = config.get('translation.google.source_lang', 'pl')
|
||||
self.target_lang = config.get('translation.google.target_lang', 'uk')
|
||||
|
||||
def _init_provider(self):
|
||||
"""Инициализирует провайдера перевода"""
|
||||
service = self.config.get('translation.service', 'google')
|
||||
|
||||
if service == 'google':
|
||||
api_key = self.config.get('translation.google.api_key')
|
||||
if not api_key:
|
||||
self.logger.warning("Google Translate API key not found, using googletrans library")
|
||||
return GoogleTranslateProvider(api_key)
|
||||
|
||||
elif service == 'deepl':
|
||||
api_key = self.config.get('translation.deepl.api_key')
|
||||
if not api_key:
|
||||
raise ValueError("DeepL API key is required")
|
||||
return DeepLProvider(api_key)
|
||||
|
||||
elif service == 'libretranslate':
|
||||
url = self.config.get('translation.libretranslate.url')
|
||||
api_key = self.config.get('translation.libretranslate.api_key')
|
||||
return LibreTranslateProvider(url, api_key)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported translation service: {service}")
|
||||
|
||||
def translate(self, text):
|
||||
"""Переводит текст с использованием кеша"""
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
text = text.strip()
|
||||
|
||||
# Проверяем кеш
|
||||
if self.cache_enabled:
|
||||
cached = self.storage.get_translation_from_cache(text)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
try:
|
||||
# Переводим
|
||||
translated = self.provider.translate(text, self.source_lang, self.target_lang)
|
||||
|
||||
# Сохраняем в кеш
|
||||
if self.cache_enabled and translated:
|
||||
self.storage.save_translation_to_cache(text, translated)
|
||||
|
||||
# Небольшая пауза чтобы не превысить лимиты API
|
||||
time.sleep(0.1)
|
||||
|
||||
return translated
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Translation failed for text '{text[:50]}...': {e}")
|
||||
return text # Возвращаем оригинальный текст при ошибке
|
||||
|
||||
|
||||
Reference in New Issue
Block a user