Files
mario_scraper/feed_generator.py
2025-05-27 12:22:52 +03:00

256 lines
8.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from config import BASE_URL
import json
import xml.etree.ElementTree as ET
from typing import List, Dict
from datetime import datetime
from urllib.parse import urljoin
class RobotVacuumYMLGenerator:
def __init__(
self,
shop_name: str = "Euro Electronics",
base_url: str = BASE_URL,
use_original_urls: bool = False,
categories_data: List[Dict] = None,
):
"""
Initialize YML feed generator
:param shop_name: Name of the shop
:param base_url: Base URL for image hosting
:param use_original_urls: If True, use original image URLs instead of local ones
"""
self.root = ET.Element(
"yml_catalog", {"date": datetime.now().strftime("%Y-%m-%d %H:%M")}
)
self.shop = ET.SubElement(self.root, "shop")
ET.SubElement(self.shop, "name").text = shop_name
self.base_url = base_url
self.use_original_urls = use_original_urls
self.categories = ET.SubElement(self.shop, "categories")
self.offers = ET.SubElement(self.shop, "offers")
self.categories_data = categories_data or []
def add_category(self, category_id: str, category_name: str, parent_id: str = None):
"""
Add category to YML feed
:param category_id: Category ID
:param category_name: Category name
:param parent_id: Parent category ID (optional)
"""
attrs = {"id": category_id}
if parent_id:
attrs["parentId"] = parent_id
category = ET.SubElement(self.categories, "category", attrs)
category.text = category_name
def get_image_url(self, local_path: str) -> str:
"""
Convert local path to full URL, normalizing path separators
:param local_path: Local path to image file
:return: Full URL with normalized path separators
"""
if not local_path:
return None
# Normalize path separators to forward slashes
normalized_path = local_path.replace("\\", "/")
return urljoin(self.base_url, normalized_path)
def process_attributes(self, attributes: List[Dict]) -> List[Dict]:
"""
Convert attributes to param format for YML
:param attributes: List of attribute dictionaries
:return: List of param dictionaries
"""
params = []
for attr in attributes:
value = attr["value"]
# Handle single or multiple values
if isinstance(value, list):
value = " | ".join(str(v) for v in value)
params.append({"name": attr["name"], "value": value})
return params
def clean_product_name(self, name: str) -> str:
"""
Очищает название продукта, удаляя кириллические слова после латинских символов
:param name: Исходное название продукта
:return: Очищенное название
"""
# Разбиваем строку на слова
words = name.split()
cleaned_words = []
last_latin_index = -1
# Проходим по словам и ищем последнее слово с латиницей
for i, word in enumerate(words):
# Проверяем, содержит ли слово латинские символы
if any(ord("a") <= ord(c.lower()) <= ord("z") for c in word):
last_latin_index = i
# Если нашли латинские символы, берём все слова до следующего после последнего латинского
if last_latin_index != -1:
cleaned_words = words[: last_latin_index + 1]
else:
cleaned_words = words
return " ".join(cleaned_words)
def add_offer(self, product: Dict):
"""
Add a robot vacuum cleaner offer to the YML feed
:param product: Product dictionary from JSON
"""
in_stock = product.get("in_stock", False)
offer = ET.SubElement(
self.offers,
"offer",
{
"id": str(product["plu"]),
"available": "true" if in_stock else "false",
"in_stock": "true" if in_stock else "false",
},
)
# Clean product name before adding to feed
cleaned_name = self.clean_product_name(product["name"])
ET.SubElement(offer, "name").text = cleaned_name
# Add vendorCode using plu
ET.SubElement(offer, "vendorCode").text = str(product["plu"])
ET.SubElement(offer, "price").text = str(product["prices"]["mainPrice"])
ET.SubElement(offer, "currencyId").text = "PLN"
ET.SubElement(offer, "categoryId").text = str(
product["local_category_id"]
) # якщо у тебе є локальна категорія
ET.SubElement(offer, "portal_category_id").text = str(
product["portal_category_id"]
) # ОБОВ'ЯЗКОВО
# Додаємо keywords із назви категорії
category_name = ""
local_category_id = product.get("local_category_id")
if self.categories_data:
match = next(
(
c
for c in self.categories_data
if str(c["id"]) == str(local_category_id)
),
None,
)
if match:
category_name = match["name"]
keywords = product.get("keywords", "")
combined_keywords = category_name
if keywords:
combined_keywords += f", {keywords}"
ET.SubElement(offer, "keywords").text = combined_keywords
# Description with images
if "description" in product:
description_html = "<div>"
for desc in product["description"]:
description_html += f"<h3>{desc['title']}</h3>"
description_html += f"<p>{desc['text']}</p>"
if desc["image"].get("local_path") and not self.use_original_urls:
img_url = self.get_image_url(desc["image"]["local_path"])
description_html += f'<img src="{img_url}" alt="{desc["title"]}"/>'
elif desc["image"].get("url") and self.use_original_urls:
img_url = desc["image"]["url"]
description_html += f'<img src="{img_url}" alt="{desc["title"]}"/>'
description_html += "</div>"
description_elem = ET.SubElement(offer, "description")
description_elem.text = description_html
# Product images
for img in product["images"][:10]:
if self.use_original_urls:
img_url = img["url"]
else:
if img.get("local_path"):
img_url = self.get_image_url(img["local_path"])
else:
continue
ET.SubElement(offer, "picture").text = img_url
# Attributes as params
params = self.process_attributes(product["attributes"])
for param in params:
param_elem = ET.SubElement(offer, "param", {"name": param["name"]})
param_elem.text = str(param["value"])
# URL
ET.SubElement(offer, "url").text = product["url"]
def generate_yml(self, products: List[Dict], output_yml: str) -> bool:
"""
Generate YML feed from products data
:param products: List of product dictionaries
:param output_yml: Path to output YML file
:return: True if successful, False otherwise
"""
try:
# Ensure a category exists
if not list(self.categories):
raise ValueError("No categories added to the YML feed.")
# ✅ Фильтрация товаров дороже 300 злотых
filtered_products = [
product
for product in products
if product.get("prices", {}).get("mainPrice", 0) > 300
]
for product in filtered_products:
self.add_offer(product)
tree = ET.ElementTree(self.root)
tree.write(output_yml, encoding="UTF-8", xml_declaration=True)
print(f"YML feed generated: {output_yml}")
return True
except Exception as e:
print(f"Error generating YML feed: {str(e)}")
return False
def main():
"""
Example usage with command line arguments
"""
import sys
if len(sys.argv) < 2:
print("Usage: python feed_generator.py input.json [output.yml]")
sys.exit(1)
input_json = sys.argv[1]
output_yml = sys.argv[2] if len(sys.argv) > 2 else None
generator = RobotVacuumYMLGenerator()
generator.generate_yml(input_json, output_yml)
if __name__ == "__main__":
main()