test_product_enrich_partial_mode.py 25.1 KB
Edit Raw Blame History

from __future__ import annotations

import importlib.util
import io
import json
import logging
import sys
import types
from pathlib import Path
from unittest import mock


def _load_product_enrich_module():
    if "dotenv" not in sys.modules:
        fake_dotenv = types.ModuleType("dotenv")
        fake_dotenv.load_dotenv = lambda *args, **kwargs: None
        sys.modules["dotenv"] = fake_dotenv

    if "redis" not in sys.modules:
        fake_redis = types.ModuleType("redis")

        class _FakeRedisClient:
            def __init__(self, *args, **kwargs):
                pass

            def ping(self):
                return True

        fake_redis.Redis = _FakeRedisClient
        sys.modules["redis"] = fake_redis

    repo_root = Path(__file__).resolve().parents[1]
    if str(repo_root) not in sys.path:
        sys.path.insert(0, str(repo_root))

    module_path = repo_root / "indexer" / "product_enrich.py"
    spec = importlib.util.spec_from_file_location("product_enrich_under_test", module_path)
    module = importlib.util.module_from_spec(spec)
    assert spec and spec.loader
    spec.loader.exec_module(module)
    return module


product_enrich = _load_product_enrich_module()


def _attach_stream(logger_obj: logging.Logger):
    stream = io.StringIO()
    handler = logging.StreamHandler(stream)
    handler.setFormatter(logging.Formatter("%(message)s"))
    logger_obj.addHandler(handler)
    return stream, handler


def test_create_prompt_splits_shared_context_and_localized_tail():
    products = [
        {"id": "1", "title": "dress"},
        {"id": "2", "title": "linen shirt"},
    ]

    shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(products, target_lang="zh")
    shared_en, user_en, prefix_en = product_enrich.create_prompt(products, target_lang="en")

    assert shared_zh == shared_en
    assert "Analyze each input product text" in shared_zh
    assert "1. dress" in shared_zh
    assert "2. linen shirt" in shared_zh
    assert "Product list" not in user_zh
    assert "Product list" not in user_en
    assert "specified language" in user_zh
    assert "Language: Chinese" in user_zh
    assert "Language: English" in user_en
    assert prefix_zh.startswith("| 序号 | 商品标题 | 品类路径 |")
    assert prefix_en.startswith("| No. | Product title | Category path |")


def test_create_prompt_supports_taxonomy_analysis_kind():
    products = [{"id": "1", "title": "linen dress"}]

    shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(
        products,
        target_lang="zh",
        analysis_kind="taxonomy",
    )
    shared_fr, user_fr, prefix_fr = product_enrich.create_prompt(
        products,
        target_lang="fr",
        analysis_kind="taxonomy",
    )

    assert "apparel attribute taxonomy" in shared_zh
    assert "1. linen dress" in shared_zh
    assert "Language: Chinese" in user_zh
    assert "Language: French" in user_fr
    assert prefix_zh.startswith("| 序号 | 品类 | 目标性别 |")
    assert prefix_fr.startswith("| No. | Product Type | Target Gender |")


def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests():
    payloads = []
    response_bodies = [
        {
            "choices": [
                {
                    "message": {
                        "content": (
                            "| 1 | 连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | "
                            "通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | "
                            "修身显瘦 | 法式收腰连衣裙 |\n"
                        )
                    }
                }
            ],
            "usage": {"prompt_tokens": 120, "completion_tokens": 45, "total_tokens": 165},
        },
        {
            "choices": [
                {
                    "message": {
                        "content": (
                            "| 1 | Dress | Women>Dress | French,Waisted | Young women | "
                            "Commute,Date | Spring,Summer | Midi | Polyester | Breathable | "
                            "Slim fit | French waisted dress |\n"
                        )
                    }
                }
            ],
            "usage": {"prompt_tokens": 118, "completion_tokens": 43, "total_tokens": 161},
        },
    ]

    class _FakeResponse:
        def __init__(self, body):
            self.body = body

        def raise_for_status(self):
            return None

        def json(self):
            return self.body

    class _FakeSession:
        trust_env = True

        def post(self, url, headers=None, json=None, timeout=None, proxies=None):
            del url, headers, timeout, proxies
            payloads.append(json)
            return _FakeResponse(response_bodies[len(payloads) - 1])

        def close(self):
            return None

    product_enrich.reset_logged_shared_context_keys()
    main_stream, main_handler = _attach_stream(product_enrich.logger)
    verbose_stream, verbose_handler = _attach_stream(product_enrich.verbose_logger)

    try:
        with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
            product_enrich.requests,
            "Session",
            lambda: _FakeSession(),
        ):
            zh_shared, zh_user, zh_prefix = product_enrich.create_prompt(
                [{"id": "1", "title": "dress"}],
                target_lang="zh",
            )
            en_shared, en_user, en_prefix = product_enrich.create_prompt(
                [{"id": "1", "title": "dress"}],
                target_lang="en",
            )

            zh_markdown, zh_raw = product_enrich.call_llm(
                zh_shared,
                zh_user,
                zh_prefix,
                target_lang="zh",
            )
            en_markdown, en_raw = product_enrich.call_llm(
                en_shared,
                en_user,
                en_prefix,
                target_lang="en",
            )
    finally:
        product_enrich.logger.removeHandler(main_handler)
        product_enrich.verbose_logger.removeHandler(verbose_handler)

    assert zh_shared == en_shared
    assert len(payloads) == 2
    assert len(payloads[0]["messages"]) == 3
    assert payloads[0]["messages"][1]["role"] == "user"
    assert "1. dress" in payloads[0]["messages"][1]["content"]
    assert "Language: Chinese" in payloads[0]["messages"][1]["content"]
    assert "Language: English" in payloads[1]["messages"][1]["content"]
    assert payloads[0]["messages"][-1]["partial"] is True
    assert payloads[1]["messages"][-1]["partial"] is True

    main_log = main_stream.getvalue()
    verbose_log = verbose_stream.getvalue()

    assert main_log.count("LLM Shared Context") == 1
    assert main_log.count("LLM Request Variant") == 2
    assert "Localized Requirement" in main_log
    assert "Shared Context" in main_log

    assert verbose_log.count("LLM Request [model=") == 2
    assert verbose_log.count("LLM Response [model=") == 2
    assert '"partial": true' in verbose_log
    assert "Combined User Prompt" in verbose_log
    assert "French waisted dress" in verbose_log
    assert "法式收腰连衣裙" in verbose_log

    assert zh_markdown.startswith(zh_prefix)
    assert en_markdown.startswith(en_prefix)
    assert json.loads(zh_raw)["usage"]["total_tokens"] == 165
    assert json.loads(en_raw)["usage"]["total_tokens"] == 161


def test_process_batch_reads_result_and_validates_expected_fields():
    merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 锚文本 |
|----|----|----|----|----|----|----|----|----|----|----|
| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 法式收腰连衣裙 |
"""

    with mock.patch.object(
        product_enrich,
        "call_llm",
        return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})),
    ):
        results = product_enrich.process_batch(
            [{"id": "sku-1", "title": "dress"}],
            batch_num=1,
            target_lang="zh",
        )

    assert len(results) == 1
    row = results[0]
    assert row["id"] == "sku-1"
    assert row["lang"] == "zh"
    assert row["title_input"] == "dress"
    assert row["title"] == "法式连衣裙"
    assert row["category_path"] == "女装>连衣裙"
    assert row["tags"] == "法式,收腰"
    assert row["target_audience"] == "年轻女性"
    assert row["usage_scene"] == "通勤,约会"
    assert row["season"] == "春季,夏季"
    assert row["key_attributes"] == "中长款"
    assert row["material"] == "聚酯纤维"
    assert row["features"] == "透气"
    assert row["anchor_text"] == "法式收腰连衣裙"


def test_process_batch_reads_taxonomy_result_with_schema_specific_fields():
    merged_markdown = """| 序号 | 品类 | 目标性别 | 年龄段 | 适用季节 | 版型 | 廓形 | 领型 | 袖长类型 | 袖型 | 肩带设计 | 腰型 | 裤型 | 裙型 | 长度类型 | 闭合方式 | 设计细节 | 面料 | 成分 | 面料特性 | 服装特征 | 功能 | 主颜色 | 色系 | 印花 / 图案 | 适用场景 | 风格 |
|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|
| 1 | 连衣裙 | 女 | 成人 | 春季,夏季 | 修身 | A字 | V领 | 无袖 | | 细肩带 | 高腰 | | A字裙 | 中长款 | 拉链 | 褶皱 | 梭织 | 聚酯纤维,氨纶 | 轻薄,透气 | 有内衬 | 易打理 | 酒红色 | 红色 | 纯色 | 约会,度假 | 浪漫 |
"""

    with mock.patch.object(
        product_enrich,
        "call_llm",
        return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})),
    ):
        results = product_enrich.process_batch(
            [{"id": "sku-1", "title": "dress"}],
            batch_num=1,
            target_lang="zh",
            analysis_kind="taxonomy",
        )

    assert len(results) == 1
    row = results[0]
    assert row["id"] == "sku-1"
    assert row["lang"] == "zh"
    assert row["title_input"] == "dress"
    assert row["product_type"] == "连衣裙"
    assert row["target_gender"] == "女"
    assert row["age_group"] == "成人"
    assert row["sleeve_length_type"] == "无袖"
    assert row["material_composition"] == "聚酯纤维,氨纶"
    assert row["occasion_end_use"] == "约会,度假"
    assert row["style_aesthetic"] == "浪漫"


def test_analyze_products_uses_product_level_cache_across_batch_requests():
    cache_store = {}
    process_calls = []

    def _cache_key(product, target_lang):
        return (
            target_lang,
            product.get("title", ""),
            product.get("brief", ""),
            product.get("description", ""),
            product.get("image_url", ""),
        )

    def fake_get_cached_analysis_result(
        product,
        target_lang,
        analysis_kind="content",
        category_taxonomy_profile=None,
    ):
        assert analysis_kind == "content"
        assert category_taxonomy_profile is None
        return cache_store.get(_cache_key(product, target_lang))

    def fake_set_cached_analysis_result(
        product,
        target_lang,
        result,
        analysis_kind="content",
        category_taxonomy_profile=None,
    ):
        assert analysis_kind == "content"
        assert category_taxonomy_profile is None
        cache_store[_cache_key(product, target_lang)] = result

    def fake_process_batch(
        batch_data,
        batch_num,
        target_lang="zh",
        analysis_kind="content",
        category_taxonomy_profile=None,
    ):
        assert analysis_kind == "content"
        assert category_taxonomy_profile is None
        process_calls.append(
            {
                "batch_num": batch_num,
                "target_lang": target_lang,
                "titles": [item["title"] for item in batch_data],
            }
        )
        return [
            {
                "id": item["id"],
                "lang": target_lang,
                "title_input": item["title"],
                "title": f"normalized:{item['title']}",
                "category_path": "cat",
                "tags": "tags",
                "target_audience": "audience",
                "usage_scene": "scene",
                "season": "season",
                "key_attributes": "attrs",
                "material": "material",
                "features": "features",
                "anchor_text": f"anchor:{item['title']}",
            }
            for item in batch_data
        ]

    products = [
        {"id": "1", "title": "dress"},
        {"id": "2", "title": "shirt"},
    ]

    with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
        product_enrich,
        "_get_cached_analysis_result",
        side_effect=fake_get_cached_analysis_result,
    ), mock.patch.object(
        product_enrich,
        "_set_cached_analysis_result",
        side_effect=fake_set_cached_analysis_result,
    ), mock.patch.object(
        product_enrich,
        "process_batch",
        side_effect=fake_process_batch,
    ):
        first = product_enrich.analyze_products(
            [products[0]],
            target_lang="zh",
            tenant_id="170",
        )
        second = product_enrich.analyze_products(
            products,
            target_lang="zh",
            tenant_id="999",
        )
        third = product_enrich.analyze_products(
            products,
            target_lang="zh",
            tenant_id="170",
        )

    assert [row["title_input"] for row in first] == ["dress"]
    assert [row["title_input"] for row in second] == ["dress", "shirt"]
    assert [row["title_input"] for row in third] == ["dress", "shirt"]

    assert process_calls == [
        {"batch_num": 1, "target_lang": "zh", "titles": ["dress"]},
        {"batch_num": 1, "target_lang": "zh", "titles": ["shirt"]},
    ]
    assert second[0]["anchor_text"] == "anchor:dress"
    assert second[1]["anchor_text"] == "anchor:shirt"
    assert third[0]["anchor_text"] == "anchor:dress"
    assert third[1]["anchor_text"] == "anchor:shirt"


def test_analyze_products_reuses_cached_content_with_current_product_identity():
    cached_result = {
        "id": "1165",
        "lang": "zh",
        "title_input": "old-title",
        "title": "法式连衣裙",
        "category_path": "女装>连衣裙",
        "enriched_tags": "法式,收腰",
        "target_audience": "年轻女性",
        "usage_scene": "通勤,约会",
        "season": "春季,夏季",
        "key_attributes": "中长款",
        "material": "聚酯纤维",
        "features": "透气",
        "anchor_text": "法式收腰连衣裙",
    }
    products = [{"id": "69960", "title": "dress"}]

    with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object(
        product_enrich,
        "_get_cached_analysis_result",
        wraps=lambda product, target_lang, analysis_kind="content", category_taxonomy_profile=None: product_enrich._normalize_analysis_result(
            cached_result,
            product=product,
            target_lang=target_lang,
            schema=product_enrich._get_analysis_schema("content"),
        ),
    ), mock.patch.object(
        product_enrich,
        "process_batch",
        side_effect=AssertionError("process_batch should not be called on cache hit"),
    ):
        result = product_enrich.analyze_products(
            products,
            target_lang="zh",
            tenant_id="170",
        )

    assert result == [
        {
            "id": "69960",
            "lang": "zh",
            "title_input": "dress",
            "title": "法式连衣裙",
            "category_path": "女装>连衣裙",
            "tags": "法式,收腰",
            "target_audience": "年轻女性",
            "usage_scene": "通勤,约会",
            "season": "春季,夏季",
            "key_attributes": "中长款",
            "material": "聚酯纤维",
            "features": "透气",
            "anchor_text": "法式收腰连衣裙",
        }
    ]


def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output():
    def fake_analyze_products(
        products,
        target_lang="zh",
        batch_size=None,
        tenant_id=None,
        analysis_kind="content",
        category_taxonomy_profile=None,
    ):
        if analysis_kind == "taxonomy":
            assert category_taxonomy_profile == "apparel"
            return [
                {
                    "id": products[0]["id"],
                    "lang": target_lang,
                    "title_input": products[0]["title"],
                    "product_type": f"{target_lang}-dress",
                    "target_gender": f"{target_lang}-women",
                    "age_group": "",
                    "season": f"{target_lang}-summer",
                    "fit": "",
                    "silhouette": "",
                    "neckline": "",
                    "sleeve_length_type": "",
                    "sleeve_style": "",
                    "strap_type": "",
                    "rise_waistline": "",
                    "leg_shape": "",
                    "skirt_shape": "",
                    "length_type": "",
                    "closure_type": "",
                    "design_details": "",
                    "fabric": "",
                    "material_composition": "",
                    "fabric_properties": "",
                    "clothing_features": "",
                    "functional_benefits": "",
                    "color": "",
                    "color_family": "",
                    "print_pattern": "",
                    "occasion_end_use": "",
                    "style_aesthetic": "",
                }
            ]
        return [
            {
                "id": products[0]["id"],
                "lang": target_lang,
                "title_input": products[0]["title"],
                "title": products[0]["title"],
                "category_path": "玩具>滑行玩具",
                "tags": f"{target_lang}-tag1,{target_lang}-tag2",
                "target_audience": f"{target_lang}-audience",
                "usage_scene": "",
                "season": "",
                "key_attributes": "",
                "material": "",
                "features": "",
                "anchor_text": f"{target_lang}-anchor",
            }
        ]

    with mock.patch.object(
        product_enrich,
        "analyze_products",
        side_effect=fake_analyze_products,
    ):
        result = product_enrich.build_index_content_fields(
            items=[{"spu_id": "69960", "title": "dress"}],
            tenant_id="170",
        )

    assert result == [
        {
            "id": "69960",
            "qanchors": {"zh": ["zh-anchor"], "en": ["en-anchor"]},
            "enriched_tags": {"zh": ["zh-tag1", "zh-tag2"], "en": ["en-tag1", "en-tag2"]},
            "enriched_attributes": [
                {
                    "name": "enriched_tags",
                    "value": {
                        "zh": ["zh-tag1", "zh-tag2"],
                        "en": ["en-tag1", "en-tag2"],
                    },
                },
                {"name": "target_audience", "value": {"zh": ["zh-audience"], "en": ["en-audience"]}},
            ],
            "enriched_taxonomy_attributes": [
                {
                    "name": "Product Type",
                    "value": {"zh": ["zh-dress"], "en": ["en-dress"]},
                },
                {
                    "name": "Target Gender",
                    "value": {"zh": ["zh-women"], "en": ["en-women"]},
                },
                {
                    "name": "Season",
                    "value": {"zh": ["zh-summer"], "en": ["en-summer"]},
                },
            ],
        }
    ]
def test_build_index_content_fields_non_apparel_taxonomy_returns_en_only():
    seen_calls = []

    def fake_analyze_products(
        products,
        target_lang="zh",
        batch_size=None,
        tenant_id=None,
        analysis_kind="content",
        category_taxonomy_profile=None,
    ):
        seen_calls.append((analysis_kind, target_lang, category_taxonomy_profile, tuple(p["id"] for p in products)))
        if analysis_kind == "taxonomy":
            assert category_taxonomy_profile == "toys"
            assert target_lang == "en"
            return [
                {
                    "id": products[0]["id"],
                    "lang": "en",
                    "title_input": products[0]["title"],
                    "product_type": "doll set",
                    "age_group": "kids",
                    "character_theme": "",
                    "material": "",
                    "power_source": "",
                    "interactive_features": "",
                    "educational_play_value": "",
                    "piece_count_size": "",
                    "color": "",
                    "use_scenario": "",
                }
            ]

        return [
            {
                "id": product["id"],
                "lang": target_lang,
                "title_input": product["title"],
                "title": product["title"],
                "category_path": "",
                "tags": f"{target_lang}-tag",
                "target_audience": "",
                "usage_scene": "",
                "season": "",
                "key_attributes": "",
                "material": "",
                "features": "",
                "anchor_text": f"{target_lang}-anchor",
            }
            for product in products
        ]

    with mock.patch.object(product_enrich, "analyze_products", side_effect=fake_analyze_products):
        result = product_enrich.build_index_content_fields(
            items=[{"spu_id": "2", "title": "toy"}],
            tenant_id="170",
            category_taxonomy_profile="toys",
        )

    assert result == [
        {
            "id": "2",
            "qanchors": {"zh": ["zh-anchor"], "en": ["en-anchor"]},
            "enriched_tags": {"zh": ["zh-tag"], "en": ["en-tag"]},
            "enriched_attributes": [
                {
                    "name": "enriched_tags",
                    "value": {
                        "zh": ["zh-tag"],
                        "en": ["en-tag"],
                    },
                }
            ],
            "enriched_taxonomy_attributes": [
                {"name": "Product Type", "value": {"en": ["doll set"]}},
                {"name": "Age Group", "value": {"en": ["kids"]}},
            ],
        }
    ]
    assert ("taxonomy", "zh", "toys", ("2",)) not in seen_calls
    assert ("taxonomy", "en", "toys", ("2",)) in seen_calls


def test_anchor_cache_key_depends_on_product_input_not_identifiers():
    product_a = {
        "id": "1",
        "spu_id": "1001",
        "title": "dress",
        "brief": "soft cotton",
        "description": "summer dress",
        "image_url": "https://img/a.jpg",
    }
    product_b = {
        "id": "2",
        "spu_id": "9999",
        "title": "dress",
        "brief": "soft cotton",
        "description": "summer dress",
        "image_url": "https://img/a.jpg",
    }
    product_c = {
        "id": "1",
        "spu_id": "1001",
        "title": "dress",
        "brief": "soft cotton updated",
        "description": "summer dress",
        "image_url": "https://img/a.jpg",
    }

    key_a = product_enrich._make_anchor_cache_key(product_a, "zh")
    key_b = product_enrich._make_anchor_cache_key(product_b, "zh")
    key_c = product_enrich._make_anchor_cache_key(product_c, "zh")

    assert key_a == key_b
    assert key_a != key_c


def test_analysis_cache_key_isolated_by_analysis_kind():
    product = {
        "id": "1",
        "title": "dress",
        "brief": "soft cotton",
        "description": "summer dress",
    }

    content_key = product_enrich._make_analysis_cache_key(product, "zh", "content")
    taxonomy_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy")

    assert content_key != taxonomy_key


def test_analysis_cache_key_changes_when_prompt_contract_changes():
    product = {
        "id": "1",
        "title": "dress",
        "brief": "soft cotton",
        "description": "summer dress",
    }

    original_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy")

    with mock.patch.object(
        product_enrich,
        "USER_INSTRUCTION_TEMPLATE",
        "Please return JSON only. Language: {language}",
    ):
        changed_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy")

    assert original_key != changed_key


def test_build_prompt_input_text_appends_brief_and_description_for_short_title():
    product = {
        "title": "T恤",
        "brief": "夏季透气纯棉短袖，舒适亲肤",
        "description": "100%棉，圆领版型，适合日常通勤与休闲穿搭。",
    }

    text = product_enrich._build_prompt_input_text(product)

    assert text.startswith("T恤")
    assert "夏季透气纯棉短袖" in text
    assert "100%棉" in text


def test_build_prompt_input_text_truncates_non_cjk_by_words():
    product = {
        "title": "dress",
        "brief": " ".join(f"brief{i}" for i in range(50)),
        "description": " ".join(f"desc{i}" for i in range(50)),
    }

    text = product_enrich._build_prompt_input_text(product)

    assert len(text.split()) <= product_enrich.PROMPT_INPUT_MAX_WORDS