From a32754685e174673aa2918611e14a7eea7427887 Mon Sep 17 00:00:00 2001 From: tangwang Date: Sat, 11 Apr 2026 22:03:00 +0800 Subject: [PATCH] 已把本仓库里的 `/indexer/enrich-content` 本地实现清理掉了,并把 indexer 主链路里对这套实现的隐式依赖一起摘掉。 --- .env | 2 +- AGENTS.md | 17 +++++++++++++++++ README.md | 2 +- api/routes/indexer.py | 165 ++++----------------------------------------------------------------------------------------------------------------------------------------------------------------- config/config.yaml | 6 ------ config/env_config.py | 2 -- config/loader.py | 9 --------- config/schema.py | 10 ---------- docs/工作总结-微服务性能优化与架构.md | 20 ++++++++------------ docs/搜索API对接指南-00-总览与快速开始.md | 5 +++-- docs/搜索API对接指南-05-索引接口(Indexer).md | 179 ++++++++++++----------------------------------------------------------------------------------------------------------------------------------------------------------------------- docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md | 12 +++--------- docs/缓存与Redis使用说明.md | 85 +++++++++++-------------------------------------------------------------------------- indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md | 418 +++++++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- indexer/README.md | 18 +++++------------- indexer/document_transformer.py | 114 ------------------------------------------------------------------------------------------------------------------ indexer/incremental_service.py | 10 ---------- indexer/product_enrich.py | 1421 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- indexer/product_enrich_prompts.py | 849 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- indexer/product_enrich模块说明.md | 173 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- indexer/spu_transformer.py | 12 ------------ scripts/debug/trace_indexer_calls.sh | 2 +- scripts/redis/redis_cache_health_check.py | 38 ++++++-------------------------------- scripts/redis/redis_cache_prefix_stats.py | 2 +- tests/ci/test_service_api_contracts.py | 162 ++---------------------------------------------------------------------------------------------------------------------------------------------------------------- tests/test_llm_enrichment_batch_fill.py | 72 ------------------------------------------------------------------------ tests/test_process_products_batching.py | 104 -------------------------------------------------------------------------------------------------------- tests/test_product_enrich_partial_mode.py | 736 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 28 files changed, 82 insertions(+), 4563 deletions(-) create mode 100644 AGENTS.md delete mode 100644 indexer/product_enrich.py delete mode 100644 indexer/product_enrich_prompts.py delete mode 100644 indexer/product_enrich模块说明.md delete mode 100644 tests/test_llm_enrichment_batch_fill.py delete mode 100644 tests/test_process_products_batching.py delete mode 100644 tests/test_product_enrich_partial_mode.py diff --git a/.env b/.env index 4a69515..8aa3cdd 100644 --- a/.env +++ b/.env @@ -4,7 +4,7 @@ ES_HOST=http://localhost:9200 ES_USERNAME=saas ES_PASSWORD=4hOaLaf41y2VuI8y -ES_AUTH="${ES_USERNAME}:${ES_PASSWORD}" +ES_AUTH="saas:4hOaLaf41y2VuI8y" # Redis Configuration (Optional) - AI 生产 10.200.16.14:6479 REDIS_HOST=10.200.16.14 diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..f8d9369 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,17 @@ +# FacetAwareMatching 协作记忆 + +## 开发原则 + +默认遵循以下错误处理原则: + +- 对于代码缺陷、逻辑疏漏、配置或资源缺失、违反统一约定等由自身原因导致的错误,应尽早暴露、快速失败,不做回退或容错处理,以保持代码精简、清晰、统一。 +- 对于线上超时、第三方接口异常等不可预见的外部错误,应提供必要的兜底、回退、重试或其他容错措施,以保证系统稳定性和业务连续性。 +- 进行功能迭代或重构时,默认直接面向最终方案和最优设计实现,不主动为历史实现、旧数据、过渡状态或遗留调用方式做兼容;优先推动代码回到统一约定和一致模型,避免长期并存的双轨逻辑、分支特判和临时过渡层。 + +## 落地要求 + +- 不要用静默吞错、默认值掩盖、隐式降级等方式隐藏内部问题。 +- 发现内部前置条件不满足时,应优先抛错、失败并暴露上下文。 +- 设计容错逻辑时,应明确区分“内部错误”和“外部错误”,避免把内部问题包装成可忽略事件。 +- 新设计一旦确定,应优先整体替换旧约定,而不是通过兼容旧行为来维持表面稳定。 +- 除非有明确、必要的外部兼容性约束,否则不要为内部历史包袱保留额外分支。 diff --git a/README.md b/README.md index 391e135..dbf796d 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ source activate.sh | `搜索API对接指南-01-搜索接口.md` | `POST /search/` 请求与响应 | | `搜索API对接指南-02-搜索建议与即时搜索.md` | 建议 / 即时搜索 | | `搜索API对接指南-03-获取文档.md` | `GET /search/{doc_id}` | -| `搜索API对接指南-05-索引接口(Indexer).md` | 索引与 `build-docs` / `enrich-content` 等 | +| `搜索API对接指南-05-索引接口(Indexer).md` | 索引与 `build-docs` 等(`enrich-content` 已迁出) | | `搜索API对接指南-06-管理接口(Admin).md` | `/admin/*` | | `搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md` | 6005/6006/6007/6008 等直连说明 | | `搜索API对接指南-08-数据模型与字段速查.md` | 字段与数据模型 | diff --git a/api/routes/indexer.py b/api/routes/indexer.py index 1abe603..c8f4bc3 100644 --- a/api/routes/indexer.py +++ b/api/routes/indexer.py @@ -7,7 +7,7 @@ import asyncio import re from fastapi import APIRouter, HTTPException -from typing import Any, Dict, List, Literal, Optional +from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field import logging from sqlalchemy import text @@ -19,11 +19,6 @@ logger = logging.getLogger(__name__) router = APIRouter(prefix="/indexer", tags=["indexer"]) -SUPPORTED_CATEGORY_TAXONOMY_PROFILES = ( - "apparel, 3c, bags, pet_supplies, electronics, outdoor, " - "home_appliances, home_living, wigs, beauty, accessories, toys, shoes, sports, others" -) - class ReindexRequest(BaseModel): """全量重建索引请求""" @@ -64,6 +59,7 @@ class BuildDocsRequest(BaseModel): 该接口是 Java 等外部索引程序正式使用的“doc 生成接口”: - 上游负责:全量 / 增量调度 + 从 MySQL 查询出各表数据 - 本模块负责:根据配置和算法,将原始行数据转换为与 mappings/search_products.json 一致的 ES 文档 + - 注意:已迁出的 `/indexer/enrich-content` 内容理解能力不再由本接口内置生成 """ tenant_id: str = Field(..., description="租户 ID,用于加载租户配置、语言策略等") items: List[BuildDocItem] = Field(..., description="需要构建 doc 的 SPU 列表(含其 SKUs 和 Options)") @@ -82,55 +78,6 @@ class BuildDocsFromDbRequest(BaseModel): spu_ids: List[str] = Field(..., description="需要构建 doc 的 SPU ID 列表") -class EnrichContentItem(BaseModel): - """单条待生成内容理解字段的商品。""" - spu_id: str = Field(..., description="SPU ID") - title: str = Field(..., description="商品标题,用于 LLM 分析生成 qanchors / enriched_tags 等") - image_url: Optional[str] = Field(None, description="商品主图 URL(预留给多模态/内容理解扩展)") - brief: Optional[str] = Field(None, description="商品简介/短描述") - description: Optional[str] = Field(None, description="商品详情/长描述") - - -class EnrichContentRequest(BaseModel): - """ - 内容理解字段生成请求:根据商品标题批量生成通用增强字段与品类 taxonomy 字段。 - 供外部 indexer 在自行组织 doc 时调用,与翻译、向量化等微服务并列。 - """ - tenant_id: str = Field(..., description="租户 ID,用于请求路由与结果归属,不参与缓存键") - items: List[EnrichContentItem] = Field(..., description="待分析的 SPU 列表(spu_id + title,可附带 brief/description/image_url)") - enrichment_scopes: Optional[List[Literal["generic", "category_taxonomy"]]] = Field( - default=None, - description=( - "要执行的增强范围。" - "`generic` 返回 qanchors/enriched_tags/enriched_attributes;" - "`category_taxonomy` 返回 enriched_taxonomy_attributes。" - "默认两者都执行。" - ), - ) - category_taxonomy_profile: str = Field( - "apparel", - description=( - "品类 taxonomy profile。默认 `apparel`。" - f"当前支持:{SUPPORTED_CATEGORY_TAXONOMY_PROFILES}。" - "其中除 `apparel` 外,其余 profile 的 taxonomy 输出仅返回 `en`。" - ), - ) - analysis_kinds: Optional[List[Literal["content", "taxonomy"]]] = Field( - default=None, - description="Deprecated alias of enrichment_scopes. `content` -> `generic`, `taxonomy` -> `category_taxonomy`.", - ) - - def resolved_enrichment_scopes(self) -> List[str]: - if self.enrichment_scopes: - return list(self.enrichment_scopes) - if self.analysis_kinds: - mapped = [] - for item in self.analysis_kinds: - mapped.append("generic" if item == "content" else "category_taxonomy") - return mapped - return ["generic", "category_taxonomy"] - - @router.post("/reindex") async def reindex_all(request: ReindexRequest): """ @@ -239,8 +186,9 @@ async def build_docs(request: BuildDocsRequest): 使用场景: - 上游(例如 Java 索引程序)已经从 MySQL 查询出了 SPU / SKU / Option 等原始行数据 - - 希望复用本项目的全部“索引富化”能力(多语言、翻译、向量、规格聚合等) + - 希望复用本项目当前保留的“索引构建”能力(多语言、翻译、向量、规格聚合等) - 只需要拿到与 `mappings/search_products.json` 一致的 doc 列表,由上游自行写入 ES + - 如需 `qanchors` / `enriched_attributes` / `enriched_taxonomy_attributes`,请由外部内容理解服务生成后再自行合并 """ try: if not request.items: @@ -260,7 +208,6 @@ async def build_docs(request: BuildDocsRequest): import pandas as pd docs: List[Dict[str, Any]] = [] - doc_spu_rows: List[pd.Series] = [] failed: List[Dict[str, Any]] = [] for item in request.items: @@ -276,7 +223,6 @@ async def build_docs(request: BuildDocsRequest): spu_row=spu_row, skus=skus_df, options=options_df, - fill_llm_attributes=False, ) if doc is None: @@ -316,7 +262,6 @@ async def build_docs(request: BuildDocsRequest): doc["title_embedding"] = emb0.tolist() docs.append(doc) - doc_spu_rows.append(spu_row) except Exception as e: failed.append( { @@ -325,13 +270,6 @@ async def build_docs(request: BuildDocsRequest): } ) - # 批量填充 LLM 字段(尽量攒批,每次最多 20 条;失败仅 warning,不影响 build-docs 主功能) - try: - if docs and doc_spu_rows: - transformer.fill_llm_attributes_batch(docs, doc_spu_rows) - except Exception as e: - logger.warning("Batch LLM fill failed in build-docs (tenant_id=%s): %s", request.tenant_id, e) - return { "tenant_id": request.tenant_id, "docs": docs, @@ -476,101 +414,6 @@ async def build_docs_from_db(request: BuildDocsFromDbRequest): raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") -def _run_enrich_content( - tenant_id: str, - items: List[Dict[str, str]], - enrichment_scopes: Optional[List[str]] = None, - category_taxonomy_profile: str = "apparel", -) -> List[Dict[str, Any]]: - """ - 同步执行内容理解,返回与 ES mapping 对齐的字段结构。 - 语言策略由 product_enrich 内部统一决定,路由层不参与。 - """ - from indexer.product_enrich import build_index_content_fields - - results = build_index_content_fields( - items=items, - tenant_id=tenant_id, - enrichment_scopes=enrichment_scopes, - category_taxonomy_profile=category_taxonomy_profile, - ) - return [ - { - "spu_id": item["id"], - "qanchors": item["qanchors"], - "enriched_attributes": item["enriched_attributes"], - "enriched_tags": item["enriched_tags"], - "enriched_taxonomy_attributes": item["enriched_taxonomy_attributes"], - **({"error": item["error"]} if item.get("error") else {}), - } - for item in results - ] - - -@router.post("/enrich-content") -async def enrich_content(request: EnrichContentRequest): - """ - 内容理解字段生成接口:根据商品标题批量生成通用增强字段与品类 taxonomy 字段。 - - 使用场景: - - 外部 indexer 采用「微服务组合」方式自己组织 doc 时,可调用本接口获取 LLM 生成的 - 锚文本与语义属性,再与翻译、向量化结果合并写入 ES。 - - 与 /indexer/build-docs 解耦,避免 build-docs 因 LLM 耗时过长而阻塞;调用方可 - 先拿不含 qanchors/enriched_tags/taxonomy attributes 的 doc,再异步或离线补齐本接口结果后更新 ES。 - - 实现逻辑与 indexer.product_enrich.build_index_content_fields 一致,支持多语言与 Redis 缓存。 - """ - try: - if not request.items: - raise HTTPException(status_code=400, detail="items cannot be empty") - if len(request.items) > 50: - raise HTTPException( - status_code=400, - detail="Maximum 50 items per request for enrich-content (LLM batch limit)", - ) - - items_payload = [ - { - "spu_id": it.spu_id, - "title": it.title or "", - "brief": it.brief or "", - "description": it.description or "", - "image_url": it.image_url or "", - } - for it in request.items - ] - loop = asyncio.get_event_loop() - enrichment_scopes = request.resolved_enrichment_scopes() - result = await loop.run_in_executor( - None, - lambda: _run_enrich_content( - tenant_id=request.tenant_id, - items=items_payload, - enrichment_scopes=enrichment_scopes, - category_taxonomy_profile=request.category_taxonomy_profile, - ), - ) - return { - "tenant_id": request.tenant_id, - "enrichment_scopes": enrichment_scopes, - "category_taxonomy_profile": request.category_taxonomy_profile, - "results": result, - "total": len(result), - } - except HTTPException: - raise - except RuntimeError as e: - if "DASHSCOPE_API_KEY" in str(e) or "cannot call LLM" in str(e).lower(): - raise HTTPException( - status_code=503, - detail="Content understanding service unavailable: DASHSCOPE_API_KEY not set", - ) - raise HTTPException(status_code=500, detail=str(e)) - except Exception as e: - logger.error(f"Error in enrich-content for tenant_id={request.tenant_id}: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") - - @router.post("/documents") async def get_documents(request: GetDocumentsRequest): """ diff --git a/config/config.yaml b/config/config.yaml index 2ef3790..3b807b2 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -38,8 +38,6 @@ infrastructure: retry_on_timeout: false cache_expire_days: 720 embedding_cache_prefix: embedding - anchor_cache_prefix: product_anchors - anchor_cache_expire_days: 30 database: host: null port: 3306 @@ -60,10 +58,6 @@ indexes: [] assets: query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict -# Product content understanding (LLM enrich-content) configuration -product_enrich: - max_workers: 40 - # 离线 / Web 相关性评估(scripts/evaluation、eval-web) # CLI 未显式传参时使用此处默认值;search_base_url 未配置时自动为 http://127.0.0.1:{runtime.api_port} search_evaluation: diff --git a/config/env_config.py b/config/env_config.py index 6264e4b..2485c33 100644 --- a/config/env_config.py +++ b/config/env_config.py @@ -46,8 +46,6 @@ def _redis_dict() -> Dict[str, Any]: "retry_on_timeout": cfg.retry_on_timeout, "cache_expire_days": cfg.cache_expire_days, "embedding_cache_prefix": cfg.embedding_cache_prefix, - "anchor_cache_prefix": cfg.anchor_cache_prefix, - "anchor_cache_expire_days": cfg.anchor_cache_expire_days, } diff --git a/config/loader.py b/config/loader.py index 5306f8c..0d512cd 100644 --- a/config/loader.py +++ b/config/loader.py @@ -38,7 +38,6 @@ from config.schema import ( IndexConfig, InfrastructureConfig, QueryConfig, - ProductEnrichConfig, RedisSettings, RerankConfig, RerankFusionConfig, @@ -260,10 +259,6 @@ class AppConfigLoader: runtime_config = self._build_runtime_config() infrastructure_config = self._build_infrastructure_config(runtime_config.environment) - product_enrich_raw = raw.get("product_enrich") if isinstance(raw.get("product_enrich"), dict) else {} - product_enrich_config = ProductEnrichConfig( - max_workers=int(product_enrich_raw.get("max_workers", 40)), - ) search_evaluation_config = self._build_search_evaluation_config(raw, runtime_config) metadata = ConfigMetadata( @@ -275,7 +270,6 @@ class AppConfigLoader: app_config = AppConfig( runtime=runtime_config, infrastructure=infrastructure_config, - product_enrich=product_enrich_config, search=search_config, services=services_config, tenants=tenants_config, @@ -288,7 +282,6 @@ class AppConfigLoader: return AppConfig( runtime=app_config.runtime, infrastructure=app_config.infrastructure, - product_enrich=app_config.product_enrich, search=app_config.search, services=app_config.services, tenants=app_config.tenants, @@ -838,8 +831,6 @@ class AppConfigLoader: retry_on_timeout=os.getenv("REDIS_RETRY_ON_TIMEOUT", "false").strip().lower() == "true", cache_expire_days=int(os.getenv("REDIS_CACHE_EXPIRE_DAYS", 360 * 2)), embedding_cache_prefix=os.getenv("REDIS_EMBEDDING_CACHE_PREFIX", "embedding"), - anchor_cache_prefix=os.getenv("REDIS_ANCHOR_CACHE_PREFIX", "product_anchors"), - anchor_cache_expire_days=int(os.getenv("REDIS_ANCHOR_CACHE_EXPIRE_DAYS", 30)), ), database=DatabaseSettings( host=os.getenv("DB_HOST"), diff --git a/config/schema.py b/config/schema.py index cbd4328..4347406 100644 --- a/config/schema.py +++ b/config/schema.py @@ -323,8 +323,6 @@ class RedisSettings: retry_on_timeout: bool = False cache_expire_days: int = 720 embedding_cache_prefix: str = "embedding" - anchor_cache_prefix: str = "product_anchors" - anchor_cache_expire_days: int = 30 @dataclass(frozen=True) @@ -351,13 +349,6 @@ class InfrastructureConfig: @dataclass(frozen=True) -class ProductEnrichConfig: - """Configuration for LLM-based product content understanding (enrich-content).""" - - max_workers: int = 40 - - -@dataclass(frozen=True) class RuntimeConfig: environment: str = "prod" index_namespace: str = "" @@ -430,7 +421,6 @@ class AppConfig: runtime: RuntimeConfig infrastructure: InfrastructureConfig - product_enrich: ProductEnrichConfig search: SearchConfig services: ServicesConfig tenants: TenantCatalogConfig diff --git a/docs/工作总结-微服务性能优化与架构.md b/docs/工作总结-微服务性能优化与架构.md index 115e6ee..a5958fb 100644 --- a/docs/工作总结-微服务性能优化与架构.md +++ b/docs/工作总结-微服务性能优化与架构.md @@ -93,19 +93,15 @@ instruction: "Given a shopping query, rank product titles by relevance" --- -### 5. 内容理解字段(支撑 Suggest) +### 5. 内容理解字段(已迁出) -**能力**:支持根据商品标题批量生成 **qanchors**(锚文本)、**enriched_attributes**、**tags**,供索引与 suggest 使用。 +`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` 这些字段模型仍保留在索引结构里,`suggestion/builder.py` 等消费侧也仍可继续使用 ES 中已有的 `qanchors`。但字段生成服务与其本地实现已经迁移到独立项目,本仓库不再提供 `/indexer/enrich-content`,也不再在 indexer 构建链路内自动补齐这些字段。 -**具体内容**: -- **接口**:`POST /indexer/enrich-content`(FacetAwareMatching 服务端口 **6001**)。请求体为 `items` 数组,每项含 `spu_id`、`title`(必填)及可选多语言标题等;单次请求最多 **50 条**,建议批量调用。响应 `results` 与 `items` 一一对应,每项含 `spu_id`、`qanchors`(按语言键,如 `qanchors.zh`、`qanchors.en`,逗号分隔短语)、`enriched_attributes`、`tags`。 -- **索引侧**:微服务组合方式下,调用方先拿不含 qanchors/tags 的 doc,再调用本接口补齐后写入 ES 的 `qanchors.{lang}` 等字段;索引 transformer(`indexer/document_transformer.py`、`indexer/product_enrich.py`)内也可在构建 doc 时调用内容理解逻辑,写入 `qanchors.{lang}`。 -- **Suggest 侧**:`suggestion/builder.py` 从 ES 商品索引读取 `_source: ["id", "spu_id", "title", "qanchors"]`,对 `qanchors.{lang}` 用 `_split_qanchors` 拆成词条,以 `source="qanchor"` 加入候选,排序时 `qanchor` 权重大于纯 title(`add_product("qanchor", ...)`);suggest 配置中 `sources: ["query_log", "qanchor"]` 表示候选来源包含 qanchor。 -- **实现与依赖**:内容理解内部使用大模型(需 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存(如 `product_anchors`);逻辑与 `indexer/product_enrich` 一致。 - -**状态**:内容理解字段已接入索引与 suggest 链路;依赖内容理解(qanchors/tags)的**全量数据尚未全部完成一轮**,后续需持续跑满并校验效果。 +当前边界: -详见:`indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md`、`docs/搜索API对接指南-05-索引接口(Indexer).md`(`enrich-content` 等)、`api/routes/indexer.py`(enrich-content 路由)。 +- 本仓库负责基础 doc 构建、多语言字段、向量、规格聚合等索引能力。 +- 独立内容理解服务负责生成 `qanchors` / `enriched_*`。 +- 上游索引程序负责把两侧结果合并后写入 ES。 --- @@ -145,7 +141,7 @@ instruction: "Given a shopping query, rank product titles by relevance" - **增量示例**:`./scripts/build_suggestions.sh 162 --mode incremental --overlap-minutes 30`(按 watermark 增量更新);脚本内部调用 `main.py build-suggestions --tenant-id ...`。 - 构建逻辑在 `suggestion/builder.py` 的 `SuggestionIndexBuilder`:从 ES 商品索引(含 `title`、`qanchors`)与查询日志等拉取数据,写入 versioned 建议索引并切换 alias。 - **尚未完成的“增量机制”**:指**自动/事件驱动的增量**(如商品变更或日志写入时自动刷新建议索引);当前 incremental 模式为“按 watermark 再跑一次构建”,仍为脚本主动触发,非持续增量流水线。 -- **依赖**:suggest 候选依赖商品侧 **内容理解字段**(qanchors/tags);`sources: ["query_log", "qanchor"]` 表示候选来自查询日志与 qanchor;当前内容理解未全量跑完一轮,suggest 数据会随全量重建逐步完善。 +- **依赖**:suggest 候选依赖商品侧 **内容理解字段**(qanchors/tags);`sources: ["query_log", "qanchor"]` 表示候选来自查询日志与 qanchor。字段生成职责已迁移到独立内容理解服务。 详见:`suggestion/builder.py`、`suggestion/ARCHITECTURE_V2.md`、`main.py`(build-suggestions 子命令)。 @@ -241,7 +237,7 @@ cd /data/saas-search | **Embedding** | TEI 替代 SentenceTransformers/vLLM 作为文本向量后端,兼顾性能与工程化(Docker、配置化、T4 调优);图片向量由 clip-as-service 承担。 | | **Reranker** | vLLM + Qwen3-Reranker-0.6B,针对 T4 做 float16、prefix caching、CUDA 图、按长度分批及 batch/长度参数搜索;高并发场景可选用 DashScope 云后端。 | | **翻译** | 因 qwen-mt 限速(RPM≈60),迁移至可配置的 qwen-flash 等方案,支撑在线索引与 query;需金伟侧对索引做流量控制。 | -| **内容理解** | 提供 qanchors/tags 等字段生成接口,支撑 suggest 与检索增强;全量一轮尚未完全跑满。 | +| **内容理解** | 字段模型仍可被检索与 suggest 消费,但生成服务已迁移到独立项目;本仓库不再内置该实现。 | | **架构** | Provider 动态选择翻译;service_ctl 统一监控与拉起;suggest 目前全量脚本触发,增量待做。 | | **性能基线** | 向量化扩展性良好;reranker 为整链瓶颈(386 docs 约 0.6 rps);search 约 8 rps;suggest 约 200+ rps。 | diff --git a/docs/搜索API对接指南-00-总览与快速开始.md b/docs/搜索API对接指南-00-总览与快速开始.md index 1aecfe3..48f2669 100644 --- a/docs/搜索API对接指南-00-总览与快速开始.md +++ b/docs/搜索API对接指南-00-总览与快速开始.md @@ -90,7 +90,6 @@ curl -X POST "http://43.166.252.75:6002/search/" \ | 查询文档 | POST | `/indexer/documents` | 查询SPU文档数据(不写入ES) | | 构建ES文档(正式对接) | POST | `/indexer/build-docs` | 基于上游提供的 MySQL 行数据构建 ES doc,不写入 ES,供 Java 等调用后自行写入 | | 构建ES文档(测试用) | POST | `/indexer/build-docs-from-db` | 仅在测试/调试时使用,根据 `tenant_id + spu_ids` 内部查库并构建 ES doc | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags,供微服务组合方式使用(独立服务端口 6001) | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 | | 健康检查 | GET | `/admin/health` | 服务健康检查 | | 获取配置 | GET | `/admin/config` | 获取租户配置 | @@ -104,6 +103,8 @@ curl -X POST "http://43.166.252.75:6002/search/" \ | 向量服务(图片) | 6008 | `POST /embed/image` | 图片向量化 | | 翻译服务 | 6006 | `POST /translate` | 文本翻译(支持 qwen-mt / llm / deepl / 本地模型) | | 重排服务 | 6007 | `POST /rerank` | 检索结果重排 | -| 内容理解(独立服务) | 6001 | `POST /indexer/enrich-content` | 根据商品标题生成 qanchors、tags 等,供 indexer 微服务组合方式使用 | +--- + +> 注:`/indexer/enrich-content` 已迁移到独立项目,不再由本仓库的 Indexer 服务提供;本仓库保留 `build-docs` / `build-docs-from-db` 等索引构建接口。 --- diff --git a/docs/搜索API对接指南-05-索引接口(Indexer).md b/docs/搜索API对接指南-05-索引接口(Indexer).md index 4856716..e478035 100644 --- a/docs/搜索API对接指南-05-索引接口(Indexer).md +++ b/docs/搜索API对接指南-05-索引接口(Indexer).md @@ -13,7 +13,6 @@ | 查询文档 | POST | `/indexer/documents` | 按 SPU ID 列表查询 ES 文档,不写入 ES | | 构建 ES 文档(正式) | POST | `/indexer/build-docs` | 由上游提供 MySQL 行数据,返回 ES-ready 文档,不写 ES | | 构建 ES 文档(测试) | POST | `/indexer/build-docs-from-db` | 由本服务查库并构建文档,仅测试/调试用 | -| 内容理解字段生成 | POST | `/indexer/enrich-content` | 根据商品标题批量生成 qanchors、enriched_attributes、tags(供微服务组合方式使用;独立服务端口 6001) | | 索引健康检查 | GET | `/indexer/health` | 检查索引服务与数据库连接状态 | #### 5.0 支撑外部 indexer 的三种方式 @@ -22,8 +21,8 @@ | 方式 | 说明 | 适用场景 | |------|------|----------| -| **1)doc 填充接口** | 调用 `POST /indexer/build-docs` 或 `POST /indexer/build-docs-from-db`,由本服务基于 MySQL 行数据构建完整 ES 文档(含多语言、向量、规格等),**不写入 ES**,由调用方自行写入。 | 希望一站式拿到 ES-ready doc,由己方控制写 ES 的时机与索引名。 | -| **2)微服务组合** | 单独调用**翻译**、**向量化**、**内容理解字段生成**等接口,由 indexer 程序自己组装 doc 并写入 ES。翻译与向量化为独立微服务(见第 7 节);内容理解为 FacetAwareMatching 独立服务接口 `POST /indexer/enrich-content`(端口 6001)。 | 需要灵活编排、或希望将 LLM/向量等耗时步骤与主链路解耦(如异步补齐 qanchors/tags)。 | +| **1)doc 填充接口** | 调用 `POST /indexer/build-docs` 或 `POST /indexer/build-docs-from-db`,由本服务基于 MySQL 行数据构建 ES 文档(含多语言、向量、规格等),**不写入 ES**,由调用方自行写入。 | 希望一站式拿到 ES-ready doc,由己方控制写 ES 的时机与索引名。 | +| **2)微服务组合** | 单独调用**翻译**、**向量化**、**外部内容理解服务**等能力,由 indexer 程序自己组装 doc 并写入 ES。翻译与向量化见第 7 节;内容理解字段生成已迁移到独立项目,不再由本仓库维护。 | 需要灵活编排、或希望将 LLM/向量等耗时步骤与主链路解耦(如异步补齐 qanchors/tags)。 | | **3)本服务直接写 ES** | 调用全量索引 `POST /indexer/reindex`、增量索引 `POST /indexer/index`(指定 SPU ID 列表),由本服务从 MySQL 拉数并直接写入 ES。 | 自建运维、联调或不需要由 Java 写 ES 的场景。 | - **方式 1** 与 **方式 2** 下,ES 的写入方均为外部 indexer(或 Java),职责清晰。 @@ -645,174 +644,20 @@ curl -X POST "http://127.0.0.1:6004/indexer/build-docs-from-db" \ 返回结构与 `/indexer/build-docs` 相同,可直接用于对比 ES 实际文档或调试字段映射问题。 -### 5.8 内容理解字段生成接口 - -- **端点**: `POST /indexer/enrich-content` -- **服务**: FacetAwareMatching 独立服务(默认端口 **6001**;由 `/data/FacetAwareMatching/scripts/service_ctl.sh` 管理) -- **描述**: 根据商品内容信息批量生成 **qanchors**(锚文本)、**enriched_attributes**(通用语义属性)、**enriched_tags**(细分标签)、**enriched_taxonomy_attributes**(taxonomy 结构化属性),供外部 indexer 在「微服务组合」方式下自行拼装 doc 时使用。请求以 `items[]` 传入商品内容字段(必填/可选见下表)。接口只暴露商品内容输入,语言选择、分析维度与最终字段结构统一由 FacetAwareMatching 的 `product_enrich` 内部决定;当前返回结果与 `search_products` mapping 保持一致。单次请求在线程池中执行,避免阻塞其他接口。 - -当前支持的 `category_taxonomy_profile`: -- `apparel` -- `3c` -- `bags` -- `pet_supplies` -- `electronics` -- `outdoor` -- `home_appliances` -- `home_living` -- `wigs` -- `beauty` -- `accessories` -- `toys` -- `shoes` -- `sports` -- `others` - -说明: -- 所有 profile 的 `enriched_taxonomy_attributes.value` 都统一返回 `zh` + `en`。 -- 外部调用 `/indexer/enrich-content` 时,以请求中的 `category_taxonomy_profile` 为准。 -- 若 indexer 内部仍接入内容理解能力,taxonomy profile 请在调用侧显式传入(建议仍以租户行业配置为准)。 +### 5.8 内容理解字段生成能力(已迁出) -#### 请求参数 - -```json -{ - "tenant_id": "170", - "enrichment_scopes": ["generic", "category_taxonomy"], - "category_taxonomy_profile": "apparel", - "items": [ - { - "spu_id": "223167", - "title": "纯棉短袖T恤 夏季男装", - "brief": "夏季透气纯棉短袖,舒适亲肤", - "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", - "image_url": "https://example.com/images/223167.jpg" - }, - { - "spu_id": "223168", - "title": "12PCS Dolls with Bottles", - "image_url": "https://example.com/images/223168.jpg" - } - ] -} -``` +`/indexer/enrich-content` 已迁移到独立项目,本仓库当前的 Indexer 服务(默认端口 `6004`)**不再暴露该接口**,也**不再在** `/indexer/build-docs`、`/indexer/build-docs-from-db`、`/indexer/reindex`、`/indexer/index` 的构建链路里内置生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`。 -| 参数 | 类型 | 必填 | 默认值 | 说明 | -|------|------|------|--------|------| -| `tenant_id` | string | Y | - | 租户 ID。目前仅用于记录日志,不产生实际作用| -| `enrichment_scopes` | array[string] | N | `["generic", "category_taxonomy"]` | 选择要执行的增强范围。`generic` 生成 `qanchors`/`enriched_tags`/`enriched_attributes`,`category_taxonomy` 生成 `enriched_taxonomy_attributes` | -| `category_taxonomy_profile` | string | N | `apparel` | 品类 taxonomy profile。支持:`apparel`、`3c`、`bags`、`pet_supplies`、`electronics`、`outdoor`、`home_appliances`、`home_living`、`wigs`、`beauty`、`accessories`、`toys`、`shoes`、`sports`、`others` | -| `items` | array | Y | - | 待分析列表;**单次最多 50 条** | - -`items[]` 字段说明: - -| 字段 | 类型 | 必填 | 说明 | -|------|------|------|------| -| `spu_id` | string | Y | SPU ID,用于回填结果;目前仅用于记录日志,不产生实际作用| -| `title` | string | Y | 商品标题 | -| `image_url` | string | N | 商品主图 URL;当前仅透传,暂未参与 prompt 与缓存键,后续可用于图像/多模态内容理解 | -| `brief` | string | N | 商品简介/短描述;当前会参与 prompt 与缓存键 | -| `description` | string | N | 商品详情/长描述;当前会参与 prompt 与缓存键 | - -缓存说明: - -- 内容缓存按 **增强范围 + taxonomy profile** 拆分;`generic` 与 `category_taxonomy:apparel` 等使用不同缓存命名空间,互不污染、可独立演进。 -- 缓存键由 `analysis_kind + target_lang + prompt/schema 版本指纹 + prompt 输入文本 hash` 构成;对 category taxonomy 来说,profile 会进入 schema 标识与版本指纹。 -- 当前真正参与 prompt 输入的字段是:`title`、`brief`、`description`;这些字段任一变化,都会落到新的缓存 key。 -- `prompt/schema 版本指纹` 会综合 system prompt、shared instruction、localized table headers、result fields、user instruction template 等信息生成;因此只要提示词或输出契约变化,旧缓存会自然失效。 -- `tenant_id`、`spu_id` 只用于请求归属与结果回填,不参与缓存键。 -- 因此,输入内容与 prompt 契约都不变时可跨请求直接命中缓存;任一一侧变化,都会自然落到新的缓存 key。 +当前建议的对接方式: -语言说明: +1. 调用本仓库的 `POST /indexer/build-docs` 或 `POST /indexer/build-docs-from-db` 生成基础 ES 文档。 +2. 调用独立内容理解服务生成 `qanchors` / `enriched_*` 字段。 +3. 由上游索引程序自行合并字段后写入 ES。 -- 接口不接受语言控制参数。 -- 返回哪些语言、返回哪些语义维度,统一由 `indexer.product_enrich` 内部逻辑决定。 -- 当前为了与 `search_products` mapping 对齐,通用增强字段与 taxonomy 字段都统一只返回核心索引语言 `zh`、`en`。 +补充说明: -批量请求建议: -- **全量**:强烈建议 尽可能 **20 个 SPU/doc** 攒成一个批次后再请求一次。 -- **增量**:可按时效要求设置时间窗口(例如 **5 分钟**),在窗口内尽可能攒到 **20 个**;达到 20 或窗口到期就发送一次请求。 -- 允许超过20,服务内部会拆分成小批次逐个处理。也允许小于20,但是将造成费用和耗时的成本上升,特别是每次请求一个doc的情况。 - -#### 响应格式 - -```json -{ - "tenant_id": "170", - "enrichment_scopes": ["generic", "category_taxonomy"], - "category_taxonomy_profile": "apparel", - "total": 2, - "results": [ - { - "spu_id": "223167", - "qanchors": { - "zh": ["短袖T恤", "纯棉", "男装", "夏季"], - "en": ["cotton t-shirt", "short sleeve", "men", "summer"] - }, - "enriched_tags": { - "zh": ["纯棉", "短袖", "男装"], - "en": ["cotton", "short sleeve", "men"] - }, - "enriched_attributes": [ - { "name": "enriched_tags", "value": { "zh": "纯棉" } }, - { "name": "usage_scene", "value": { "zh": "日常" } }, - { "name": "enriched_tags", "value": { "en": "cotton" } } - ], - "enriched_taxonomy_attributes": [ - { "name": "Product Type", "value": { "zh": ["T恤"], "en": ["t-shirt"] } }, - { "name": "Target Gender", "value": { "zh": ["男"], "en": ["men"] } }, - { "name": "Season", "value": { "zh": ["夏季"], "en": ["summer"] } } - ] - }, - { - "spu_id": "223168", - "qanchors": { - "en": ["dolls", "toys", "12pcs"] - }, - "enriched_tags": { - "en": ["dolls", "toys"] - }, - "enriched_attributes": [], - "enriched_taxonomy_attributes": [] - } - ] -} -``` - -| 字段 | 类型 | 说明 | -|------|------|------| -| `enrichment_scopes` | array | 实际执行的增强范围列表 | -| `category_taxonomy_profile` | string | 实际使用的品类 taxonomy profile | -| `results` | array | 与请求 `items` 一一对应,每项含 `spu_id`、`qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` | -| `results[].qanchors` | object | 与 ES `qanchors` 字段同结构,按语言键返回短语数组 | -| `results[].enriched_tags` | object | 与 ES `enriched_tags` 字段同结构,按语言键返回标签数组 | -| `results[].enriched_attributes` | array | 与 ES `enriched_attributes` nested 字段同结构,每项为 `{ "name", "value": { "zh"?: "...", "en"?: "..." } }` | -| `results[].enriched_taxonomy_attributes` | array | 与 ES `enriched_taxonomy_attributes` nested 字段同结构。每项通常为 `{ "name", "value": { "zh"?: [...], "en"?: [...] } }` | -| `results[].error` | string | 若该条处理失败(如 LLM 异常),会在此字段返回错误信息 | - -**错误响应**: -- `400`: `items` 为空或超过 50 条 -- `503`: 未配置 `DASHSCOPE_API_KEY`,内容理解服务不可用 - -#### 请求示例 - -```bash -curl -X POST "http://localhost:6001/indexer/enrich-content" \ - -H "Content-Type: application/json" \ - -d '{ - "tenant_id": "163", - "enrichment_scopes": ["generic", "category_taxonomy"], - "category_taxonomy_profile": "apparel", - "items": [ - { - "spu_id": "223167", - "title": "纯棉短袖T恤 夏季男装夏季男装", - "brief": "夏季透气纯棉短袖,舒适亲肤", - "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", - "image_url": "https://example.com/images/223167.jpg" - } - ] - }' -``` +- `search_products` mapping 仍保留上述字段,便于独立内容理解服务继续产出并写入。 +- `suggestion` 等消费侧仍可读取 ES 中已有的 `qanchors` 字段;迁移的是“生成实现”,不是字段模型本身。 +- 本文档不再维护独立内容理解服务的请求/响应细节,请以对应独立项目的文档为准。 --- diff --git a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md index 30ee77c..6311cf5 100644 --- a/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md +++ b/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md @@ -1,6 +1,6 @@ # 搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation) -本篇覆盖向量服务(Embedding)、重排服务(Reranker)、翻译服务(Translation)以及 Indexer 服务内的内容理解字段生成(原文第 7 章)。 +本篇覆盖向量服务(Embedding)、重排服务(Reranker)与翻译服务(Translation)。原先收录的 `/indexer/enrich-content` 内容理解接口已迁移到独立项目,不再由本仓库维护。 ## 7. 微服务接口(向量、重排、翻译) @@ -438,14 +438,8 @@ curl "http://localhost:6006/health" } ``` -### 7.4 内容理解字段生成(Indexer 服务内) +### 7.4 内容理解字段生成(已迁出) -内容理解字段生成接口部署在 **Indexer 服务**(默认端口 6004)内,与「翻译、向量化」等独立端口微服务并列,供采用**微服务组合**方式的 indexer 调用。 - -- **Base URL**: Indexer 服务地址,如 `http://localhost:6004` -- **路径**: `POST /indexer/enrich-content` -- **说明**: 根据商品标题批量生成 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes`,用于拼装 ES 文档。支持通过 `enrichment_scopes` 选择执行 `generic` / `category_taxonomy`,并通过 `category_taxonomy_profile` 选择对应大类的 taxonomy prompt/profile;默认执行 `generic + category_taxonomy(apparel)`。当前支持的 taxonomy profile 包括 `apparel`、`3c`、`bags`、`pet_supplies`、`electronics`、`outdoor`、`home_appliances`、`home_living`、`wigs`、`beauty`、`accessories`、`toys`、`shoes`、`sports`、`others`。所有 profile 的 taxonomy 输出都统一返回 `zh` + `en`,`category_taxonomy_profile` 只决定字段集合。内部使用大模型(需配置 `DASHSCOPE_API_KEY`),支持多语言与 Redis 缓存;单次最多 50 条,建议批量调用以提升效率。 - -请求/响应格式、示例及错误码见 [-05-索引接口(Indexer)](./搜索API对接指南-05-索引接口(Indexer).md#58-内容理解字段生成接口)。 +`/indexer/enrich-content` 已迁移到独立项目,不再属于本仓库的微服务接口集合。当前仓库中的 Indexer 服务(`6004`)不再提供该接口;如需 `qanchors` / `enriched_*` 字段,请接入对应独立服务,并与本仓库的 `build-docs` 输出在上游侧自行合并。 --- diff --git a/docs/缓存与Redis使用说明.md b/docs/缓存与Redis使用说明.md index 32baa10..f673712 100644 --- a/docs/缓存与Redis使用说明.md +++ b/docs/缓存与Redis使用说明.md @@ -4,7 +4,6 @@ - **文本向量缓存**(embedding 缓存) - **翻译结果缓存**(Qwen-MT 等机器翻译) -- **商品内容理解缓存**(锚文本 / 语义属性 / 标签) 底层连接配置统一来自 `config/env_config.py` 的 `REDIS_CONFIG`: @@ -21,8 +20,6 @@ |------------|----------|----------------|----------|------| | 向量缓存(text/image embedding) | 文本:`{EMBEDDING_CACHE_PREFIX}:embed:norm{0|1}:{text}`;图片:`{EMBEDDING_CACHE_PREFIX}:image:embed:norm{0|1}:{url_or_path}` | **BF16 bytes**(每维 2 字节大端存储),读取后恢复为 `np.float32` | TTL=`REDIS_CONFIG["cache_expire_days"]` 天;访问时滑动过期 | 见 `embeddings/text_encoder.py`、`embeddings/image_encoder.py`、`embeddings/server.py`;前缀由 `REDIS_CONFIG["embedding_cache_prefix"]` 控制 | | 翻译结果缓存(translator service) | `trans:{model}:{target_lang}:{source_text[:4]}{sha256(source_text)}` | 机翻后的单条字符串 | TTL=`services.translation.cache.ttl_seconds` 秒;可配置滑动过期 | 见 `translation/service.py` + `config/config.yaml` | -| 商品内容理解缓存(anchors / 语义属性 / tags) | `{ANCHOR_CACHE_PREFIX}:{tenant_or_global}:{target_lang}:{md5(title)}` | `json.dumps(dict)`,包含 id/title/category/tags/anchor_text 等 | TTL=`ANCHOR_CACHE_EXPIRE_DAYS` 天 | 见 `indexer/product_enrich.py` | - 下面按模块详细说明。 --- @@ -186,69 +183,9 @@ services: --- -## 4. 商品内容理解缓存(indexer/product_enrich.py) - -- **代码位置**:`indexer/product_enrich.py` -- **用途**:在生成商品锚文本(qanchors)、语义属性、标签等内容理解结果时复用缓存,避免对同一标题重复调用大模型。 - -### 4.1 Key 设计 - -- 配置项: - - `ANCHOR_CACHE_PREFIX = REDIS_CONFIG.get("anchor_cache_prefix", "product_anchors")` - - `ANCHOR_CACHE_EXPIRE_DAYS = int(REDIS_CONFIG.get("anchor_cache_expire_days", 30))` -- Key 构造函数:`_make_analysis_cache_key(product, target_lang, analysis_kind)` -- 模板: - -```text -{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{prompt_contract_hash}:{target_lang}:{prompt_input_prefix}{md5(prompt_input)} -``` - -- 字段说明: - - `ANCHOR_CACHE_PREFIX`:默认 `"product_anchors"`,可通过 `.env` 中的 `REDIS_ANCHOR_CACHE_PREFIX`(若存在)间接配置到 `REDIS_CONFIG`; - - `analysis_kind`:分析族,目前至少包括 `content` 与 `taxonomy`,两者缓存隔离; - - `prompt_contract_hash`:基于 system prompt、shared instruction、localized headers、result fields、user instruction template、schema cache version 等生成的短 hash;只要提示词或输出契约变化,缓存会自动失效; - - `target_lang`:内容理解输出语言,例如 `zh`; - - `prompt_input_prefix + md5(prompt_input)`:对真正送入 prompt 的商品文本做前缀 + MD5;当前 prompt 输入来自 `title`、`brief`、`description` 的规范化拼接结果。 - -设计原则: - -- 只让**实际影响 LLM 输出**的输入参与 key; -- 不让 `tenant_id`、`spu_id` 这类“结果归属信息”污染缓存; -- prompt 或 schema 变更时,不依赖人工清理 Redis,也能自然切换到新 key。 - -### 4.2 Value 与类型 - -- 类型:`json.dumps(dict, ensure_ascii=False)`。 -- 典型结构(简化): - -```json -{ - "id": "123", - "lang": "zh", - "title_input": "原始标题", - "title": "归一化后的商品标题", - "category_path": "...", - "tags": "...", - "target_audience": "...", - "usage_scene": "...", - "anchor_text": "..., ..." -} -``` - -- 读取时通过 `json.loads(raw)` 还原为 `Dict[str, Any]`。 -- `content` 与 `taxonomy` 的 value 结构会随各自 schema 不同而不同,但都会先通过统一的 normalize 逻辑再写缓存。 - -### 4.3 过期策略 - -- TTL:`ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600` 秒(默认 30 天); -- 写入:`redis.setex(key, ttl, json.dumps(result, ensure_ascii=False))`; -- 读取:仅做 `redis.get(key)`,**不做滑动过期**。 - -### 4.4 调用流程中的位置 +## 4. 商品内容理解缓存(已迁出) -- 单条调用(索引阶段常见)时,`analyze_products()` 会先尝试命中缓存: - - 若命中,直接返回缓存结果; - - 若 miss,调用 LLM,解析结果后再写入缓存。 +本仓库原先存在一套用于 `qanchors` / `enriched_*` 生成的 Redis 缓存实现,但对应内容理解服务已经迁移到独立项目,当前仓库代码中不再读写这类缓存,也不再把它作为运行时能力的一部分维护。 --- @@ -258,24 +195,24 @@ services: ### 5.1 redis_cache_health_check.py(缓存健康巡检) -**功能**:按**业务缓存类型**(embedding / translation / anchors)做健康巡检,不扫全库。 +**功能**:按**业务缓存类型**(embedding / translation)做健康巡检,不扫全库。 - 对每类缓存:SCAN 匹配对应 key 前缀,统计**匹配 key 数量**(受 `--max-scan` 上限约束); - **TTL 分布**:对采样 key 统计 `no-expire-or-expired` / `0-1h` / `1h-1d` / `1d-30d` / `>30d`; - **近期活跃 key**:从采样中选出 `OBJECT IDLETIME <= 600s` 的 key,用于判断是否有新写入; -- **样本 key 与 value 预览**:对 embedding 显示 ndarray 信息,对 translation 显示译文片段,对 anchors 显示 JSON 摘要。 +- **样本 key 与 value 预览**:对 embedding 显示 ndarray 信息,对 translation 显示译文片段。 -**适用场景**:日常查看三类缓存是否在增长、TTL 是否合理、是否有近期写入;与「缓存总览表」中的 key 设计一一对应。 +**适用场景**:日常查看两类缓存是否在增长、TTL 是否合理、是否有近期写入;与「缓存总览表」中的 key 设计一一对应。 **用法示例**: ```bash -# 默认:检查 embedding / translation / anchors 三类 +# 默认:检查 embedding / translation 两类 python scripts/redis/redis_cache_health_check.py -# 只检查某一类或两类 +# 只检查某一类 python scripts/redis/redis_cache_health_check.py --type embedding -python scripts/redis/redis_cache_health_check.py --type translation anchors +python scripts/redis/redis_cache_health_check.py --type translation # 按自定义 pattern 检查(不按业务类型) python scripts/redis/redis_cache_health_check.py --pattern "mycache:*" @@ -288,7 +225,7 @@ python scripts/redis/redis_cache_health_check.py --sample-size 100 --max-scan 50 | 参数 | 说明 | 默认 | |------|------|------| -| `--type` | 缓存类型:`embedding` / `translation` / `anchors`,可多选 | 三类都检查 | +| `--type` | 缓存类型:`embedding` / `translation`,可多选 | 两类都检查 | | `--pattern` | 自定义 key pattern(如 `mycache:*`),指定后忽略 `--type` | - | | `--db` | Redis 数据库编号 | 0 | | `--sample-size` | 每类采样的 key 数量 | 50 | @@ -319,7 +256,7 @@ python scripts/redis/redis_cache_prefix_stats.py --all-db python scripts/redis/redis_cache_prefix_stats.py --db 1 # 只统计指定前缀(可多个) -python scripts/redis/redis_cache_prefix_stats.py --prefix trans embedding product_anchors +python scripts/redis/redis_cache_prefix_stats.py --prefix trans embedding # 全 DB + 指定前缀 python scripts/redis/redis_cache_prefix_stats.py --all-db --prefix trans embedding @@ -369,7 +306,7 @@ python scripts/redis/redis_memory_heavy_keys.py --top 100 | 需求 | 推荐脚本 | |------|----------| -| 看三类业务缓存(embedding/translation/anchors)的数量、TTL、近期写入、样本 value | `redis_cache_health_check.py` | +| 看两类业务缓存(embedding/translation)的数量、TTL、近期写入、样本 value | `redis_cache_health_check.py` | | 看全库或某前缀的 key 条数与内存占比 | `redis_cache_prefix_stats.py` | | 找占用内存最多的大 key、分析内存差异 | `redis_memory_heavy_keys.py` | diff --git a/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md b/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md index ca9c0c0..fdad080 100644 --- a/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md +++ b/indexer/ANCHORS_AND_SEMANTIC_ATTRIBUTES.md @@ -1,415 +1,11 @@ -## qanchors 与 enriched_attributes 设计与索引逻辑说明 +# qanchors 与 enriched_* 字段说明 -本文档详细说明: +本文档原先记录本仓库内的内容理解实现细节。自 2026-04 起,这部分生成能力已经迁移到独立项目,本仓库不再维护 `/indexer/enrich-content` 路由,也不再在 indexer 构建链路内自动补齐这些字段。 -- **锚文本字段 `qanchors.{lang}` 的作用与来源** -- **语义属性字段 `enriched_attributes` 的结构、用途与写入流程** -- **多语言支持策略(zh / en / de / ru / fr)** -- **索引阶段与 LLM 调用的集成方式** +当前状态: -本设计已默认开启,无需额外开关;在上游 LLM 不可用时会自动降级为“无锚点/语义属性”,不影响主索引流程。 - ---- - -### 1. 字段设计概览 - -#### 1.1 `qanchors.{lang}`:面向查询的锚文本 - -- **Mapping 位置**:`mappings/search_products.json` 中的 `qanchors` 对象。 -- **结构**(与 `title.{lang}` 一致): - -```140:182:/home/tw/saas-search/mappings/search_products.json -"qanchors": { - "type": "object", - "properties": { - "zh": { "type": "text", "analyzer": "index_ik", "search_analyzer": "query_ik" }, - "en": { "type": "text", "analyzer": "english" }, - "de": { "type": "text", "analyzer": "german" }, - "ru": { "type": "text", "analyzer": "russian" }, - "fr": { "type": "text", "analyzer": "french" }, - ... - } -} -``` - -- **语义**: - 用于承载“更接近用户自然搜索行为”的词/短语(query-style anchors),包括: - - 品类 + 细分类别表达; - - 使用场景(通勤、约会、度假、office outfit 等); - - 适用人群(年轻女性、plus size、teen boys 等); - - 材质 / 关键属性 / 功能特点等。 - -- **使用场景**: - - 主搜索:作为额外的全文字段参与 BM25 召回与打分(可在 `search/query_config.py` 中给一定权重); - - Suggestion:`suggestion/builder.py` 会从 `qanchors.{lang}` 中拆分词条作为候选(`source="qanchor"`,权重大于 `title`)。 - -#### 1.2 `enriched_attributes`:面向过滤/分面的通用语义属性 - -- **Mapping 位置**:`mappings/search_products.json`,追加的 nested 字段。 -- **结构**: - -```1392:1410:/home/tw/saas-search/mappings/search_products.json -"enriched_attributes": { - "type": "nested", - "properties": { - "lang": { "type": "keyword" }, // 语言:zh / en / de / ru / fr - "name": { "type": "keyword" }, // 维度名:usage_scene / target_audience / material / ... - "value": { "type": "keyword" } // 维度值:通勤 / office / Baumwolle ... - } -} -``` - -- **语义**: - - 将 LLM 输出的各维度信息统一规约到 `name/value/lang` 三元组; - - 维度名稳定、值内容可变,便于后续扩展新的语义维度而不需要修改 mapping。 - -- **当前支持的维度名**(在 `document_transformer.py` 中固定列表): - - `tags`:细分标签/风格标签; - - `target_audience`:适用人群; - - `usage_scene`:使用场景; - - `season`:适用季节; - - `key_attributes`:关键属性; - - `material`:材质说明; - - `features`:功能特点。 - -- **使用场景**: - - 按语义维度过滤: - - 例:只要“适用人群=年轻女性”的商品; - - 例:`usage_scene` 包含 “office” 或 “通勤”。 - - 按语义维度分面 / 展示筛选项: - - 例:展示当前结果中所有 `usage_scene` 的分布,供前端勾选; - - 例:展示所有 `material` 值 + 命中文档数。 - ---- - -### 2. LLM 分析服务:`indexer/product_annotator.py` - -#### 2.1 入口函数:`analyze_products` - -- **文件**:`indexer/product_annotator.py` -- **函数签名**: - -```365:392:/home/tw/saas-search/indexer/product_annotator.py -def analyze_products( - products: List[Dict[str, str]], - target_lang: str = "zh", - batch_size: Optional[int] = None, -) -> List[Dict[str, Any]]: - """ - 库调用入口:根据输入+语言,返回锚文本及各维度信息。 - - Args: - products: [{"id": "...", "title": "..."}] - target_lang: 输出语言,需在 SUPPORTED_LANGS 内 - batch_size: 批大小,默认使用全局 BATCH_SIZE - """ - ... -``` - -- **支持的输出语言**(在同文件中定义): - -```54:62:/home/tw/saas-search/indexer/product_annotator.py -LANG_LABELS: Dict[str, str] = { - "zh": "中文", - "en": "英文", - "de": "德文", - "ru": "俄文", - "fr": "法文", -} -SUPPORTED_LANGS = set(LANG_LABELS.keys()) -``` - -- **返回结构**(每个商品一条记录): - -```python -{ - "id": "", - "lang": "", - "title_input": "<原始输入标题>", - "title": "<目标语言的标题>", - "category_path": "", - "tags": "<逗号分隔的细分标签>", - "target_audience": "<逗号分隔的适用人群>", - "usage_scene": "<逗号分隔的使用场景>", - "season": "<逗号分隔的适用季节>", - "key_attributes": "<逗号分隔的关键属性>", - "material": "<逗号分隔的材质说明>", - "features": "<逗号分隔的功能特点>", - "anchor_text": "<逗号分隔的锚文本短语>", - # 若发生错误,还会附带: - # "error": "<异常信息>" -} -``` - -> 注意:表格中的多值字段(标签/场景/人群/材质等)约定为**使用逗号分隔**,后续索引端会统一按正则 `[,;|/\\n\\t]+` 再拆分为短语。 - -#### 2.2 Prompt 设计与语言控制 - -- Prompt 中会明确要求“**所有输出内容使用目标语言**”,并给出中英文示例: - -```65:81:/home/tw/saas-search/indexer/product_annotator.py -def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str: - """创建LLM提示词(根据目标语言输出)""" - lang_label = LANG_LABELS.get(target_lang, "对应语言") - prompt = f"""请对输入的每条商品标题,分析并提取以下信息,所有输出内容请使用{lang_label}: - -1. 商品标题:将输入商品名称翻译为{lang_label} -2. 品类路径:从大类到细分品类,用">"分隔(例如:服装>女装>裤子>工装裤) -3. 细分标签:商品的风格、特点、功能等(例如:碎花,收腰,法式) -4. 适用人群:性别/年龄段等(例如:年轻女性) -5. 使用场景 -6. 适用季节 -7. 关键属性 -8. 材质说明 -9. 功能特点 -10. 商品卖点:分析和提取一句话核心卖点,用于推荐理由 -11. 锚文本:生成一组能够代表该商品、并可能被用户用于搜索的词语或短语。这些词语应覆盖用户需求的各个维度,如品类、细分标签、功能特性、需求场景等等。 -""" -``` - -- 返回格式固定为 Markdown 表格,首行头为: - -```89:91:/home/tw/saas-search/indexer/product_annotator.py -| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 | -|----|----|----|----|----|----|----|----|----|----|----|----| -``` - -`parse_markdown_table` 会按表格列顺序解析成字段。 - ---- - -### 3. 索引阶段集成:`SPUDocumentTransformer._fill_llm_attributes` - -#### 3.1 调用时机 - -在 `SPUDocumentTransformer.transform_spu_to_doc(...)` 的末尾,在所有基础字段(多语言文本、类目、SKU/规格、价格、库存等)填充完成后,会调用: - -```96:101:/home/tw/saas-search/indexer/document_transformer.py - # 文本字段处理(翻译等) - self._fill_text_fields(doc, spu_row, primary_lang) - - # 标题向量化 - if self.enable_title_embedding and self.encoder: - self._fill_title_embedding(doc) - ... - # 时间字段 - ... - - # 基于 LLM 的锚文本与语义属性(默认开启,失败时仅记录日志) - self._fill_llm_attributes(doc, spu_row) -``` - -也就是说,**每个 SPU 文档默认会尝试补充 qanchors 与 enriched_attributes**。 - -#### 3.2 语言选择策略 - -在 `_fill_llm_attributes` 内部: - -```148:164:/home/tw/saas-search/indexer/document_transformer.py - try: - index_langs = self.tenant_config.get("index_languages") or ["en", "zh"] - except Exception: - index_langs = ["en", "zh"] - - # 只在支持的语言集合内调用 - llm_langs = [lang for lang in index_langs if lang in SUPPORTED_LANGS] - if not llm_langs: - return -``` - -- `tenant_config.index_languages` 决定该租户希望在索引中支持哪些语言; -- 实际调用 LLM 的语言集合 = `index_languages ∩ SUPPORTED_LANGS`; -- 当前 SUPPORTED_LANGS:`{"zh", "en", "de", "ru", "fr"}`。 - -这保证了: - -- 如果租户只索引 `zh`,就只跑中文; -- 如果租户同时索引 `en` + `de`,就为这两种语言各跑一次 LLM; -- 如果 `index_languages` 里包含暂不支持的语言(例如 `es`),会被自动忽略。 - -#### 3.3 调用 LLM 并写入字段 - -核心逻辑(简化描述): - -```164:210:/home/tw/saas-search/indexer/document_transformer.py - spu_id = str(spu_row.get("id") or "").strip() - title = str(spu_row.get("title") or "").strip() - if not spu_id or not title: - return - - semantic_list = doc.get("enriched_attributes") or [] - qanchors_obj = doc.get("qanchors") or {} - - dim_keys = [ - "tags", - "target_audience", - "usage_scene", - "season", - "key_attributes", - "material", - "features", - ] - - for lang in llm_langs: - try: - rows = analyze_products( - products=[{"id": spu_id, "title": title}], - target_lang=lang, - batch_size=1, - ) - except Exception as e: - logger.warning("LLM attribute fill failed for SPU %s, lang=%s: %s", spu_id, lang, e) - continue - - if not rows: - continue - row = rows[0] or {} - - # qanchors.{lang} - anchor_text = str(row.get("anchor_text") or "").strip() - if anchor_text: - qanchors_obj[lang] = anchor_text - - # 语义属性 - for name in dim_keys: - raw = row.get(name) - if not raw: - continue - parts = re.split(r"[,;|/\n\t]+", str(raw)) - for part in parts: - value = part.strip() - if not value: - continue - semantic_list.append( - { - "lang": lang, - "name": name, - "value": value, - } - ) - - if qanchors_obj: - doc["qanchors"] = qanchors_obj - if semantic_list: - doc["enriched_attributes"] = semantic_list -``` - -要点: - -- 每种语言**单独调用一次** `analyze_products`,传入同一 SPU 的原始标题; -- 将返回的 `anchor_text` 直接写入 `qanchors.{lang}`,其内部仍是逗号分隔短语,后续 suggestion builder 会再拆分; -- 对各维度字段(tags/usage_scene/...)用统一正则进行“松散拆词”,过滤空串后,以 `(lang,name,value)` 三元组追加到 nested 数组; -- 如果某个维度在该语言下为空,则跳过,不写入任何条目。 - -#### 3.4 容错 & 降级策略 - -- 如果: - - 没有 `title`; - - 或者 `tenant_config.index_languages` 与 `SUPPORTED_LANGS` 没有交集; - - 或 `DASHSCOPE_API_KEY` 未配置 / LLM 请求报错; -- 则 `_fill_llm_attributes` 会在日志中输出 `warning`,**不会抛异常**,索引流程继续,只是该 SPU 在这一轮不会得到 `qanchors` / `enriched_attributes`。 - -这保证了整个索引服务在 LLM 不可用时表现为一个普通的“传统索引”,而不会中断。 - ---- - -### 4. 查询与 Suggestion 中的使用建议 - -#### 4.1 主搜索(Search API) - -在 `search/query_config.py` 或构建 ES 查询时,可以: - -- 将 `qanchors.{lang}` 作为额外的 `should` 字段参与匹配,并给一个略高的权重,例如: - -```json -{ - "multi_match": { - "query": "", - "fields": [ - "title.zh^3.0", - "brief.zh^1.5", - "description.zh^1.0", - "vendor.zh^1.5", - "category_path.zh^1.5", - "category_name_text.zh^1.5", - "tags^1.0", - "qanchors.zh^2.0" // 建议新增 - ] - } -} -``` - -- 当用户做维度过滤时(例如“只看通勤场景 + 夏季 + 棉质”),可以在 filter 中增加 nested 查询: - -```json -{ - "nested": { - "path": "enriched_attributes", - "query": { - "bool": { - "must": [ - { "term": { "enriched_attributes.lang": "zh" } }, - { "term": { "enriched_attributes.name": "usage_scene" } }, - { "term": { "enriched_attributes.value": "通勤" } } - ] - } - } - } -} -``` - -多个维度可以通过多个 nested 子句组合(AND/OR 逻辑与 `specifications` 的设计类似)。 - -#### 4.2 Suggestion(联想词) - -现有 `suggestion/builder.py` 已经支持从 `qanchors.{lang}` 中提取候选: - -```249:287:/home/tw/saas-search/suggestion/builder.py - # Step 1: product title/qanchors - hits = self._scan_products(tenant_id, batch_size=batch_size) - ... - title_obj = src.get("title") or {} - qanchor_obj = src.get("qanchors") or {} - ... - for lang in index_languages: - ... - q_raw = None - if isinstance(qanchor_obj, dict): - q_raw = qanchor_obj.get(lang) - for q_text in self._split_qanchors(q_raw): - text_norm = self._normalize_text(q_text) - if self._looks_noise(text_norm): - continue - key = (lang, text_norm) - c = key_to_candidate.get(key) - if c is None: - c = SuggestionCandidate(text=q_text, text_norm=text_norm, lang=lang) - key_to_candidate[key] = c - c.add_product("qanchor", spu_id=spu_id, score=product_score + 0.6) -``` - -- `_split_qanchors` 使用与索引端一致的分隔符集合,确保: - - 无论 LLM 用逗号、分号还是换行分隔,只要符合约定,都能被拆成单独候选词; -- `add_product("qanchor", ...)` 会: - - 将来源标记为 `qanchor`; - - 在排序打分时,`qanchor` 命中会比纯 `title` 更有权重。 - ---- - -### 5. 总结与扩展方向 - -1. **功能定位**: - - `qanchors.{lang}`:更好地贴近用户真实查询词,用于召回与 suggestion; - - `enriched_attributes`:以结构化形式承载 LLM 抽取的语义维度,用于 filter / facet。 -2. **多语言对齐**: - - 完全复用租户级 `index_languages` 配置; - - 对每种语言单独生成锚文本与语义属性,不互相混用。 -3. **默认开启 / 自动降级**: - - 索引流程始终可用; - - 当 LLM/配置异常时,只是“缺少增强特征”,不影响基础搜索能力。 -4. **未来扩展**: - - 可以在 `dim_keys` 中新增维度名(如 `style`, `benefit` 等),只要在 prompt 与解析逻辑中增加对应列即可; - - 可以为 `enriched_attributes` 增加额外字段(如 `confidence`、`source`),用于更精细的控制(当前 mapping 为简单版)。 - -如需在查询层面增加基于 `enriched_attributes` 的统一 DSL(类似 `specifications` 的过滤/分面规则),推荐在 `docs/搜索API对接指南-01-搜索接口.md` 或 `docs/搜索API对接指南-08-数据模型与字段速查.md` 中新增一节,并在 `search/es_query_builder.py` 里封装构造逻辑,避免前端直接拼 nested 查询。 +- `search_products` mapping 仍保留 `qanchors`、`enriched_attributes`、`enriched_tags`、`enriched_taxonomy_attributes` 字段,便于外部服务继续产出并写入。 +- `suggestion/builder.py` 等消费侧仍会读取 ES 中已有的 `qanchors`。 +- `/indexer/build-docs`、`/indexer/build-docs-from-db`、`/indexer/reindex`、`/indexer/index` 只负责基础文档构建,不再调用本地 LLM 富化。 +如需这些字段,请在独立内容理解服务中生成,并由上游索引程序自行合并到最终 ES 文档。 diff --git a/indexer/README.md b/indexer/README.md index 3a1ccce..c9e0e65 100644 --- a/indexer/README.md +++ b/indexer/README.md @@ -67,7 +67,7 @@ - ES 文档结构 `ProductIndexDocument` 的字段细节(title/brief/description/vendor/category_xxx/tags/specifications/skus/embedding 等)。 - 翻译、向量等具体算法逻辑。 -- qanchors/keywords 等新特征的计算。 +- `qanchors` 等外部内容理解字段的生成。 **新职责边界**: Java 只负责“**选出要索引的 SPU + 从 MySQL 拉取原始数据 + 调用 Python 服务**(或交给 Python 做完整索引)”。 @@ -81,7 +81,7 @@ Java 只负责“**选出要索引的 SPU + 从 MySQL 拉取原始数据 + 调 - 输入:**MySQL 基础数据**(`shoplazza_product_spu/sku/option/category/image` 等)。 - 输出:**符合 `mappings/search_products.json` 的 doc 列表**,包括: - 多语言文本字段:`title.*`, `brief.*`, `description.*`, `vendor.*`, `category_path.*`, `category_name_text.*`; - - 算法特征:`title_embedding`, `image_embedding`, `qanchors.*`, `keywords.*`(未来扩展); + - 算法特征:`title_embedding`, `image_embedding`; - 结构化字段:`tags`, `specifications`, `skus`, `min_price`, `max_price`, `compare_at_price`, `total_inventory`, `sales` 等。 - 附加: - 翻译调用 & **Redis 缓存**(继承 Java 的 key 组织和 TTL 策略); @@ -370,14 +370,7 @@ if spu.tags: ### 7.2 qanchors / keywords 扩展 -- 当前 Java 中 `qanchors` 字段结构已存在,但未赋值; -- 设计建议: - - 在 Python 侧基于: - - 标题 / brief / description / tags / 类目等,做**查询锚点**抽取; - - 按与 `title/keywords` 类似的多语言结构写入 `qanchors.{lang}`; - - 翻译策略可选: - - 在生成锚点后再调用翻译; - - 或使用原始文本的翻译结果组合。 +该能力已迁移到独立内容理解服务。本仓库仍保留字段模型与消费侧能力,但不再负责在 indexer 内部生成 `qanchors` / `enriched_*`。 --- @@ -436,8 +429,6 @@ if spu.tags: "spu_id": "1", "tenant_id": "123", "title": { "en": "...", "zh": "...", ... }, - "qanchors": { ... }, - "keywords": { ... }, "brief": { ... }, "description": { ... }, "vendor": { ... }, @@ -496,7 +487,7 @@ if spu.tags: - **保留现有 Java 调度 & 数据同步能力**,不破坏已有全量/增量任务和 MQ 削峰; - **把 ES 文档结构、多语言逻辑、翻译与向量等算法能力全部收拢到 Python 索引富化模块**,实现“单一 owner”; - **完全继承 Java 现有的翻译缓存策略**(Redis key & TTL & 维度),保证行为与性能的一致性; -- **为未来字段扩展(qanchors、更多 tags/特征)预留清晰路径**:仅需在 Python 侧新增逻辑和 mapping,不再拉 Java 入伙。 +- **为未来字段扩展(包括外部内容理解字段接入)预留清晰路径**:字段模型可继续保留,但生成职责可独立演进。 --- @@ -514,6 +505,7 @@ if spu.tags: - **构建文档(正式使用)**:`POST /indexer/build-docs` - 入参:`tenant_id + items[ { spu, skus, options } ]` - 输出:`docs` 数组,每个元素是完整 ES doc,不查库、不写 ES。 + - 注意:当前不再内置生成 `qanchors` / `enriched_*`;如需这些字段,请由独立内容理解服务生成后自行合并。 - **构建文档(测试用,内部查库)**:`POST /indexer/build-docs-from-db` - 入参:`{"tenant_id": "...", "spu_ids": ["..."]}` diff --git a/indexer/document_transformer.py b/indexer/document_transformer.py index 72dcdf3..9874da6 100644 --- a/indexer/document_transformer.py +++ b/indexer/document_transformer.py @@ -12,7 +12,6 @@ import pandas as pd import numpy as np import logging from typing import Dict, Any, Optional, List -from indexer.product_enrich import build_index_content_fields logger = logging.getLogger(__name__) @@ -113,7 +112,6 @@ class SPUDocumentTransformer: spu_row: pd.Series, skus: pd.DataFrame, options: pd.DataFrame, - fill_llm_attributes: bool = True, ) -> Optional[Dict[str, Any]]: """ 将单个SPU行和其SKUs转换为ES文档。 @@ -228,85 +226,8 @@ class SPUDocumentTransformer: else: doc['update_time'] = str(update_time) - # 基于 LLM 的锚文本与语义属性(默认开启,失败时仅记录日志) - # 注意:批处理场景(build-docs / bulk / incremental)应优先在外层攒批, - # 再调用 fill_llm_attributes_batch(),避免逐条调用 LLM。 - if fill_llm_attributes: - self._fill_llm_attributes(doc, spu_row) - return doc - def fill_llm_attributes_batch(self, docs: List[Dict[str, Any]], spu_rows: List[pd.Series]) -> None: - """ - 批量调用 LLM,为一批 doc 填充: - - qanchors.{lang} - - enriched_tags.{lang} - - enriched_attributes[].value.{lang} - - enriched_taxonomy_attributes[].value.{lang} - - 设计目标: - - 尽可能攒批调用 LLM; - - 单次 LLM 调用最多 20 条(由 analyze_products 内部强制 cap 并自动拆批)。 - """ - if not docs or not spu_rows or len(docs) != len(spu_rows): - return - - id_to_idx: Dict[str, int] = {} - items: List[Dict[str, str]] = [] - for i, row in enumerate(spu_rows): - raw_id = row.get("id") - spu_id = "" if raw_id is None else str(raw_id).strip() - title = str(row.get("title") or "").strip() - if not spu_id or not title: - continue - id_to_idx[spu_id] = i - items.append( - { - "id": spu_id, - "title": title, - "brief": str(row.get("brief") or "").strip(), - "description": str(row.get("description") or "").strip(), - "image_url": str(row.get("image_src") or "").strip(), - } - ) - if not items: - return - - tenant_id = str(docs[0].get("tenant_id") or "").strip() or None - try: - # TODO: 从数据库读取该 tenant 的真实行业,并据此替换当前默认的 apparel profile。 - results = build_index_content_fields( - items=items, - tenant_id=tenant_id, - category_taxonomy_profile="apparel", - ) - except Exception as e: - logger.warning("LLM batch attribute fill failed: %s", e) - return - - for result in results: - spu_id = str(result.get("id") or "").strip() - if not spu_id: - continue - idx = id_to_idx.get(spu_id) - if idx is None: - continue - self._apply_content_enrichment(docs[idx], result) - - def _apply_content_enrichment(self, doc: Dict[str, Any], enrichment: Dict[str, Any]) -> None: - """将 product_enrich 产出的 ES-ready 内容字段写入 doc。""" - try: - if enrichment.get("qanchors"): - doc["qanchors"] = enrichment["qanchors"] - if enrichment.get("enriched_tags"): - doc["enriched_tags"] = enrichment["enriched_tags"] - if enrichment.get("enriched_attributes"): - doc["enriched_attributes"] = enrichment["enriched_attributes"] - if enrichment.get("enriched_taxonomy_attributes"): - doc["enriched_taxonomy_attributes"] = enrichment["enriched_taxonomy_attributes"] - except Exception as e: - logger.warning("Failed to apply enrichment to doc (spu_id=%s): %s", doc.get("spu_id"), e) - def _fill_text_fields( self, doc: Dict[str, Any], @@ -660,41 +581,6 @@ class SPUDocumentTransformer: else: doc['option3_values'] = [] - def _fill_llm_attributes(self, doc: Dict[str, Any], spu_row: pd.Series) -> None: - """ - 调用 indexer.product_enrich 的高层内容理解入口,为当前 SPU 填充: - - qanchors.{lang} - - enriched_tags.{lang} - - enriched_attributes[].value.{lang} - """ - spu_id = str(spu_row.get("id") or "").strip() - title = str(spu_row.get("title") or "").strip() - if not spu_id or not title: - return - - tenant_id = doc.get("tenant_id") - try: - # TODO: 从数据库读取该 tenant 的真实行业,并据此替换当前默认的 apparel profile。 - results = build_index_content_fields( - items=[ - { - "id": spu_id, - "title": title, - "brief": str(spu_row.get("brief") or "").strip(), - "description": str(spu_row.get("description") or "").strip(), - "image_url": str(spu_row.get("image_src") or "").strip(), - } - ], - tenant_id=str(tenant_id), - category_taxonomy_profile="apparel", - ) - except Exception as e: - logger.warning("LLM attribute fill failed for SPU %s: %s", spu_id, e) - return - - if results: - self._apply_content_enrichment(doc, results[0]) - def _transform_sku_row(self, sku_row: pd.Series, option_name_map: Dict[int, str] = None) -> Optional[Dict[str, Any]]: """ 将SKU行转换为SKU对象。 diff --git a/indexer/incremental_service.py b/indexer/incremental_service.py index cfa38ef..649e194 100644 --- a/indexer/incremental_service.py +++ b/indexer/incremental_service.py @@ -584,7 +584,6 @@ class IncrementalIndexerService: transformer, encoder, enable_embedding = self._get_transformer_bundle(tenant_id) # 按输入顺序处理 active SPUs - doc_spu_rows: List[pd.Series] = [] for spu_id in spu_ids: try: spu_id_int = int(spu_id) @@ -603,7 +602,6 @@ class IncrementalIndexerService: spu_row=spu_row, skus=skus_for_spu, options=opts_for_spu, - fill_llm_attributes=False, ) if doc is None: error_msg = "SPU transform returned None" @@ -612,14 +610,6 @@ class IncrementalIndexerService: continue documents.append((spu_id, doc)) - doc_spu_rows.append(spu_row) - - # 批量填充 LLM 字段(尽量攒批,每次最多 20 条;失败仅 warning,不影响主流程) - try: - if documents and doc_spu_rows: - transformer.fill_llm_attributes_batch([d for _, d in documents], doc_spu_rows) - except Exception as e: - logger.warning("[IncrementalIndexing] Batch LLM fill failed: %s", e) # 批量生成 embedding(保持翻译逻辑不变;embedding 走缓存) if enable_embedding and encoder and documents: diff --git a/indexer/product_enrich.py b/indexer/product_enrich.py deleted file mode 100644 index 7dca838..0000000 --- a/indexer/product_enrich.py +++ /dev/null @@ -1,1421 +0,0 @@ -#!/usr/bin/env python3 -""" -商品内容理解与属性补充模块(product_enrich) - -提供基于 LLM 的商品锚文本 / 语义属性 / 标签等分析能力, -供 indexer 与 API 在内存中调用(不再负责 CSV 读写)。 -""" - -import os -import json -import logging -import re -import time -import hashlib -import uuid -import threading -from dataclasses import dataclass, field -from collections import OrderedDict -from datetime import datetime -from concurrent.futures import ThreadPoolExecutor -from typing import List, Dict, Tuple, Any, Optional, FrozenSet - -import redis -import requests -from pathlib import Path - -from config.loader import get_app_config -from config.tenant_config_loader import SOURCE_LANG_CODE_MAP -from indexer.product_enrich_prompts import ( - SYSTEM_MESSAGE, - USER_INSTRUCTION_TEMPLATE, - LANGUAGE_MARKDOWN_TABLE_HEADERS, - SHARED_ANALYSIS_INSTRUCTION, - CATEGORY_TAXONOMY_PROFILES, -) - -# 配置 -BATCH_SIZE = 20 -# enrich-content LLM 批次并发 worker 上限(线程池;仅对 uncached batch 并发) -_APP_CONFIG = get_app_config() -CONTENT_UNDERSTANDING_MAX_WORKERS = int(_APP_CONFIG.product_enrich.max_workers) -# 华北2(北京):https://dashscope.aliyuncs.com/compatible-mode/v1 -# 新加坡:https://dashscope-intl.aliyuncs.com/compatible-mode/v1 -# 美国(弗吉尼亚):https://dashscope-us.aliyuncs.com/compatible-mode/v1 -API_BASE_URL = "https://dashscope-us.aliyuncs.com/compatible-mode/v1" -MODEL_NAME = "qwen-flash" -API_KEY = os.environ.get("DASHSCOPE_API_KEY") -MAX_RETRIES = 3 -RETRY_DELAY = 5 # 秒 -REQUEST_TIMEOUT = 180 # 秒 -LOGGED_SHARED_CONTEXT_CACHE_SIZE = 256 -PROMPT_INPUT_MIN_ZH_CHARS = 20 -PROMPT_INPUT_MAX_ZH_CHARS = 100 -PROMPT_INPUT_MIN_WORDS = 16 -PROMPT_INPUT_MAX_WORDS = 80 - -# 日志路径 -OUTPUT_DIR = Path("output_logs") -LOG_DIR = OUTPUT_DIR / "logs" - -# 设置独立日志(不影响全局 indexer.log) -LOG_DIR.mkdir(parents=True, exist_ok=True) -timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -log_file = LOG_DIR / f"product_enrich_{timestamp}.log" -verbose_log_file = LOG_DIR / "product_enrich_verbose.log" -_logged_shared_context_keys: "OrderedDict[str, None]" = OrderedDict() -_logged_shared_context_lock = threading.Lock() - -_content_understanding_executor: Optional[ThreadPoolExecutor] = None -_content_understanding_executor_lock = threading.Lock() - - -def _get_content_understanding_executor() -> ThreadPoolExecutor: - """ - 使用模块级单例线程池,避免同一进程内多次请求叠加创建线程池导致并发失控。 - """ - global _content_understanding_executor - with _content_understanding_executor_lock: - if _content_understanding_executor is None: - _content_understanding_executor = ThreadPoolExecutor( - max_workers=CONTENT_UNDERSTANDING_MAX_WORKERS, - thread_name_prefix="product-enrich-llm", - ) - return _content_understanding_executor - -# 主日志 logger:执行流程、批次信息等 -logger = logging.getLogger("product_enrich") -logger.setLevel(logging.INFO) - -if not logger.handlers: - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - - file_handler = logging.FileHandler(log_file, encoding="utf-8") - file_handler.setFormatter(formatter) - - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(formatter) - - logger.addHandler(file_handler) - logger.addHandler(stream_handler) - - # 避免日志向根 logger 传播,防止写入 logs/indexer.log 等其他文件 - logger.propagate = False - -# 详尽日志 logger:专门记录 LLM 请求与响应 -verbose_logger = logging.getLogger("product_enrich_verbose") -verbose_logger.setLevel(logging.INFO) - -if not verbose_logger.handlers: - verbose_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - verbose_file_handler = logging.FileHandler(verbose_log_file, encoding="utf-8") - verbose_file_handler.setFormatter(verbose_formatter) - verbose_logger.addHandler(verbose_file_handler) - verbose_logger.propagate = False - -logger.info("Verbose LLM logs are written to: %s", verbose_log_file) - - -# Redis 缓存(用于 anchors / 语义属性) -_REDIS_CONFIG = _APP_CONFIG.infrastructure.redis -ANCHOR_CACHE_PREFIX = _REDIS_CONFIG.anchor_cache_prefix -ANCHOR_CACHE_EXPIRE_DAYS = int(_REDIS_CONFIG.anchor_cache_expire_days) -_anchor_redis: Optional[redis.Redis] = None - -try: - _anchor_redis = redis.Redis( - host=_REDIS_CONFIG.host, - port=_REDIS_CONFIG.port, - password=_REDIS_CONFIG.password, - decode_responses=True, - socket_timeout=_REDIS_CONFIG.socket_timeout, - socket_connect_timeout=_REDIS_CONFIG.socket_connect_timeout, - retry_on_timeout=_REDIS_CONFIG.retry_on_timeout, - health_check_interval=10, - ) - _anchor_redis.ping() - logger.info("Redis cache initialized for product anchors and semantic attributes") -except Exception as e: - logger.warning(f"Failed to initialize Redis for anchors cache: {e}") - _anchor_redis = None - -_missing_prompt_langs = sorted(set(SOURCE_LANG_CODE_MAP) - set(LANGUAGE_MARKDOWN_TABLE_HEADERS)) -if _missing_prompt_langs: - raise RuntimeError( - f"Missing product_enrich prompt config for languages: {_missing_prompt_langs}" - ) - - -# 多值字段分隔 -_MULTI_VALUE_FIELD_SPLIT_RE = re.compile(r"[,、,;|/\n\t]+") -# 表格单元格中视为「无内容」的占位 -_MARKDOWN_EMPTY_CELL_LITERALS: Tuple[str, ...] = ("-","–", "—", "none", "null", "n/a", "无") -_MARKDOWN_EMPTY_CELL_TOKENS_CF: FrozenSet[str] = frozenset( - lit.casefold() for lit in _MARKDOWN_EMPTY_CELL_LITERALS -) - -def _normalize_markdown_table_cell(raw: Optional[str]) -> str: - """strip;将占位符统一视为空字符串。""" - s = str(raw or "").strip() - if not s: - return "" - if s.casefold() in _MARKDOWN_EMPTY_CELL_TOKENS_CF: - return "" - return s -_CORE_INDEX_LANGUAGES = ("zh", "en") -_DEFAULT_ENRICHMENT_SCOPES = ("generic", "category_taxonomy") -_DEFAULT_CATEGORY_TAXONOMY_PROFILE = "apparel" -_CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP = ( - ("tags", "enriched_tags"), - ("target_audience", "target_audience"), - ("usage_scene", "usage_scene"), - ("season", "season"), - ("key_attributes", "key_attributes"), - ("material", "material"), - ("features", "features"), -) -_CONTENT_ANALYSIS_RESULT_FIELDS = ( - "title", - "category_path", - "tags", - "target_audience", - "usage_scene", - "season", - "key_attributes", - "material", - "features", - "anchor_text", -) -_CONTENT_ANALYSIS_MEANINGFUL_FIELDS = ( - "tags", - "target_audience", - "usage_scene", - "season", - "key_attributes", - "material", - "features", - "anchor_text", -) -_CONTENT_ANALYSIS_FIELD_ALIASES = { - "tags": ("tags", "enriched_tags"), -} -_CONTENT_ANALYSIS_QUALITY_FIELDS = ("title", "category_path", "anchor_text") - - -@dataclass(frozen=True) -class AnalysisSchema: - name: str - shared_instruction: str - markdown_table_headers: Dict[str, List[str]] - result_fields: Tuple[str, ...] - meaningful_fields: Tuple[str, ...] - cache_version: str = "v1" - field_aliases: Dict[str, Tuple[str, ...]] = field(default_factory=dict) - quality_fields: Tuple[str, ...] = () - - def get_headers(self, target_lang: str) -> Optional[List[str]]: - return self.markdown_table_headers.get(target_lang) - - -_ANALYSIS_SCHEMAS: Dict[str, AnalysisSchema] = { - "content": AnalysisSchema( - name="content", - shared_instruction=SHARED_ANALYSIS_INSTRUCTION, - markdown_table_headers=LANGUAGE_MARKDOWN_TABLE_HEADERS, - result_fields=_CONTENT_ANALYSIS_RESULT_FIELDS, - meaningful_fields=_CONTENT_ANALYSIS_MEANINGFUL_FIELDS, - cache_version="v2", - field_aliases=_CONTENT_ANALYSIS_FIELD_ALIASES, - quality_fields=_CONTENT_ANALYSIS_QUALITY_FIELDS, - ), -} - -def _build_taxonomy_profile_schema(profile: str, config: Dict[str, Any]) -> AnalysisSchema: - return AnalysisSchema( - name=f"taxonomy:{profile}", - shared_instruction=config["shared_instruction"], - markdown_table_headers=config["markdown_table_headers"], - result_fields=tuple(field["key"] for field in config["fields"]), - meaningful_fields=tuple(field["key"] for field in config["fields"]), - cache_version="v1", - ) - - -_CATEGORY_TAXONOMY_PROFILE_SCHEMAS: Dict[str, AnalysisSchema] = { - profile: _build_taxonomy_profile_schema(profile, config) - for profile, config in CATEGORY_TAXONOMY_PROFILES.items() -} - -_CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS: Dict[str, Tuple[Tuple[str, str], ...]] = { - profile: tuple((field["key"], field["label"]) for field in config["fields"]) - for profile, config in CATEGORY_TAXONOMY_PROFILES.items() -} - - -def get_supported_category_taxonomy_profiles() -> Tuple[str, ...]: - return tuple(_CATEGORY_TAXONOMY_PROFILE_SCHEMAS.keys()) - - -def _normalize_category_taxonomy_profile(category_taxonomy_profile: Optional[str] = None) -> str: - profile = str(category_taxonomy_profile or _DEFAULT_CATEGORY_TAXONOMY_PROFILE).strip() - if profile not in _CATEGORY_TAXONOMY_PROFILE_SCHEMAS: - supported = ", ".join(get_supported_category_taxonomy_profiles()) - raise ValueError( - f"Unsupported category_taxonomy_profile: {profile}. Supported profiles: {supported}" - ) - return profile - - -def _get_analysis_schema( - analysis_kind: str, - *, - category_taxonomy_profile: Optional[str] = None, -) -> AnalysisSchema: - if analysis_kind == "content": - return _ANALYSIS_SCHEMAS["content"] - if analysis_kind == "taxonomy": - profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) - return _CATEGORY_TAXONOMY_PROFILE_SCHEMAS[profile] - raise ValueError(f"Unsupported analysis_kind: {analysis_kind}") - - -def _get_taxonomy_attribute_field_map( - category_taxonomy_profile: Optional[str] = None, -) -> Tuple[Tuple[str, str], ...]: - profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) - return _CATEGORY_TAXONOMY_PROFILE_ATTRIBUTE_FIELD_MAPS[profile] - - -def _normalize_enrichment_scopes( - enrichment_scopes: Optional[List[str]] = None, -) -> Tuple[str, ...]: - requested = _DEFAULT_ENRICHMENT_SCOPES if not enrichment_scopes else tuple(enrichment_scopes) - normalized: List[str] = [] - seen = set() - for enrichment_scope in requested: - scope = str(enrichment_scope).strip() - if scope not in {"generic", "category_taxonomy"}: - raise ValueError(f"Unsupported enrichment_scope: {scope}") - if scope in seen: - continue - seen.add(scope) - normalized.append(scope) - return tuple(normalized) - - -def split_multi_value_field(text: Optional[str]) -> List[str]: - """将 LLM/业务中的多值字符串拆成短语列表(strip 后去空)。""" - if text is None: - return [] - s = str(text).strip() - if not s: - return [] - return [p.strip() for p in _MULTI_VALUE_FIELD_SPLIT_RE.split(s) if p.strip()] - - -def _append_lang_phrase_map(target: Dict[str, List[str]], lang: str, raw_value: Any) -> None: - parts = split_multi_value_field(raw_value) - if not parts: - return - existing = target.get(lang) or [] - merged = list(dict.fromkeys([str(x).strip() for x in existing if str(x).strip()] + parts)) - if merged: - target[lang] = merged - - -def _get_or_create_named_value_entry( - target: List[Dict[str, Any]], - name: str, - *, - default_value: Optional[Dict[str, Any]] = None, -) -> Dict[str, Any]: - for item in target: - if item.get("name") == name: - value = item.get("value") - if isinstance(value, dict): - return item - break - - entry = {"name": name, "value": default_value or {}} - target.append(entry) - return entry - - -def _append_named_lang_phrase_map( - target: List[Dict[str, Any]], - name: str, - lang: str, - raw_value: Any, -) -> None: - entry = _get_or_create_named_value_entry(target, name=name, default_value={}) - _append_lang_phrase_map(entry["value"], lang=lang, raw_value=raw_value) - - -def _get_product_id(product: Dict[str, Any]) -> str: - return str(product.get("id") or product.get("spu_id") or "").strip() - - -def _get_analysis_field_aliases(field_name: str, schema: AnalysisSchema) -> Tuple[str, ...]: - return schema.field_aliases.get(field_name, (field_name,)) - - -def _get_analysis_field_value(row: Dict[str, Any], field_name: str, schema: AnalysisSchema) -> Any: - for alias in _get_analysis_field_aliases(field_name, schema): - if alias in row: - return row.get(alias) - return None - - -def _has_meaningful_value(value: Any) -> bool: - if value is None: - return False - if isinstance(value, str): - return bool(value.strip()) - if isinstance(value, dict): - return any(_has_meaningful_value(v) for v in value.values()) - if isinstance(value, list): - return any(_has_meaningful_value(v) for v in value) - return bool(value) - - -def _make_empty_analysis_result( - product: Dict[str, Any], - target_lang: str, - schema: AnalysisSchema, - error: Optional[str] = None, -) -> Dict[str, Any]: - result = { - "id": _get_product_id(product), - "lang": target_lang, - "title_input": str(product.get("title") or "").strip(), - } - for field in schema.result_fields: - result[field] = "" - if error: - result["error"] = error - return result - - -def _normalize_analysis_result( - result: Dict[str, Any], - product: Dict[str, Any], - target_lang: str, - schema: AnalysisSchema, -) -> Dict[str, Any]: - normalized = _make_empty_analysis_result(product, target_lang, schema) - if not isinstance(result, dict): - return normalized - - normalized["lang"] = str(result.get("lang") or target_lang).strip() or target_lang - normalized["title_input"] = str( - product.get("title") or result.get("title_input") or "" - ).strip() - - for field in schema.result_fields: - normalized[field] = str(_get_analysis_field_value(result, field, schema) or "").strip() - - if result.get("error"): - normalized["error"] = str(result.get("error")) - return normalized - - -def _has_meaningful_analysis_content(result: Dict[str, Any], schema: AnalysisSchema) -> bool: - return any(_has_meaningful_value(result.get(field)) for field in schema.meaningful_fields) - - -def _append_analysis_attributes( - target: List[Dict[str, Any]], - row: Dict[str, Any], - lang: str, - schema: AnalysisSchema, - field_map: Tuple[Tuple[str, str], ...], -) -> None: - for source_name, output_name in field_map: - raw = _get_analysis_field_value(row, source_name, schema) - if not raw: - continue - _append_named_lang_phrase_map( - target, - name=output_name, - lang=lang, - raw_value=raw, - ) - - -def _apply_index_content_row(result: Dict[str, Any], row: Dict[str, Any], lang: str) -> None: - if not row or row.get("error"): - return - - content_schema = _get_analysis_schema("content") - anchor_text = str(_get_analysis_field_value(row, "anchor_text", content_schema) or "").strip() - if anchor_text: - _append_lang_phrase_map(result["qanchors"], lang=lang, raw_value=anchor_text) - - for source_name, output_name in _CONTENT_ANALYSIS_ATTRIBUTE_FIELD_MAP: - raw = _get_analysis_field_value(row, source_name, content_schema) - if not raw: - continue - _append_named_lang_phrase_map( - result["enriched_attributes"], - name=output_name, - lang=lang, - raw_value=raw, - ) - if output_name == "enriched_tags": - _append_lang_phrase_map(result["enriched_tags"], lang=lang, raw_value=raw) - - -def _apply_index_taxonomy_row( - result: Dict[str, Any], - row: Dict[str, Any], - lang: str, - *, - category_taxonomy_profile: Optional[str] = None, -) -> None: - if not row or row.get("error"): - return - - _append_analysis_attributes( - result["enriched_taxonomy_attributes"], - row=row, - lang=lang, - schema=_get_analysis_schema( - "taxonomy", - category_taxonomy_profile=category_taxonomy_profile, - ), - field_map=_get_taxonomy_attribute_field_map(category_taxonomy_profile), - ) - - -def _normalize_index_content_item(item: Dict[str, Any]) -> Dict[str, str]: - item_id = _get_product_id(item) - return { - "id": item_id, - "title": str(item.get("title") or "").strip(), - "brief": str(item.get("brief") or "").strip(), - "description": str(item.get("description") or "").strip(), - "image_url": str(item.get("image_url") or "").strip(), - } - - -def build_index_content_fields( - items: List[Dict[str, Any]], - tenant_id: Optional[str] = None, - enrichment_scopes: Optional[List[str]] = None, - category_taxonomy_profile: Optional[str] = None, -) -> List[Dict[str, Any]]: - """ - 高层入口:生成与 ES mapping 对齐的内容理解字段。 - - 输入项需包含: - - `id` 或 `spu_id` - - `title` - - 可选 `brief` / `description` / `image_url` - - 可选 `enrichment_scopes`,默认同时执行 `generic` 与 `category_taxonomy` - - 可选 `category_taxonomy_profile`,默认 `apparel` - - 返回项结构: - - `id` - - `qanchors` - - `enriched_tags` - - `enriched_attributes` - - `enriched_taxonomy_attributes` - - 可选 `error` - - 其中: - - `qanchors.{lang}` 为短语数组 - - `enriched_tags.{lang}` 为标签数组 - """ - requested_enrichment_scopes = _normalize_enrichment_scopes(enrichment_scopes) - normalized_taxonomy_profile = _normalize_category_taxonomy_profile(category_taxonomy_profile) - normalized_items = [_normalize_index_content_item(item) for item in items] - if not normalized_items: - return [] - - results_by_id: Dict[str, Dict[str, Any]] = { - item["id"]: { - "id": item["id"], - "qanchors": {}, - "enriched_tags": {}, - "enriched_attributes": [], - "enriched_taxonomy_attributes": [], - } - for item in normalized_items - } - - for lang in _CORE_INDEX_LANGUAGES: - if "generic" in requested_enrichment_scopes: - try: - rows = analyze_products( - products=normalized_items, - target_lang=lang, - batch_size=BATCH_SIZE, - tenant_id=tenant_id, - analysis_kind="content", - category_taxonomy_profile=normalized_taxonomy_profile, - ) - except Exception as e: - logger.warning("build_index_content_fields content enrichment failed for lang=%s: %s", lang, e) - for item in normalized_items: - results_by_id[item["id"]].setdefault("error", str(e)) - continue - - for row in rows or []: - item_id = str(row.get("id") or "").strip() - if not item_id or item_id not in results_by_id: - continue - if row.get("error"): - results_by_id[item_id].setdefault("error", row["error"]) - continue - _apply_index_content_row(results_by_id[item_id], row=row, lang=lang) - - if "category_taxonomy" in requested_enrichment_scopes: - for lang in _CORE_INDEX_LANGUAGES: - try: - taxonomy_rows = analyze_products( - products=normalized_items, - target_lang=lang, - batch_size=BATCH_SIZE, - tenant_id=tenant_id, - analysis_kind="taxonomy", - category_taxonomy_profile=normalized_taxonomy_profile, - ) - except Exception as e: - logger.warning( - "build_index_content_fields taxonomy enrichment failed for profile=%s lang=%s: %s", - normalized_taxonomy_profile, - lang, - e, - ) - for item in normalized_items: - results_by_id[item["id"]].setdefault("error", str(e)) - continue - - for row in taxonomy_rows or []: - item_id = str(row.get("id") or "").strip() - if not item_id or item_id not in results_by_id: - continue - if row.get("error"): - results_by_id[item_id].setdefault("error", row["error"]) - continue - _apply_index_taxonomy_row( - results_by_id[item_id], - row=row, - lang=lang, - category_taxonomy_profile=normalized_taxonomy_profile, - ) - - return [results_by_id[item["id"]] for item in normalized_items] - - -def _normalize_space(text: str) -> str: - return re.sub(r"\s+", " ", (text or "").strip()) - - -def _contains_cjk(text: str) -> bool: - return bool(re.search(r"[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]", text or "")) - - -def _truncate_by_chars(text: str, max_chars: int) -> str: - return text[:max_chars].strip() - - -def _truncate_by_words(text: str, max_words: int) -> str: - words = re.findall(r"\S+", text or "") - return " ".join(words[:max_words]).strip() - - -def _detect_prompt_input_lang(text: str) -> str: - # 简化处理:包含 CJK 时按中文类文本处理,否则统一按空格分词类语言处理。 - return "zh" if _contains_cjk(text) else "en" - - -def _build_prompt_input_text(product: Dict[str, Any]) -> str: - """ - 生成真正送入 prompt 的商品文本。 - - 规则: - - 默认使用 title - - 若文本过短,则依次补 brief / description - - 若文本过长,则按语言粗粒度截断 - """ - fields = [ - _normalize_space(str(product.get("title") or "")), - _normalize_space(str(product.get("brief") or "")), - _normalize_space(str(product.get("description") or "")), - ] - parts: List[str] = [] - - def join_parts() -> str: - return " | ".join(part for part in parts if part).strip() - - for field in fields: - if not field: - continue - if field not in parts: - parts.append(field) - candidate = join_parts() - if _detect_prompt_input_lang(candidate) == "zh": - if len(candidate) >= PROMPT_INPUT_MIN_ZH_CHARS: - return _truncate_by_chars(candidate, PROMPT_INPUT_MAX_ZH_CHARS) - else: - if len(re.findall(r"\S+", candidate)) >= PROMPT_INPUT_MIN_WORDS: - return _truncate_by_words(candidate, PROMPT_INPUT_MAX_WORDS) - - candidate = join_parts() - if not candidate: - return "" - if _detect_prompt_input_lang(candidate) == "zh": - return _truncate_by_chars(candidate, PROMPT_INPUT_MAX_ZH_CHARS) - return _truncate_by_words(candidate, PROMPT_INPUT_MAX_WORDS) - - -def _make_analysis_cache_key( - product: Dict[str, Any], - target_lang: str, - analysis_kind: str, - category_taxonomy_profile: Optional[str] = None, -) -> str: - """构造缓存 key,仅由分析类型、prompt 实际输入文本内容与目标语言决定。""" - schema = _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - prompt_input = _build_prompt_input_text(product) - h = hashlib.md5(prompt_input.encode("utf-8")).hexdigest() - prompt_contract = { - "schema_name": schema.name, - "cache_version": schema.cache_version, - "system_message": SYSTEM_MESSAGE, - "user_instruction_template": USER_INSTRUCTION_TEMPLATE, - "shared_instruction": schema.shared_instruction, - "assistant_headers": schema.get_headers(target_lang), - "result_fields": schema.result_fields, - "meaningful_fields": schema.meaningful_fields, - "field_aliases": schema.field_aliases, - } - prompt_contract_hash = hashlib.md5( - json.dumps(prompt_contract, ensure_ascii=False, sort_keys=True).encode("utf-8") - ).hexdigest()[:12] - return ( - f"{ANCHOR_CACHE_PREFIX}:{analysis_kind}:{prompt_contract_hash}:" - f"{target_lang}:{prompt_input[:4]}{h}" - ) - - -def _make_anchor_cache_key( - product: Dict[str, Any], - target_lang: str, -) -> str: - return _make_analysis_cache_key(product, target_lang, analysis_kind="content") - - -def _get_cached_analysis_result( - product: Dict[str, Any], - target_lang: str, - analysis_kind: str, - category_taxonomy_profile: Optional[str] = None, -) -> Optional[Dict[str, Any]]: - if not _anchor_redis: - return None - schema = _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - try: - key = _make_analysis_cache_key( - product, - target_lang, - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - raw = _anchor_redis.get(key) - if not raw: - return None - result = _normalize_analysis_result( - json.loads(raw), - product=product, - target_lang=target_lang, - schema=schema, - ) - if not _has_meaningful_analysis_content(result, schema): - return None - return result - except Exception as e: - logger.warning("Failed to get %s analysis cache: %s", analysis_kind, e) - return None - - -def _get_cached_anchor_result( - product: Dict[str, Any], - target_lang: str, -) -> Optional[Dict[str, Any]]: - return _get_cached_analysis_result(product, target_lang, analysis_kind="content") - - -def _set_cached_analysis_result( - product: Dict[str, Any], - target_lang: str, - result: Dict[str, Any], - analysis_kind: str, - category_taxonomy_profile: Optional[str] = None, -) -> None: - if not _anchor_redis: - return - schema = _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - try: - normalized = _normalize_analysis_result( - result, - product=product, - target_lang=target_lang, - schema=schema, - ) - if not _has_meaningful_analysis_content(normalized, schema): - return - key = _make_analysis_cache_key( - product, - target_lang, - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - ttl = ANCHOR_CACHE_EXPIRE_DAYS * 24 * 3600 - _anchor_redis.setex(key, ttl, json.dumps(normalized, ensure_ascii=False)) - except Exception as e: - logger.warning("Failed to set %s analysis cache: %s", analysis_kind, e) - - -def _set_cached_anchor_result( - product: Dict[str, Any], - target_lang: str, - result: Dict[str, Any], -) -> None: - _set_cached_analysis_result(product, target_lang, result, analysis_kind="content") - - -def _build_assistant_prefix(headers: List[str]) -> str: - header_line = "| " + " | ".join(headers) + " |" - separator_line = "|" + "----|" * len(headers) - return f"{header_line}\n{separator_line}\n" - - -def _build_shared_context(products: List[Dict[str, str]], schema: AnalysisSchema) -> str: - shared_context = schema.shared_instruction - for idx, product in enumerate(products, 1): - prompt_input = _build_prompt_input_text(product) - shared_context += f"{idx}. {prompt_input}\n" - return shared_context - - -def _hash_text(text: str) -> str: - return hashlib.md5((text or "").encode("utf-8")).hexdigest()[:12] - - -def _mark_shared_context_logged_once(shared_context_key: str) -> bool: - with _logged_shared_context_lock: - if shared_context_key in _logged_shared_context_keys: - _logged_shared_context_keys.move_to_end(shared_context_key) - return False - - _logged_shared_context_keys[shared_context_key] = None - if len(_logged_shared_context_keys) > LOGGED_SHARED_CONTEXT_CACHE_SIZE: - _logged_shared_context_keys.popitem(last=False) - return True - - -def reset_logged_shared_context_keys() -> None: - """测试辅助:清理已记录的共享 prompt key。""" - with _logged_shared_context_lock: - _logged_shared_context_keys.clear() - - -def create_prompt( - products: List[Dict[str, str]], - target_lang: str = "zh", - analysis_kind: str = "content", - category_taxonomy_profile: Optional[str] = None, -) -> Tuple[Optional[str], Optional[str], Optional[str]]: - """根据目标语言创建共享上下文、本地化输出要求和 Partial Mode assistant 前缀。""" - schema = _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - markdown_table_headers = schema.get_headers(target_lang) - if not markdown_table_headers: - logger.warning( - "Unsupported target_lang for markdown table headers: kind=%s lang=%s", - analysis_kind, - target_lang, - ) - return None, None, None - shared_context = _build_shared_context(products, schema) - language_label = SOURCE_LANG_CODE_MAP.get(target_lang, target_lang) - user_prompt = USER_INSTRUCTION_TEMPLATE.format(language=language_label).strip() - assistant_prefix = _build_assistant_prefix(markdown_table_headers) - return shared_context, user_prompt, assistant_prefix - - -def _merge_partial_response(assistant_prefix: str, generated_content: str) -> str: - """将 Partial Mode 的 assistant 前缀与补全文本拼成完整 markdown。""" - generated = (generated_content or "").lstrip() - prefix_lines = [line.strip() for line in assistant_prefix.strip().splitlines()] - generated_lines = generated.splitlines() - - if generated_lines: - first_line = generated_lines[0].strip() - if prefix_lines and first_line == prefix_lines[0]: - generated_lines = generated_lines[1:] - if generated_lines and len(prefix_lines) > 1 and generated_lines[0].strip() == prefix_lines[1]: - generated_lines = generated_lines[1:] - elif len(prefix_lines) > 1 and first_line == prefix_lines[1]: - generated_lines = generated_lines[1:] - - suffix = "\n".join(generated_lines).lstrip("\n") - if suffix: - return f"{assistant_prefix}{suffix}" - return assistant_prefix - - -def call_llm( - shared_context: str, - user_prompt: str, - assistant_prefix: str, - target_lang: str = "zh", - analysis_kind: str = "content", -) -> Tuple[str, str]: - """调用大模型 API(带重试机制),使用 Partial Mode 强制 markdown 表格前缀。""" - headers = { - "Authorization": f"Bearer {API_KEY}", - "Content-Type": "application/json", - } - shared_context_key = _hash_text(shared_context) - localized_tail_key = _hash_text(f"{target_lang}\n{user_prompt}\n{assistant_prefix}") - combined_user_prompt = f"{shared_context.rstrip()}\n\n{user_prompt.strip()}" - - payload = { - "model": MODEL_NAME, - "messages": [ - { - "role": "system", - "content": SYSTEM_MESSAGE, - }, - { - "role": "user", - "content": combined_user_prompt, - }, - { - "role": "assistant", - "content": assistant_prefix, - "partial": True, - }, - ], - "temperature": 0.3, - "top_p": 0.8, - } - - request_data = { - "headers": {k: v for k, v in headers.items() if k != "Authorization"}, - "payload": payload, - } - - if _mark_shared_context_logged_once(shared_context_key): - logger.info(f"\n{'=' * 80}") - logger.info( - "LLM Shared Context [model=%s, kind=%s, shared_key=%s, chars=%s] (logged once per process key)", - MODEL_NAME, - analysis_kind, - shared_context_key, - len(shared_context), - ) - logger.info("\nSystem Message:\n%s", SYSTEM_MESSAGE) - logger.info("\nShared Context:\n%s", shared_context) - - verbose_logger.info(f"\n{'=' * 80}") - verbose_logger.info( - "LLM Request [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:", - MODEL_NAME, - analysis_kind, - target_lang, - shared_context_key, - localized_tail_key, - ) - verbose_logger.info(json.dumps(request_data, ensure_ascii=False, indent=2)) - verbose_logger.info(f"\nCombined User Prompt:\n{combined_user_prompt}") - verbose_logger.info(f"\nShared Context:\n{shared_context}") - verbose_logger.info(f"\nLocalized Requirement:\n{user_prompt}") - verbose_logger.info(f"\nAssistant Prefix:\n{assistant_prefix}") - - logger.info( - "\nLLM Request Variant [kind=%s, lang=%s, shared_key=%s, tail_key=%s, prompt_chars=%s, prefix_chars=%s]", - analysis_kind, - target_lang, - shared_context_key, - localized_tail_key, - len(user_prompt), - len(assistant_prefix), - ) - logger.info("\nLocalized Requirement:\n%s", user_prompt) - logger.info("\nAssistant Prefix:\n%s", assistant_prefix) - - # 创建session,禁用代理 - session = requests.Session() - session.trust_env = False # 忽略系统代理设置 - - try: - # 重试机制 - for attempt in range(MAX_RETRIES): - try: - response = session.post( - f"{API_BASE_URL}/chat/completions", - headers=headers, - json=payload, - timeout=REQUEST_TIMEOUT, - proxies={"http": None, "https": None}, # 明确禁用代理 - ) - - response.raise_for_status() - result = response.json() - usage = result.get("usage") or {} - - verbose_logger.info( - "\nLLM Response [model=%s, kind=%s, lang=%s, shared_key=%s, tail_key=%s]:", - MODEL_NAME, - analysis_kind, - target_lang, - shared_context_key, - localized_tail_key, - ) - verbose_logger.info(json.dumps(result, ensure_ascii=False, indent=2)) - - generated_content = result["choices"][0]["message"]["content"] - full_markdown = _merge_partial_response(assistant_prefix, generated_content) - - logger.info( - "\nLLM Response Summary [kind=%s, lang=%s, shared_key=%s, tail_key=%s, generated_chars=%s, completion_tokens=%s, prompt_tokens=%s, total_tokens=%s]", - analysis_kind, - target_lang, - shared_context_key, - localized_tail_key, - len(generated_content or ""), - usage.get("completion_tokens"), - usage.get("prompt_tokens"), - usage.get("total_tokens"), - ) - logger.info("\nGenerated Content:\n%s", generated_content) - logger.info("\nMerged Markdown:\n%s", full_markdown) - - verbose_logger.info(f"\nGenerated Content:\n{generated_content}") - verbose_logger.info(f"\nMerged Markdown:\n{full_markdown}") - - return full_markdown, json.dumps(result, ensure_ascii=False) - - except requests.exceptions.ProxyError as e: - logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Proxy error - {str(e)}") - if attempt < MAX_RETRIES - 1: - logger.info(f"Retrying in {RETRY_DELAY} seconds...") - time.sleep(RETRY_DELAY) - else: - raise - - except requests.exceptions.RequestException as e: - logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES}: Request error - {str(e)}") - if attempt < MAX_RETRIES - 1: - logger.info(f"Retrying in {RETRY_DELAY} seconds...") - time.sleep(RETRY_DELAY) - else: - raise - - except Exception as e: - logger.error(f"Unexpected error on attempt {attempt + 1}/{MAX_RETRIES}: {str(e)}") - if attempt < MAX_RETRIES - 1: - logger.info(f"Retrying in {RETRY_DELAY} seconds...") - time.sleep(RETRY_DELAY) - else: - raise - - finally: - session.close() - - -def parse_markdown_table( - markdown_content: str, - analysis_kind: str = "content", - category_taxonomy_profile: Optional[str] = None, -) -> List[Dict[str, str]]: - """解析markdown表格内容""" - schema = _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - lines = markdown_content.strip().split("\n") - data = [] - data_started = False - - for line in lines: - line = line.strip() - if not line: - continue - - # 表格行处理 - if line.startswith("|"): - # 分隔行(---- 或 :---: 等;允许空格,如 "| ---- | ---- |") - sep_chars = line.replace("|", "").strip().replace(" ", "") - if sep_chars and set(sep_chars) <= {"-", ":"}: - data_started = True - continue - - # 首个表头行:无论语言如何,统一跳过 - if not data_started: - # 等待下一行数据行 - continue - - # 解析数据行 - parts = [p.strip() for p in line.split("|")] - if parts and parts[0] == "": - parts = parts[1:] - if parts and parts[-1] == "": - parts = parts[:-1] - - if len(parts) >= 2: - row = {"seq_no": parts[0]} - for field_index, field_name in enumerate(schema.result_fields, start=1): - cell = parts[field_index] if len(parts) > field_index else "" - row[field_name] = _normalize_markdown_table_cell(cell) - data.append(row) - - return data - - -def _log_parsed_result_quality( - batch_data: List[Dict[str, str]], - parsed_results: List[Dict[str, str]], - target_lang: str, - batch_num: int, - analysis_kind: str, - category_taxonomy_profile: Optional[str] = None, -) -> None: - schema = _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - expected = len(batch_data) - actual = len(parsed_results) - if actual != expected: - logger.warning( - "Parsed row count mismatch for kind=%s batch=%s lang=%s: expected=%s actual=%s", - analysis_kind, - batch_num, - target_lang, - expected, - actual, - ) - - if not schema.quality_fields: - logger.info( - "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s", - analysis_kind, - batch_num, - target_lang, - actual, - expected, - ) - return - - missing_summary = ", ".join( - f"missing_{field}=" - f"{sum(1 for item in parsed_results if not str(item.get(field) or '').strip())}" - for field in schema.quality_fields - ) - logger.info( - "Parsed Quality Summary [kind=%s, batch=%s, lang=%s]: rows=%s/%s, %s", - analysis_kind, - batch_num, - target_lang, - actual, - expected, - missing_summary, - ) - - -def process_batch( - batch_data: List[Dict[str, str]], - batch_num: int, - target_lang: str = "zh", - analysis_kind: str = "content", - category_taxonomy_profile: Optional[str] = None, -) -> List[Dict[str, Any]]: - """处理一个批次的数据""" - schema = _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - logger.info(f"\n{'#' * 80}") - logger.info( - "Processing Batch %s (%s items, kind=%s)", - batch_num, - len(batch_data), - analysis_kind, - ) - - # 创建提示词 - shared_context, user_prompt, assistant_prefix = create_prompt( - batch_data, - target_lang=target_lang, - analysis_kind=analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - - # 如果提示词创建失败(例如不支持的 target_lang),本次批次整体失败,不再继续调用 LLM - if shared_context is None or user_prompt is None or assistant_prefix is None: - logger.error( - "Failed to create prompt for batch %s, kind=%s, target_lang=%s; " - "marking entire batch as failed without calling LLM", - batch_num, - analysis_kind, - target_lang, - ) - return [ - _make_empty_analysis_result( - item, - target_lang, - schema, - error=f"prompt_creation_failed: unsupported target_lang={target_lang}", - ) - for item in batch_data - ] - - # 调用LLM - try: - raw_response, full_response_json = call_llm( - shared_context, - user_prompt, - assistant_prefix, - target_lang=target_lang, - analysis_kind=analysis_kind, - ) - - # 解析结果 - parsed_results = parse_markdown_table( - raw_response, - analysis_kind=analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - _log_parsed_result_quality( - batch_data, - parsed_results, - target_lang, - batch_num, - analysis_kind, - category_taxonomy_profile, - ) - - logger.info(f"\nParsed Results ({len(parsed_results)} items):") - logger.info(json.dumps(parsed_results, ensure_ascii=False, indent=2)) - - # 映射回原始ID - results_with_ids = [] - for i, parsed_item in enumerate(parsed_results): - if i < len(batch_data): - source_product = batch_data[i] - result = _normalize_analysis_result( - parsed_item, - product=source_product, - target_lang=target_lang, - schema=schema, - ) - results_with_ids.append(result) - logger.info( - "Mapped: kind=%s seq=%s -> original_id=%s", - analysis_kind, - parsed_item.get("seq_no"), - source_product.get("id"), - ) - - # 保存批次 JSON 日志到独立文件 - batch_log = { - "batch_num": batch_num, - "analysis_kind": analysis_kind, - "timestamp": datetime.now().isoformat(), - "input_products": batch_data, - "raw_response": raw_response, - "full_response_json": full_response_json, - "parsed_results": parsed_results, - "final_results": results_with_ids, - } - - # 并发写 batch json 日志时,保证文件名唯一避免覆盖 - batch_call_id = uuid.uuid4().hex[:12] - batch_log_file = ( - LOG_DIR - / f"batch_{analysis_kind}_{batch_num:04d}_{timestamp}_{batch_call_id}.json" - ) - with open(batch_log_file, "w", encoding="utf-8") as f: - json.dump(batch_log, f, ensure_ascii=False, indent=2) - - logger.info(f"Batch log saved to: {batch_log_file}") - - return results_with_ids - - except Exception as e: - logger.error(f"Error processing batch {batch_num}: {str(e)}", exc_info=True) - # 返回空结果,保持ID映射 - return [ - _make_empty_analysis_result(item, target_lang, schema, error=str(e)) - for item in batch_data - ] - - -def analyze_products( - products: List[Dict[str, str]], - target_lang: str = "zh", - batch_size: Optional[int] = None, - tenant_id: Optional[str] = None, - analysis_kind: str = "content", - category_taxonomy_profile: Optional[str] = None, -) -> List[Dict[str, Any]]: - """ - 库调用入口:根据输入+语言,返回锚文本及各维度信息。 - - Args: - products: [{"id": "...", "title": "..."}] - target_lang: 输出语言 - batch_size: 批大小,默认使用全局 BATCH_SIZE - """ - if not API_KEY: - raise RuntimeError("DASHSCOPE_API_KEY is not set, cannot call LLM") - - if not products: - return [] - - _get_analysis_schema( - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - results_by_index: List[Optional[Dict[str, Any]]] = [None] * len(products) - uncached_items: List[Tuple[int, Dict[str, str]]] = [] - - for idx, product in enumerate(products): - title = str(product.get("title") or "").strip() - if not title: - uncached_items.append((idx, product)) - continue - - cached = _get_cached_analysis_result( - product, - target_lang, - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - if cached: - logger.info( - f"[analyze_products] Cache hit for title='{title[:50]}...', " - f"kind={analysis_kind}, lang={target_lang}" - ) - results_by_index[idx] = cached - continue - - uncached_items.append((idx, product)) - - if not uncached_items: - return [item for item in results_by_index if item is not None] - - # call_llm 一次处理上限固定为 BATCH_SIZE(默认 20): - # - 尽可能攒批处理; - # - 即便调用方传入更大的 batch_size,也会自动按上限拆批。 - req_bs = BATCH_SIZE if batch_size is None else int(batch_size) - bs = max(1, min(req_bs, BATCH_SIZE)) - total_batches = (len(uncached_items) + bs - 1) // bs - - batch_jobs: List[Tuple[int, List[Tuple[int, Dict[str, str]]], List[Dict[str, str]]]] = [] - for i in range(0, len(uncached_items), bs): - batch_num = i // bs + 1 - batch_slice = uncached_items[i : i + bs] - batch = [item for _, item in batch_slice] - batch_jobs.append((batch_num, batch_slice, batch)) - - # 只有一个批次时走串行,减少线程池创建开销与日志/日志文件的不可控交织 - if total_batches <= 1 or CONTENT_UNDERSTANDING_MAX_WORKERS <= 1: - for batch_num, batch_slice, batch in batch_jobs: - logger.info( - f"[analyze_products] Processing batch {batch_num}/{total_batches}, " - f"size={len(batch)}, kind={analysis_kind}, target_lang={target_lang}" - ) - batch_results = process_batch( - batch, - batch_num=batch_num, - target_lang=target_lang, - analysis_kind=analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - - for (original_idx, product), item in zip(batch_slice, batch_results): - results_by_index[original_idx] = item - title_input = str(item.get("title_input") or "").strip() - if not title_input: - continue - if item.get("error"): - # 不缓存错误结果,避免放大临时故障 - continue - try: - _set_cached_analysis_result( - product, - target_lang, - item, - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - except Exception: - # 已在内部记录 warning - pass - else: - max_workers = min(CONTENT_UNDERSTANDING_MAX_WORKERS, len(batch_jobs)) - logger.info( - "[analyze_products] Using ThreadPoolExecutor for uncached batches: " - "max_workers=%s, total_batches=%s, bs=%s, kind=%s, target_lang=%s", - max_workers, - total_batches, - bs, - analysis_kind, - target_lang, - ) - - # 只把“LLM 调用 + markdown 解析”放到线程里;Redis get/set 保持在主线程,避免并发写入带来额外风险。 - # 注意:线程池是模块级单例,因此这里的 max_workers 主要用于日志语义(实际并发受单例池上限约束)。 - executor = _get_content_understanding_executor() - future_by_batch_num: Dict[int, Any] = {} - for batch_num, _batch_slice, batch in batch_jobs: - future_by_batch_num[batch_num] = executor.submit( - process_batch, - batch, - batch_num=batch_num, - target_lang=target_lang, - analysis_kind=analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - - # 按 batch_num 回填,确保输出稳定(results_by_index 是按原始 input index 映射的) - for batch_num, batch_slice, _batch in batch_jobs: - batch_results = future_by_batch_num[batch_num].result() - for (original_idx, product), item in zip(batch_slice, batch_results): - results_by_index[original_idx] = item - title_input = str(item.get("title_input") or "").strip() - if not title_input: - continue - if item.get("error"): - # 不缓存错误结果,避免放大临时故障 - continue - try: - _set_cached_analysis_result( - product, - target_lang, - item, - analysis_kind, - category_taxonomy_profile=category_taxonomy_profile, - ) - except Exception: - # 已在内部记录 warning - pass - - return [item for item in results_by_index if item is not None] diff --git a/indexer/product_enrich_prompts.py b/indexer/product_enrich_prompts.py deleted file mode 100644 index 6c2d05a..0000000 --- a/indexer/product_enrich_prompts.py +++ /dev/null @@ -1,849 +0,0 @@ -#!/usr/bin/env python3 - -from typing import Any, Dict, Tuple - -SYSTEM_MESSAGE = ( - "You are an e-commerce product annotator. " - "Continue the provided assistant Markdown table prefix. " - "Do not repeat or modify the prefix, and do not add explanations outside the table." -) - -SHARED_ANALYSIS_INSTRUCTION = """Analyze each input product text and fill these columns: - -1. Product title: a natural, localized product name based on the input text -2. Category path: a concise category hierarchy from broad to specific, separated by ">" -3. Fine-grained tags: concise tags for style, features, design details, function, or standout selling points -4. Target audience: gender, age group, body type, or suitable users when clearly implied -5. Usage scene: likely occasions, settings, or use cases -6. Applicable season: relevant season(s) based on the product text -7. Key attributes: core product attributes and specifications. Depending on the item type, this may include fit, silhouette, length, sleeve type, neckline, waistline, closure, pattern, design details, structure, or other relevant attribute dimensions -8. Material description: material, fabric, texture, or construction description -9. Functional features: practical or performance-related functions such as stretch, breathability, warmth, support, storage, protection, or ease of wear -10. Anchor text: a search-oriented keyword string covering product type, category intent, attributes, design cues, usage scenarios, and strong shopping phrases - -Rules: -- Keep the input order and row count exactly the same. -- Infer only from the provided input product text; if uncertain, prefer concise and broadly correct ecommerce wording. -- Keep category paths concise and use ">" as the separator. -- For columns with multiple values, the localized output requirement will define the delimiter. - -Input product list: -""" - -USER_INSTRUCTION_TEMPLATE = """Please strictly return a Markdown table following the given columns in the specified language. For any column containing multiple values, separate them with commas. Do not add any other explanation. -Language: {language}""" - -def _taxonomy_field( - key: str, - label: str, - description: str, - zh_label: str | None = None, -) -> Dict[str, str]: - return { - "key": key, - "label": label, - "description": description, - "zh_label": zh_label or label, - } - - -def _build_taxonomy_shared_instruction(profile_label: str, fields: Tuple[Dict[str, str], ...]) -> str: - lines = [ - f"Analyze each input product text and fill the columns below using a {profile_label} attribute taxonomy.", - "", - "Output columns:", - ] - for idx, field in enumerate(fields, start=1): - lines.append(f"{idx}. {field['label']}: {field['description']}") - lines.extend( - [ - "", - "Rules:", - "- Keep the same row order and row count as input.", - "- Leave blank if not applicable, unmentioned, or unsupported.", - "- Use concise, standardized ecommerce wording.", - "- If multiple values, separate with commas.", - "", - "Input product list:", - ] - ) - return "\n".join(lines) - - -def _make_taxonomy_profile( - profile_label: str, - fields: Tuple[Dict[str, str], ...], -) -> Dict[str, Any]: - headers = { - "en": ["No.", *[field["label"] for field in fields]], - "zh": ["序号", *[field["zh_label"] for field in fields]], - } - return { - "profile_label": profile_label, - "fields": fields, - "shared_instruction": _build_taxonomy_shared_instruction(profile_label, fields), - "markdown_table_headers": headers, - } - - -APPAREL_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise ecommerce apparel category label, not a full marketing title", "品类"), - _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied", "目标性别"), - _taxonomy_field("age_group", "Age Group", "only if clearly implied, e.g. adults, kids, teens, toddlers, babies", "年龄段"), - _taxonomy_field("season", "Season", "season(s) or all-season suitability only if supported", "适用季节"), - _taxonomy_field("fit", "Fit", "body closeness, e.g. slim, regular, relaxed, oversized, fitted", "版型"), - _taxonomy_field("silhouette", "Silhouette", "overall garment shape, e.g. straight, A-line, boxy, tapered, bodycon, wide-leg", "廓形"), - _taxonomy_field("neckline", "Neckline", "neckline type when applicable, e.g. crew neck, V-neck, hooded, collared, square neck", "领型"), - _taxonomy_field("sleeve_length_type", "Sleeve Length Type", "sleeve length only, e.g. sleeveless, short sleeve, long sleeve, three-quarter sleeve", "袖长类型"), - _taxonomy_field("sleeve_style", "Sleeve Style", "sleeve design only, e.g. puff sleeve, raglan sleeve, batwing sleeve, bell sleeve", "袖型"), - _taxonomy_field("strap_type", "Strap Type", "strap design when applicable, e.g. spaghetti strap, wide strap, halter strap, adjustable strap", "肩带设计"), - _taxonomy_field("rise_waistline", "Rise / Waistline", "waist placement when applicable, e.g. high rise, mid rise, low rise, empire waist", "腰型"), - _taxonomy_field("leg_shape", "Leg Shape", "for bottoms only, e.g. straight leg, wide leg, flare leg, tapered leg, skinny leg", "裤型"), - _taxonomy_field("skirt_shape", "Skirt Shape", "for skirts only, e.g. A-line, pleated, pencil, mermaid", "裙型"), - _taxonomy_field("length_type", "Length Type", "design length only, not size, e.g. cropped, regular, longline, mini, midi, maxi, ankle length, full length", "长度类型"), - _taxonomy_field("closure_type", "Closure Type", "fastening method when applicable, e.g. zipper, button, drawstring, elastic waist, hook-and-loop", "闭合方式"), - _taxonomy_field("design_details", "Design Details", "construction or visual details, e.g. ruched, ruffled, pleated, cut-out, layered, distressed, split hem", "设计细节"), - _taxonomy_field("fabric", "Fabric", "fabric type only, e.g. denim, knit, chiffon, jersey, fleece, cotton twill", "面料"), - _taxonomy_field("material_composition", "Material Composition", "fiber content or blend only if stated, e.g. cotton, polyester, spandex, linen blend, 95% cotton 5% elastane", "成分"), - _taxonomy_field("fabric_properties", "Fabric Properties", "inherent fabric traits, e.g. stretch, breathable, lightweight, soft-touch, water-resistant", "面料特性"), - _taxonomy_field("clothing_features", "Clothing Features", "product features, e.g. lined, reversible, hooded, packable, padded, pocketed", "服装特征"), - _taxonomy_field("functional_benefits", "Functional Benefits", "wearer benefits, e.g. moisture-wicking, thermal insulation, UV protection, easy care, supportive compression", "功能"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("color_family", "Color Family", "normalized broad retail color group, e.g. black, white, blue, green, red, pink, beige, brown, gray", "色系"), - _taxonomy_field("print_pattern", "Print / Pattern", "surface pattern when applicable, e.g. solid, striped, plaid, floral, graphic, animal print", "印花 / 图案"), - _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use occasion only if supported, e.g. office, casual wear, streetwear, lounge, workout, outdoor", "适用场景"), - _taxonomy_field("style_aesthetic", "Style Aesthetic", "overall style only if supported, e.g. minimalist, streetwear, athleisure, smart casual, romantic, playful", "风格"), -) - -THREE_C_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise 3C accessory or peripheral category label", "品类"), - _taxonomy_field("compatible_device", "Compatible Device / Model", "supported device family, series, model, or form factor when clearly stated", "适配设备 / 型号"), - _taxonomy_field("connectivity", "Connectivity", "connection method such as wired, wireless, Bluetooth, Wi-Fi, NFC, or 2.4G", "连接方式"), - _taxonomy_field("interface_port_type", "Interface / Port Type", "relevant connector or port, e.g. USB-C, Lightning, HDMI, AUX, RJ45", "接口 / 端口类型"), - _taxonomy_field("power_charging", "Power Source / Charging", "charging or power mode, e.g. battery powered, fast charging, rechargeable, plug-in", "供电 / 充电方式"), - _taxonomy_field("key_features", "Key Features", "primary hardware features such as noise cancelling, foldable, magnetic, backlit, waterproof", "关键特征"), - _taxonomy_field("material_finish", "Material / Finish", "main material or exterior finish when supported", "材质 / 表面处理"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("pack_size", "Pack Size", "unit count or bundle size when stated", "包装规格"), - _taxonomy_field("use_case", "Use Case", "intended usage such as travel, office, gaming, car, charging, streaming", "使用场景"), -) - -BAGS_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise bag category such as backpack, tote bag, crossbody bag, luggage, or wallet", "品类"), - _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied", "目标性别"), - _taxonomy_field("carry_style", "Carry Style", "how the bag is worn or carried, e.g. handheld, shoulder, crossbody, backpack", "携带方式"), - _taxonomy_field("size_capacity", "Size / Capacity", "size tier or capacity when supported, e.g. mini, large capacity, 20L", "尺寸 / 容量"), - _taxonomy_field("material", "Material", "main bag material such as leather, nylon, canvas, PU, straw", "材质"), - _taxonomy_field("closure_type", "Closure Type", "bag closure such as zipper, flap, buckle, drawstring, magnetic snap", "闭合方式"), - _taxonomy_field("structure_compartments", "Structure / Compartments", "organizational structure such as multi-pocket, laptop sleeve, card slots, expandable", "结构 / 分层"), - _taxonomy_field("strap_handle_type", "Strap / Handle Type", "strap or handle design such as chain strap, top handle, adjustable strap", "肩带 / 提手类型"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use such as commute, travel, evening, school, casual", "适用场景"), -) - -PET_SUPPLIES_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise pet supplies category label", "品类"), - _taxonomy_field("pet_type", "Pet Type", "target pet such as dog, cat, bird, fish, hamster", "宠物类型"), - _taxonomy_field("breed_size", "Breed Size", "pet size or breed size when stated, e.g. small breed, large dogs", "体型 / 品种大小"), - _taxonomy_field("life_stage", "Life Stage", "pet age stage when supported, e.g. puppy, kitten, adult, senior", "成长阶段"), - _taxonomy_field("material_ingredients", "Material / Ingredients", "main material or ingredient composition when supported", "材质 / 成分"), - _taxonomy_field("flavor_scent", "Flavor / Scent", "flavor or scent when applicable", "口味 / 气味"), - _taxonomy_field("key_features", "Key Features", "primary attributes such as interactive, leak-proof, orthopedic, washable, elevated", "关键特征"), - _taxonomy_field("functional_benefits", "Functional Benefits", "benefits such as dental care, calming, digestion support, joint support", "功能"), - _taxonomy_field("size_capacity", "Size / Capacity", "size, count, or net content when stated", "尺寸 / 容量"), - _taxonomy_field("use_scenario", "Use Scenario", "usage such as feeding, training, grooming, travel, indoor play", "使用场景"), -) - -ELECTRONICS_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise electronics device or component category label", "品类"), - _taxonomy_field("device_category", "Device Category / Compatibility", "supported platform, component class, or compatible device family when stated", "设备类别 / 兼容性"), - _taxonomy_field("power_voltage", "Power / Voltage", "power, voltage, wattage, or battery spec when supported", "功率 / 电压"), - _taxonomy_field("connectivity", "Connectivity", "connection method such as wired, Bluetooth, Wi-Fi, RF, or smart app control", "连接方式"), - _taxonomy_field("interface_port_type", "Interface / Port Type", "relevant port or interface such as USB-C, AC plug type, HDMI, SATA", "接口 / 端口类型"), - _taxonomy_field("capacity_storage", "Capacity / Storage", "capacity or storage spec such as 256GB, 2TB, 5000mAh", "容量 / 存储"), - _taxonomy_field("key_features", "Key Features", "main product features such as touch control, HD display, noise reduction, smart control", "关键特征"), - _taxonomy_field("material_finish", "Material / Finish", "main housing material or finish when supported", "材质 / 表面处理"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("use_case", "Use Case", "intended use such as home entertainment, office, charging, security, repair", "使用场景"), -) - -OUTDOOR_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise outdoor gear category label", "品类"), - _taxonomy_field("activity_type", "Activity Type", "primary outdoor activity such as camping, hiking, fishing, climbing, travel", "活动类型"), - _taxonomy_field("season_weather", "Season / Weather", "season or weather suitability when supported", "适用季节 / 天气"), - _taxonomy_field("material", "Material", "main material such as aluminum, ripstop nylon, stainless steel, EVA", "材质"), - _taxonomy_field("capacity_size", "Capacity / Size", "size, length, or capacity when stated", "容量 / 尺寸"), - _taxonomy_field("protection_resistance", "Protection / Resistance", "resistance or protection such as waterproof, UV resistant, windproof", "防护 / 耐受性"), - _taxonomy_field("key_features", "Key Features", "primary gear attributes such as foldable, lightweight, insulated, non-slip", "关键特征"), - _taxonomy_field("portability_packability", "Portability / Packability", "carry or storage trait such as collapsible, compact, ultralight, packable", "便携 / 收纳性"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("use_scenario", "Use Scenario", "likely use setting such as campsite, trail, survival kit, beach, picnic", "使用场景"), -) - -HOME_APPLIANCES_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise home appliance category label", "品类"), - _taxonomy_field("appliance_category", "Appliance Category", "functional class such as kitchen appliance, cleaning appliance, personal care appliance", "家电类别"), - _taxonomy_field("power_voltage", "Power / Voltage", "wattage, voltage, plug type, or power supply when supported", "功率 / 电压"), - _taxonomy_field("capacity_coverage", "Capacity / Coverage", "capacity or coverage metric such as 1.5L, 20L, 40sqm", "容量 / 覆盖范围"), - _taxonomy_field("control_method", "Control Method", "operation method such as touch, knob, remote, app control", "控制方式"), - _taxonomy_field("installation_type", "Installation Type", "setup style such as countertop, handheld, portable, wall-mounted, built-in", "安装方式"), - _taxonomy_field("key_features", "Key Features", "main product features such as timer, steam, HEPA filter, self-cleaning", "关键特征"), - _taxonomy_field("material_finish", "Material / Finish", "main material or exterior finish when supported", "材质 / 表面处理"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("use_scenario", "Use Scenario", "intended use such as cooking, cleaning, grooming, cooling, air treatment", "使用场景"), -) - -HOME_LIVING_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise home and living category label", "品类"), - _taxonomy_field("room_placement", "Room / Placement", "intended room or placement such as bedroom, kitchen, bathroom, desktop", "适用空间 / 摆放位置"), - _taxonomy_field("material", "Material", "main material such as wood, ceramic, cotton, glass, metal", "材质"), - _taxonomy_field("style", "Style", "home style such as modern, farmhouse, minimalist, boho, Nordic", "风格"), - _taxonomy_field("size_dimensions", "Size / Dimensions", "size or dimensions when stated", "尺寸 / 规格"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("pattern_finish", "Pattern / Finish", "surface pattern or finish such as solid, marble, matte, ribbed", "图案 / 表面处理"), - _taxonomy_field("key_features", "Key Features", "main product features such as stackable, washable, blackout, space-saving", "关键特征"), - _taxonomy_field("assembly_installation", "Assembly / Installation", "assembly or installation trait when supported", "组装 / 安装"), - _taxonomy_field("use_scenario", "Use Scenario", "intended use such as storage, dining, decor, sleep, organization", "使用场景"), -) - -WIGS_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise wig or hairpiece category label", "品类"), - _taxonomy_field("hair_material", "Hair Material", "hair material such as human hair, synthetic fiber, heat-resistant fiber", "发丝材质"), - _taxonomy_field("hair_texture", "Hair Texture", "texture or curl pattern such as straight, body wave, curly, kinky", "发质纹理"), - _taxonomy_field("hair_length", "Hair Length", "hair length when stated", "发长"), - _taxonomy_field("hair_color", "Hair Color", "specific hair color or blend when available", "发色"), - _taxonomy_field("cap_construction", "Cap Construction", "cap type such as full lace, lace front, glueless, U part", "帽网结构"), - _taxonomy_field("lace_area_part_type", "Lace Area / Part Type", "lace size or part style such as 13x4 lace, middle part, T part", "蕾丝面积 / 分缝类型"), - _taxonomy_field("density_volume", "Density / Volume", "hair density or fullness when supported", "密度 / 发量"), - _taxonomy_field("style_bang_type", "Style / Bang Type", "style cue such as bob, pixie, layered, with bangs", "款式 / 刘海类型"), - _taxonomy_field("occasion_end_use", "Occasion / End Use", "intended use such as daily wear, cosplay, protective style, party", "适用场景"), -) - -BEAUTY_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise beauty or cosmetics category label", "品类"), - _taxonomy_field("target_area", "Target Area", "target area such as face, lips, eyes, nails, hair, body", "适用部位"), - _taxonomy_field("skin_hair_type", "Skin Type / Hair Type", "suitable skin or hair type when supported", "肤质 / 发质"), - _taxonomy_field("finish_effect", "Finish / Effect", "cosmetic finish or effect such as matte, dewy, volumizing, brightening", "妆效 / 效果"), - _taxonomy_field("key_ingredients", "Key Ingredients", "notable ingredients when stated", "关键成分"), - _taxonomy_field("shade_color", "Shade / Color", "specific shade or color when available", "色号 / 颜色"), - _taxonomy_field("scent", "Scent", "fragrance or scent only when supported", "香味"), - _taxonomy_field("formulation", "Formulation", "product form such as cream, serum, powder, gel, stick", "剂型 / 形态"), - _taxonomy_field("functional_benefits", "Functional Benefits", "benefits such as hydration, anti-aging, long-wear, repair, sun protection", "功能"), - _taxonomy_field("use_scenario", "Use Scenario", "intended use such as daily routine, salon, travel, evening makeup", "使用场景"), -) - -ACCESSORIES_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise accessory category label such as necklace, watch, belt, hat, or sunglasses", "品类"), - _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied", "目标性别"), - _taxonomy_field("material", "Material", "main material such as alloy, leather, stainless steel, acetate, fabric", "材质"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("pattern_finish", "Pattern / Finish", "surface treatment or style finish such as polished, textured, braided, rhinestone", "图案 / 表面处理"), - _taxonomy_field("closure_fastening", "Closure / Fastening", "fastening method when applicable", "闭合 / 固定方式"), - _taxonomy_field("size_fit", "Size / Fit", "size or fit information such as adjustable, one size, 42mm", "尺寸 / 适配"), - _taxonomy_field("style", "Style", "style cue such as minimalist, vintage, statement, sporty", "风格"), - _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use such as daily wear, formal, party, travel, sun protection", "适用场景"), - _taxonomy_field("set_pack_size", "Set / Pack Size", "set count or pack size when stated", "套装 / 规格"), -) - -TOYS_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise toy category label", "品类"), - _taxonomy_field("age_group", "Age Group", "intended age group when clearly implied", "年龄段"), - _taxonomy_field("character_theme", "Character / Theme", "licensed character, theme, or play theme when supported", "角色 / 主题"), - _taxonomy_field("material", "Material", "main toy material such as plush, plastic, wood, silicone", "材质"), - _taxonomy_field("power_source", "Power Source", "battery, rechargeable, wind-up, or non-powered when supported", "供电方式"), - _taxonomy_field("interactive_features", "Interactive Features", "interactive functions such as sound, lights, remote control, motion", "互动功能"), - _taxonomy_field("educational_play_value", "Educational / Play Value", "play value such as STEM, pretend play, sensory, puzzle solving", "教育 / 可玩性"), - _taxonomy_field("piece_count_size", "Piece Count / Size", "piece count or size when stated", "件数 / 尺寸"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("use_scenario", "Use Scenario", "intended use such as indoor play, bath time, party favor, outdoor play", "使用场景"), -) - -SHOES_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise footwear category label", "品类"), - _taxonomy_field("target_gender", "Target Gender", "intended gender only if clearly implied", "目标性别"), - _taxonomy_field("age_group", "Age Group", "only if clearly implied", "年龄段"), - _taxonomy_field("closure_type", "Closure Type", "fastening method such as lace-up, slip-on, buckle, hook-and-loop", "闭合方式"), - _taxonomy_field("toe_shape", "Toe Shape", "toe shape when applicable, e.g. round toe, pointed toe, open toe", "鞋头形状"), - _taxonomy_field("heel_sole_type", "Heel Height / Sole Type", "heel or sole profile such as flat, block heel, wedge, platform, thick sole", "跟高 / 鞋底类型"), - _taxonomy_field("upper_material", "Upper Material", "main upper material such as leather, knit, canvas, mesh", "鞋面材质"), - _taxonomy_field("lining_insole_material", "Lining / Insole Material", "lining or insole material when supported", "里料 / 鞋垫材质"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("occasion_end_use", "Occasion / End Use", "likely use such as running, casual, office, hiking, formal", "适用场景"), -) - -SPORTS_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise sports product category label", "品类"), - _taxonomy_field("sport_activity", "Sport / Activity", "primary sport or activity such as fitness, yoga, basketball, cycling, swimming", "运动 / 活动"), - _taxonomy_field("skill_level", "Skill Level", "target user level when supported, e.g. beginner, training, professional", "适用水平"), - _taxonomy_field("material", "Material", "main material such as EVA, carbon fiber, neoprene, latex", "材质"), - _taxonomy_field("size_capacity", "Size / Capacity", "size, weight, resistance level, or capacity when stated", "尺寸 / 容量"), - _taxonomy_field("protection_support", "Protection / Support", "support or protection function such as ankle support, shock absorption, impact protection", "防护 / 支撑"), - _taxonomy_field("key_features", "Key Features", "main features such as anti-slip, adjustable, foldable, quick-dry", "关键特征"), - _taxonomy_field("power_source", "Power Source", "battery, electric, or non-powered when applicable", "供电方式"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("use_scenario", "Use Scenario", "intended use such as gym, home workout, field training, competition", "使用场景"), -) - -OTHERS_TAXONOMY_FIELDS = ( - _taxonomy_field("product_type", "Product Type", "concise product category label, not a full marketing title", "品类"), - _taxonomy_field("product_category", "Product Category", "broader retail grouping when the specific product type is narrow", "商品类别"), - _taxonomy_field("target_user", "Target User", "intended user, audience, or recipient when clearly implied", "适用人群"), - _taxonomy_field("material_ingredients", "Material / Ingredients", "main material or ingredients when supported", "材质 / 成分"), - _taxonomy_field("key_features", "Key Features", "primary product attributes or standout features", "关键特征"), - _taxonomy_field("functional_benefits", "Functional Benefits", "practical benefits or performance advantages when supported", "功能"), - _taxonomy_field("size_capacity", "Size / Capacity", "size, count, weight, or capacity when stated", "尺寸 / 容量"), - _taxonomy_field("color", "Color", "specific color name when available", "主颜色"), - _taxonomy_field("style_theme", "Style / Theme", "overall style, design theme, or visual direction when supported", "风格 / 主题"), - _taxonomy_field("use_scenario", "Use Scenario", "likely use occasion or application setting when supported", "使用场景"), -) - -CATEGORY_TAXONOMY_PROFILES: Dict[str, Dict[str, Any]] = { - "apparel": _make_taxonomy_profile( - "apparel", - APPAREL_TAXONOMY_FIELDS, - ), - "3c": _make_taxonomy_profile( - "3C", - THREE_C_TAXONOMY_FIELDS, - ), - "bags": _make_taxonomy_profile( - "bags", - BAGS_TAXONOMY_FIELDS, - ), - "pet_supplies": _make_taxonomy_profile( - "pet supplies", - PET_SUPPLIES_TAXONOMY_FIELDS, - ), - "electronics": _make_taxonomy_profile( - "electronics", - ELECTRONICS_TAXONOMY_FIELDS, - ), - "outdoor": _make_taxonomy_profile( - "outdoor products", - OUTDOOR_TAXONOMY_FIELDS, - ), - "home_appliances": _make_taxonomy_profile( - "home appliances", - HOME_APPLIANCES_TAXONOMY_FIELDS, - ), - "home_living": _make_taxonomy_profile( - "home and living", - HOME_LIVING_TAXONOMY_FIELDS, - ), - "wigs": _make_taxonomy_profile( - "wigs", - WIGS_TAXONOMY_FIELDS, - ), - "beauty": _make_taxonomy_profile( - "beauty and cosmetics", - BEAUTY_TAXONOMY_FIELDS, - ), - "accessories": _make_taxonomy_profile( - "accessories", - ACCESSORIES_TAXONOMY_FIELDS, - ), - "toys": _make_taxonomy_profile( - "toys", - TOYS_TAXONOMY_FIELDS, - ), - "shoes": _make_taxonomy_profile( - "shoes", - SHOES_TAXONOMY_FIELDS, - ), - "sports": _make_taxonomy_profile( - "sports products", - SPORTS_TAXONOMY_FIELDS, - ), - "others": _make_taxonomy_profile( - "general merchandise", - OTHERS_TAXONOMY_FIELDS, - ), -} - -TAXONOMY_SHARED_ANALYSIS_INSTRUCTION = CATEGORY_TAXONOMY_PROFILES["apparel"]["shared_instruction"] -TAXONOMY_MARKDOWN_TABLE_HEADERS_EN = CATEGORY_TAXONOMY_PROFILES["apparel"]["markdown_table_headers"]["en"] -TAXONOMY_LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = CATEGORY_TAXONOMY_PROFILES["apparel"]["markdown_table_headers"] - -LANGUAGE_MARKDOWN_TABLE_HEADERS: Dict[str, Dict[str, Any]] = { - "en": [ - "No.", - "Product title", - "Category path", - "Fine-grained tags", - "Target audience", - "Usage scene", - "Season", - "Key attributes", - "Material", - "Features", - "Anchor text" - ], - "zh": [ - "序号", - "商品标题", - "品类路径", - "细分标签", - "适用人群", - "使用场景", - "适用季节", - "关键属性", - "材质说明", - "功能特点", - "锚文本" - ], - "zh_tw": [ - "序號", - "商品標題", - "品類路徑", - "細分標籤", - "適用人群", - "使用場景", - "適用季節", - "關鍵屬性", - "材質說明", - "功能特點", - "錨文本" - ], - "ru": [ - "№", - "Название товара", - "Путь категории", - "Детализированные теги", - "Целевая аудитория", - "Сценарий использования", - "Сезон", - "Ключевые атрибуты", - "Материал", - "Особенности", - "Анкорный текст" - ], - "ja": [ - "番号", - "商品タイトル", - "カテゴリパス", - "詳細タグ", - "対象ユーザー", - "利用シーン", - "季節", - "主要属性", - "素材", - "機能特徴", - "アンカーテキスト" - ], - "ko": [ - "번호", - "상품 제목", - "카테고리 경로", - "세부 태그", - "대상 고객", - "사용 장면", - "계절", - "핵심 속성", - "소재", - "기능 특징", - "앵커 텍스트" - ], - "es": [ - "N.º", - "Titulo del producto", - "Ruta de categoria", - "Etiquetas detalladas", - "Publico objetivo", - "Escenario de uso", - "Temporada", - "Atributos clave", - "Material", - "Caracteristicas", - "Texto ancla" - ], - "fr": [ - "N°", - "Titre du produit", - "Chemin de categorie", - "Etiquettes detaillees", - "Public cible", - "Scenario d'utilisation", - "Saison", - "Attributs cles", - "Matiere", - "Caracteristiques", - "Texte d'ancrage" - ], - "pt": [ - "Nº", - "Titulo do produto", - "Caminho da categoria", - "Tags detalhadas", - "Publico-alvo", - "Cenario de uso", - "Estacao", - "Atributos principais", - "Material", - "Caracteristicas", - "Texto ancora" - ], - "de": [ - "Nr.", - "Produkttitel", - "Kategoriepfad", - "Detaillierte Tags", - "Zielgruppe", - "Nutzungsszenario", - "Saison", - "Wichtige Attribute", - "Material", - "Funktionen", - "Ankertext" - ], - "it": [ - "N.", - "Titolo del prodotto", - "Percorso categoria", - "Tag dettagliati", - "Pubblico target", - "Scenario d'uso", - "Stagione", - "Attributi chiave", - "Materiale", - "Caratteristiche", - "Testo ancora" - ], - "th": [ - "ลำดับ", - "ชื่อสินค้า", - "เส้นทางหมวดหมู่", - "แท็กย่อย", - "กลุ่มเป้าหมาย", - "สถานการณ์การใช้งาน", - "ฤดูกาล", - "คุณสมบัติสำคัญ", - "วัสดุ", - "คุณสมบัติการใช้งาน", - "แองเคอร์เท็กซ์" - ], - "vi": [ - "STT", - "Tieu de san pham", - "Duong dan danh muc", - "The chi tiet", - "Doi tuong phu hop", - "Boi canh su dung", - "Mua phu hop", - "Thuoc tinh chinh", - "Chat lieu", - "Tinh nang", - "Van ban neo" - ], - "id": [ - "No.", - "Judul produk", - "Jalur kategori", - "Tag terperinci", - "Target pengguna", - "Skenario penggunaan", - "Musim", - "Atribut utama", - "Bahan", - "Fitur", - "Teks jangkar" - ], - "ms": [ - "No.", - "Tajuk produk", - "Laluan kategori", - "Tag terperinci", - "Sasaran pengguna", - "Senario penggunaan", - "Musim", - "Atribut utama", - "Bahan", - "Ciri-ciri", - "Teks sauh" - ], - "ar": [ - "الرقم", - "عنوان المنتج", - "مسار الفئة", - "الوسوم التفصيلية", - "الفئة المستهدفة", - "سيناريو الاستخدام", - "الموسم", - "السمات الرئيسية", - "المادة", - "الميزات", - "نص الربط" - ], - "hi": [ - "क्रमांक", - "उत्पाद शीर्षक", - "श्रेणी पथ", - "विस्तृत टैग", - "लक्षित उपभोक्ता", - "उपयोग परिदृश्य", - "मौसम", - "मुख्य गुण", - "सामग्री", - "विशेषताएं", - "एंकर टेक्स्ट" - ], - "he": [ - "מס׳", - "כותרת המוצר", - "נתיב קטגוריה", - "תגיות מפורטות", - "קהל יעד", - "תרחיש שימוש", - "עונה", - "מאפיינים מרכזיים", - "חומר", - "תכונות", - "טקסט עוגן" - ], - "my": [ - "အမှတ်စဉ်", - "ကုန်ပစ္စည်းခေါင်းစဉ်", - "အမျိုးအစားလမ်းကြောင်း", - "အသေးစိတ်တဂ်များ", - "ပစ်မှတ်အသုံးပြုသူ", - "အသုံးပြုမှုအခြေအနေ", - "ရာသီ", - "အဓိကဂုဏ်သတ္တိများ", - "ပစ္စည်း", - "လုပ်ဆောင်ချက်များ", - "အန်ကာစာသား" - ], - "ta": [ - "எண்", - "தயாரிப்பு தலைப்பு", - "வகை பாதை", - "விரிவான குறிச்சொற்கள்", - "இலக்கு பயனர்கள்", - "பயன்பாட்டு நிலை", - "பருவம்", - "முக்கிய பண்புகள்", - "பொருள்", - "அம்சங்கள்", - "ஆங்கர் உரை" - ], - "ur": [ - "نمبر", - "پروڈکٹ عنوان", - "زمرہ راستہ", - "تفصیلی ٹیگز", - "ہدف صارفین", - "استعمال کا منظر", - "موسم", - "کلیدی خصوصیات", - "مواد", - "فیچرز", - "اینکر ٹیکسٹ" - ], - "bn": [ - "ক্রম", - "পণ্যের শিরোনাম", - "শ্রেণি পথ", - "বিস্তারিত ট্যাগ", - "লক্ষ্য ব্যবহারকারী", - "ব্যবহারের দৃশ্য", - "মৌসুম", - "মূল বৈশিষ্ট্য", - "উপাদান", - "ফিচার", - "অ্যাঙ্কর টেক্সট" - ], - "pl": [ - "Nr", - "Tytul produktu", - "Sciezka kategorii", - "Szczegolowe tagi", - "Grupa docelowa", - "Scenariusz uzycia", - "Sezon", - "Kluczowe atrybuty", - "Material", - "Cechy", - "Tekst kotwicy" - ], - "nl": [ - "Nr.", - "Producttitel", - "Categoriepad", - "Gedetailleerde tags", - "Doelgroep", - "Gebruikscontext", - "Seizoen", - "Belangrijke kenmerken", - "Materiaal", - "Functies", - "Ankertekst" - ], - "ro": [ - "Nr.", - "Titlul produsului", - "Calea categoriei", - "Etichete detaliate", - "Public tinta", - "Scenariu de utilizare", - "Sezon", - "Atribute cheie", - "Material", - "Caracteristici", - "Text ancora" - ], - "tr": [ - "No.", - "Urun basligi", - "Kategori yolu", - "Ayrintili etiketler", - "Hedef kitle", - "Kullanim senaryosu", - "Sezon", - "Temel ozellikler", - "Malzeme", - "Ozellikler", - "Capa metni" - ], - "km": [ - "ល.រ", - "ចំណងជើងផលិតផល", - "ផ្លូវប្រភេទ", - "ស្លាកលម្អិត", - "ក្រុមអ្នកប្រើគោលដៅ", - "សេណារីយ៉ូប្រើប្រាស់", - "រដូវកាល", - "លក្ខណៈសម្បត្តិសំខាន់", - "សម្ភារៈ", - "មុខងារ", - "អត្ថបទអង់ក័រ" - ], - "lo": [ - "ລຳດັບ", - "ຊື່ສິນຄ້າ", - "ເສັ້ນທາງໝວດໝູ່", - "ແທັກລະອຽດ", - "ກຸ່ມເປົ້າໝາຍ", - "ສະຖານະການໃຊ້ງານ", - "ລະດູການ", - "ຄຸນລັກສະນະສຳຄັນ", - "ວັດສະດຸ", - "ຄຸນສົມບັດ", - "ຂໍ້ຄວາມອັງເຄີ" - ], - "yue": [ - "序號", - "商品標題", - "品類路徑", - "細分類標籤", - "適用人群", - "使用場景", - "適用季節", - "關鍵屬性", - "材質說明", - "功能特點", - "錨文本" - ], - "cs": [ - "C.", - "Nazev produktu", - "Cesta kategorie", - "Podrobne stitky", - "Cilova skupina", - "Scenar pouziti", - "Sezona", - "Klicove atributy", - "Material", - "Vlastnosti", - "Kotvici text" - ], - "el": [ - "Α/Α", - "Τίτλος προϊόντος", - "Διαδρομή κατηγορίας", - "Αναλυτικές ετικέτες", - "Κοινό-στόχος", - "Σενάριο χρήσης", - "Εποχή", - "Βασικά χαρακτηριστικά", - "Υλικό", - "Λειτουργίες", - "Κείμενο άγκυρας" - ], - "sv": [ - "Nr", - "Produkttitel", - "Kategorisokvag", - "Detaljerade taggar", - "Malgrupp", - "Anvandningsscenario", - "Sasong", - "Viktiga attribut", - "Material", - "Funktioner", - "Ankartext" - ], - "hu": [ - "Sorszam", - "Termekcim", - "Kategoriavonal", - "Reszletes cimkek", - "Celcsoport", - "Hasznalati helyzet", - "Evszak", - "Fo jellemzok", - "Anyag", - "Funkciok", - "Horgonyszoveg" - ], - "da": [ - "Nr.", - "Produkttitel", - "Kategoristi", - "Detaljerede tags", - "Malgruppe", - "Brugsscenarie", - "Saeson", - "Nogleattributter", - "Materiale", - "Funktioner", - "Ankertekst" - ], - "fi": [ - "Nro", - "Tuotteen nimi", - "Kategoriapolku", - "Yksityiskohtaiset tunnisteet", - "Kohdeyleiso", - "Kayttotilanne", - "Kausi", - "Keskeiset ominaisuudet", - "Materiaali", - "Ominaisuudet", - "Ankkuriteksti" - ], - "uk": [ - "№", - "Назва товару", - "Шлях категорії", - "Детальні теги", - "Цільова аудиторія", - "Сценарій використання", - "Сезон", - "Ключові атрибути", - "Матеріал", - "Особливості", - "Анкорний текст" - ], - "bg": [ - "№", - "Заглавие на продукта", - "Път на категорията", - "Подробни тагове", - "Целева аудитория", - "Сценарий на употреба", - "Сезон", - "Ключови атрибути", - "Материал", - "Характеристики", - "Анкор текст" - ] -} diff --git a/indexer/product_enrich模块说明.md b/indexer/product_enrich模块说明.md deleted file mode 100644 index 1428746..0000000 --- a/indexer/product_enrich模块说明.md +++ /dev/null @@ -1,173 +0,0 @@ -# 内容富化模块说明 - -本文说明商品内容富化模块的职责、入口、输出结构,以及当前 taxonomy profile 的设计约束。 - -## 1. 模块目标 - -内容富化模块负责基于商品文本调用 LLM,生成以下索引字段: - -- `qanchors` -- `enriched_tags` -- `enriched_attributes` -- `enriched_taxonomy_attributes` - -模块追求的设计原则: - -- 单一职责:只负责内容理解与结构化输出,不负责 CSV 读写 -- 输出对齐 ES mapping:返回结构可直接写入 `search_products` -- 配置化扩展:taxonomy profile 通过数据配置扩展,而不是散落条件分支 -- 代码精简:只面向正常使用方式,避免为了不合理调用堆叠补丁逻辑 - -## 2. 主要文件 - -- [product_enrich.py](/data/saas-search/indexer/product_enrich.py) - 运行时主逻辑,负责批处理、缓存、prompt 组装、LLM 调用、markdown 解析、输出整理 -- [product_enrich_prompts.py](/data/saas-search/indexer/product_enrich_prompts.py) - prompt 模板与 taxonomy profile 配置 -- [document_transformer.py](/data/saas-search/indexer/document_transformer.py) - 在内部索引构建链路中调用内容富化模块,把结果回填到 ES doc -- [taxonomy.md](/data/saas-search/indexer/taxonomy.md) - taxonomy 设计说明与字段清单 - -## 3. 对外入口 - -### 3.1 Python 入口 - -核心入口: - -```python -build_index_content_fields( - items, - tenant_id=None, - enrichment_scopes=None, - category_taxonomy_profile=None, -) -``` - -输入最小要求: - -- `id` 或 `spu_id` -- `title` - -可选输入: - -- `brief` -- `description` -- `image_url` - -关键参数: - -- `enrichment_scopes` - 可选 `generic`、`category_taxonomy` -- `category_taxonomy_profile` - taxonomy profile;默认 `apparel` - -### 3.2 HTTP 入口 - -API 路由: - -- `POST /indexer/enrich-content` - -对应文档: - -- [搜索API对接指南-05-索引接口(Indexer)](/data/saas-search/docs/搜索API对接指南-05-索引接口(Indexer).md) -- [搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation)](/data/saas-search/docs/搜索API对接指南-07-微服务接口(Embedding-Reranker-Translation).md) - -## 4. 输出结构 - -返回结果与 ES mapping 对齐: - -```json -{ - "id": "223167", - "qanchors": { - "zh": ["短袖T恤", "纯棉"], - "en": ["t-shirt", "cotton"] - }, - "enriched_tags": { - "zh": ["短袖", "纯棉"], - "en": ["short sleeve", "cotton"] - }, - "enriched_attributes": [ - { - "name": "enriched_tags", - "value": { - "zh": ["短袖", "纯棉"], - "en": ["short sleeve", "cotton"] - } - } - ], - "enriched_taxonomy_attributes": [ - { - "name": "Product Type", - "value": { - "zh": ["T恤"], - "en": ["t-shirt"] - } - } - ] -} -``` - -说明: - -- `generic` 部分固定输出核心索引语言 `zh`、`en` -- `taxonomy` 部分同样统一输出 `zh`、`en` - -## 5. Taxonomy profile - -当前支持: - -- `apparel` -- `3c` -- `bags` -- `pet_supplies` -- `electronics` -- `outdoor` -- `home_appliances` -- `home_living` -- `wigs` -- `beauty` -- `accessories` -- `toys` -- `shoes` -- `sports` -- `others` - -统一约束: - -- 所有 profile 都返回 `zh` + `en` -- profile 只决定 taxonomy 字段集合,不再决定输出语言 -- 所有 profile 都配置中英文字段名,prompt/header 结构保持一致 - -## 6. 内部索引链路的当前约束 - -在内部 ES 文档构建链路里,`document_transformer` 当前调用内容富化时,taxonomy profile 暂时固定传: - -```python -category_taxonomy_profile="apparel" -``` - -这是一种显式、可控、代码更干净的临时策略。 - -当前代码里已保留 TODO: - -- 后续从数据库读取租户真实所属行业 -- 再用该行业替换固定的 `apparel` - -当前不做“根据商品类目文本自动猜 profile”的隐式逻辑,避免增加冗余代码与不必要的不确定性。 - -## 7. 缓存与批处理 - -缓存键由以下信息共同决定: - -- `analysis_kind` -- `target_lang` -- prompt/schema 版本指纹 -- prompt 实际输入文本 - -批处理规则: - -- 单次 LLM 调用最多 20 条 -- 上层允许传更大批次,模块内部自动拆批 -- uncached batch 可并发执行 diff --git a/indexer/spu_transformer.py b/indexer/spu_transformer.py index f4900dc..f6b7e95 100644 --- a/indexer/spu_transformer.py +++ b/indexer/spu_transformer.py @@ -220,7 +220,6 @@ class SPUTransformer: logger.info(f"Grouped options into {len(option_groups)} SPU groups") documents: List[Dict[str, Any]] = [] - doc_spu_rows: List[pd.Series] = [] skipped_count = 0 error_count = 0 @@ -244,11 +243,9 @@ class SPUTransformer: spu_row=spu_row, skus=skus, options=options, - fill_llm_attributes=False, ) if doc: documents.append(doc) - doc_spu_rows.append(spu_row) else: skipped_count += 1 logger.warning(f"SPU {spu_id} transformation returned None, skipped") @@ -256,13 +253,6 @@ class SPUTransformer: error_count += 1 logger.error(f"Error transforming SPU {spu_id}: {e}", exc_info=True) - # 批量填充 LLM 字段(尽量攒批,每次最多 20 条;失败仅 warning,不影响主流程) - try: - if documents and doc_spu_rows: - self.document_transformer.fill_llm_attributes_batch(documents, doc_spu_rows) - except Exception as e: - logger.warning("Batch LLM fill failed in transform_batch: %s", e) - logger.info(f"Transformation complete:") logger.info(f" - Total SPUs: {len(spu_df)}") logger.info(f" - Successfully transformed: {len(documents)}") @@ -270,5 +260,3 @@ class SPUTransformer: logger.info(f" - Errors: {error_count}") return documents - - diff --git a/scripts/debug/trace_indexer_calls.sh b/scripts/debug/trace_indexer_calls.sh index d22b9ea..203c748 100755 --- a/scripts/debug/trace_indexer_calls.sh +++ b/scripts/debug/trace_indexer_calls.sh @@ -66,7 +66,7 @@ echo "" echo " - Indexer 内部会调用:" echo " - Text Embedding 服务 (${EMBEDDING_TEXT_PORT}): POST /embed/text" echo " - Image Embedding 服务 (${EMBEDDING_IMAGE_PORT}): POST /embed/image" -echo " - Qwen API: dashscope.aliyuncs.com (翻译、LLM 分析)" +echo " - Translation 服务 / 翻译后端(按当前配置)" echo " - MySQL: 商品数据" echo " - Elasticsearch: 写入索引" echo "" diff --git a/scripts/redis/redis_cache_health_check.py b/scripts/redis/redis_cache_health_check.py index 8379e13..3a52198 100644 --- a/scripts/redis/redis_cache_health_check.py +++ b/scripts/redis/redis_cache_health_check.py @@ -2,7 +2,7 @@ """ 缓存状态巡检脚本 -按「缓存类型」维度(embedding / translation / anchors)查看: +按「缓存类型」维度(embedding / translation)查看: - 估算 key 数量 - TTL 分布(采样) - 近期活跃 key(按 IDLETIME 近似) @@ -10,12 +10,12 @@ 使用示例: - # 默认:检查已知三类缓存,使用 env_config 中的 Redis 配置 + # 默认:检查已知两类缓存,使用 env_config 中的 Redis 配置 python scripts/redis/redis_cache_health_check.py # 只看某一类缓存 python scripts/redis/redis_cache_health_check.py --type embedding - python scripts/redis/redis_cache_health_check.py --type translation anchors + python scripts/redis/redis_cache_health_check.py --type translation # 自定义前缀(pattern),不限定缓存类型 python scripts/redis/redis_cache_health_check.py --pattern "mycache:*" @@ -27,7 +27,6 @@ from __future__ import annotations import argparse -import json import sys from collections import defaultdict from dataclasses import dataclass @@ -54,7 +53,7 @@ class CacheTypeConfig: def _load_known_cache_types() -> Dict[str, CacheTypeConfig]: - """根据当前配置装配三种已知缓存类型及其前缀 pattern。""" + """根据当前配置装配仓库内仍在使用的缓存类型及其前缀 pattern。""" cache_types: Dict[str, CacheTypeConfig] = {} # embedding 缓存:prefix 来自 REDIS_CONFIG['embedding_cache_prefix'](默认 embedding) @@ -72,14 +71,6 @@ def _load_known_cache_types() -> Dict[str, CacheTypeConfig]: description="翻译结果缓存(translation/service.py)", ) - # anchors 缓存:prefix 来自 REDIS_CONFIG['anchor_cache_prefix'](若存在),否则 product_anchors - anchor_prefix = REDIS_CONFIG.get("anchor_cache_prefix", "product_anchors") - cache_types["anchors"] = CacheTypeConfig( - name="anchors", - pattern=f"{anchor_prefix}:*", - description="商品内容理解缓存(indexer/product_enrich.py,anchors/语义属性/tags)", - ) - return cache_types @@ -162,23 +153,6 @@ def decode_value_preview( except Exception: return f"" - # anchors: JSON dict - if cache_type == "anchors": - try: - text = raw_value.decode("utf-8", errors="replace") - obj = json.loads(text) - if isinstance(obj, dict): - brief = { - k: obj.get(k) - for k in ["id", "lang", "title_input", "title", "category_path", "anchor_text"] - if k in obj - } - return "json " + json.dumps(brief, ensure_ascii=False)[:200] - # 其他情况简单截断 - return "json " + text[:200] - except Exception: - return raw_value.decode("utf-8", errors="replace")[:200] - # translation: 纯字符串 if cache_type == "translation": try: @@ -308,8 +282,8 @@ def main() -> None: "--type", dest="types", nargs="+", - choices=["embedding", "translation", "anchors"], - help="指定要检查的缓存类型(默认:三种全部)", + choices=["embedding", "translation"], + help="指定要检查的缓存类型(默认:两种全部)", ) parser.add_argument( "--pattern", diff --git a/scripts/redis/redis_cache_prefix_stats.py b/scripts/redis/redis_cache_prefix_stats.py index a295dce..fa60f6f 100755 --- a/scripts/redis/redis_cache_prefix_stats.py +++ b/scripts/redis/redis_cache_prefix_stats.py @@ -15,7 +15,7 @@ python scripts/redis/redis_cache_prefix_stats.py --all-db 统计指定数据库: python scripts/redis/redis_cache_prefix_stats.py --db 1 -只统计以下三种前缀: +只统计若干常见前缀: python scripts/redis/redis_cache_prefix_stats.py --prefix trans embedding product 统计所有数据库的指定前缀: diff --git a/tests/ci/test_service_api_contracts.py b/tests/ci/test_service_api_contracts.py index cfa0fc7..4f387b1 100644 --- a/tests/ci/test_service_api_contracts.py +++ b/tests/ci/test_service_api_contracts.py @@ -342,162 +342,15 @@ def test_indexer_build_docs_from_db_contract(indexer_client: TestClient): assert data["docs"][0]["spu_id"] == "1001" -def test_indexer_enrich_content_contract(indexer_client: TestClient, monkeypatch): - import indexer.product_enrich as process_products - - def _fake_build_index_content_fields( - items: List[Dict[str, str]], - tenant_id: str | None = None, - enrichment_scopes: List[str] | None = None, - category_taxonomy_profile: str = "apparel", - ): - assert tenant_id == "162" - assert enrichment_scopes == ["generic", "category_taxonomy"] - assert category_taxonomy_profile == "apparel" - return [ - { - "id": p["spu_id"], - "qanchors": { - "zh": [f"zh-anchor-{p['spu_id']}"], - "en": [f"en-anchor-{p['spu_id']}"], - }, - "enriched_tags": {"zh": ["tag1", "tag2"], "en": ["tag1", "tag2"]}, - "enriched_attributes": [ - {"name": "enriched_tags", "value": {"zh": ["tag1"], "en": ["tag1"]}}, - ], - "enriched_taxonomy_attributes": [ - {"name": "Product Type", "value": {"zh": ["T恤"], "en": ["t-shirt"]}}, - ], - } - for p in items - ] - - monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) - - response = indexer_client.post( - "/indexer/enrich-content", - json={ - "tenant_id": "162", - "enrichment_scopes": ["generic", "category_taxonomy"], - "category_taxonomy_profile": "apparel", - "items": [ - {"spu_id": "1001", "title": "T-shirt"}, - {"spu_id": "1002", "title": "Toy"}, - ], - }, - ) - assert response.status_code == 200 - data = response.json() - assert data["tenant_id"] == "162" - assert data["enrichment_scopes"] == ["generic", "category_taxonomy"] - assert data["category_taxonomy_profile"] == "apparel" - assert data["total"] == 2 - assert len(data["results"]) == 2 - assert data["results"][0]["spu_id"] == "1001" - assert data["results"][0]["qanchors"]["zh"] == ["zh-anchor-1001"] - assert data["results"][0]["qanchors"]["en"] == ["en-anchor-1001"] - assert data["results"][0]["enriched_tags"]["zh"] == ["tag1", "tag2"] - assert data["results"][0]["enriched_tags"]["en"] == ["tag1", "tag2"] - assert data["results"][0]["enriched_attributes"][0] == { - "name": "enriched_tags", - "value": {"zh": ["tag1"], "en": ["tag1"]}, - } - assert data["results"][0]["enriched_taxonomy_attributes"][0] == { - "name": "Product Type", - "value": {"zh": ["T恤"], "en": ["t-shirt"]}, - } - - -def test_indexer_enrich_content_contract_accepts_deprecated_analysis_kinds(indexer_client: TestClient, monkeypatch): - import indexer.product_enrich as process_products - - seen: Dict[str, Any] = {} - - def _fake_build_index_content_fields( - items: List[Dict[str, str]], - tenant_id: str | None = None, - enrichment_scopes: List[str] | None = None, - category_taxonomy_profile: str = "apparel", - ): - seen["tenant_id"] = tenant_id - seen["enrichment_scopes"] = enrichment_scopes - seen["category_taxonomy_profile"] = category_taxonomy_profile - return [ - { - "id": items[0]["spu_id"], - "qanchors": {}, - "enriched_tags": {}, - "enriched_attributes": [], - "enriched_taxonomy_attributes": [], - } - ] - - monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) - +def test_indexer_enrich_content_route_removed(indexer_client: TestClient): response = indexer_client.post( "/indexer/enrich-content", json={ "tenant_id": "162", - "analysis_kinds": ["taxonomy"], "items": [{"spu_id": "1001", "title": "T-shirt"}], }, ) - - assert response.status_code == 200 - data = response.json() - assert seen == { - "tenant_id": "162", - "enrichment_scopes": ["category_taxonomy"], - "category_taxonomy_profile": "apparel", - } - assert data["enrichment_scopes"] == ["category_taxonomy"] - assert data["category_taxonomy_profile"] == "apparel" - - -def test_indexer_enrich_content_contract_supports_non_apparel_taxonomy_profiles(indexer_client: TestClient, monkeypatch): - import indexer.product_enrich as process_products - - def _fake_build_index_content_fields( - items: List[Dict[str, str]], - tenant_id: str | None = None, - enrichment_scopes: List[str] | None = None, - category_taxonomy_profile: str = "apparel", - ): - assert tenant_id == "162" - assert enrichment_scopes == ["category_taxonomy"] - assert category_taxonomy_profile == "toys" - return [ - { - "id": items[0]["spu_id"], - "qanchors": {}, - "enriched_tags": {}, - "enriched_attributes": [], - "enriched_taxonomy_attributes": [ - {"name": "Product Type", "value": {"en": ["doll set"]}}, - {"name": "Age Group", "value": {"en": ["kids"]}}, - ], - } - ] - - monkeypatch.setattr(process_products, "build_index_content_fields", _fake_build_index_content_fields) - - response = indexer_client.post( - "/indexer/enrich-content", - json={ - "tenant_id": "162", - "enrichment_scopes": ["category_taxonomy"], - "category_taxonomy_profile": "toys", - "items": [{"spu_id": "1001", "title": "Toy"}], - }, - ) - - assert response.status_code == 200 - data = response.json() - assert data["category_taxonomy_profile"] == "toys" - assert data["results"][0]["enriched_taxonomy_attributes"] == [ - {"name": "Product Type", "value": {"en": ["doll set"]}}, - {"name": "Age Group", "value": {"en": ["kids"]}}, - ] + assert response.status_code == 404 def test_indexer_documents_contract(indexer_client: TestClient): @@ -614,17 +467,6 @@ def test_indexer_build_docs_from_db_validation_max_spu_ids(indexer_client: TestC assert response.status_code == 400 -def test_indexer_enrich_content_validation_max_items(indexer_client: TestClient): - response = indexer_client.post( - "/indexer/enrich-content", - json={ - "tenant_id": "162", - "items": [{"spu_id": str(i), "title": "x"} for i in range(51)], - }, - ) - assert response.status_code == 400 - - def test_indexer_documents_validation_max_spu_ids(indexer_client: TestClient): """POST /indexer/documents: 400 when spu_ids > 100.""" response = indexer_client.post( diff --git a/tests/test_llm_enrichment_batch_fill.py b/tests/test_llm_enrichment_batch_fill.py deleted file mode 100644 index 5d882c0..0000000 --- a/tests/test_llm_enrichment_batch_fill.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -from typing import Any, Dict, List - -import pandas as pd - -from indexer.document_transformer import SPUDocumentTransformer - - -def test_fill_llm_attributes_batch_uses_product_enrich_helper(monkeypatch): - seen_calls: List[Dict[str, Any]] = [] - - def _fake_build_index_content_fields(items, tenant_id=None, category_taxonomy_profile=None): - seen_calls.append( - { - "n": len(items), - "tenant_id": tenant_id, - "category_taxonomy_profile": category_taxonomy_profile, - } - ) - return [ - { - "id": item["id"], - "qanchors": { - "zh": [f"zh-anchor-{item['id']}"], - "en": [f"en-anchor-{item['id']}"], - }, - "enriched_tags": {"zh": ["t1", "t2"], "en": ["t1", "t2"]}, - "enriched_attributes": [ - {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}}, - ], - "enriched_taxonomy_attributes": [ - {"name": "Product Type", "value": {"zh": ["连衣裙"], "en": ["dress"]}}, - ], - } - for item in items - ] - - import indexer.document_transformer as doc_tr - - monkeypatch.setattr(doc_tr, "build_index_content_fields", _fake_build_index_content_fields) - - transformer = SPUDocumentTransformer( - category_id_to_name={}, - searchable_option_dimensions=[], - tenant_config={"index_languages": ["zh", "en"], "primary_language": "zh"}, - translator=None, - encoder=None, - enable_title_embedding=False, - image_encoder=None, - enable_image_embedding=False, - ) - - docs: List[Dict[str, Any]] = [] - rows: List[pd.Series] = [] - for i in range(45): - docs.append({"tenant_id": "162", "spu_id": str(i)}) - rows.append(pd.Series({"id": i, "title": f"title-{i}"})) - - transformer.fill_llm_attributes_batch(docs, rows) - - assert seen_calls == [{"n": 45, "tenant_id": "162", "category_taxonomy_profile": "apparel"}] - - assert docs[0]["qanchors"]["zh"] == ["zh-anchor-0"] - assert docs[0]["qanchors"]["en"] == ["en-anchor-0"] - assert docs[0]["enriched_tags"]["zh"] == ["t1", "t2"] - assert docs[0]["enriched_tags"]["en"] == ["t1", "t2"] - assert {"name": "tags", "value": {"zh": ["t1"], "en": ["t1"]}} in docs[0]["enriched_attributes"] - assert { - "name": "Product Type", - "value": {"zh": ["连衣裙"], "en": ["dress"]}, - } in docs[0]["enriched_taxonomy_attributes"] diff --git a/tests/test_process_products_batching.py b/tests/test_process_products_batching.py deleted file mode 100644 index e72a0e0..0000000 --- a/tests/test_process_products_batching.py +++ /dev/null @@ -1,104 +0,0 @@ -from __future__ import annotations - -from typing import Any, Dict, List - -import indexer.product_enrich as process_products - - -def _mk_products(n: int) -> List[Dict[str, str]]: - return [{"id": str(i), "title": f"title-{i}"} for i in range(n)] - - -def test_analyze_products_caps_batch_size_to_20(monkeypatch): - monkeypatch.setattr(process_products, "API_KEY", "fake-key") - seen_batch_sizes: List[int] = [] - - def _fake_process_batch( - batch_data: List[Dict[str, str]], - batch_num: int, - target_lang: str = "zh", - analysis_kind: str = "content", - category_taxonomy_profile=None, - ): - assert analysis_kind == "content" - assert category_taxonomy_profile is None - seen_batch_sizes.append(len(batch_data)) - return [ - { - "id": item["id"], - "lang": target_lang, - "title_input": item["title"], - "title": "", - "category_path": "", - "tags": "", - "target_audience": "", - "usage_scene": "", - "season": "", - "key_attributes": "", - "material": "", - "features": "", - "anchor_text": "", - } - for item in batch_data - ] - - monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) - monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None) - - out = process_products.analyze_products( - products=_mk_products(45), - target_lang="zh", - batch_size=200, - tenant_id="162", - ) - - assert len(out) == 45 - # 并发执行时 batch 调用顺序可能变化,因此校验“批大小集合”而不是严格顺序 - assert sorted(seen_batch_sizes) == [5, 20, 20] - - -def test_analyze_products_uses_min_batch_size_1(monkeypatch): - monkeypatch.setattr(process_products, "API_KEY", "fake-key") - seen_batch_sizes: List[int] = [] - - def _fake_process_batch( - batch_data: List[Dict[str, str]], - batch_num: int, - target_lang: str = "zh", - analysis_kind: str = "content", - category_taxonomy_profile=None, - ): - assert analysis_kind == "content" - assert category_taxonomy_profile is None - seen_batch_sizes.append(len(batch_data)) - return [ - { - "id": item["id"], - "lang": target_lang, - "title_input": item["title"], - "title": "", - "category_path": "", - "tags": "", - "target_audience": "", - "usage_scene": "", - "season": "", - "key_attributes": "", - "material": "", - "features": "", - "anchor_text": "", - } - for item in batch_data - ] - - monkeypatch.setattr(process_products, "process_batch", _fake_process_batch) - monkeypatch.setattr(process_products, "_set_cached_analysis_result", lambda *args, **kwargs: None) - - out = process_products.analyze_products( - products=_mk_products(3), - target_lang="zh", - batch_size=0, - tenant_id="162", - ) - - assert len(out) == 3 - assert seen_batch_sizes == [1, 1, 1] diff --git a/tests/test_product_enrich_partial_mode.py b/tests/test_product_enrich_partial_mode.py deleted file mode 100644 index 156145b..0000000 --- a/tests/test_product_enrich_partial_mode.py +++ /dev/null @@ -1,736 +0,0 @@ -from __future__ import annotations - -import importlib.util -import io -import json -import logging -import sys -import types -from pathlib import Path -from unittest import mock - - -def _load_product_enrich_module(): - if "dotenv" not in sys.modules: - fake_dotenv = types.ModuleType("dotenv") - fake_dotenv.load_dotenv = lambda *args, **kwargs: None - sys.modules["dotenv"] = fake_dotenv - - if "redis" not in sys.modules: - fake_redis = types.ModuleType("redis") - - class _FakeRedisClient: - def __init__(self, *args, **kwargs): - pass - - def ping(self): - return True - - fake_redis.Redis = _FakeRedisClient - sys.modules["redis"] = fake_redis - - repo_root = Path(__file__).resolve().parents[1] - if str(repo_root) not in sys.path: - sys.path.insert(0, str(repo_root)) - - module_path = repo_root / "indexer" / "product_enrich.py" - spec = importlib.util.spec_from_file_location("product_enrich_under_test", module_path) - module = importlib.util.module_from_spec(spec) - assert spec and spec.loader - spec.loader.exec_module(module) - return module - - -product_enrich = _load_product_enrich_module() - - -def _attach_stream(logger_obj: logging.Logger): - stream = io.StringIO() - handler = logging.StreamHandler(stream) - handler.setFormatter(logging.Formatter("%(message)s")) - logger_obj.addHandler(handler) - return stream, handler - - -def test_create_prompt_splits_shared_context_and_localized_tail(): - products = [ - {"id": "1", "title": "dress"}, - {"id": "2", "title": "linen shirt"}, - ] - - shared_zh, user_zh, prefix_zh = product_enrich.create_prompt(products, target_lang="zh") - shared_en, user_en, prefix_en = product_enrich.create_prompt(products, target_lang="en") - - assert shared_zh == shared_en - assert "Analyze each input product text" in shared_zh - assert "1. dress" in shared_zh - assert "2. linen shirt" in shared_zh - assert "Product list" not in user_zh - assert "Product list" not in user_en - assert "specified language" in user_zh - assert "Language: Chinese" in user_zh - assert "Language: English" in user_en - assert prefix_zh.startswith("| 序号 | 商品标题 | 品类路径 |") - assert prefix_en.startswith("| No. | Product title | Category path |") - - -def test_create_prompt_supports_taxonomy_analysis_kind(): - products = [{"id": "1", "title": "linen dress"}] - - shared_zh, user_zh, prefix_zh = product_enrich.create_prompt( - products, - target_lang="zh", - analysis_kind="taxonomy", - ) - shared_fr, user_fr, prefix_fr = product_enrich.create_prompt( - products, - target_lang="fr", - analysis_kind="taxonomy", - ) - - assert "apparel attribute taxonomy" in shared_zh - assert "1. linen dress" in shared_zh - assert "Language: Chinese" in user_zh - assert "Language: French" in user_fr - assert prefix_zh.startswith("| 序号 | 品类 | 目标性别 |") - assert prefix_fr.startswith("| No. | Product Type | Target Gender |") - - -def test_call_llm_logs_shared_context_once_and_verbose_contains_full_requests(): - payloads = [] - response_bodies = [ - { - "choices": [ - { - "message": { - "content": ( - "| 1 | 连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | " - "通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | " - "修身显瘦 | 法式收腰连衣裙 |\n" - ) - } - } - ], - "usage": {"prompt_tokens": 120, "completion_tokens": 45, "total_tokens": 165}, - }, - { - "choices": [ - { - "message": { - "content": ( - "| 1 | Dress | Women>Dress | French,Waisted | Young women | " - "Commute,Date | Spring,Summer | Midi | Polyester | Breathable | " - "Slim fit | French waisted dress |\n" - ) - } - } - ], - "usage": {"prompt_tokens": 118, "completion_tokens": 43, "total_tokens": 161}, - }, - ] - - class _FakeResponse: - def __init__(self, body): - self.body = body - - def raise_for_status(self): - return None - - def json(self): - return self.body - - class _FakeSession: - trust_env = True - - def post(self, url, headers=None, json=None, timeout=None, proxies=None): - del url, headers, timeout, proxies - payloads.append(json) - return _FakeResponse(response_bodies[len(payloads) - 1]) - - def close(self): - return None - - product_enrich.reset_logged_shared_context_keys() - main_stream, main_handler = _attach_stream(product_enrich.logger) - verbose_stream, verbose_handler = _attach_stream(product_enrich.verbose_logger) - - try: - with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( - product_enrich.requests, - "Session", - lambda: _FakeSession(), - ): - zh_shared, zh_user, zh_prefix = product_enrich.create_prompt( - [{"id": "1", "title": "dress"}], - target_lang="zh", - ) - en_shared, en_user, en_prefix = product_enrich.create_prompt( - [{"id": "1", "title": "dress"}], - target_lang="en", - ) - - zh_markdown, zh_raw = product_enrich.call_llm( - zh_shared, - zh_user, - zh_prefix, - target_lang="zh", - ) - en_markdown, en_raw = product_enrich.call_llm( - en_shared, - en_user, - en_prefix, - target_lang="en", - ) - finally: - product_enrich.logger.removeHandler(main_handler) - product_enrich.verbose_logger.removeHandler(verbose_handler) - - assert zh_shared == en_shared - assert len(payloads) == 2 - assert len(payloads[0]["messages"]) == 3 - assert payloads[0]["messages"][1]["role"] == "user" - assert "1. dress" in payloads[0]["messages"][1]["content"] - assert "Language: Chinese" in payloads[0]["messages"][1]["content"] - assert "Language: English" in payloads[1]["messages"][1]["content"] - assert payloads[0]["messages"][-1]["partial"] is True - assert payloads[1]["messages"][-1]["partial"] is True - - main_log = main_stream.getvalue() - verbose_log = verbose_stream.getvalue() - - assert main_log.count("LLM Shared Context") == 1 - assert main_log.count("LLM Request Variant") == 2 - assert "Localized Requirement" in main_log - assert "Shared Context" in main_log - - assert verbose_log.count("LLM Request [model=") == 2 - assert verbose_log.count("LLM Response [model=") == 2 - assert '"partial": true' in verbose_log - assert "Combined User Prompt" in verbose_log - assert "French waisted dress" in verbose_log - assert "法式收腰连衣裙" in verbose_log - - assert zh_markdown.startswith(zh_prefix) - assert en_markdown.startswith(en_prefix) - assert json.loads(zh_raw)["usage"]["total_tokens"] == 165 - assert json.loads(en_raw)["usage"]["total_tokens"] == 161 - - -def test_process_batch_reads_result_and_validates_expected_fields(): - merged_markdown = """| 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 锚文本 | -|----|----|----|----|----|----|----|----|----|----|----| -| 1 | 法式连衣裙 | 女装>连衣裙 | 法式,收腰 | 年轻女性 | 通勤,约会 | 春季,夏季 | 中长款 | 聚酯纤维 | 透气 | 法式收腰连衣裙 | -""" - - with mock.patch.object( - product_enrich, - "call_llm", - return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})), - ): - results = product_enrich.process_batch( - [{"id": "sku-1", "title": "dress"}], - batch_num=1, - target_lang="zh", - ) - - assert len(results) == 1 - row = results[0] - assert row["id"] == "sku-1" - assert row["lang"] == "zh" - assert row["title_input"] == "dress" - assert row["title"] == "法式连衣裙" - assert row["category_path"] == "女装>连衣裙" - assert row["tags"] == "法式,收腰" - assert row["target_audience"] == "年轻女性" - assert row["usage_scene"] == "通勤,约会" - assert row["season"] == "春季,夏季" - assert row["key_attributes"] == "中长款" - assert row["material"] == "聚酯纤维" - assert row["features"] == "透气" - assert row["anchor_text"] == "法式收腰连衣裙" - - -def test_process_batch_reads_taxonomy_result_with_schema_specific_fields(): - merged_markdown = """| 序号 | 品类 | 目标性别 | 年龄段 | 适用季节 | 版型 | 廓形 | 领型 | 袖长类型 | 袖型 | 肩带设计 | 腰型 | 裤型 | 裙型 | 长度类型 | 闭合方式 | 设计细节 | 面料 | 成分 | 面料特性 | 服装特征 | 功能 | 主颜色 | 色系 | 印花 / 图案 | 适用场景 | 风格 | -|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----| -| 1 | 连衣裙 | 女 | 成人 | 春季,夏季 | 修身 | A字 | V领 | 无袖 | | 细肩带 | 高腰 | | A字裙 | 中长款 | 拉链 | 褶皱 | 梭织 | 聚酯纤维,氨纶 | 轻薄,透气 | 有内衬 | 易打理 | 酒红色 | 红色 | 纯色 | 约会,度假 | 浪漫 | -""" - - with mock.patch.object( - product_enrich, - "call_llm", - return_value=(merged_markdown, json.dumps({"choices": [{"message": {"content": "stub"}}]})), - ): - results = product_enrich.process_batch( - [{"id": "sku-1", "title": "dress"}], - batch_num=1, - target_lang="zh", - analysis_kind="taxonomy", - ) - - assert len(results) == 1 - row = results[0] - assert row["id"] == "sku-1" - assert row["lang"] == "zh" - assert row["title_input"] == "dress" - assert row["product_type"] == "连衣裙" - assert row["target_gender"] == "女" - assert row["age_group"] == "成人" - assert row["sleeve_length_type"] == "无袖" - assert row["material_composition"] == "聚酯纤维,氨纶" - assert row["occasion_end_use"] == "约会,度假" - assert row["style_aesthetic"] == "浪漫" - - -def test_analyze_products_uses_product_level_cache_across_batch_requests(): - cache_store = {} - process_calls = [] - - def _cache_key(product, target_lang): - return ( - target_lang, - product.get("title", ""), - product.get("brief", ""), - product.get("description", ""), - product.get("image_url", ""), - ) - - def fake_get_cached_analysis_result( - product, - target_lang, - analysis_kind="content", - category_taxonomy_profile=None, - ): - assert analysis_kind == "content" - assert category_taxonomy_profile is None - return cache_store.get(_cache_key(product, target_lang)) - - def fake_set_cached_analysis_result( - product, - target_lang, - result, - analysis_kind="content", - category_taxonomy_profile=None, - ): - assert analysis_kind == "content" - assert category_taxonomy_profile is None - cache_store[_cache_key(product, target_lang)] = result - - def fake_process_batch( - batch_data, - batch_num, - target_lang="zh", - analysis_kind="content", - category_taxonomy_profile=None, - ): - assert analysis_kind == "content" - assert category_taxonomy_profile is None - process_calls.append( - { - "batch_num": batch_num, - "target_lang": target_lang, - "titles": [item["title"] for item in batch_data], - } - ) - return [ - { - "id": item["id"], - "lang": target_lang, - "title_input": item["title"], - "title": f"normalized:{item['title']}", - "category_path": "cat", - "tags": "tags", - "target_audience": "audience", - "usage_scene": "scene", - "season": "season", - "key_attributes": "attrs", - "material": "material", - "features": "features", - "anchor_text": f"anchor:{item['title']}", - } - for item in batch_data - ] - - products = [ - {"id": "1", "title": "dress"}, - {"id": "2", "title": "shirt"}, - ] - - with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( - product_enrich, - "_get_cached_analysis_result", - side_effect=fake_get_cached_analysis_result, - ), mock.patch.object( - product_enrich, - "_set_cached_analysis_result", - side_effect=fake_set_cached_analysis_result, - ), mock.patch.object( - product_enrich, - "process_batch", - side_effect=fake_process_batch, - ): - first = product_enrich.analyze_products( - [products[0]], - target_lang="zh", - tenant_id="170", - ) - second = product_enrich.analyze_products( - products, - target_lang="zh", - tenant_id="999", - ) - third = product_enrich.analyze_products( - products, - target_lang="zh", - tenant_id="170", - ) - - assert [row["title_input"] for row in first] == ["dress"] - assert [row["title_input"] for row in second] == ["dress", "shirt"] - assert [row["title_input"] for row in third] == ["dress", "shirt"] - - assert process_calls == [ - {"batch_num": 1, "target_lang": "zh", "titles": ["dress"]}, - {"batch_num": 1, "target_lang": "zh", "titles": ["shirt"]}, - ] - assert second[0]["anchor_text"] == "anchor:dress" - assert second[1]["anchor_text"] == "anchor:shirt" - assert third[0]["anchor_text"] == "anchor:dress" - assert third[1]["anchor_text"] == "anchor:shirt" - - -def test_analyze_products_reuses_cached_content_with_current_product_identity(): - cached_result = { - "id": "1165", - "lang": "zh", - "title_input": "old-title", - "title": "法式连衣裙", - "category_path": "女装>连衣裙", - "enriched_tags": "法式,收腰", - "target_audience": "年轻女性", - "usage_scene": "通勤,约会", - "season": "春季,夏季", - "key_attributes": "中长款", - "material": "聚酯纤维", - "features": "透气", - "anchor_text": "法式收腰连衣裙", - } - products = [{"id": "69960", "title": "dress"}] - - with mock.patch.object(product_enrich, "API_KEY", "fake-key"), mock.patch.object( - product_enrich, - "_get_cached_analysis_result", - wraps=lambda product, target_lang, analysis_kind="content", category_taxonomy_profile=None: product_enrich._normalize_analysis_result( - cached_result, - product=product, - target_lang=target_lang, - schema=product_enrich._get_analysis_schema("content"), - ), - ), mock.patch.object( - product_enrich, - "process_batch", - side_effect=AssertionError("process_batch should not be called on cache hit"), - ): - result = product_enrich.analyze_products( - products, - target_lang="zh", - tenant_id="170", - ) - - assert result == [ - { - "id": "69960", - "lang": "zh", - "title_input": "dress", - "title": "法式连衣裙", - "category_path": "女装>连衣裙", - "tags": "法式,收腰", - "target_audience": "年轻女性", - "usage_scene": "通勤,约会", - "season": "春季,夏季", - "key_attributes": "中长款", - "material": "聚酯纤维", - "features": "透气", - "anchor_text": "法式收腰连衣裙", - } - ] - - -def test_build_index_content_fields_maps_internal_tags_to_enriched_tags_output(): - def fake_analyze_products( - products, - target_lang="zh", - batch_size=None, - tenant_id=None, - analysis_kind="content", - category_taxonomy_profile=None, - ): - if analysis_kind == "taxonomy": - assert category_taxonomy_profile == "apparel" - return [ - { - "id": products[0]["id"], - "lang": target_lang, - "title_input": products[0]["title"], - "product_type": f"{target_lang}-dress", - "target_gender": f"{target_lang}-women", - "age_group": "", - "season": f"{target_lang}-summer", - "fit": "", - "silhouette": "", - "neckline": "", - "sleeve_length_type": "", - "sleeve_style": "", - "strap_type": "", - "rise_waistline": "", - "leg_shape": "", - "skirt_shape": "", - "length_type": "", - "closure_type": "", - "design_details": "", - "fabric": "", - "material_composition": "", - "fabric_properties": "", - "clothing_features": "", - "functional_benefits": "", - "color": "", - "color_family": "", - "print_pattern": "", - "occasion_end_use": "", - "style_aesthetic": "", - } - ] - return [ - { - "id": products[0]["id"], - "lang": target_lang, - "title_input": products[0]["title"], - "title": products[0]["title"], - "category_path": "玩具>滑行玩具", - "tags": f"{target_lang}-tag1,{target_lang}-tag2", - "target_audience": f"{target_lang}-audience", - "usage_scene": "", - "season": "", - "key_attributes": "", - "material": "", - "features": "", - "anchor_text": f"{target_lang}-anchor", - } - ] - - with mock.patch.object( - product_enrich, - "analyze_products", - side_effect=fake_analyze_products, - ): - result = product_enrich.build_index_content_fields( - items=[{"spu_id": "69960", "title": "dress"}], - tenant_id="170", - ) - - assert result == [ - { - "id": "69960", - "qanchors": {"zh": ["zh-anchor"], "en": ["en-anchor"]}, - "enriched_tags": {"zh": ["zh-tag1", "zh-tag2"], "en": ["en-tag1", "en-tag2"]}, - "enriched_attributes": [ - { - "name": "enriched_tags", - "value": { - "zh": ["zh-tag1", "zh-tag2"], - "en": ["en-tag1", "en-tag2"], - }, - }, - {"name": "target_audience", "value": {"zh": ["zh-audience"], "en": ["en-audience"]}}, - ], - "enriched_taxonomy_attributes": [ - { - "name": "Product Type", - "value": {"zh": ["zh-dress"], "en": ["en-dress"]}, - }, - { - "name": "Target Gender", - "value": {"zh": ["zh-women"], "en": ["en-women"]}, - }, - { - "name": "Season", - "value": {"zh": ["zh-summer"], "en": ["en-summer"]}, - }, - ], - } - ] -def test_build_index_content_fields_non_apparel_taxonomy_returns_en_only(): - seen_calls = [] - - def fake_analyze_products( - products, - target_lang="zh", - batch_size=None, - tenant_id=None, - analysis_kind="content", - category_taxonomy_profile=None, - ): - seen_calls.append((analysis_kind, target_lang, category_taxonomy_profile, tuple(p["id"] for p in products))) - if analysis_kind == "taxonomy": - assert category_taxonomy_profile == "toys" - assert target_lang == "en" - return [ - { - "id": products[0]["id"], - "lang": "en", - "title_input": products[0]["title"], - "product_type": "doll set", - "age_group": "kids", - "character_theme": "", - "material": "", - "power_source": "", - "interactive_features": "", - "educational_play_value": "", - "piece_count_size": "", - "color": "", - "use_scenario": "", - } - ] - - return [ - { - "id": product["id"], - "lang": target_lang, - "title_input": product["title"], - "title": product["title"], - "category_path": "", - "tags": f"{target_lang}-tag", - "target_audience": "", - "usage_scene": "", - "season": "", - "key_attributes": "", - "material": "", - "features": "", - "anchor_text": f"{target_lang}-anchor", - } - for product in products - ] - - with mock.patch.object(product_enrich, "analyze_products", side_effect=fake_analyze_products): - result = product_enrich.build_index_content_fields( - items=[{"spu_id": "2", "title": "toy"}], - tenant_id="170", - category_taxonomy_profile="toys", - ) - - assert result == [ - { - "id": "2", - "qanchors": {"zh": ["zh-anchor"], "en": ["en-anchor"]}, - "enriched_tags": {"zh": ["zh-tag"], "en": ["en-tag"]}, - "enriched_attributes": [ - { - "name": "enriched_tags", - "value": { - "zh": ["zh-tag"], - "en": ["en-tag"], - }, - } - ], - "enriched_taxonomy_attributes": [ - {"name": "Product Type", "value": {"en": ["doll set"]}}, - {"name": "Age Group", "value": {"en": ["kids"]}}, - ], - } - ] - assert ("taxonomy", "zh", "toys", ("2",)) not in seen_calls - assert ("taxonomy", "en", "toys", ("2",)) in seen_calls - - -def test_anchor_cache_key_depends_on_product_input_not_identifiers(): - product_a = { - "id": "1", - "spu_id": "1001", - "title": "dress", - "brief": "soft cotton", - "description": "summer dress", - "image_url": "https://img/a.jpg", - } - product_b = { - "id": "2", - "spu_id": "9999", - "title": "dress", - "brief": "soft cotton", - "description": "summer dress", - "image_url": "https://img/a.jpg", - } - product_c = { - "id": "1", - "spu_id": "1001", - "title": "dress", - "brief": "soft cotton updated", - "description": "summer dress", - "image_url": "https://img/a.jpg", - } - - key_a = product_enrich._make_anchor_cache_key(product_a, "zh") - key_b = product_enrich._make_anchor_cache_key(product_b, "zh") - key_c = product_enrich._make_anchor_cache_key(product_c, "zh") - - assert key_a == key_b - assert key_a != key_c - - -def test_analysis_cache_key_isolated_by_analysis_kind(): - product = { - "id": "1", - "title": "dress", - "brief": "soft cotton", - "description": "summer dress", - } - - content_key = product_enrich._make_analysis_cache_key(product, "zh", "content") - taxonomy_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy") - - assert content_key != taxonomy_key - - -def test_analysis_cache_key_changes_when_prompt_contract_changes(): - product = { - "id": "1", - "title": "dress", - "brief": "soft cotton", - "description": "summer dress", - } - - original_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy") - - with mock.patch.object( - product_enrich, - "USER_INSTRUCTION_TEMPLATE", - "Please return JSON only. Language: {language}", - ): - changed_key = product_enrich._make_analysis_cache_key(product, "zh", "taxonomy") - - assert original_key != changed_key - - -def test_build_prompt_input_text_appends_brief_and_description_for_short_title(): - product = { - "title": "T恤", - "brief": "夏季透气纯棉短袖,舒适亲肤", - "description": "100%棉,圆领版型,适合日常通勤与休闲穿搭。", - } - - text = product_enrich._build_prompt_input_text(product) - - assert text.startswith("T恤") - assert "夏季透气纯棉短袖" in text - assert "100%棉" in text - - -def test_build_prompt_input_text_truncates_non_cjk_by_words(): - product = { - "title": "dress", - "brief": " ".join(f"brief{i}" for i in range(50)), - "description": " ".join(f"desc{i}" for i in range(50)), - } - - text = product_enrich._build_prompt_input_text(product) - - assert len(text.split()) <= product_enrich.PROMPT_INPUT_MAX_WORDS -- libgit2 0.21.2