diff --git a/config/config.yaml b/config/config.yaml index b2c1b0a..3014f1a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,3 +1,11 @@ +# Unified Configuration for Multi-Tenant Search Engine +# 统一配置文件,所有租户共用一套配置 +# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 +# +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。 + +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义) runtime: environment: prod index_namespace: '' @@ -13,6 +21,8 @@ runtime: translator_port: 6006 reranker_host: 0.0.0.0 reranker_port: 6007 + +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) infrastructure: elasticsearch: host: http://localhost:9200 @@ -39,16 +49,30 @@ infrastructure: secrets: dashscope_api_key: null deepl_auth_key: null + +# Elasticsearch Index es_index_name: search_products + +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出) indexes: [] + +# Config assets assets: query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict + +# Product content understanding (LLM enrich-content) configuration product_enrich: max_workers: 40 + +# ES Index Settings (基础设置) es_settings: number_of_shards: 1 number_of_replicas: 0 refresh_interval: 30s + +# 字段权重配置(用于搜索时的字段boost) +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 field_boosts: title: 3.0 qanchors: 2.5 @@ -61,21 +85,39 @@ field_boosts: option1_values: 1.5 option2_values: 1.5 option3_values: 1.5 + +# Query Configuration(查询配置) query_config: + # 支持的语言 supported_languages: - zh - en default_language: en + + # 功能开关(翻译开关由tenant_config控制) enable_text_embedding: true enable_query_rewrite: true - zh_to_en_model: nllb-200-distilled-600m - en_to_zh_model: nllb-200-distilled-600m + + # 查询翻译模型(须与 services.translation.capabilities 中某项一致) + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。 + zh_to_en_model: nllb-200-distilled-600m # "opus-mt-zh-en" + en_to_zh_model: nllb-200-distilled-600m # "opus-mt-en-zh" default_translation_model: nllb-200-distilled-600m + # zh_to_en_model: deepl + # en_to_zh_model: deepl + # default_translation_model: deepl + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同) zh_to_en_model__source_not_in_index: nllb-200-distilled-600m en_to_zh_model__source_not_in_index: nllb-200-distilled-600m default_translation_model__source_not_in_index: nllb-200-distilled-600m - translation_embedding_wait_budget_ms_source_in_index: 200 - translation_embedding_wait_budget_ms_source_not_in_index: 300 + # zh_to_en_model__source_not_in_index: deepl + # en_to_zh_model__source_not_in_index: deepl + # default_translation_model__source_not_in_index: deepl + + # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 + # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 + translation_embedding_wait_budget_ms_source_in_index: 200 # 80 + translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200 style_intent: enabled: true selected_sku_boost: 1.2 @@ -102,6 +144,10 @@ query_config: product_title_exclusion: enabled: true dictionary_path: config/dictionaries/product_title_exclusion.tsv + + # 动态多语言检索字段配置 + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; + # shared_fields 为无语言后缀字段。 search_fields: multilingual_fields: - title @@ -111,11 +157,14 @@ query_config: - brief - description - vendor + # shared_fields: 无语言后缀字段;示例: tags, option1_values, option2_values, option3_values shared_fields: null core_multilingual_fields: - title - qanchors - category_name_text + + # 统一文本召回策略(主查询 + 翻译查询) text_query_strategy: base_minimum_should_match: 60% translation_minimum_should_match: 60% @@ -130,8 +179,14 @@ query_config: title: 5.0 qanchors: 4.0 phrase_match_boost: 3.0 + + # Embedding字段名称 text_embedding_field: title_embedding image_embedding_field: image_embedding.vector + + # 返回字段配置(_source includes) + # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 + # 下列字段与 api/result_formatter.py(SpuResult 填充)及 search/searcher.py(SKU 排序/主图替换)一致 source_fields: - spu_id - handle @@ -163,18 +218,26 @@ query_config: - option3_values - specifications - skus + + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates) knn_text_boost: 4 knn_image_boost: 4 + + # knn_text_num_candidates = k * 3.4 knn_text_k: 160 knn_text_num_candidates: 560 knn_text_k_long: 400 knn_text_num_candidates_long: 1200 knn_image_k: 400 knn_image_num_candidates: 1200 + +# Function Score配置(ES层打分规则) function_score: score_mode: sum boost_mode: multiply functions: [] + +# 粗排配置(仅融合 ES 文本/向量信号,不调用模型) coarse_rank: enabled: true input_window: 700 @@ -182,12 +245,16 @@ coarse_rank: fusion: text_bias: 0.1 text_exponent: 0.35 + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) + # 因为es的打分已经给了trans进行了折扣,所以这里不再继续折扣 text_translation_weight: 1.0 knn_text_weight: 1.0 knn_image_weight: 1.0 knn_tie_breaker: 0.1 knn_bias: 0.6 knn_exponent: 0.0 + +# 精排配置(轻量 reranker) fine_rank: enabled: false input_window: 160 @@ -196,6 +263,8 @@ fine_rank: rerank_query_template: '{query}' rerank_doc_template: '{title}' service_profile: fine + +# 重排配置(provider/URL 在 services.rerank) rerank: enabled: true rerank_window: 160 @@ -205,6 +274,11 @@ rerank: rerank_query_template: '{query}' rerank_doc_template: '{title}' service_profile: default + + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项) + # 其中 knn_score 先做一层 dis_max: + # max(knn_text_weight * text_knn, knn_image_weight * image_knn) + # + knn_tie_breaker * 另一侧较弱信号 fusion: rerank_bias: 1.0e-05 rerank_exponent: 1.15 @@ -212,22 +286,29 @@ rerank: fine_exponent: 1.0 text_bias: 0.1 text_exponent: 0.25 + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) text_translation_weight: 0.8 knn_text_weight: 1.0 knn_image_weight: 1.0 knn_tie_breaker: 0.1 knn_bias: 0.6 knn_exponent: 0.0 + +# 可扩展服务/provider 注册表(单一配置源) services: translation: service_url: http://127.0.0.1:6006 + # default_model: nllb-200-distilled-600m default_model: nllb-200-distilled-600m default_scene: general timeout_sec: 10.0 cache: ttl_seconds: 62208000 sliding_expiration: true + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups). enable_model_quality_tier_cache: true + # Higher tier = better quality. Multiple models may share one tier (同级). + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers). model_quality_tiers: deepl: 30 qwen-mt: 30 @@ -321,12 +402,13 @@ services: num_beams: 1 use_cache: true embedding: - provider: http + provider: http # http providers: http: text_base_url: http://127.0.0.1:6005 image_base_url: http://127.0.0.1:6008 - backend: tei + # 服务内文本后端(embedding 进程启动时读取) + backend: tei # tei | local_st backends: tei: base_url: http://127.0.0.1:8080 @@ -337,7 +419,10 @@ services: device: cuda batch_size: 32 normalize_embeddings: true - image_backend: clip_as_service + # 服务内图片后端(embedding 进程启动时读取;cnclip gRPC 与 6008 须同一 model_name) + # Chinese-CLIP:ViT-H-14 → 1024 维,ViT-L-14 → 768 维。须与 mappings/search_products.json 中 + # image_embedding.vector.dims 一致(当前索引为 1024 → 默认 ViT-H-14)。 + image_backend: clip_as_service # clip_as_service | local_cnclip image_backends: clip_as_service: server: grpc://127.0.0.1:51000 @@ -364,6 +449,7 @@ services: max_docs: 1000 normalize: true default_instance: default + # 命名实例:同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。 instances: default: host: 0.0.0.0 @@ -405,11 +491,29 @@ services: enforce_eager: false infer_batch_size: 100 sort_by_doc_length: true - instruction_format: standard + # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct) + instruction_format: standard # compact standard + # instruction: "Given a query, score the product for relevance" + # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点 + # instruction: "rank products by given query, category match first" + # instruction: "Rank products by query relevance, prioritizing category match" + # instruction: "Rank products by query relevance, prioritizing category and style match" + # instruction: "Rank by query relevance, prioritize category & style" + # instruction: "Relevance ranking: category & style match first" + # instruction: "Score product relevance by query with category & style match prioritized" + # instruction: "Rank products by query with category & style match prioritized" + # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query" instruction: rank products by given query + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。 qwen3_vllm_score: model_name: Qwen/Qwen3-Reranker-0.6B + # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false use_original_qwen3_hf_overrides: true + # vllm_runner: "auto" + # vllm_convert: "auto" + # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并 + # hf_overrides: {} engine: vllm max_model_len: 172 tensor_parallel_size: 1 @@ -419,15 +523,23 @@ services: enforce_eager: false infer_batch_size: 80 sort_by_doc_length: true - instruction_format: standard + # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致 + instruction_format: standard # compact standard + # instruction: "Rank products by query with category & style match prioritized" + # instruction: "Given a shopping query, rank products by relevance" instruction: Rank products by query with category & style match prioritized qwen3_transformers: model_name: Qwen/Qwen3-Reranker-0.6B instruction: rank products by given query + # instruction: "Score the product’s relevance to the given query" max_length: 8192 batch_size: 64 use_fp16: true + # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 attn_implementation: sdpa + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask. + # For 1 query + many short docs (for example 400 product titles), this usually reduces + # repeated prefix work and padding waste compared with pairwise batching. qwen3_transformers_packed: model_name: Qwen/Qwen3-Reranker-0.6B instruction: Rank products by query with category & style match prioritized @@ -436,6 +548,8 @@ services: max_docs_per_pack: 0 use_fp16: true sort_by_doc_length: true + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default. + # If your torch/transformers stack validates it, you can benchmark "sdpa". attn_implementation: eager qwen3_gguf: repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF @@ -443,6 +557,7 @@ services: cache_dir: ./model_cache local_dir: ./models/reranker/qwen3-reranker-4b-gguf instruction: Rank products by query with category & style match prioritized + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快 n_ctx: 512 n_batch: 512 n_ubatch: 512 @@ -465,6 +580,8 @@ services: cache_dir: ./model_cache local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf instruction: Rank products by query with category & style match prioritized + # 0.6B GGUF / online rerank baseline: + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。 n_ctx: 256 n_batch: 256 n_ubatch: 256 @@ -484,22 +601,34 @@ services: verbose: false dashscope_rerank: model_name: qwen3-rerank + # 按地域选择 endpoint: + # 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks + # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks + # 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks api_key_env: RERANK_DASHSCOPE_API_KEY_CN timeout_sec: 10.0 - top_n_cap: 0 - batchsize: 64 + top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限 + batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断) instruct: Given a shopping query, rank product titles by relevance max_retries: 2 retry_backoff_sec: 0.2 + +# SPU配置(已启用,使用嵌套skus) spu_config: enabled: true spu_field: spu_id inner_hits_size: 10 + # 配置哪些option维度参与检索(进索引、以及在线搜索) + # 格式为list,选择option1/option2/option3中的一个或多个 searchable_option_dimensions: - option1 - option2 - option3 + +# 租户配置(Tenant Configuration) +# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选) +# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集 tenant_config: default: primary_language: en diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md new file mode 100644 index 0000000..83adc0d --- /dev/null +++ b/scripts/evaluation/README.md @@ -0,0 +1,330 @@ +# Search Evaluation Framework + +This directory contains the offline annotation set builder, the online evaluation UI/API, the audit tooling, and the fusion-tuning runner for retrieval quality evaluation. + +It is designed around one core rule: + +- Annotation should be built offline first. +- Single-query evaluation should then map recalled `spu_id` values to the cached annotation set. +- Recalled items without cached labels are treated as `Irrelevant` during evaluation, and the UI/API returns a tip so the operator knows coverage is incomplete. + +## Goals + +The framework supports four related tasks: + +1. Build an annotation set for a fixed query set. +2. Evaluate a live search result list against that annotation set. +3. Run batch evaluation and store historical reports with config snapshots. +4. Tune fusion parameters reproducibly. + +## Files + +- `eval_framework.py` + Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation. +- `build_annotation_set.py` + Thin CLI entrypoint into `eval_framework.py`. +- `serve_eval_web.py` + Thin web entrypoint into `eval_framework.py`. +- `tune_fusion.py` + Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports. +- `fusion_experiments_shortlist.json` + A compact experiment set for practical tuning. +- `fusion_experiments_round1.json` + A broader first-round experiment set. +- `queries/queries.txt` + The canonical evaluation query set. +- `README_Requirement.md` + Requirement reference document. +- `quick_start_eval.sh` + Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`). + +## Quick start (from repo root) + +Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend. + +```bash +# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/ +./scripts/evaluation/quick_start_eval.sh batch + +# 2) Evaluation UI on http://127.0.0.1:6010/ +./scripts/evaluation/quick_start_eval.sh serve +``` + +Equivalent explicit commands: + +```bash +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ + --tenant-id "${TENANT_ID:-163}" \ + --queries-file scripts/evaluation/queries/queries.txt \ + --top-k 50 \ + --language en \ + --labeler-mode simple \ + --force-refresh-labels + +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ + --tenant-id "${TENANT_ID:-163}" \ + --queries-file scripts/evaluation/queries/queries.txt \ + --host 127.0.0.1 \ + --port 6010 +``` + +**Batch behavior:** There is no “skip queries already processed”. Each run walks the full queries file. With `--force-refresh-labels`, for **every** query the runner issues a live search and sends **all** `top_k` returned `spu_id`s through the LLM again (SQLite rows are upserted). Omit `--force-refresh-labels` if you only want to fill in labels that are missing for the current recall window. + +## Storage Layout + +All generated artifacts are under: + +- `/data/saas-search/artifacts/search_evaluation` + +Important subpaths: + +- `/data/saas-search/artifacts/search_evaluation/search_eval.sqlite3` + Main cache and annotation store. +- `/data/saas-search/artifacts/search_evaluation/query_builds` + Per-query pooled annotation-set build artifacts. +- `/data/saas-search/artifacts/search_evaluation/batch_reports` + Batch evaluation JSON, Markdown reports, and config snapshots. +- `/data/saas-search/artifacts/search_evaluation/audits` + Audit summaries for label quality checks. +- `/data/saas-search/artifacts/search_evaluation/tuning_runs` + Fusion experiment summaries and per-experiment config snapshots. + +## SQLite Schema Summary + +The main tables in `search_eval.sqlite3` are: + +- `corpus_docs` + Cached product corpus for the tenant. +- `rerank_scores` + Cached full-corpus reranker scores keyed by `(tenant_id, query_text, spu_id)`. +- `relevance_labels` + Cached LLM relevance labels keyed by `(tenant_id, query_text, spu_id)`. +- `query_profiles` + Structured query-intent profiles extracted before labeling. +- `build_runs` + Per-query pooled-build records. +- `batch_runs` + Batch evaluation history. + +## Label Semantics + +Three labels are used throughout: + +- `Exact` + Fully matches the intended product type and all explicit required attributes. +- `Partial` + Main intent matches, but explicit attributes are missing, approximate, or weaker than requested. +- `Irrelevant` + Product type mismatches, or explicit required attributes conflict. + +The framework always uses: + +- LLM-based batched relevance classification +- caching and retry logic for robust offline labeling + +There are now two labeler modes: + +- `simple` + Default. A single low-coupling LLM judging pass per batch, using the standard relevance prompt. +- `complex` + Legacy structured mode. It extracts query profiles and applies extra guardrails. Kept for comparison, but no longer the default. + +## Offline-First Workflow + +### 1. Refresh labels for the evaluation query set + +For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient. + +Example: + +```bash +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ + --tenant-id 163 \ + --queries-file scripts/evaluation/queries/queries.txt \ + --top-k 50 \ + --language en \ + --labeler-mode simple \ + --force-refresh-labels +``` + +This command does two things: + +- runs **every** query in the file against the live backend (no skip list) +- with `--force-refresh-labels`, re-labels **all** `top_k` hits per query via the LLM and upserts SQLite; without the flag, only `spu_id`s lacking a cached label are sent to the LLM + +After this step, single-query evaluation can run in cached mode without calling the LLM again. + +### 2. Optional pooled build + +The framework also supports a heavier pooled build that combines: + +- top search results +- top full-corpus reranker results + +Example: + +```bash +./.venv/bin/python scripts/evaluation/build_annotation_set.py build \ + --tenant-id 163 \ + --queries-file scripts/evaluation/queries/queries.txt \ + --search-depth 1000 \ + --rerank-depth 10000 \ + --annotate-search-top-k 100 \ + --annotate-rerank-top-k 120 \ + --language en +``` + +This is slower, but useful when you want a richer pooled annotation set beyond the current live recall window. + +## Why Single-Query Evaluation Was Slow + +If single-query evaluation is slow, the usual reason is that it is still running with `auto_annotate=true`, which means: + +- perform live search +- detect recalled but unlabeled products +- call the LLM to label them + +That is not the intended steady-state evaluation path. + +The UI/API is now configured to prefer cached evaluation: + +- default single-query evaluation uses `auto_annotate=false` +- unlabeled recalled results are treated as `Irrelevant` +- the response includes tips explaining that coverage gap + +If you want stable, fast evaluation: + +1. prebuild labels offline +2. use cached single-query evaluation + +## Web UI + +Start the evaluation UI: + +```bash +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ + --tenant-id 163 \ + --queries-file scripts/evaluation/queries/queries.txt \ + --host 127.0.0.1 \ + --port 6010 +``` + +The UI provides: + +- query list loaded from `queries.txt` +- single-query evaluation +- batch evaluation +- history of batch reports +- top recalled results +- missed `Exact` and `Partial` products that were not recalled +- tips about unlabeled hits treated as `Irrelevant` + +### Single-query response behavior + +For a single query: + +1. live search returns recalled `spu_id` values +2. the framework looks up cached labels by `(query, spu_id)` +3. unlabeled recalled items are counted as `Irrelevant` +4. cached `Exact` and `Partial` products that were not recalled are listed under `Missed Exact / Partial` + +This makes the page useful as a real retrieval-evaluation view rather than only a search-result viewer. + +## CLI Commands + +### Build pooled annotation artifacts + +```bash +./.venv/bin/python scripts/evaluation/build_annotation_set.py build ... +``` + +### Run batch evaluation + +```bash +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ + --tenant-id 163 \ + --queries-file scripts/evaluation/queries/queries.txt \ + --top-k 50 \ + --language en \ + --labeler-mode simple +``` + +Use `--force-refresh-labels` if you want to rebuild the offline label cache for the recalled window first. + +### Audit annotation quality + +```bash +./.venv/bin/python scripts/evaluation/build_annotation_set.py audit \ + --tenant-id 163 \ + --queries-file scripts/evaluation/queries/queries.txt \ + --top-k 50 \ + --language en \ + --labeler-mode simple +``` + +This checks cached labels against current guardrails and reports suspicious cases. + +## Batch Reports + +Each batch run stores: + +- aggregate metrics +- per-query metrics +- label distribution +- timestamp +- config snapshot from `/admin/config` + +Reports are written as: + +- Markdown for easy reading +- JSON for downstream processing + +## Fusion Tuning + +The tuning runner applies experiment configs sequentially and records the outcome. + +Example: + +```bash +./.venv/bin/python scripts/evaluation/tune_fusion.py \ + --tenant-id 163 \ + --queries-file scripts/evaluation/queries/queries.txt \ + --top-k 50 \ + --language en \ + --experiments-file scripts/evaluation/fusion_experiments_shortlist.json \ + --score-metric MAP_3 \ + --apply-best +``` + +What it does: + +1. writes an experiment config into `config/config.yaml` +2. restarts backend +3. runs batch evaluation +4. stores the per-experiment result +5. optionally applies the best experiment at the end + +## Current Practical Recommendation + +For day-to-day evaluation: + +1. refresh the offline labels for the fixed query set with `batch --force-refresh-labels` +2. run the web UI or normal batch evaluation in cached mode +3. only force-refresh labels again when: + - the query set changes + - the product corpus changes materially + - the labeling logic changes + +## Caveats + +- The current label cache is query-specific, not a full all-products all-queries matrix. +- Single-query evaluation still depends on the live search API for recall, but not on the LLM if labels are already cached. +- The backend restart path in this environment can be briefly unstable immediately after startup; a short wait after restart is sometimes necessary for scripting. +- Some multilingual translation hints are noisy on long-tail fashion queries, which is one reason fusion tuning around translation weight matters. + +## Related Requirement Docs + +- `README_Requirement.md` +- `README_Requirement_zh.md` + +These documents describe the original problem statement. This `README.md` describes the implemented framework and the current recommended workflow. diff --git a/scripts/evaluation/eval_framework.py b/scripts/evaluation/eval_framework.py index 5b52530..af0be96 100644 --- a/scripts/evaluation/eval_framework.py +++ b/scripts/evaluation/eval_framework.py @@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = "Irrelevant" VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt" -JUDGE_PROMPT_VERSION = "v2_structured_20260331" +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" +DEFAULT_LABELER_MODE = "simple" def utc_now_iso() -> str: @@ -625,6 +627,57 @@ class DashScopeLabelClient: content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() return content, safe_json_dumps(data) + def classify_batch_simple( + self, + query: str, + docs: Sequence[Dict[str, Any]], + ) -> Tuple[List[str], str]: + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] + prompt = ( + "You are an e-commerce search result relevance evaluation assistant. " + "Based on the user query and each product's information, output the relevance level for each product.\n\n" + "## Relevance Level Criteria\n" + "Exact — Fully matches the user's search intent.\n" + "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), " + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" + "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n" + "Additional judging guidance:\n" + "- If the query clearly names a product type, product type matching has the highest priority. " + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " + "bra vs top, backpack vs bag are not interchangeable.\n" + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" + "- Do not guess missing attributes.\n" + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" + "- Be conservative with Exact.\n\n" + f"Query: {query}\n\n" + "Products:\n" + + "\n".join(numbered_docs) + + "\n\n## Output Format\n" + f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. " + "They must correspond sequentially to the products above. Do not output any other information.\n" + ) + content, raw_response = self._chat(prompt) + labels = [] + for line in str(content or "").splitlines(): + label = line.strip() + if label in VALID_LABELS: + labels.append(label) + if len(labels) != len(docs): + payload = _extract_json_blob(content) + if isinstance(payload, dict) and isinstance(payload.get("labels"), list): + labels = [] + for item in payload["labels"][: len(docs)]: + if isinstance(item, dict): + label = str(item.get("label") or "").strip() + else: + label = str(item).strip() + if label in VALID_LABELS: + labels.append(label) + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): + raise ValueError(f"unexpected simple label output: {content!r}") + return labels, raw_response + def extract_query_profile( self, query: str, @@ -665,7 +718,7 @@ class DashScopeLabelClient: payload.setdefault("notes", []) return payload, raw_response - def classify_batch( + def classify_batch_complex( self, query: str, query_profile: Dict[str, Any], @@ -763,10 +816,12 @@ class SearchEvaluationFramework: tenant_id: str, artifact_root: Path = DEFAULT_ARTIFACT_ROOT, search_base_url: str = "http://localhost:6002", + labeler_mode: str = DEFAULT_LABELER_MODE, ): init_service(get_app_config().infrastructure.elasticsearch.host) self.tenant_id = str(tenant_id) self.artifact_root = ensure_dir(artifact_root) + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") self.search_client = SearchServiceClient(search_base_url, self.tenant_id) app_cfg = get_app_config() @@ -783,17 +838,24 @@ class SearchEvaluationFramework: base_url=str(llm_cfg["base_url"]), api_key=str(api_key), ) - self.query_parser = get_query_parser() + self.query_parser = None + + def _get_query_parser(self): + if self.query_parser is None: + self.query_parser = get_query_parser() + return self.query_parser def build_query_parser_hints(self, query: str) -> Dict[str, Any]: - parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"]) + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) payload = parsed.to_dict() payload["text_for_rerank"] = parsed.text_for_rerank() return payload def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: + if self.labeler_mode != "complex": + raise RuntimeError("query profiles are only used in complex labeler mode") if not force_refresh: - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION) + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) if cached is not None: return cached parser_hints = self.build_query_parser_hints(query) @@ -802,7 +864,7 @@ class SearchEvaluationFramework: self.store.upsert_query_profile( self.tenant_id, query, - JUDGE_PROMPT_VERSION, + JUDGE_PROMPT_VERSION_COMPLEX, self.label_client.model, profile, raw_response, @@ -955,9 +1017,24 @@ class SearchEvaluationFramework: *, top_k: int = 100, language: str = "en", - auto_annotate: bool = True, + auto_annotate: bool = False, ) -> Dict[str, Any]: live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) + if self.labeler_mode != "complex": + labels = [ + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT + for item in live["results"] + ] + return { + "query": query, + "tenant_id": self.tenant_id, + "top_k": top_k, + "metrics": live["metrics"], + "distribution": label_distribution(labels), + "query_profile": None, + "suspicious": [], + "results": live["results"], + } query_profile = self.get_query_profile(query, force_refresh=False) suspicious: List[Dict[str, Any]] = [] @@ -1093,7 +1170,6 @@ class SearchEvaluationFramework: docs: Sequence[Dict[str, Any]], force_refresh: bool = False, ) -> Dict[str, str]: - query_profile = self.get_query_profile(query, force_refresh=force_refresh) labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] if not missing_docs: @@ -1101,12 +1177,9 @@ class SearchEvaluationFramework: for start in range(0, len(missing_docs), self.label_client.batch_size): batch = missing_docs[start : start + self.label_client.batch_size] - batch_pairs = self._classify_with_retry(query, query_profile, batch) + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh) for sub_labels, raw_response, sub_batch in batch_pairs: - to_store = { - str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc) - for doc, label in zip(sub_batch, sub_labels) - } + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} self.store.upsert_labels( self.tenant_id, query, @@ -1121,19 +1194,28 @@ class SearchEvaluationFramework: def _classify_with_retry( self, query: str, - query_profile: Dict[str, Any], docs: Sequence[Dict[str, Any]], + *, + force_refresh: bool = False, ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: if not docs: return [] try: - labels, raw_response = self.label_client.classify_batch(query, query_profile, docs) + if self.labeler_mode == "complex": + query_profile = self.get_query_profile(query, force_refresh=force_refresh) + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) + labels = [ + self._apply_rule_based_label_guardrails(label, query_profile, doc) + for doc, label in zip(docs, labels) + ] + else: + labels, raw_response = self.label_client.classify_batch_simple(query, docs) return [(labels, raw_response, docs)] except Exception: if len(docs) == 1: raise mid = len(docs) // 2 - return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:]) + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh) def build_query_annotation_set( self, @@ -1163,7 +1245,6 @@ class SearchEvaluationFramework: for item in full_rerank[:annotate_rerank_top_k]: pool_docs[str(item["spu_id"])] = item["doc"] - query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels) labels = self.annotate_missing_labels( query=query, docs=list(pool_docs.values()), @@ -1229,7 +1310,8 @@ class SearchEvaluationFramework: "annotate_rerank_top_k": annotate_rerank_top_k, "pool_size": len(pool_docs), }, - "query_profile": query_profile, + "labeler_mode": self.labeler_mode, + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, "metrics_top100": metrics, "search_results": search_labeled_results, "full_rerank_top": rerank_top_results, @@ -1250,7 +1332,7 @@ class SearchEvaluationFramework: self, query: str, top_k: int = 100, - auto_annotate: bool = True, + auto_annotate: bool = False, language: str = "en", force_refresh_labels: bool = False, ) -> Dict[str, Any]: @@ -1259,16 +1341,21 @@ class SearchEvaluationFramework: if auto_annotate: self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) labels = self.store.get_labels(self.tenant_id, query) + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]} labeled = [] + unlabeled_hits = 0 for rank, doc in enumerate(results[:top_k], start=1): spu_id = str(doc.get("spu_id")) + label = labels.get(spu_id) + if label not in VALID_LABELS: + unlabeled_hits += 1 labeled.append( { "rank": rank, "spu_id": spu_id, "title": build_display_title(doc), "image_url": doc.get("image_url"), - "label": labels.get(spu_id), + "label": label, "option_values": list(compact_option_values(doc.get("skus") or [])), "product": compact_product_payload(doc), } @@ -1277,12 +1364,65 @@ class SearchEvaluationFramework: item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in labeled ] + label_stats = self.store.get_query_label_stats(self.tenant_id, query) + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query) + relevant_missing_ids = [ + spu_id + for spu_id, label in labels.items() + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids + ] + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) + missing_relevant = [] + for spu_id in relevant_missing_ids: + doc = missing_docs_map.get(spu_id) + if not doc: + continue + missing_relevant.append( + { + "spu_id": spu_id, + "label": labels[spu_id], + "rerank_score": rerank_scores.get(spu_id), + "title": build_display_title(doc), + "image_url": doc.get("image_url"), + "option_values": list(compact_option_values(doc.get("skus") or [])), + "product": compact_product_payload(doc), + } + ) + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} + missing_relevant.sort( + key=lambda item: ( + label_order.get(str(item.get("label")), 9), + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")), + str(item.get("title") or ""), + ) + ) + tips: List[str] = [] + if auto_annotate: + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.") + else: + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.") + if label_stats["total"] == 0: + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.") + if unlabeled_hits: + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") + if not missing_relevant: + tips.append("No cached Exact/Partial products were missed by this recall set.") return { "query": query, "tenant_id": self.tenant_id, "top_k": top_k, "metrics": compute_query_metrics(metric_labels), "results": labeled, + "missing_relevant": missing_relevant, + "label_stats": { + **label_stats, + "unlabeled_hits_treated_irrelevant": unlabeled_hits, + "recalled_hits": len(labeled), + "missing_relevant_count": len(missing_relevant), + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), + }, + "tips": tips, "total": int(search_payload.get("total") or 0), } @@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: class SearchEvalRequest(BaseModel): query: str top_k: int = Field(default=100, ge=1, le=500) - auto_annotate: bool = True + auto_annotate: bool = False language: str = "en" class BatchEvalRequest(BaseModel): queries: Optional[List[str]] = None top_k: int = Field(default=100, ge=1, le=500) - auto_annotate: bool = True + auto_annotate: bool = False language: str = "en" force_refresh_labels: bool = False @@ -1494,6 +1634,8 @@ WEB_APP_HTML = """ .options { color: var(--muted); line-height: 1.5; font-size: 14px; } .section { margin-bottom: 28px; } .history { font-size: 13px; line-height: 1.5; } + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } + .tip { margin-bottom: 6px; color: var(--muted); }
@@ -1524,6 +1666,14 @@ WEB_APP_HTML = """