4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
1
|
# Unified Configuration for Multi-Tenant Search Engine
|
33839b37
tangwang
属性值参与搜索:
|
2
3
|
# 统一配置文件,所有租户共用一套配置
# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
4
5
6
7
|
# Elasticsearch Index
es_index_name: "search_products"
|
33839b37
tangwang
属性值参与搜索:
|
8
|
# ES Index Settings (基础设置)
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
9
10
11
12
13
|
es_settings:
number_of_shards: 1
number_of_replicas: 0
refresh_interval: "30s"
|
33839b37
tangwang
属性值参与搜索:
|
14
|
# 字段权重配置(用于搜索时的字段boost)
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
15
16
|
# 统一按“字段基名”配置;查询时按 search_langs 动态拼接 .{lang}。
# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
|
33839b37
tangwang
属性值参与搜索:
|
17
|
field_boosts:
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
18
19
20
|
title: 3.0
brief: 1.5
description: 1.0
|
a8261ece
tangwang
检索效果优化
|
21
|
qanchors: 1.5
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
22
23
24
|
vendor: 1.5
category_path: 1.5
category_name_text: 1.5
|
33839b37
tangwang
属性值参与搜索:
|
25
26
27
28
29
|
tags: 1.0
option1_values: 0.5
option2_values: 0.5
option3_values: 0.5
|
33839b37
tangwang
属性值参与搜索:
|
30
|
# Query Configuration(查询配置)
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
31
|
query_config:
|
33839b37
tangwang
属性值参与搜索:
|
32
|
# 支持的语言
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
33
34
35
|
supported_languages:
- "zh"
- "en"
|
2739b281
tangwang
多语言索引调整
|
36
|
default_language: "en"
|
33839b37
tangwang
属性值参与搜索:
|
37
|
|
345d960b
tangwang
1. 删除全局 enable_tr...
|
38
|
# 功能开关(翻译开关由tenant_config控制)
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
39
40
|
enable_text_embedding: true
enable_query_rewrite: true
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
41
|
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
# 动态多语言检索字段配置
# multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
# shared_fields 为无语言后缀字段。
search_fields:
multilingual_fields:
- "title"
- "brief"
- "description"
- "vendor"
- "category_path"
- "category_name_text"
shared_fields:
- "tags"
- "option1_values"
- "option2_values"
- "option3_values"
core_multilingual_fields:
- "title"
- "brief"
- "vendor"
- "category_name_text"
|
c90f80ed
tangwang
相关性优化
|
64
|
# 统一文本召回策略(主查询 + 翻译查询 + 原始查询兜底)
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
65
66
67
68
69
70
71
|
text_query_strategy:
base_minimum_should_match: "75%"
translation_minimum_should_match: "75%"
translation_boost: 0.4
translation_boost_when_source_missing: 1.0
source_boost_when_missing: 0.6
original_query_fallback_boost_when_translation_missing: 0.2
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
72
|
tie_breaker_base_query: 0.9
|
bd96cead
tangwang
1. 动态多语言字段与统一策略配置
|
73
|
|
33839b37
tangwang
属性值参与搜索:
|
74
75
76
|
# Embedding字段名称
text_embedding_field: "title_embedding"
image_embedding_field: null
|
325eec03
tangwang
1. 日志、配置基础设施,使用优化
|
77
|
|
33839b37
tangwang
属性值参与搜索:
|
78
79
80
|
# 返回字段配置(_source includes)
# null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
source_fields: null
|
70dab99f
tangwang
add logs
|
81
82
83
|
# KNN boost配置(向量召回的boost值)
knn_boost: 0.25 # Lower boost for embedding recall
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
84
|
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
85
86
87
88
|
# Function Score配置(ES层打分规则)
function_score:
score_mode: "sum"
boost_mode: "multiply"
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
89
90
|
functions: []
|
42e3aea6
tangwang
tidy
|
91
|
# 重排配置(provider/URL 在 services.rerank)
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
92
|
rerank:
|
5f7d7f09
tangwang
性能测试报告.md
|
93
|
enabled: true
|
c51d254f
tangwang
性能测试
|
94
|
rerank_window: 384
|
42e3aea6
tangwang
tidy
|
95
|
timeout_sec: 15.0
|
506c39b7
tangwang
feat(search): 统一重...
|
96
97
|
weight_es: 0.4
weight_ai: 0.6
|
ff32d894
tangwang
rerank
|
98
99
|
rerank_query_template: "{query}"
rerank_doc_template: "{title}"
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
100
|
|
42e3aea6
tangwang
tidy
|
101
102
103
|
# 可扩展服务/provider 注册表(单一配置源)
services:
translation:
|
5e4dc8e4
tangwang
翻译架构按“一个翻译服务 +
|
104
105
106
|
service_url: "http://127.0.0.1:6006"
default_model: "llm"
default_scene: "general"
|
42e3aea6
tangwang
tidy
|
107
|
timeout_sec: 10.0
|
d4cadc13
tangwang
翻译重构
|
108
|
cache:
|
d4cadc13
tangwang
翻译重构
|
109
110
|
ttl_seconds: 62208000
sliding_expiration: true
|
5e4dc8e4
tangwang
翻译架构按“一个翻译服务 +
|
111
|
capabilities:
|
d4cadc13
tangwang
翻译重构
|
112
|
qwen-mt:
|
5e4dc8e4
tangwang
翻译架构按“一个翻译服务 +
|
113
|
enabled: true
|
0fd2f875
tangwang
translate
|
114
|
backend: "qwen_mt"
|
5e4dc8e4
tangwang
翻译架构按“一个翻译服务 +
|
115
|
model: "qwen-mt-flash"
|
0fd2f875
tangwang
translate
|
116
|
base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
|
42e3aea6
tangwang
tidy
|
117
|
timeout_sec: 10.0
|
5e4dc8e4
tangwang
翻译架构按“一个翻译服务 +
|
118
|
use_cache: true
|
a0a173ae
tangwang
last
|
119
|
llm:
|
5e4dc8e4
tangwang
翻译架构按“一个翻译服务 +
|
120
|
enabled: true
|
0fd2f875
tangwang
translate
|
121
|
backend: "llm"
|
a0a173ae
tangwang
last
|
122
|
model: "qwen-flash"
|
0fd2f875
tangwang
translate
|
123
|
base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
|
a0a173ae
tangwang
last
|
124
|
timeout_sec: 30.0
|
cd4ce66d
tangwang
trans logs
|
125
|
use_cache: true
|
d4cadc13
tangwang
翻译重构
|
126
|
deepl:
|
cd4ce66d
tangwang
trans logs
|
127
|
enabled: true
|
0fd2f875
tangwang
translate
|
128
129
|
backend: "deepl"
api_url: "https://api.deepl.com/v2/translate"
|
d4cadc13
tangwang
翻译重构
|
130
|
timeout_sec: 10.0
|
d4cadc13
tangwang
翻译重构
|
131
|
glossary_id: ""
|
cd4ce66d
tangwang
trans logs
|
132
|
use_cache: true
|
0fd2f875
tangwang
translate
|
133
134
135
136
137
|
nllb-200-distilled-600m:
enabled: true
backend: "local_nllb"
model_id: "facebook/nllb-200-distilled-600M"
model_dir: "./models/translation/facebook/nllb-200-distilled-600M"
|
ea293660
tangwang
CTranslate2
|
138
139
140
141
|
ct2_model_dir: "./models/translation/facebook/nllb-200-distilled-600M/ctranslate2-float16"
ct2_compute_type: "float16"
ct2_conversion_quantization: "float16"
ct2_auto_convert: true
|
46ce858d
tangwang
在NLLB模型的 /data/sa...
|
142
|
ct2_inter_threads: 4
|
ea293660
tangwang
CTranslate2
|
143
|
ct2_intra_threads: 0
|
46ce858d
tangwang
在NLLB模型的 /data/sa...
|
144
|
ct2_max_queued_batches: 32
|
ea293660
tangwang
CTranslate2
|
145
|
ct2_batch_type: "examples"
|
46ce858d
tangwang
在NLLB模型的 /data/sa...
|
146
147
148
|
ct2_decoding_length_mode: "source"
ct2_decoding_length_extra: 8
ct2_decoding_length_min: 32
|
0fd2f875
tangwang
translate
|
149
150
|
device: "cuda"
torch_dtype: "float16"
|
3eff49b7
tangwang
trans nllb-200-di...
|
151
|
batch_size: 16
|
0fd2f875
tangwang
translate
|
152
|
max_input_length: 256
|
3eff49b7
tangwang
trans nllb-200-di...
|
153
|
max_new_tokens: 64
|
0fd2f875
tangwang
translate
|
154
|
num_beams: 1
|
cd4ce66d
tangwang
trans logs
|
155
|
use_cache: true
|
0fd2f875
tangwang
translate
|
156
157
158
159
160
|
opus-mt-zh-en:
enabled: true
backend: "local_marian"
model_id: "Helsinki-NLP/opus-mt-zh-en"
model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en"
|
ea293660
tangwang
CTranslate2
|
161
162
163
164
165
166
167
168
|
ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en/ctranslate2-float16"
ct2_compute_type: "float16"
ct2_conversion_quantization: "float16"
ct2_auto_convert: true
ct2_inter_threads: 1
ct2_intra_threads: 0
ct2_max_queued_batches: 0
ct2_batch_type: "examples"
|
0fd2f875
tangwang
translate
|
169
170
171
172
173
174
|
device: "cuda"
torch_dtype: "float16"
batch_size: 16
max_input_length: 256
max_new_tokens: 256
num_beams: 1
|
cd4ce66d
tangwang
trans logs
|
175
|
use_cache: true
|
0fd2f875
tangwang
translate
|
176
177
178
179
180
|
opus-mt-en-zh:
enabled: true
backend: "local_marian"
model_id: "Helsinki-NLP/opus-mt-en-zh"
model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh"
|
ea293660
tangwang
CTranslate2
|
181
182
183
184
185
186
187
188
|
ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh/ctranslate2-float16"
ct2_compute_type: "float16"
ct2_conversion_quantization: "float16"
ct2_auto_convert: true
ct2_inter_threads: 1
ct2_intra_threads: 0
ct2_max_queued_batches: 0
ct2_batch_type: "examples"
|
0fd2f875
tangwang
translate
|
189
190
191
192
193
194
|
device: "cuda"
torch_dtype: "float16"
batch_size: 16
max_input_length: 256
max_new_tokens: 256
num_beams: 1
|
cd4ce66d
tangwang
trans logs
|
195
|
use_cache: true
|
42e3aea6
tangwang
tidy
|
196
|
embedding:
|
950a640e
tangwang
embeddings
|
197
|
provider: "http" # http
|
42e3aea6
tangwang
tidy
|
198
199
200
201
|
base_url: "http://127.0.0.1:6005"
providers:
http:
base_url: "http://127.0.0.1:6005"
|
07cf5a93
tangwang
START_EMBEDDING=...
|
202
|
# 服务内文本后端(embedding 进程启动时读取)
|
efd435cf
tangwang
tei性能调优:
|
203
|
backend: "tei" # tei | local_st
|
07cf5a93
tangwang
START_EMBEDDING=...
|
204
205
206
|
backends:
tei:
base_url: "http://127.0.0.1:8080"
|
efd435cf
tangwang
tei性能调优:
|
207
|
timeout_sec: 20
|
07cf5a93
tangwang
START_EMBEDDING=...
|
208
209
210
211
212
213
|
model_id: "Qwen/Qwen3-Embedding-0.6B"
local_st:
model_id: "Qwen/Qwen3-Embedding-0.6B"
device: "cuda"
batch_size: 32
normalize_embeddings: true
|
42e3aea6
tangwang
tidy
|
214
|
rerank:
|
701ae503
tangwang
docs
|
215
|
provider: "http"
|
42e3aea6
tangwang
tidy
|
216
217
218
219
|
base_url: "http://127.0.0.1:6007"
providers:
http:
base_url: "http://127.0.0.1:6007"
|
701ae503
tangwang
docs
|
220
221
|
service_url: "http://127.0.0.1:6007/rerank"
# 服务内后端(reranker 进程启动时读取)
|
22ae00c7
tangwang
product_annotator
|
222
|
backend: "qwen3_vllm" # bge | qwen3_vllm | qwen3_transformers | dashscope_rerank
|
701ae503
tangwang
docs
|
223
224
225
226
227
228
229
230
231
232
233
234
|
backends:
bge:
model_name: "BAAI/bge-reranker-v2-m3"
device: null
use_fp16: true
batch_size: 64
max_length: 512
cache_dir: "./model_cache"
enable_warmup: true
qwen3_vllm:
model_name: "Qwen/Qwen3-Reranker-0.6B"
engine: "vllm"
|
07cf5a93
tangwang
START_EMBEDDING=...
|
235
|
max_model_len: 256
|
701ae503
tangwang
docs
|
236
|
tensor_parallel_size: 1
|
07cf5a93
tangwang
START_EMBEDDING=...
|
237
238
|
gpu_memory_utilization: 0.36
dtype: "float16"
|
bc089b43
tangwang
refactor(reranker...
|
239
240
|
enable_prefix_caching: true
enforce_eager: false
|
9f5994b4
tangwang
reranker
|
241
242
243
|
infer_batch_size: 64
sort_by_doc_length: true
length_sort_mode: "char" # char | token
|
a99e62ba
tangwang
记录各阶段耗时
|
244
|
instruction: "Given a shopping query, rank product titles by relevance"
|
d31c7f65
tangwang
补充云服务reranker
|
245
246
247
248
249
250
251
252
253
254
255
256
257
258
|
qwen3_transformers:
model_name: "Qwen/Qwen3-Reranker-0.6B"
instruction: "Given a shopping query, rank product titles by relevance"
max_length: 8192
batch_size: 64
use_fp16: true
attn_implementation: "flash_attention_2"
dashscope_rerank:
model_name: "qwen3-rerank"
# 按地域选择 endpoint:
# 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
# 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks
# 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks
endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks"
|
0d3e73ba
tangwang
rerank mini batch
|
259
260
|
api_key_env: "RERANK_DASHSCOPE_API_KEY_CN"
timeout_sec: 10.0 #
|
d31c7f65
tangwang
补充云服务reranker
|
261
|
top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限
|
0d3e73ba
tangwang
rerank mini batch
|
262
|
batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断)
|
d31c7f65
tangwang
补充云服务reranker
|
263
264
265
|
instruct: "Given a shopping query, rank product titles by relevance"
max_retries: 2
retry_backoff_sec: 0.2
|
42e3aea6
tangwang
tidy
|
266
|
|
cadc77b6
tangwang
索引字段名、变量名、API数据结构...
|
267
|
# SPU配置(已启用,使用嵌套skus)
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
268
269
|
spu_config:
enabled: true
|
cadc77b6
tangwang
索引字段名、变量名、API数据结构...
|
270
|
spu_field: "spu_id"
|
4d824a77
tangwang
所有租户共用一套统一配置.tena...
|
271
|
inner_hits_size: 10
|
33839b37
tangwang
属性值参与搜索:
|
272
273
274
|
# 配置哪些option维度参与检索(进索引、以及在线搜索)
# 格式为list,选择option1/option2/option3中的一个或多个
searchable_option_dimensions: ['option1', 'option2', 'option3']
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
275
276
|
# 租户配置(Tenant Configuration)
|
038e4e2f
tangwang
refactor(i18n): t...
|
277
|
# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选)
|
6f7840cf
tangwang
refactor: rename ...
|
278
|
# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
279
|
tenant_config:
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
280
|
default:
|
2739b281
tangwang
多语言索引调整
|
281
|
primary_language: "en"
|
038e4e2f
tangwang
refactor(i18n): t...
|
282
|
index_languages: ["en", "zh"]
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
283
284
285
|
tenants:
"1":
primary_language: "zh"
|
038e4e2f
tangwang
refactor(i18n): t...
|
286
|
index_languages: ["zh", "en"]
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
287
288
|
"2":
primary_language: "en"
|
038e4e2f
tangwang
refactor(i18n): t...
|
289
|
index_languages: ["en", "zh"]
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
290
291
|
"3":
primary_language: "zh"
|
038e4e2f
tangwang
refactor(i18n): t...
|
292
|
index_languages: ["zh", "en"]
|
0064e946
tangwang
feat: 增量索引服务、租户配置...
|
293
294
|
"162":
primary_language: "zh"
|
038e4e2f
tangwang
refactor(i18n): t...
|
295
|
index_languages: ["zh", "en"]
|
cff5e86f
tangwang
reindex
|
296
297
|
"170":
primary_language: "en"
|
038e4e2f
tangwang
refactor(i18n): t...
|
298
|
index_languages: ["en", "zh"]
|