Commit b926f67839c02dad9d5f1e1a2905b03ead7bffb4
1 parent
2a76641e
多语言查询
Showing
10 changed files
with
1048 additions
and
15 deletions
Show diff stats
HighLevelDesign.md
| ... | ... | @@ -112,10 +112,9 @@ if response.status_code == 200: |
| 112 | 112 | 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。 |
| 113 | 113 | |
| 114 | 114 | default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 |
| 115 | -多语言搜索: | |
| 116 | -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 | |
| 117 | 115 | |
| 118 | 116 | 暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) |
| 117 | +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 | |
| 119 | 118 | bm25打分(base_query): |
| 120 | 119 | "multi_match": { |
| 121 | 120 | "query": search_query, | ... | ... |
| ... | ... | @@ -0,0 +1,219 @@ |
| 1 | +# 多语言标题索引功能说明 | |
| 2 | + | |
| 3 | +## 功能概述 | |
| 4 | + | |
| 5 | +本功能实现了多语言标题索引的支持,允许不同语言的标题字段使用对应的分词器,同时对外提供统一的搜索接口。 | |
| 6 | + | |
| 7 | +## 主要特性 | |
| 8 | + | |
| 9 | +1. **多语言字段分离索引**:不同语言的标题字段(中文、英文、俄文等)使用对应的分词器 | |
| 10 | +2. **统一的搜索接口**:对外仍然使用 `default` 域搜索,内部自动处理多语言路由 | |
| 11 | +3. **智能查询路由**:根据查询语言和配置,将查询路由到对应的语言字段 | |
| 12 | + | |
| 13 | +## 配置说明 | |
| 14 | + | |
| 15 | +### 字段配置 | |
| 16 | + | |
| 17 | +在 `customer1_config.yaml` 中,需要为不同语言的标题字段配置对应的分词器: | |
| 18 | + | |
| 19 | +```yaml | |
| 20 | +fields: | |
| 21 | + # 中文标题 - 使用中文电商分词器 | |
| 22 | + - name: "name" | |
| 23 | + type: "TEXT" | |
| 24 | + analyzer: "chinese_ecommerce" | |
| 25 | + boost: 2.0 | |
| 26 | + | |
| 27 | + # 英文标题 - 使用英文分词器 | |
| 28 | + - name: "enSpuName" | |
| 29 | + type: "TEXT" | |
| 30 | + analyzer: "english" | |
| 31 | + boost: 2.0 | |
| 32 | + | |
| 33 | + # 俄文标题 - 使用俄文分词器 | |
| 34 | + - name: "ruSkuName" | |
| 35 | + type: "TEXT" | |
| 36 | + analyzer: "russian" | |
| 37 | + boost: 2.0 | |
| 38 | +``` | |
| 39 | + | |
| 40 | +### 索引域配置 | |
| 41 | + | |
| 42 | +在索引配置中添加 `language_field_mapping` 来指定每种语言对应的字段: | |
| 43 | + | |
| 44 | +```yaml | |
| 45 | +indexes: | |
| 46 | + - name: "default" | |
| 47 | + label: "默认索引" | |
| 48 | + fields: | |
| 49 | + - "name" | |
| 50 | + - "enSpuName" | |
| 51 | + - "ruSkuName" | |
| 52 | + - "categoryName" | |
| 53 | + - "brandName" | |
| 54 | + analyzer: "chinese_ecommerce" | |
| 55 | + boost: 1.0 | |
| 56 | + language_field_mapping: | |
| 57 | + zh: | |
| 58 | + - "name" | |
| 59 | + - "categoryName" | |
| 60 | + - "brandName" | |
| 61 | + en: | |
| 62 | + - "enSpuName" | |
| 63 | + ru: | |
| 64 | + - "ruSkuName" | |
| 65 | + | |
| 66 | + - name: "title" | |
| 67 | + label: "标题索引" | |
| 68 | + fields: | |
| 69 | + - "name" | |
| 70 | + - "enSpuName" | |
| 71 | + - "ruSkuName" | |
| 72 | + analyzer: "chinese_ecommerce" | |
| 73 | + boost: 2.0 | |
| 74 | + language_field_mapping: | |
| 75 | + zh: | |
| 76 | + - "name" | |
| 77 | + en: | |
| 78 | + - "enSpuName" | |
| 79 | + ru: | |
| 80 | + - "ruSkuName" | |
| 81 | +``` | |
| 82 | + | |
| 83 | +### 查询配置 | |
| 84 | + | |
| 85 | +在 `query_config` 中配置支持的语言: | |
| 86 | + | |
| 87 | +```yaml | |
| 88 | +query_config: | |
| 89 | + supported_languages: | |
| 90 | + - "zh" | |
| 91 | + - "en" | |
| 92 | + - "ru" | |
| 93 | + default_language: "zh" | |
| 94 | + enable_translation: true | |
| 95 | + enable_text_embedding: true | |
| 96 | +``` | |
| 97 | + | |
| 98 | +## 工作原理 | |
| 99 | + | |
| 100 | +### 1. 查询解析阶段 | |
| 101 | + | |
| 102 | +当用户输入查询时: | |
| 103 | +1. **语言检测**:自动检测查询语言(中文、英文、俄文等) | |
| 104 | +2. **翻译生成**:如果启用了翻译,将查询翻译到其他支持的语言 | |
| 105 | +3. **域提取**:如果查询包含域前缀(如 `title:查询`),提取域信息 | |
| 106 | + | |
| 107 | +### 2. 查询构建阶段 | |
| 108 | + | |
| 109 | +对于有 `language_field_mapping` 的域: | |
| 110 | + | |
| 111 | +1. **检测语言查询**:使用检测到的语言和原始查询,搜索对应语言的字段 | |
| 112 | + - 例如:中文查询 "芭比娃娃" → 搜索 `name` 字段(中文分词器) | |
| 113 | + | |
| 114 | +2. **翻译语言查询**:使用翻译后的查询,搜索对应语言的字段 | |
| 115 | + - 例如:中文查询翻译为英文 "Barbie doll" → 搜索 `enSpuName` 字段(英文分词器) | |
| 116 | + | |
| 117 | +3. **查询组合**:将多个语言查询组合为 `should` 子句,提高召回率 | |
| 118 | + - 检测语言的查询权重更高(boost * 1.5) | |
| 119 | + - 翻译语言的查询使用正常权重(boost * 1.0) | |
| 120 | + | |
| 121 | +### 3. 字段级别分析器 | |
| 122 | + | |
| 123 | +Elasticsearch 会自动为每个字段使用其配置的分析器: | |
| 124 | +- `name` 字段使用 `chinese_ecommerce` 分词器 | |
| 125 | +- `enSpuName` 字段使用 `english` 分词器 | |
| 126 | +- `ruSkuName` 字段使用 `russian` 分词器 | |
| 127 | + | |
| 128 | +## 使用示例 | |
| 129 | + | |
| 130 | +### 示例 1: 默认域搜索(中文查询) | |
| 131 | + | |
| 132 | +``` | |
| 133 | +查询: "芭比娃娃" | |
| 134 | +域: default | |
| 135 | +检测语言: zh | |
| 136 | +``` | |
| 137 | + | |
| 138 | +**生成的查询**: | |
| 139 | +- 中文查询 "芭比娃娃" → 搜索 `name`, `categoryName`, `brandName` 字段(boost * 1.5) | |
| 140 | +- 英文翻译 "Barbie doll" → 搜索 `enSpuName` 字段(boost * 1.0) | |
| 141 | +- 俄文翻译 "Кукла Барби" → 搜索 `ruSkuName` 字段(boost * 1.0) | |
| 142 | + | |
| 143 | +### 示例 2: 标题域搜索(英文查询) | |
| 144 | + | |
| 145 | +``` | |
| 146 | +查询: "title:Barbie doll" | |
| 147 | +域: title | |
| 148 | +检测语言: en | |
| 149 | +``` | |
| 150 | + | |
| 151 | +**生成的查询**: | |
| 152 | +- 英文查询 "Barbie doll" → 搜索 `enSpuName` 字段(boost * 2.0 * 1.5) | |
| 153 | +- 中文翻译 "芭比娃娃" → 搜索 `name` 字段(boost * 2.0) | |
| 154 | +- 俄文翻译 "Кукла Барби" → 搜索 `ruSkuName` 字段(boost * 2.0) | |
| 155 | + | |
| 156 | +### 示例 3: 无语言映射的域 | |
| 157 | + | |
| 158 | +``` | |
| 159 | +查询: "category:玩具" | |
| 160 | +域: category | |
| 161 | +``` | |
| 162 | + | |
| 163 | +**生成的查询**: | |
| 164 | +- 使用所有配置的字段进行搜索(`categoryName`) | |
| 165 | +- 不进行多语言路由 | |
| 166 | + | |
| 167 | +## 配置验证 | |
| 168 | + | |
| 169 | +系统会自动验证配置: | |
| 170 | +1. 检查 `language_field_mapping` 中引用的字段是否存在 | |
| 171 | +2. 验证字段类型是否为 `TEXT` | |
| 172 | +3. 警告字段分析器与语言不匹配的情况 | |
| 173 | + | |
| 174 | +## API 使用 | |
| 175 | + | |
| 176 | +### 搜索接口 | |
| 177 | + | |
| 178 | +```python | |
| 179 | +POST /search/ | |
| 180 | +{ | |
| 181 | + "query": "芭比娃娃", | |
| 182 | + "size": 10, | |
| 183 | + "enable_translation": true, | |
| 184 | + "enable_embedding": true | |
| 185 | +} | |
| 186 | +``` | |
| 187 | + | |
| 188 | +### 域搜索 | |
| 189 | + | |
| 190 | +```python | |
| 191 | +POST /search/ | |
| 192 | +{ | |
| 193 | + "query": "title:芭比娃娃", | |
| 194 | + "size": 10, | |
| 195 | + "enable_translation": true | |
| 196 | +} | |
| 197 | +``` | |
| 198 | + | |
| 199 | +## 注意事项 | |
| 200 | + | |
| 201 | +1. **翻译服务**:需要配置 DeepL API 密钥才能使用翻译功能 | |
| 202 | +2. **字段配置**:确保每个语言字段都配置了正确的分词器 | |
| 203 | +3. **性能考虑**:多语言查询会产生多个子查询,可能略微影响性能 | |
| 204 | +4. **语言检测**:语言检测的准确性会影响查询路由的效果 | |
| 205 | + | |
| 206 | +## 技术实现 | |
| 207 | + | |
| 208 | +- **MultiLanguageQueryBuilder**: 多语言查询构建器 | |
| 209 | +- **QueryParser**: 查询解析器,支持语言检测和翻译 | |
| 210 | +- **ConfigLoader**: 配置加载器,支持 `language_field_mapping` 配置 | |
| 211 | +- **MappingGenerator**: 映射生成器,确保字段使用正确的分析器 | |
| 212 | + | |
| 213 | +## 未来改进 | |
| 214 | + | |
| 215 | +1. 支持更多语言 | |
| 216 | +2. 优化翻译缓存机制 | |
| 217 | +3. 支持自定义语言检测模型 | |
| 218 | +4. 添加查询性能监控 | |
| 219 | + | ... | ... |
config/config_loader.py
| ... | ... | @@ -27,6 +27,9 @@ class IndexConfig: |
| 27 | 27 | boost: float = 1.0 |
| 28 | 28 | example: Optional[str] = None |
| 29 | 29 | |
| 30 | + # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]} | |
| 31 | + language_field_mapping: Optional[Dict[str, List[str]]] = None | |
| 32 | + | |
| 30 | 33 | |
| 31 | 34 | @dataclass |
| 32 | 35 | class RankingConfig: |
| ... | ... | @@ -66,8 +69,6 @@ class CustomerConfig: |
| 66 | 69 | |
| 67 | 70 | # Database settings |
| 68 | 71 | mysql_config: Dict[str, Any] |
| 69 | - main_table: str = "shoplazza_product_sku" | |
| 70 | - extension_table: Optional[str] = None | |
| 71 | 72 | |
| 72 | 73 | # Field definitions |
| 73 | 74 | fields: List[FieldConfig] |
| ... | ... | @@ -86,6 +87,10 @@ class CustomerConfig: |
| 86 | 87 | |
| 87 | 88 | # ES index settings |
| 88 | 89 | es_index_name: str |
| 90 | + | |
| 91 | + # Optional fields with defaults | |
| 92 | + main_table: str = "shoplazza_product_sku" | |
| 93 | + extension_table: Optional[str] = None | |
| 89 | 94 | es_settings: Dict[str, Any] = field(default_factory=dict) |
| 90 | 95 | |
| 91 | 96 | |
| ... | ... | @@ -228,13 +233,17 @@ class ConfigLoader: |
| 228 | 233 | if analyzer_str not in ANALYZER_MAP: |
| 229 | 234 | raise ConfigurationError(f"Unknown analyzer: {analyzer_str}") |
| 230 | 235 | |
| 236 | + # Parse language field mapping if present | |
| 237 | + language_field_mapping = index_data.get("language_field_mapping") | |
| 238 | + | |
| 231 | 239 | return IndexConfig( |
| 232 | 240 | name=index_data["name"], |
| 233 | 241 | label=index_data.get("label", index_data["name"]), |
| 234 | 242 | fields=index_data["fields"], |
| 235 | 243 | analyzer=ANALYZER_MAP[analyzer_str], |
| 236 | 244 | boost=index_data.get("boost", 1.0), |
| 237 | - example=index_data.get("example") | |
| 245 | + example=index_data.get("example"), | |
| 246 | + language_field_mapping=language_field_mapping | |
| 238 | 247 | ) |
| 239 | 248 | |
| 240 | 249 | def validate_config(self, config: CustomerConfig) -> List[str]: |
| ... | ... | @@ -251,10 +260,57 @@ class ConfigLoader: |
| 251 | 260 | |
| 252 | 261 | # Validate field references in indexes |
| 253 | 262 | field_names = {field.name for field in config.fields} |
| 263 | + field_map = {field.name: field for field in config.fields} | |
| 264 | + | |
| 254 | 265 | for index in config.indexes: |
| 266 | + # Validate fields in index.fields | |
| 255 | 267 | for field_name in index.fields: |
| 256 | 268 | if field_name not in field_names: |
| 257 | 269 | errors.append(f"Index '{index.name}' references unknown field '{field_name}'") |
| 270 | + | |
| 271 | + # Validate language_field_mapping if present | |
| 272 | + if index.language_field_mapping: | |
| 273 | + for lang, field_list in index.language_field_mapping.items(): | |
| 274 | + if not isinstance(field_list, list): | |
| 275 | + errors.append(f"Index '{index.name}': language_field_mapping['{lang}'] must be a list") | |
| 276 | + continue | |
| 277 | + | |
| 278 | + for field_name in field_list: | |
| 279 | + # Check if field exists | |
| 280 | + if field_name not in field_names: | |
| 281 | + errors.append( | |
| 282 | + f"Index '{index.name}': language_field_mapping['{lang}'] " | |
| 283 | + f"references unknown field '{field_name}'" | |
| 284 | + ) | |
| 285 | + else: | |
| 286 | + # Check if field is TEXT type (multi-language fields should be text fields) | |
| 287 | + field = field_map[field_name] | |
| 288 | + if field.field_type != FieldType.TEXT: | |
| 289 | + errors.append( | |
| 290 | + f"Index '{index.name}': language_field_mapping['{lang}'] " | |
| 291 | + f"field '{field_name}' must be of type TEXT, got {field.field_type.value}" | |
| 292 | + ) | |
| 293 | + | |
| 294 | + # Verify analyzer is appropriate for the language | |
| 295 | + # This is a soft check - we just warn if analyzer doesn't match language | |
| 296 | + if field.analyzer: | |
| 297 | + analyzer_name = field.analyzer.value.lower() | |
| 298 | + expected_analyzers = { | |
| 299 | + 'zh': ['chinese', 'index_ansj', 'query_ansj'], | |
| 300 | + 'en': ['english'], | |
| 301 | + 'ru': ['russian'], | |
| 302 | + 'ar': ['arabic'], | |
| 303 | + 'es': ['spanish'], | |
| 304 | + 'ja': ['japanese'] | |
| 305 | + } | |
| 306 | + if lang in expected_analyzers: | |
| 307 | + expected = expected_analyzers[lang] | |
| 308 | + if not any(exp in analyzer_name for exp in expected): | |
| 309 | + # Warning only, not an error | |
| 310 | + print( | |
| 311 | + f"Warning: Index '{index.name}': field '{field_name}' for language '{lang}' " | |
| 312 | + f"uses analyzer '{analyzer_name}', which may not be optimal for '{lang}'" | |
| 313 | + ) | |
| 258 | 314 | |
| 259 | 315 | # Validate SPU config |
| 260 | 316 | if config.spu_config.enabled: |
| ... | ... | @@ -360,11 +416,16 @@ class ConfigLoader: |
| 360 | 416 | |
| 361 | 417 | def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: |
| 362 | 418 | """Convert IndexConfig to dictionary.""" |
| 363 | - return { | |
| 419 | + result = { | |
| 364 | 420 | "name": index.name, |
| 365 | 421 | "label": index.label, |
| 366 | 422 | "fields": index.fields, |
| 367 | 423 | "analyzer": index.analyzer.value, |
| 368 | 424 | "boost": index.boost, |
| 369 | 425 | "example": index.example |
| 370 | - } | |
| 371 | 426 | \ No newline at end of file |
| 427 | + } | |
| 428 | + | |
| 429 | + if index.language_field_mapping: | |
| 430 | + result["language_field_mapping"] = index.language_field_mapping | |
| 431 | + | |
| 432 | + return result | |
| 372 | 433 | \ No newline at end of file | ... | ... |
config/schema/customer1_config.yaml
| ... | ... | @@ -177,6 +177,15 @@ indexes: |
| 177 | 177 | analyzer: "chinese_ecommerce" |
| 178 | 178 | boost: 1.0 |
| 179 | 179 | example: 'query=default:"消防套"' |
| 180 | + language_field_mapping: | |
| 181 | + zh: | |
| 182 | + - "name" | |
| 183 | + - "categoryName" | |
| 184 | + - "brandName" | |
| 185 | + en: | |
| 186 | + - "enSpuName" | |
| 187 | + ru: | |
| 188 | + - "ruSkuName" | |
| 180 | 189 | |
| 181 | 190 | - name: "title" |
| 182 | 191 | label: "标题索引" |
| ... | ... | @@ -187,6 +196,13 @@ indexes: |
| 187 | 196 | analyzer: "chinese_ecommerce" |
| 188 | 197 | boost: 2.0 |
| 189 | 198 | example: 'query=title:"芭比娃娃"' |
| 199 | + language_field_mapping: | |
| 200 | + zh: | |
| 201 | + - "name" | |
| 202 | + en: | |
| 203 | + - "enSpuName" | |
| 204 | + ru: | |
| 205 | + - "ruSkuName" | |
| 190 | 206 | |
| 191 | 207 | - name: "category" |
| 192 | 208 | label: "类目索引" | ... | ... |
query/query_parser.py
| ... | ... | @@ -138,9 +138,29 @@ class QueryParser: |
| 138 | 138 | # Stage 4: Translation |
| 139 | 139 | translations = {} |
| 140 | 140 | if self.query_config.enable_translation: |
| 141 | + # Determine target languages for translation | |
| 142 | + # If domain has language_field_mapping, only translate to languages in the mapping | |
| 143 | + # Otherwise, use all supported languages | |
| 144 | + target_langs_for_translation = self.query_config.supported_languages | |
| 145 | + | |
| 146 | + # Check if domain has language_field_mapping | |
| 147 | + domain_config = next( | |
| 148 | + (idx for idx in self.config.indexes if idx.name == domain), | |
| 149 | + None | |
| 150 | + ) | |
| 151 | + if domain_config and domain_config.language_field_mapping: | |
| 152 | + # Only translate to languages that exist in the mapping | |
| 153 | + available_languages = set(domain_config.language_field_mapping.keys()) | |
| 154 | + target_langs_for_translation = [ | |
| 155 | + lang for lang in self.query_config.supported_languages | |
| 156 | + if lang in available_languages | |
| 157 | + ] | |
| 158 | + print(f"[QueryParser] Domain '{domain}' has language_field_mapping, " | |
| 159 | + f"will translate to: {target_langs_for_translation}") | |
| 160 | + | |
| 141 | 161 | target_langs = self.translator.get_translation_needs( |
| 142 | 162 | detected_lang, |
| 143 | - self.query_config.supported_languages | |
| 163 | + target_langs_for_translation | |
| 144 | 164 | ) |
| 145 | 165 | |
| 146 | 166 | if target_langs: | ... | ... |
| ... | ... | @@ -0,0 +1,305 @@ |
| 1 | +""" | |
| 2 | +Multi-language query builder for handling domain-specific searches. | |
| 3 | + | |
| 4 | +This module extends the ESQueryBuilder to support multi-language field mappings, | |
| 5 | +allowing queries to be routed to appropriate language-specific fields while | |
| 6 | +maintaining a unified external interface. | |
| 7 | +""" | |
| 8 | + | |
| 9 | +from typing import Dict, Any, List, Optional | |
| 10 | +import numpy as np | |
| 11 | + | |
| 12 | +from config import CustomerConfig, IndexConfig | |
| 13 | +from query import ParsedQuery | |
| 14 | +from .es_query_builder import ESQueryBuilder | |
| 15 | + | |
| 16 | + | |
| 17 | +class MultiLanguageQueryBuilder(ESQueryBuilder): | |
| 18 | + """ | |
| 19 | + Enhanced query builder with multi-language support. | |
| 20 | + | |
| 21 | + Handles routing queries to appropriate language-specific fields based on: | |
| 22 | + 1. Detected query language | |
| 23 | + 2. Available translations | |
| 24 | + 3. Domain configuration (language_field_mapping) | |
| 25 | + """ | |
| 26 | + | |
| 27 | + def __init__( | |
| 28 | + self, | |
| 29 | + config: CustomerConfig, | |
| 30 | + index_name: str, | |
| 31 | + text_embedding_field: Optional[str] = None, | |
| 32 | + image_embedding_field: Optional[str] = None | |
| 33 | + ): | |
| 34 | + """ | |
| 35 | + Initialize multi-language query builder. | |
| 36 | + | |
| 37 | + Args: | |
| 38 | + config: Customer configuration | |
| 39 | + index_name: ES index name | |
| 40 | + text_embedding_field: Field name for text embeddings | |
| 41 | + image_embedding_field: Field name for image embeddings | |
| 42 | + """ | |
| 43 | + self.config = config | |
| 44 | + | |
| 45 | + # For default domain, use all fields as fallback | |
| 46 | + default_fields = self._get_domain_fields("default") | |
| 47 | + | |
| 48 | + super().__init__( | |
| 49 | + index_name=index_name, | |
| 50 | + match_fields=default_fields, | |
| 51 | + text_embedding_field=text_embedding_field, | |
| 52 | + image_embedding_field=image_embedding_field | |
| 53 | + ) | |
| 54 | + | |
| 55 | + # Build domain configurations | |
| 56 | + self.domain_configs = self._build_domain_configs() | |
| 57 | + | |
| 58 | + def _build_domain_configs(self) -> Dict[str, IndexConfig]: | |
| 59 | + """Build mapping of domain name to IndexConfig.""" | |
| 60 | + return {index.name: index for index in self.config.indexes} | |
| 61 | + | |
| 62 | + def _get_domain_fields(self, domain_name: str) -> List[str]: | |
| 63 | + """Get fields for a specific domain with boost notation.""" | |
| 64 | + for index in self.config.indexes: | |
| 65 | + if index.name == domain_name: | |
| 66 | + result = [] | |
| 67 | + for field_name in index.fields: | |
| 68 | + field = self._get_field_by_name(field_name) | |
| 69 | + if field and field.boost != 1.0: | |
| 70 | + result.append(f"{field_name}^{field.boost}") | |
| 71 | + else: | |
| 72 | + result.append(field_name) | |
| 73 | + return result | |
| 74 | + return [] | |
| 75 | + | |
| 76 | + def _get_field_by_name(self, field_name: str): | |
| 77 | + """Get field configuration by name.""" | |
| 78 | + for field in self.config.fields: | |
| 79 | + if field.name == field_name: | |
| 80 | + return field | |
| 81 | + return None | |
| 82 | + | |
| 83 | + def build_multilang_query( | |
| 84 | + self, | |
| 85 | + parsed_query: ParsedQuery, | |
| 86 | + query_vector: Optional[np.ndarray] = None, | |
| 87 | + filters: Optional[Dict[str, Any]] = None, | |
| 88 | + size: int = 10, | |
| 89 | + from_: int = 0, | |
| 90 | + enable_knn: bool = True, | |
| 91 | + knn_k: int = 50, | |
| 92 | + knn_num_candidates: int = 200, | |
| 93 | + min_score: Optional[float] = None | |
| 94 | + ) -> Dict[str, Any]: | |
| 95 | + """ | |
| 96 | + Build ES query with multi-language support. | |
| 97 | + | |
| 98 | + Args: | |
| 99 | + parsed_query: Parsed query with language info and translations | |
| 100 | + query_vector: Query embedding for KNN search | |
| 101 | + filters: Additional filters | |
| 102 | + size: Number of results | |
| 103 | + from_: Offset for pagination | |
| 104 | + enable_knn: Whether to use KNN search | |
| 105 | + knn_k: K value for KNN | |
| 106 | + knn_num_candidates: Number of candidates for KNN | |
| 107 | + min_score: Minimum score threshold | |
| 108 | + | |
| 109 | + Returns: | |
| 110 | + ES query DSL dictionary | |
| 111 | + """ | |
| 112 | + domain = parsed_query.domain | |
| 113 | + domain_config = self.domain_configs.get(domain) | |
| 114 | + | |
| 115 | + if not domain_config: | |
| 116 | + # Fallback to default domain | |
| 117 | + domain = "default" | |
| 118 | + domain_config = self.domain_configs.get("default") | |
| 119 | + | |
| 120 | + if not domain_config: | |
| 121 | + # Use original behavior | |
| 122 | + return super().build_query( | |
| 123 | + query_text=parsed_query.rewritten_query, | |
| 124 | + query_vector=query_vector, | |
| 125 | + filters=filters, | |
| 126 | + size=size, | |
| 127 | + from_=from_, | |
| 128 | + enable_knn=enable_knn, | |
| 129 | + knn_k=knn_k, | |
| 130 | + knn_num_candidates=knn_num_candidates, | |
| 131 | + min_score=min_score | |
| 132 | + ) | |
| 133 | + | |
| 134 | + print(f"[MultiLangQueryBuilder] Building query for domain: {domain}") | |
| 135 | + print(f"[MultiLangQueryBuilder] Detected language: {parsed_query.detected_language}") | |
| 136 | + print(f"[MultiLangQueryBuilder] Available translations: {list(parsed_query.translations.keys())}") | |
| 137 | + | |
| 138 | + # Build query clause with multi-language support | |
| 139 | + query_clause = self._build_multilang_text_query(parsed_query, domain_config) | |
| 140 | + | |
| 141 | + es_query = { | |
| 142 | + "size": size, | |
| 143 | + "from": from_ | |
| 144 | + } | |
| 145 | + | |
| 146 | + # Add filters if provided | |
| 147 | + if filters: | |
| 148 | + es_query["query"] = { | |
| 149 | + "bool": { | |
| 150 | + "must": [query_clause], | |
| 151 | + "filter": self._build_filters(filters) | |
| 152 | + } | |
| 153 | + } | |
| 154 | + else: | |
| 155 | + es_query["query"] = query_clause | |
| 156 | + | |
| 157 | + # Add KNN search if enabled and vector provided | |
| 158 | + if enable_knn and query_vector is not None and self.text_embedding_field: | |
| 159 | + knn_clause = { | |
| 160 | + "field": self.text_embedding_field, | |
| 161 | + "query_vector": query_vector.tolist(), | |
| 162 | + "k": knn_k, | |
| 163 | + "num_candidates": knn_num_candidates | |
| 164 | + } | |
| 165 | + es_query["knn"] = knn_clause | |
| 166 | + | |
| 167 | + # Add minimum score filter | |
| 168 | + if min_score is not None: | |
| 169 | + es_query["min_score"] = min_score | |
| 170 | + | |
| 171 | + return es_query | |
| 172 | + | |
| 173 | + def _build_multilang_text_query( | |
| 174 | + self, | |
| 175 | + parsed_query: ParsedQuery, | |
| 176 | + domain_config: IndexConfig | |
| 177 | + ) -> Dict[str, Any]: | |
| 178 | + """ | |
| 179 | + Build text query with multi-language field routing. | |
| 180 | + | |
| 181 | + Args: | |
| 182 | + parsed_query: Parsed query with language info | |
| 183 | + domain_config: Domain configuration | |
| 184 | + | |
| 185 | + Returns: | |
| 186 | + ES query clause | |
| 187 | + """ | |
| 188 | + if not domain_config.language_field_mapping: | |
| 189 | + # No multi-language mapping, use all fields with default analyzer | |
| 190 | + fields_with_boost = [] | |
| 191 | + for field_name in domain_config.fields: | |
| 192 | + field = self._get_field_by_name(field_name) | |
| 193 | + if field and field.boost != 1.0: | |
| 194 | + fields_with_boost.append(f"{field_name}^{field.boost}") | |
| 195 | + else: | |
| 196 | + fields_with_boost.append(field_name) | |
| 197 | + | |
| 198 | + return { | |
| 199 | + "multi_match": { | |
| 200 | + "query": parsed_query.rewritten_query, | |
| 201 | + "fields": fields_with_boost, | |
| 202 | + "minimum_should_match": "67%", | |
| 203 | + "tie_breaker": 0.9, | |
| 204 | + "boost": domain_config.boost, | |
| 205 | + "_name": f"{domain_config.name}_query" | |
| 206 | + } | |
| 207 | + } | |
| 208 | + | |
| 209 | + # Multi-language mapping exists - build targeted queries | |
| 210 | + should_clauses = [] | |
| 211 | + available_languages = set(domain_config.language_field_mapping.keys()) | |
| 212 | + | |
| 213 | + # 1. Query in detected language (if it exists in mapping) | |
| 214 | + detected_lang = parsed_query.detected_language | |
| 215 | + if detected_lang in available_languages: | |
| 216 | + target_fields = domain_config.language_field_mapping[detected_lang] | |
| 217 | + fields_with_boost = self._apply_field_boosts(target_fields) | |
| 218 | + | |
| 219 | + should_clauses.append({ | |
| 220 | + "multi_match": { | |
| 221 | + "query": parsed_query.rewritten_query, | |
| 222 | + "fields": fields_with_boost, | |
| 223 | + "minimum_should_match": "67%", | |
| 224 | + "tie_breaker": 0.9, | |
| 225 | + "boost": domain_config.boost * 1.5, # Higher boost for detected language | |
| 226 | + "_name": f"{domain_config.name}_{detected_lang}_query" | |
| 227 | + } | |
| 228 | + }) | |
| 229 | + print(f"[MultiLangQueryBuilder] Added query for detected language '{detected_lang}' on fields: {target_fields}") | |
| 230 | + | |
| 231 | + # 2. Query in translated languages (only for languages in mapping) | |
| 232 | + for lang, translation in parsed_query.translations.items(): | |
| 233 | + # Only use translations for languages that exist in the mapping | |
| 234 | + if lang in available_languages and translation and translation.strip(): | |
| 235 | + target_fields = domain_config.language_field_mapping[lang] | |
| 236 | + fields_with_boost = self._apply_field_boosts(target_fields) | |
| 237 | + | |
| 238 | + should_clauses.append({ | |
| 239 | + "multi_match": { | |
| 240 | + "query": translation, | |
| 241 | + "fields": fields_with_boost, | |
| 242 | + "minimum_should_match": "67%", | |
| 243 | + "tie_breaker": 0.9, | |
| 244 | + "boost": domain_config.boost, | |
| 245 | + "_name": f"{domain_config.name}_{lang}_translated_query" | |
| 246 | + } | |
| 247 | + }) | |
| 248 | + print(f"[MultiLangQueryBuilder] Added translated query for language '{lang}' on fields: {target_fields}") | |
| 249 | + | |
| 250 | + # 3. Fallback: query all fields in mapping if no language-specific query was built | |
| 251 | + if not should_clauses: | |
| 252 | + print(f"[MultiLangQueryBuilder] No language mapping matched, using all fields from mapping") | |
| 253 | + # Use all fields from all languages in the mapping | |
| 254 | + all_mapped_fields = [] | |
| 255 | + for lang_fields in domain_config.language_field_mapping.values(): | |
| 256 | + all_mapped_fields.extend(lang_fields) | |
| 257 | + # Remove duplicates while preserving order | |
| 258 | + unique_fields = list(dict.fromkeys(all_mapped_fields)) | |
| 259 | + fields_with_boost = self._apply_field_boosts(unique_fields) | |
| 260 | + | |
| 261 | + should_clauses.append({ | |
| 262 | + "multi_match": { | |
| 263 | + "query": parsed_query.rewritten_query, | |
| 264 | + "fields": fields_with_boost, | |
| 265 | + "minimum_should_match": "67%", | |
| 266 | + "tie_breaker": 0.9, | |
| 267 | + "boost": domain_config.boost * 0.8, # Lower boost for fallback | |
| 268 | + "_name": f"{domain_config.name}_fallback_query" | |
| 269 | + } | |
| 270 | + }) | |
| 271 | + | |
| 272 | + if len(should_clauses) == 1: | |
| 273 | + return should_clauses[0] | |
| 274 | + else: | |
| 275 | + return { | |
| 276 | + "bool": { | |
| 277 | + "should": should_clauses, | |
| 278 | + "minimum_should_match": 1 | |
| 279 | + } | |
| 280 | + } | |
| 281 | + | |
| 282 | + def _apply_field_boosts(self, field_names: List[str]) -> List[str]: | |
| 283 | + """Apply boost values to field names.""" | |
| 284 | + result = [] | |
| 285 | + for field_name in field_names: | |
| 286 | + field = self._get_field_by_name(field_name) | |
| 287 | + if field and field.boost != 1.0: | |
| 288 | + result.append(f"{field_name}^{field.boost}") | |
| 289 | + else: | |
| 290 | + result.append(field_name) | |
| 291 | + return result | |
| 292 | + | |
| 293 | + def get_domain_summary(self) -> Dict[str, Any]: | |
| 294 | + """Get summary of all configured domains.""" | |
| 295 | + summary = {} | |
| 296 | + for domain_name, domain_config in self.domain_configs.items(): | |
| 297 | + summary[domain_name] = { | |
| 298 | + "label": domain_config.label, | |
| 299 | + "fields": domain_config.fields, | |
| 300 | + "analyzer": domain_config.analyzer.value, | |
| 301 | + "boost": domain_config.boost, | |
| 302 | + "has_multilang_mapping": domain_config.language_field_mapping is not None, | |
| 303 | + "supported_languages": list(domain_config.language_field_mapping.keys()) if domain_config.language_field_mapping else [] | |
| 304 | + } | |
| 305 | + return summary | |
| 0 | 306 | \ No newline at end of file | ... | ... |
search/searcher.py
| ... | ... | @@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery |
| 13 | 13 | from indexer import MappingGenerator |
| 14 | 14 | from .boolean_parser import BooleanParser, QueryNode |
| 15 | 15 | from .es_query_builder import ESQueryBuilder |
| 16 | +from .multilang_query_builder import MultiLanguageQueryBuilder | |
| 16 | 17 | from .ranking_engine import RankingEngine |
| 17 | 18 | |
| 18 | 19 | |
| ... | ... | @@ -86,10 +87,10 @@ class Searcher: |
| 86 | 87 | self.text_embedding_field = mapping_gen.get_text_embedding_field() |
| 87 | 88 | self.image_embedding_field = mapping_gen.get_image_embedding_field() |
| 88 | 89 | |
| 89 | - # Query builder | |
| 90 | - self.query_builder = ESQueryBuilder( | |
| 90 | + # Query builder - use multi-language version | |
| 91 | + self.query_builder = MultiLanguageQueryBuilder( | |
| 92 | + config=config, | |
| 91 | 93 | index_name=config.es_index_name, |
| 92 | - match_fields=self.match_fields, | |
| 93 | 94 | text_embedding_field=self.text_embedding_field, |
| 94 | 95 | image_embedding_field=self.image_embedding_field |
| 95 | 96 | ) |
| ... | ... | @@ -144,11 +145,10 @@ class Searcher: |
| 144 | 145 | query_text = parsed_query.rewritten_query |
| 145 | 146 | print(f"[Searcher] Parsed boolean expression: {query_node}") |
| 146 | 147 | |
| 147 | - # Step 3: Build ES query | |
| 148 | - es_query = self.query_builder.build_query( | |
| 149 | - query_text=query_text, | |
| 148 | + # Step 3: Build ES query using multi-language builder | |
| 149 | + es_query = self.query_builder.build_multilang_query( | |
| 150 | + parsed_query=parsed_query, | |
| 150 | 151 | query_vector=parsed_query.query_vector if enable_embedding else None, |
| 151 | - query_node=query_node, | |
| 152 | 152 | filters=filters, |
| 153 | 153 | size=size, |
| 154 | 154 | from_=from_, |
| ... | ... | @@ -325,6 +325,15 @@ class Searcher: |
| 325 | 325 | query_info={'image_url': image_url, 'search_type': 'image_similarity'} |
| 326 | 326 | ) |
| 327 | 327 | |
| 328 | + def get_domain_summary(self) -> Dict[str, Any]: | |
| 329 | + """ | |
| 330 | + Get summary of all configured domains. | |
| 331 | + | |
| 332 | + Returns: | |
| 333 | + Dictionary with domain information | |
| 334 | + """ | |
| 335 | + return self.query_builder.get_domain_summary() | |
| 336 | + | |
| 328 | 337 | def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: |
| 329 | 338 | """ |
| 330 | 339 | Get single document by ID. | ... | ... |
| ... | ... | @@ -0,0 +1,155 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | +""" | |
| 3 | +Test script to validate multi-language configuration. | |
| 4 | + | |
| 5 | +This script validates that: | |
| 6 | +1. language_field_mapping is correctly loaded from config | |
| 7 | +2. All referenced fields exist and have correct analyzers | |
| 8 | +3. Multi-language query builder works correctly | |
| 9 | +""" | |
| 10 | + | |
| 11 | +import sys | |
| 12 | +import json | |
| 13 | +from config import ConfigLoader | |
| 14 | + | |
| 15 | +def test_config_loading(): | |
| 16 | + """Test that configuration loads correctly with language_field_mapping.""" | |
| 17 | + print("=" * 60) | |
| 18 | + print("Testing Configuration Loading") | |
| 19 | + print("=" * 60) | |
| 20 | + | |
| 21 | + try: | |
| 22 | + loader = ConfigLoader() | |
| 23 | + config = loader.load_customer_config('customer1') | |
| 24 | + | |
| 25 | + print(f"\n✓ Configuration loaded successfully") | |
| 26 | + print(f" Customer: {config.customer_name}") | |
| 27 | + print(f" ES Index: {config.es_index_name}") | |
| 28 | + | |
| 29 | + # Validate configuration | |
| 30 | + errors = loader.validate_config(config) | |
| 31 | + if errors: | |
| 32 | + print(f"\n✗ Configuration validation failed:") | |
| 33 | + for error in errors: | |
| 34 | + print(f" - {error}") | |
| 35 | + return False | |
| 36 | + else: | |
| 37 | + print(f"\n✓ Configuration validation passed") | |
| 38 | + | |
| 39 | + # Check indexes with language_field_mapping | |
| 40 | + print(f"\nIndexes with multi-language support:") | |
| 41 | + for index in config.indexes: | |
| 42 | + if index.language_field_mapping: | |
| 43 | + print(f"\n {index.name} ({index.label}):") | |
| 44 | + print(f" Fields: {index.fields}") | |
| 45 | + print(f" Language mapping:") | |
| 46 | + for lang, fields in index.language_field_mapping.items(): | |
| 47 | + print(f" {lang}: {fields}") | |
| 48 | + else: | |
| 49 | + print(f"\n {index.name} ({index.label}): No language mapping") | |
| 50 | + | |
| 51 | + return True | |
| 52 | + | |
| 53 | + except Exception as e: | |
| 54 | + print(f"\n✗ Error loading configuration: {e}") | |
| 55 | + import traceback | |
| 56 | + traceback.print_exc() | |
| 57 | + return False | |
| 58 | + | |
| 59 | + | |
| 60 | +def test_multilang_query_builder(): | |
| 61 | + """Test that MultiLanguageQueryBuilder works correctly.""" | |
| 62 | + print("\n" + "=" * 60) | |
| 63 | + print("Testing Multi-Language Query Builder") | |
| 64 | + print("=" * 60) | |
| 65 | + | |
| 66 | + try: | |
| 67 | + from config import ConfigLoader | |
| 68 | + from query import QueryParser | |
| 69 | + from search.multilang_query_builder import MultiLanguageQueryBuilder | |
| 70 | + from indexer import MappingGenerator | |
| 71 | + | |
| 72 | + loader = ConfigLoader() | |
| 73 | + config = loader.load_customer_config('customer1') | |
| 74 | + | |
| 75 | + # Initialize query builder | |
| 76 | + mapping_gen = MappingGenerator(config) | |
| 77 | + text_embedding_field = mapping_gen.get_text_embedding_field() | |
| 78 | + image_embedding_field = mapping_gen.get_image_embedding_field() | |
| 79 | + | |
| 80 | + query_builder = MultiLanguageQueryBuilder( | |
| 81 | + config=config, | |
| 82 | + index_name=config.es_index_name, | |
| 83 | + text_embedding_field=text_embedding_field, | |
| 84 | + image_embedding_field=image_embedding_field | |
| 85 | + ) | |
| 86 | + | |
| 87 | + print(f"\n✓ MultiLanguageQueryBuilder initialized") | |
| 88 | + | |
| 89 | + # Get domain summary | |
| 90 | + summary = query_builder.get_domain_summary() | |
| 91 | + print(f"\nDomain Summary:") | |
| 92 | + for domain, info in summary.items(): | |
| 93 | + print(f" {domain}:") | |
| 94 | + print(f" Label: {info['label']}") | |
| 95 | + print(f" Has multilang mapping: {info['has_multilang_mapping']}") | |
| 96 | + if info['has_multilang_mapping']: | |
| 97 | + print(f" Supported languages: {info['supported_languages']}") | |
| 98 | + | |
| 99 | + # Test query parsing | |
| 100 | + query_parser = QueryParser(config) | |
| 101 | + test_queries = [ | |
| 102 | + "芭比娃娃", | |
| 103 | + "title:芭比娃娃", | |
| 104 | + "default:玩具" | |
| 105 | + ] | |
| 106 | + | |
| 107 | + print(f"\nTesting query parsing:") | |
| 108 | + for query in test_queries: | |
| 109 | + print(f"\n Query: '{query}'") | |
| 110 | + parsed = query_parser.parse(query, generate_vector=False) | |
| 111 | + print(f" Domain: {parsed.domain}") | |
| 112 | + print(f" Detected language: {parsed.detected_language}") | |
| 113 | + print(f" Translations: {list(parsed.translations.keys())}") | |
| 114 | + | |
| 115 | + # Build query | |
| 116 | + es_query = query_builder.build_multilang_query( | |
| 117 | + parsed_query=parsed, | |
| 118 | + query_vector=None, | |
| 119 | + filters=None, | |
| 120 | + size=10, | |
| 121 | + enable_knn=False | |
| 122 | + ) | |
| 123 | + print(f" ES Query keys: {list(es_query.keys())}") | |
| 124 | + | |
| 125 | + return True | |
| 126 | + | |
| 127 | + except Exception as e: | |
| 128 | + print(f"\n✗ Error testing query builder: {e}") | |
| 129 | + import traceback | |
| 130 | + traceback.print_exc() | |
| 131 | + return False | |
| 132 | + | |
| 133 | + | |
| 134 | +if __name__ == "__main__": | |
| 135 | + print("Multi-Language Configuration Test") | |
| 136 | + print("=" * 60) | |
| 137 | + | |
| 138 | + success = True | |
| 139 | + | |
| 140 | + # Test 1: Configuration loading | |
| 141 | + if not test_config_loading(): | |
| 142 | + success = False | |
| 143 | + | |
| 144 | + # Test 2: Multi-language query builder | |
| 145 | + if not test_multilang_query_builder(): | |
| 146 | + success = False | |
| 147 | + | |
| 148 | + print("\n" + "=" * 60) | |
| 149 | + if success: | |
| 150 | + print("✓ All tests passed!") | |
| 151 | + sys.exit(0) | |
| 152 | + else: | |
| 153 | + print("✗ Some tests failed") | |
| 154 | + sys.exit(1) | |
| 155 | + | ... | ... |
| ... | ... | @@ -0,0 +1,53 @@ |
| 1 | + | |
| 2 | + | |
| 3 | +对后端搜索技术 做通用化。 | |
| 4 | + | |
| 5 | +通用化的本质 是 对于各种业务数据、各种检索需求,都可以 用少量定制+配置化 来实现效果。 | |
| 6 | + | |
| 7 | +## 1. 原始数据层的约定。 | |
| 8 | +### 店匠主表 | |
| 9 | +shoplazza_product_sku | |
| 10 | +shoplazza_product_spu | |
| 11 | +所有租户共用这个主表 | |
| 12 | + | |
| 13 | +### 每个租户的辅表 | |
| 14 | +各个租户,有自己的扩展表。 入索引的时候,商品主表 shoplazza_product_sku 的 id + shopid,拼接租户自己单独的扩展表(比如可以放一些自己的属性体系、各种语言的商品名、品牌名、标签、分类等) | |
| 15 | + | |
| 16 | +但是,各个租户,可能有不一样的业务数据,比如不同租户有不同的属性的体系、不同语言的商品标题(一般至少有中英文两种满足跨境的搜索需求),有不同的权重(提权)字段、业务过滤和聚合字段。 | |
| 17 | +能够统一的 只能是 sku表 按照一套配置规范、做一个配置文件,按照配置文件建设ES mapping结构以及做数据的入库。 | |
| 18 | + | |
| 19 | +1. 应用结构配置 : 定义了ES的输入数据有哪些字段、关联mysql的哪些字段. | |
| 20 | + 请帮我补充具体实现的一些配置 | |
| 21 | + | |
| 22 | + | |
| 23 | +2。 索引结构配置 : 定义了ES的字段,每个字段的索引mapping配置,支持各个域的查询,包括默认的域的查询。索引配置预定一号了一堆分析方式 | |
| 24 | + 请帮我补充具体实现的一些配置 | |
| 25 | + | |
| 26 | +## 测试数据灌入 | |
| 27 | + | |
| 28 | +灌入数据、mysql到ES的自动同步,不在本项目的范围内,但是,该项目 为了提供测试数据,需要 构造一个实例 customer1. | |
| 29 | +我们为他构造一套应用配置和索引配置。 | |
| 30 | +暂时是随机抽了我们自己的1w数据,建设辅助表,然后写一个程序,将数据分别灌入主表和辅表。 | |
| 31 | + | |
| 32 | +请帮我补充具体,当前测试数据灌入的具体的配置和方式,比如辅助表的内容 对应的应用结构配置 索引配置 等等。 | |
| 33 | + | |
| 34 | +## queryParser | |
| 35 | + | |
| 36 | +1. 查询改写。 配置词典的key是query,value是改写后的查询表达式,比如。比如品牌词 改写为在brand|query OR name|query,类别词、标签词等都可以放进去。纠错、规范化、查询改写等 都可以通过这个词典来配置。 | |
| 37 | +2. 翻译。配置需要得到的几种目标语言。 在customer1测试案例中,我们配置 zh en两种语言。先对query做语言检测,如果query是中文那么要翻译一下en,如果是en那么要翻译zh,如果两者都不是那么zh en都需要翻译。 | |
| 38 | +3. 如果配置打开了text_embedding查询,并且query 包含了default域的查询,那么要把default域的查询词转向量,后面searcher会用这个向量参与查询。 | |
| 39 | + | |
| 40 | +也帮我补充一些具体实现情况 | |
| 41 | + | |
| 42 | +## searcher | |
| 43 | + | |
| 44 | +支持多种检索表达式: | |
| 45 | +支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。 | |
| 46 | + | |
| 47 | +## default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 | |
| 48 | + | |
| 49 | +暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) | |
| 50 | +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 | |
| 51 | + | |
| 52 | +也帮我补充一些具体实现情况 | |
| 53 | + | ... | ... |
| ... | ... | @@ -0,0 +1,196 @@ |
| 1 | + | |
| 2 | + | |
| 3 | + | |
| 4 | + | |
| 5 | +这次修改没改完。 | |
| 6 | + | |
| 7 | + | |
| 8 | +diff --git a/HighLevelDesign.md b/HighLevelDesign.md | |
| 9 | +index 397a9f7..3e728c9 100644 | |
| 10 | +--- a/HighLevelDesign.md | |
| 11 | ++++ b/HighLevelDesign.md | |
| 12 | +@@ -112,10 +112,9 @@ if response.status_code == 200: | |
| 13 | + 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。 | |
| 14 | + | |
| 15 | + default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 | |
| 16 | +-多语言搜索: | |
| 17 | +-对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 | |
| 18 | + | |
| 19 | + 暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) | |
| 20 | ++bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 | |
| 21 | + bm25打分(base_query): | |
| 22 | + "multi_match": { | |
| 23 | + "query": search_query, | |
| 24 | +diff --git a/config/config_loader.py b/config/config_loader.py | |
| 25 | +index 8df15b3..f3fcaa3 100644 | |
| 26 | +--- a/config/config_loader.py | |
| 27 | ++++ b/config/config_loader.py | |
| 28 | +@@ -27,6 +27,9 @@ class IndexConfig: | |
| 29 | + boost: float = 1.0 | |
| 30 | + example: Optional[str] = None | |
| 31 | + | |
| 32 | ++ # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]} | |
| 33 | ++ language_field_mapping: Optional[Dict[str, List[str]]] = None | |
| 34 | ++ | |
| 35 | + | |
| 36 | + @dataclass | |
| 37 | + class RankingConfig: | |
| 38 | +@@ -66,8 +69,6 @@ class CustomerConfig: | |
| 39 | + | |
| 40 | + # Database settings | |
| 41 | + mysql_config: Dict[str, Any] | |
| 42 | +- main_table: str = "shoplazza_product_sku" | |
| 43 | +- extension_table: Optional[str] = None | |
| 44 | + | |
| 45 | + # Field definitions | |
| 46 | + fields: List[FieldConfig] | |
| 47 | +@@ -86,6 +87,10 @@ class CustomerConfig: | |
| 48 | + | |
| 49 | + # ES index settings | |
| 50 | + es_index_name: str | |
| 51 | ++ | |
| 52 | ++ # Optional fields with defaults | |
| 53 | ++ main_table: str = "shoplazza_product_sku" | |
| 54 | ++ extension_table: Optional[str] = None | |
| 55 | + es_settings: Dict[str, Any] = field(default_factory=dict) | |
| 56 | + | |
| 57 | + | |
| 58 | +@@ -228,13 +233,17 @@ class ConfigLoader: | |
| 59 | + if analyzer_str not in ANALYZER_MAP: | |
| 60 | + raise ConfigurationError(f"Unknown analyzer: {analyzer_str}") | |
| 61 | + | |
| 62 | ++ # Parse language field mapping if present | |
| 63 | ++ language_field_mapping = index_data.get("language_field_mapping") | |
| 64 | ++ | |
| 65 | + return IndexConfig( | |
| 66 | + name=index_data["name"], | |
| 67 | + label=index_data.get("label", index_data["name"]), | |
| 68 | + fields=index_data["fields"], | |
| 69 | + analyzer=ANALYZER_MAP[analyzer_str], | |
| 70 | + boost=index_data.get("boost", 1.0), | |
| 71 | +- example=index_data.get("example") | |
| 72 | ++ example=index_data.get("example"), | |
| 73 | ++ language_field_mapping=language_field_mapping | |
| 74 | + ) | |
| 75 | + | |
| 76 | + def validate_config(self, config: CustomerConfig) -> List[str]: | |
| 77 | +@@ -360,11 +369,16 @@ class ConfigLoader: | |
| 78 | + | |
| 79 | + def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: | |
| 80 | + """Convert IndexConfig to dictionary.""" | |
| 81 | +- return { | |
| 82 | ++ result = { | |
| 83 | + "name": index.name, | |
| 84 | + "label": index.label, | |
| 85 | + "fields": index.fields, | |
| 86 | + "analyzer": index.analyzer.value, | |
| 87 | + "boost": index.boost, | |
| 88 | + "example": index.example | |
| 89 | +- } | |
| 90 | +\ No newline at end of file | |
| 91 | ++ } | |
| 92 | ++ | |
| 93 | ++ if index.language_field_mapping: | |
| 94 | ++ result["language_field_mapping"] = index.language_field_mapping | |
| 95 | ++ | |
| 96 | ++ return result | |
| 97 | +\ No newline at end of file | |
| 98 | +diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml | |
| 99 | +index bfe2e53..84e9ba1 100644 | |
| 100 | +--- a/config/schema/customer1_config.yaml | |
| 101 | ++++ b/config/schema/customer1_config.yaml | |
| 102 | +@@ -177,6 +177,15 @@ indexes: | |
| 103 | + analyzer: "chinese_ecommerce" | |
| 104 | + boost: 1.0 | |
| 105 | + example: 'query=default:"消防套"' | |
| 106 | ++ language_field_mapping: | |
| 107 | ++ zh: | |
| 108 | ++ - "name" | |
| 109 | ++ - "categoryName" | |
| 110 | ++ - "brandName" | |
| 111 | ++ en: | |
| 112 | ++ - "enSpuName" | |
| 113 | ++ ru: | |
| 114 | ++ - "ruSkuName" | |
| 115 | + | |
| 116 | + - name: "title" | |
| 117 | + label: "标题索引" | |
| 118 | +@@ -187,6 +196,13 @@ indexes: | |
| 119 | + analyzer: "chinese_ecommerce" | |
| 120 | + boost: 2.0 | |
| 121 | + example: 'query=title:"芭比娃娃"' | |
| 122 | ++ language_field_mapping: | |
| 123 | ++ zh: | |
| 124 | ++ - "name" | |
| 125 | ++ en: | |
| 126 | ++ - "enSpuName" | |
| 127 | ++ ru: | |
| 128 | ++ - "ruSkuName" | |
| 129 | + | |
| 130 | + - name: "category" | |
| 131 | + label: "类目索引" | |
| 132 | +diff --git a/search/searcher.py b/search/searcher.py | |
| 133 | +index a7088ec..0a798ed 100644 | |
| 134 | +--- a/search/searcher.py | |
| 135 | ++++ b/search/searcher.py | |
| 136 | +@@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery | |
| 137 | + from indexer import MappingGenerator | |
| 138 | + from .boolean_parser import BooleanParser, QueryNode | |
| 139 | + from .es_query_builder import ESQueryBuilder | |
| 140 | ++from .multilang_query_builder import MultiLanguageQueryBuilder | |
| 141 | + from .ranking_engine import RankingEngine | |
| 142 | + | |
| 143 | + | |
| 144 | +@@ -86,10 +87,10 @@ class Searcher: | |
| 145 | + self.text_embedding_field = mapping_gen.get_text_embedding_field() | |
| 146 | + self.image_embedding_field = mapping_gen.get_image_embedding_field() | |
| 147 | + | |
| 148 | +- # Query builder | |
| 149 | +- self.query_builder = ESQueryBuilder( | |
| 150 | ++ # Query builder - use multi-language version | |
| 151 | ++ self.query_builder = MultiLanguageQueryBuilder( | |
| 152 | ++ config=config, | |
| 153 | + index_name=config.es_index_name, | |
| 154 | +- match_fields=self.match_fields, | |
| 155 | + text_embedding_field=self.text_embedding_field, | |
| 156 | + image_embedding_field=self.image_embedding_field | |
| 157 | + ) | |
| 158 | +@@ -144,11 +145,10 @@ class Searcher: | |
| 159 | + query_text = parsed_query.rewritten_query | |
| 160 | + print(f"[Searcher] Parsed boolean expression: {query_node}") | |
| 161 | + | |
| 162 | +- # Step 3: Build ES query | |
| 163 | +- es_query = self.query_builder.build_query( | |
| 164 | +- query_text=query_text, | |
| 165 | ++ # Step 3: Build ES query using multi-language builder | |
| 166 | ++ es_query = self.query_builder.build_multilang_query( | |
| 167 | ++ parsed_query=parsed_query, | |
| 168 | + query_vector=parsed_query.query_vector if enable_embedding else None, | |
| 169 | +- query_node=query_node, | |
| 170 | + filters=filters, | |
| 171 | + size=size, | |
| 172 | + from_=from_, | |
| 173 | +@@ -325,6 +325,15 @@ class Searcher: | |
| 174 | + query_info={'image_url': image_url, 'search_type': 'image_similarity'} | |
| 175 | + ) | |
| 176 | + | |
| 177 | ++ def get_domain_summary(self) -> Dict[str, Any]: | |
| 178 | ++ """ | |
| 179 | ++ Get summary of all configured domains. | |
| 180 | ++ | |
| 181 | ++ Returns: | |
| 182 | ++ Dictionary with domain information | |
| 183 | ++ """ | |
| 184 | ++ return self.query_builder.get_domain_summary() | |
| 185 | ++ | |
| 186 | + def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: | |
| 187 | + """ | |
| 188 | + Get single document by ID. | |
| 189 | + | |
| 190 | + | |
| 191 | + | |
| 192 | + | |
| 193 | + | |
| 194 | + | |
| 195 | + | |
| 196 | + | ... | ... |