diff --git a/HighLevelDesign.md b/HighLevelDesign.md index 397a9f7..3e728c9 100644 --- a/HighLevelDesign.md +++ b/HighLevelDesign.md @@ -112,10 +112,9 @@ if response.status_code == 200: 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。 default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 -多语言搜索: -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 bm25打分(base_query): "multi_match": { "query": search_query, diff --git a/MULTILANG_FEATURE.md b/MULTILANG_FEATURE.md new file mode 100644 index 0000000..5571710 --- /dev/null +++ b/MULTILANG_FEATURE.md @@ -0,0 +1,219 @@ +# 多语言标题索引功能说明 + +## 功能概述 + +本功能实现了多语言标题索引的支持,允许不同语言的标题字段使用对应的分词器,同时对外提供统一的搜索接口。 + +## 主要特性 + +1. **多语言字段分离索引**:不同语言的标题字段(中文、英文、俄文等)使用对应的分词器 +2. **统一的搜索接口**:对外仍然使用 `default` 域搜索,内部自动处理多语言路由 +3. **智能查询路由**:根据查询语言和配置,将查询路由到对应的语言字段 + +## 配置说明 + +### 字段配置 + +在 `customer1_config.yaml` 中,需要为不同语言的标题字段配置对应的分词器: + +```yaml +fields: + # 中文标题 - 使用中文电商分词器 + - name: "name" + type: "TEXT" + analyzer: "chinese_ecommerce" + boost: 2.0 + + # 英文标题 - 使用英文分词器 + - name: "enSpuName" + type: "TEXT" + analyzer: "english" + boost: 2.0 + + # 俄文标题 - 使用俄文分词器 + - name: "ruSkuName" + type: "TEXT" + analyzer: "russian" + boost: 2.0 +``` + +### 索引域配置 + +在索引配置中添加 `language_field_mapping` 来指定每种语言对应的字段: + +```yaml +indexes: + - name: "default" + label: "默认索引" + fields: + - "name" + - "enSpuName" + - "ruSkuName" + - "categoryName" + - "brandName" + analyzer: "chinese_ecommerce" + boost: 1.0 + language_field_mapping: + zh: + - "name" + - "categoryName" + - "brandName" + en: + - "enSpuName" + ru: + - "ruSkuName" + + - name: "title" + label: "标题索引" + fields: + - "name" + - "enSpuName" + - "ruSkuName" + analyzer: "chinese_ecommerce" + boost: 2.0 + language_field_mapping: + zh: + - "name" + en: + - "enSpuName" + ru: + - "ruSkuName" +``` + +### 查询配置 + +在 `query_config` 中配置支持的语言: + +```yaml +query_config: + supported_languages: + - "zh" + - "en" + - "ru" + default_language: "zh" + enable_translation: true + enable_text_embedding: true +``` + +## 工作原理 + +### 1. 查询解析阶段 + +当用户输入查询时: +1. **语言检测**:自动检测查询语言(中文、英文、俄文等) +2. **翻译生成**:如果启用了翻译,将查询翻译到其他支持的语言 +3. **域提取**:如果查询包含域前缀(如 `title:查询`),提取域信息 + +### 2. 查询构建阶段 + +对于有 `language_field_mapping` 的域: + +1. **检测语言查询**:使用检测到的语言和原始查询,搜索对应语言的字段 + - 例如:中文查询 "芭比娃娃" → 搜索 `name` 字段(中文分词器) + +2. **翻译语言查询**:使用翻译后的查询,搜索对应语言的字段 + - 例如:中文查询翻译为英文 "Barbie doll" → 搜索 `enSpuName` 字段(英文分词器) + +3. **查询组合**:将多个语言查询组合为 `should` 子句,提高召回率 + - 检测语言的查询权重更高(boost * 1.5) + - 翻译语言的查询使用正常权重(boost * 1.0) + +### 3. 字段级别分析器 + +Elasticsearch 会自动为每个字段使用其配置的分析器: +- `name` 字段使用 `chinese_ecommerce` 分词器 +- `enSpuName` 字段使用 `english` 分词器 +- `ruSkuName` 字段使用 `russian` 分词器 + +## 使用示例 + +### 示例 1: 默认域搜索(中文查询) + +``` +查询: "芭比娃娃" +域: default +检测语言: zh +``` + +**生成的查询**: +- 中文查询 "芭比娃娃" → 搜索 `name`, `categoryName`, `brandName` 字段(boost * 1.5) +- 英文翻译 "Barbie doll" → 搜索 `enSpuName` 字段(boost * 1.0) +- 俄文翻译 "Кукла Барби" → 搜索 `ruSkuName` 字段(boost * 1.0) + +### 示例 2: 标题域搜索(英文查询) + +``` +查询: "title:Barbie doll" +域: title +检测语言: en +``` + +**生成的查询**: +- 英文查询 "Barbie doll" → 搜索 `enSpuName` 字段(boost * 2.0 * 1.5) +- 中文翻译 "芭比娃娃" → 搜索 `name` 字段(boost * 2.0) +- 俄文翻译 "Кукла Барби" → 搜索 `ruSkuName` 字段(boost * 2.0) + +### 示例 3: 无语言映射的域 + +``` +查询: "category:玩具" +域: category +``` + +**生成的查询**: +- 使用所有配置的字段进行搜索(`categoryName`) +- 不进行多语言路由 + +## 配置验证 + +系统会自动验证配置: +1. 检查 `language_field_mapping` 中引用的字段是否存在 +2. 验证字段类型是否为 `TEXT` +3. 警告字段分析器与语言不匹配的情况 + +## API 使用 + +### 搜索接口 + +```python +POST /search/ +{ + "query": "芭比娃娃", + "size": 10, + "enable_translation": true, + "enable_embedding": true +} +``` + +### 域搜索 + +```python +POST /search/ +{ + "query": "title:芭比娃娃", + "size": 10, + "enable_translation": true +} +``` + +## 注意事项 + +1. **翻译服务**:需要配置 DeepL API 密钥才能使用翻译功能 +2. **字段配置**:确保每个语言字段都配置了正确的分词器 +3. **性能考虑**:多语言查询会产生多个子查询,可能略微影响性能 +4. **语言检测**:语言检测的准确性会影响查询路由的效果 + +## 技术实现 + +- **MultiLanguageQueryBuilder**: 多语言查询构建器 +- **QueryParser**: 查询解析器,支持语言检测和翻译 +- **ConfigLoader**: 配置加载器,支持 `language_field_mapping` 配置 +- **MappingGenerator**: 映射生成器,确保字段使用正确的分析器 + +## 未来改进 + +1. 支持更多语言 +2. 优化翻译缓存机制 +3. 支持自定义语言检测模型 +4. 添加查询性能监控 + diff --git a/config/config_loader.py b/config/config_loader.py index 8df15b3..c510377 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -27,6 +27,9 @@ class IndexConfig: boost: float = 1.0 example: Optional[str] = None + # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]} + language_field_mapping: Optional[Dict[str, List[str]]] = None + @dataclass class RankingConfig: @@ -66,8 +69,6 @@ class CustomerConfig: # Database settings mysql_config: Dict[str, Any] - main_table: str = "shoplazza_product_sku" - extension_table: Optional[str] = None # Field definitions fields: List[FieldConfig] @@ -86,6 +87,10 @@ class CustomerConfig: # ES index settings es_index_name: str + + # Optional fields with defaults + main_table: str = "shoplazza_product_sku" + extension_table: Optional[str] = None es_settings: Dict[str, Any] = field(default_factory=dict) @@ -228,13 +233,17 @@ class ConfigLoader: if analyzer_str not in ANALYZER_MAP: raise ConfigurationError(f"Unknown analyzer: {analyzer_str}") + # Parse language field mapping if present + language_field_mapping = index_data.get("language_field_mapping") + return IndexConfig( name=index_data["name"], label=index_data.get("label", index_data["name"]), fields=index_data["fields"], analyzer=ANALYZER_MAP[analyzer_str], boost=index_data.get("boost", 1.0), - example=index_data.get("example") + example=index_data.get("example"), + language_field_mapping=language_field_mapping ) def validate_config(self, config: CustomerConfig) -> List[str]: @@ -251,10 +260,57 @@ class ConfigLoader: # Validate field references in indexes field_names = {field.name for field in config.fields} + field_map = {field.name: field for field in config.fields} + for index in config.indexes: + # Validate fields in index.fields for field_name in index.fields: if field_name not in field_names: errors.append(f"Index '{index.name}' references unknown field '{field_name}'") + + # Validate language_field_mapping if present + if index.language_field_mapping: + for lang, field_list in index.language_field_mapping.items(): + if not isinstance(field_list, list): + errors.append(f"Index '{index.name}': language_field_mapping['{lang}'] must be a list") + continue + + for field_name in field_list: + # Check if field exists + if field_name not in field_names: + errors.append( + f"Index '{index.name}': language_field_mapping['{lang}'] " + f"references unknown field '{field_name}'" + ) + else: + # Check if field is TEXT type (multi-language fields should be text fields) + field = field_map[field_name] + if field.field_type != FieldType.TEXT: + errors.append( + f"Index '{index.name}': language_field_mapping['{lang}'] " + f"field '{field_name}' must be of type TEXT, got {field.field_type.value}" + ) + + # Verify analyzer is appropriate for the language + # This is a soft check - we just warn if analyzer doesn't match language + if field.analyzer: + analyzer_name = field.analyzer.value.lower() + expected_analyzers = { + 'zh': ['chinese', 'index_ansj', 'query_ansj'], + 'en': ['english'], + 'ru': ['russian'], + 'ar': ['arabic'], + 'es': ['spanish'], + 'ja': ['japanese'] + } + if lang in expected_analyzers: + expected = expected_analyzers[lang] + if not any(exp in analyzer_name for exp in expected): + # Warning only, not an error + print( + f"Warning: Index '{index.name}': field '{field_name}' for language '{lang}' " + f"uses analyzer '{analyzer_name}', which may not be optimal for '{lang}'" + ) # Validate SPU config if config.spu_config.enabled: @@ -360,11 +416,16 @@ class ConfigLoader: def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: """Convert IndexConfig to dictionary.""" - return { + result = { "name": index.name, "label": index.label, "fields": index.fields, "analyzer": index.analyzer.value, "boost": index.boost, "example": index.example - } \ No newline at end of file + } + + if index.language_field_mapping: + result["language_field_mapping"] = index.language_field_mapping + + return result \ No newline at end of file diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml index bfe2e53..84e9ba1 100644 --- a/config/schema/customer1_config.yaml +++ b/config/schema/customer1_config.yaml @@ -177,6 +177,15 @@ indexes: analyzer: "chinese_ecommerce" boost: 1.0 example: 'query=default:"消防套"' + language_field_mapping: + zh: + - "name" + - "categoryName" + - "brandName" + en: + - "enSpuName" + ru: + - "ruSkuName" - name: "title" label: "标题索引" @@ -187,6 +196,13 @@ indexes: analyzer: "chinese_ecommerce" boost: 2.0 example: 'query=title:"芭比娃娃"' + language_field_mapping: + zh: + - "name" + en: + - "enSpuName" + ru: + - "ruSkuName" - name: "category" label: "类目索引" diff --git a/query/query_parser.py b/query/query_parser.py index 56e4bde..2679ce6 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -138,9 +138,29 @@ class QueryParser: # Stage 4: Translation translations = {} if self.query_config.enable_translation: + # Determine target languages for translation + # If domain has language_field_mapping, only translate to languages in the mapping + # Otherwise, use all supported languages + target_langs_for_translation = self.query_config.supported_languages + + # Check if domain has language_field_mapping + domain_config = next( + (idx for idx in self.config.indexes if idx.name == domain), + None + ) + if domain_config and domain_config.language_field_mapping: + # Only translate to languages that exist in the mapping + available_languages = set(domain_config.language_field_mapping.keys()) + target_langs_for_translation = [ + lang for lang in self.query_config.supported_languages + if lang in available_languages + ] + print(f"[QueryParser] Domain '{domain}' has language_field_mapping, " + f"will translate to: {target_langs_for_translation}") + target_langs = self.translator.get_translation_needs( detected_lang, - self.query_config.supported_languages + target_langs_for_translation ) if target_langs: diff --git a/search/multilang_query_builder.py b/search/multilang_query_builder.py new file mode 100644 index 0000000..7c2387f --- /dev/null +++ b/search/multilang_query_builder.py @@ -0,0 +1,305 @@ +""" +Multi-language query builder for handling domain-specific searches. + +This module extends the ESQueryBuilder to support multi-language field mappings, +allowing queries to be routed to appropriate language-specific fields while +maintaining a unified external interface. +""" + +from typing import Dict, Any, List, Optional +import numpy as np + +from config import CustomerConfig, IndexConfig +from query import ParsedQuery +from .es_query_builder import ESQueryBuilder + + +class MultiLanguageQueryBuilder(ESQueryBuilder): + """ + Enhanced query builder with multi-language support. + + Handles routing queries to appropriate language-specific fields based on: + 1. Detected query language + 2. Available translations + 3. Domain configuration (language_field_mapping) + """ + + def __init__( + self, + config: CustomerConfig, + index_name: str, + text_embedding_field: Optional[str] = None, + image_embedding_field: Optional[str] = None + ): + """ + Initialize multi-language query builder. + + Args: + config: Customer configuration + index_name: ES index name + text_embedding_field: Field name for text embeddings + image_embedding_field: Field name for image embeddings + """ + self.config = config + + # For default domain, use all fields as fallback + default_fields = self._get_domain_fields("default") + + super().__init__( + index_name=index_name, + match_fields=default_fields, + text_embedding_field=text_embedding_field, + image_embedding_field=image_embedding_field + ) + + # Build domain configurations + self.domain_configs = self._build_domain_configs() + + def _build_domain_configs(self) -> Dict[str, IndexConfig]: + """Build mapping of domain name to IndexConfig.""" + return {index.name: index for index in self.config.indexes} + + def _get_domain_fields(self, domain_name: str) -> List[str]: + """Get fields for a specific domain with boost notation.""" + for index in self.config.indexes: + if index.name == domain_name: + result = [] + for field_name in index.fields: + field = self._get_field_by_name(field_name) + if field and field.boost != 1.0: + result.append(f"{field_name}^{field.boost}") + else: + result.append(field_name) + return result + return [] + + def _get_field_by_name(self, field_name: str): + """Get field configuration by name.""" + for field in self.config.fields: + if field.name == field_name: + return field + return None + + def build_multilang_query( + self, + parsed_query: ParsedQuery, + query_vector: Optional[np.ndarray] = None, + filters: Optional[Dict[str, Any]] = None, + size: int = 10, + from_: int = 0, + enable_knn: bool = True, + knn_k: int = 50, + knn_num_candidates: int = 200, + min_score: Optional[float] = None + ) -> Dict[str, Any]: + """ + Build ES query with multi-language support. + + Args: + parsed_query: Parsed query with language info and translations + query_vector: Query embedding for KNN search + filters: Additional filters + size: Number of results + from_: Offset for pagination + enable_knn: Whether to use KNN search + knn_k: K value for KNN + knn_num_candidates: Number of candidates for KNN + min_score: Minimum score threshold + + Returns: + ES query DSL dictionary + """ + domain = parsed_query.domain + domain_config = self.domain_configs.get(domain) + + if not domain_config: + # Fallback to default domain + domain = "default" + domain_config = self.domain_configs.get("default") + + if not domain_config: + # Use original behavior + return super().build_query( + query_text=parsed_query.rewritten_query, + query_vector=query_vector, + filters=filters, + size=size, + from_=from_, + enable_knn=enable_knn, + knn_k=knn_k, + knn_num_candidates=knn_num_candidates, + min_score=min_score + ) + + print(f"[MultiLangQueryBuilder] Building query for domain: {domain}") + print(f"[MultiLangQueryBuilder] Detected language: {parsed_query.detected_language}") + print(f"[MultiLangQueryBuilder] Available translations: {list(parsed_query.translations.keys())}") + + # Build query clause with multi-language support + query_clause = self._build_multilang_text_query(parsed_query, domain_config) + + es_query = { + "size": size, + "from": from_ + } + + # Add filters if provided + if filters: + es_query["query"] = { + "bool": { + "must": [query_clause], + "filter": self._build_filters(filters) + } + } + else: + es_query["query"] = query_clause + + # Add KNN search if enabled and vector provided + if enable_knn and query_vector is not None and self.text_embedding_field: + knn_clause = { + "field": self.text_embedding_field, + "query_vector": query_vector.tolist(), + "k": knn_k, + "num_candidates": knn_num_candidates + } + es_query["knn"] = knn_clause + + # Add minimum score filter + if min_score is not None: + es_query["min_score"] = min_score + + return es_query + + def _build_multilang_text_query( + self, + parsed_query: ParsedQuery, + domain_config: IndexConfig + ) -> Dict[str, Any]: + """ + Build text query with multi-language field routing. + + Args: + parsed_query: Parsed query with language info + domain_config: Domain configuration + + Returns: + ES query clause + """ + if not domain_config.language_field_mapping: + # No multi-language mapping, use all fields with default analyzer + fields_with_boost = [] + for field_name in domain_config.fields: + field = self._get_field_by_name(field_name) + if field and field.boost != 1.0: + fields_with_boost.append(f"{field_name}^{field.boost}") + else: + fields_with_boost.append(field_name) + + return { + "multi_match": { + "query": parsed_query.rewritten_query, + "fields": fields_with_boost, + "minimum_should_match": "67%", + "tie_breaker": 0.9, + "boost": domain_config.boost, + "_name": f"{domain_config.name}_query" + } + } + + # Multi-language mapping exists - build targeted queries + should_clauses = [] + available_languages = set(domain_config.language_field_mapping.keys()) + + # 1. Query in detected language (if it exists in mapping) + detected_lang = parsed_query.detected_language + if detected_lang in available_languages: + target_fields = domain_config.language_field_mapping[detected_lang] + fields_with_boost = self._apply_field_boosts(target_fields) + + should_clauses.append({ + "multi_match": { + "query": parsed_query.rewritten_query, + "fields": fields_with_boost, + "minimum_should_match": "67%", + "tie_breaker": 0.9, + "boost": domain_config.boost * 1.5, # Higher boost for detected language + "_name": f"{domain_config.name}_{detected_lang}_query" + } + }) + print(f"[MultiLangQueryBuilder] Added query for detected language '{detected_lang}' on fields: {target_fields}") + + # 2. Query in translated languages (only for languages in mapping) + for lang, translation in parsed_query.translations.items(): + # Only use translations for languages that exist in the mapping + if lang in available_languages and translation and translation.strip(): + target_fields = domain_config.language_field_mapping[lang] + fields_with_boost = self._apply_field_boosts(target_fields) + + should_clauses.append({ + "multi_match": { + "query": translation, + "fields": fields_with_boost, + "minimum_should_match": "67%", + "tie_breaker": 0.9, + "boost": domain_config.boost, + "_name": f"{domain_config.name}_{lang}_translated_query" + } + }) + print(f"[MultiLangQueryBuilder] Added translated query for language '{lang}' on fields: {target_fields}") + + # 3. Fallback: query all fields in mapping if no language-specific query was built + if not should_clauses: + print(f"[MultiLangQueryBuilder] No language mapping matched, using all fields from mapping") + # Use all fields from all languages in the mapping + all_mapped_fields = [] + for lang_fields in domain_config.language_field_mapping.values(): + all_mapped_fields.extend(lang_fields) + # Remove duplicates while preserving order + unique_fields = list(dict.fromkeys(all_mapped_fields)) + fields_with_boost = self._apply_field_boosts(unique_fields) + + should_clauses.append({ + "multi_match": { + "query": parsed_query.rewritten_query, + "fields": fields_with_boost, + "minimum_should_match": "67%", + "tie_breaker": 0.9, + "boost": domain_config.boost * 0.8, # Lower boost for fallback + "_name": f"{domain_config.name}_fallback_query" + } + }) + + if len(should_clauses) == 1: + return should_clauses[0] + else: + return { + "bool": { + "should": should_clauses, + "minimum_should_match": 1 + } + } + + def _apply_field_boosts(self, field_names: List[str]) -> List[str]: + """Apply boost values to field names.""" + result = [] + for field_name in field_names: + field = self._get_field_by_name(field_name) + if field and field.boost != 1.0: + result.append(f"{field_name}^{field.boost}") + else: + result.append(field_name) + return result + + def get_domain_summary(self) -> Dict[str, Any]: + """Get summary of all configured domains.""" + summary = {} + for domain_name, domain_config in self.domain_configs.items(): + summary[domain_name] = { + "label": domain_config.label, + "fields": domain_config.fields, + "analyzer": domain_config.analyzer.value, + "boost": domain_config.boost, + "has_multilang_mapping": domain_config.language_field_mapping is not None, + "supported_languages": list(domain_config.language_field_mapping.keys()) if domain_config.language_field_mapping else [] + } + return summary \ No newline at end of file diff --git a/search/searcher.py b/search/searcher.py index a7088ec..0a798ed 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery from indexer import MappingGenerator from .boolean_parser import BooleanParser, QueryNode from .es_query_builder import ESQueryBuilder +from .multilang_query_builder import MultiLanguageQueryBuilder from .ranking_engine import RankingEngine @@ -86,10 +87,10 @@ class Searcher: self.text_embedding_field = mapping_gen.get_text_embedding_field() self.image_embedding_field = mapping_gen.get_image_embedding_field() - # Query builder - self.query_builder = ESQueryBuilder( + # Query builder - use multi-language version + self.query_builder = MultiLanguageQueryBuilder( + config=config, index_name=config.es_index_name, - match_fields=self.match_fields, text_embedding_field=self.text_embedding_field, image_embedding_field=self.image_embedding_field ) @@ -144,11 +145,10 @@ class Searcher: query_text = parsed_query.rewritten_query print(f"[Searcher] Parsed boolean expression: {query_node}") - # Step 3: Build ES query - es_query = self.query_builder.build_query( - query_text=query_text, + # Step 3: Build ES query using multi-language builder + es_query = self.query_builder.build_multilang_query( + parsed_query=parsed_query, query_vector=parsed_query.query_vector if enable_embedding else None, - query_node=query_node, filters=filters, size=size, from_=from_, @@ -325,6 +325,15 @@ class Searcher: query_info={'image_url': image_url, 'search_type': 'image_similarity'} ) + def get_domain_summary(self) -> Dict[str, Any]: + """ + Get summary of all configured domains. + + Returns: + Dictionary with domain information + """ + return self.query_builder.get_domain_summary() + def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: """ Get single document by ID. diff --git a/test_multilang_config.py b/test_multilang_config.py new file mode 100644 index 0000000..8452ddb --- /dev/null +++ b/test_multilang_config.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +""" +Test script to validate multi-language configuration. + +This script validates that: +1. language_field_mapping is correctly loaded from config +2. All referenced fields exist and have correct analyzers +3. Multi-language query builder works correctly +""" + +import sys +import json +from config import ConfigLoader + +def test_config_loading(): + """Test that configuration loads correctly with language_field_mapping.""" + print("=" * 60) + print("Testing Configuration Loading") + print("=" * 60) + + try: + loader = ConfigLoader() + config = loader.load_customer_config('customer1') + + print(f"\n✓ Configuration loaded successfully") + print(f" Customer: {config.customer_name}") + print(f" ES Index: {config.es_index_name}") + + # Validate configuration + errors = loader.validate_config(config) + if errors: + print(f"\n✗ Configuration validation failed:") + for error in errors: + print(f" - {error}") + return False + else: + print(f"\n✓ Configuration validation passed") + + # Check indexes with language_field_mapping + print(f"\nIndexes with multi-language support:") + for index in config.indexes: + if index.language_field_mapping: + print(f"\n {index.name} ({index.label}):") + print(f" Fields: {index.fields}") + print(f" Language mapping:") + for lang, fields in index.language_field_mapping.items(): + print(f" {lang}: {fields}") + else: + print(f"\n {index.name} ({index.label}): No language mapping") + + return True + + except Exception as e: + print(f"\n✗ Error loading configuration: {e}") + import traceback + traceback.print_exc() + return False + + +def test_multilang_query_builder(): + """Test that MultiLanguageQueryBuilder works correctly.""" + print("\n" + "=" * 60) + print("Testing Multi-Language Query Builder") + print("=" * 60) + + try: + from config import ConfigLoader + from query import QueryParser + from search.multilang_query_builder import MultiLanguageQueryBuilder + from indexer import MappingGenerator + + loader = ConfigLoader() + config = loader.load_customer_config('customer1') + + # Initialize query builder + mapping_gen = MappingGenerator(config) + text_embedding_field = mapping_gen.get_text_embedding_field() + image_embedding_field = mapping_gen.get_image_embedding_field() + + query_builder = MultiLanguageQueryBuilder( + config=config, + index_name=config.es_index_name, + text_embedding_field=text_embedding_field, + image_embedding_field=image_embedding_field + ) + + print(f"\n✓ MultiLanguageQueryBuilder initialized") + + # Get domain summary + summary = query_builder.get_domain_summary() + print(f"\nDomain Summary:") + for domain, info in summary.items(): + print(f" {domain}:") + print(f" Label: {info['label']}") + print(f" Has multilang mapping: {info['has_multilang_mapping']}") + if info['has_multilang_mapping']: + print(f" Supported languages: {info['supported_languages']}") + + # Test query parsing + query_parser = QueryParser(config) + test_queries = [ + "芭比娃娃", + "title:芭比娃娃", + "default:玩具" + ] + + print(f"\nTesting query parsing:") + for query in test_queries: + print(f"\n Query: '{query}'") + parsed = query_parser.parse(query, generate_vector=False) + print(f" Domain: {parsed.domain}") + print(f" Detected language: {parsed.detected_language}") + print(f" Translations: {list(parsed.translations.keys())}") + + # Build query + es_query = query_builder.build_multilang_query( + parsed_query=parsed, + query_vector=None, + filters=None, + size=10, + enable_knn=False + ) + print(f" ES Query keys: {list(es_query.keys())}") + + return True + + except Exception as e: + print(f"\n✗ Error testing query builder: {e}") + import traceback + traceback.print_exc() + return False + + +if __name__ == "__main__": + print("Multi-Language Configuration Test") + print("=" * 60) + + success = True + + # Test 1: Configuration loading + if not test_config_loading(): + success = False + + # Test 2: Multi-language query builder + if not test_multilang_query_builder(): + success = False + + print("\n" + "=" * 60) + if success: + print("✓ All tests passed!") + sys.exit(0) + else: + print("✗ Some tests failed") + sys.exit(1) + diff --git a/当前开发进度.md b/当前开发进度.md new file mode 100644 index 0000000..0d3fcbd --- /dev/null +++ b/当前开发进度.md @@ -0,0 +1,53 @@ + + +对后端搜索技术 做通用化。 + +通用化的本质 是 对于各种业务数据、各种检索需求,都可以 用少量定制+配置化 来实现效果。 + +## 1. 原始数据层的约定。 +### 店匠主表 +shoplazza_product_sku +shoplazza_product_spu +所有租户共用这个主表 + +### 每个租户的辅表 +各个租户,有自己的扩展表。 入索引的时候,商品主表 shoplazza_product_sku 的 id + shopid,拼接租户自己单独的扩展表(比如可以放一些自己的属性体系、各种语言的商品名、品牌名、标签、分类等) + +但是,各个租户,可能有不一样的业务数据,比如不同租户有不同的属性的体系、不同语言的商品标题(一般至少有中英文两种满足跨境的搜索需求),有不同的权重(提权)字段、业务过滤和聚合字段。 +能够统一的 只能是 sku表 按照一套配置规范、做一个配置文件,按照配置文件建设ES mapping结构以及做数据的入库。 + +1. 应用结构配置 : 定义了ES的输入数据有哪些字段、关联mysql的哪些字段. + 请帮我补充具体实现的一些配置 + + +2。 索引结构配置 : 定义了ES的字段,每个字段的索引mapping配置,支持各个域的查询,包括默认的域的查询。索引配置预定一号了一堆分析方式 + 请帮我补充具体实现的一些配置 + +## 测试数据灌入 + +灌入数据、mysql到ES的自动同步,不在本项目的范围内,但是,该项目 为了提供测试数据,需要 构造一个实例 customer1. +我们为他构造一套应用配置和索引配置。 +暂时是随机抽了我们自己的1w数据,建设辅助表,然后写一个程序,将数据分别灌入主表和辅表。 + +请帮我补充具体,当前测试数据灌入的具体的配置和方式,比如辅助表的内容 对应的应用结构配置 索引配置 等等。 + +## queryParser + +1. 查询改写。 配置词典的key是query,value是改写后的查询表达式,比如。比如品牌词 改写为在brand|query OR name|query,类别词、标签词等都可以放进去。纠错、规范化、查询改写等 都可以通过这个词典来配置。 +2. 翻译。配置需要得到的几种目标语言。 在customer1测试案例中,我们配置 zh en两种语言。先对query做语言检测,如果query是中文那么要翻译一下en,如果是en那么要翻译zh,如果两者都不是那么zh en都需要翻译。 +3. 如果配置打开了text_embedding查询,并且query 包含了default域的查询,那么要把default域的查询词转向量,后面searcher会用这个向量参与查询。 + +也帮我补充一些具体实现情况 + +## searcher + +支持多种检索表达式: +支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。 + +## default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 + +暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 + +也帮我补充一些具体实现情况 + diff --git a/支持多语言查询.md b/支持多语言查询.md new file mode 100644 index 0000000..fe93dea --- /dev/null +++ b/支持多语言查询.md @@ -0,0 +1,196 @@ + + + + +这次修改没改完。 + + +diff --git a/HighLevelDesign.md b/HighLevelDesign.md +index 397a9f7..3e728c9 100644 +--- a/HighLevelDesign.md ++++ b/HighLevelDesign.md +@@ -112,10 +112,9 @@ if response.status_code == 200: + 支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。 + + default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。 +-多语言搜索: +-对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 + + 暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分) ++bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。 + bm25打分(base_query): + "multi_match": { + "query": search_query, +diff --git a/config/config_loader.py b/config/config_loader.py +index 8df15b3..f3fcaa3 100644 +--- a/config/config_loader.py ++++ b/config/config_loader.py +@@ -27,6 +27,9 @@ class IndexConfig: + boost: float = 1.0 + example: Optional[str] = None + ++ # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]} ++ language_field_mapping: Optional[Dict[str, List[str]]] = None ++ + + @dataclass + class RankingConfig: +@@ -66,8 +69,6 @@ class CustomerConfig: + + # Database settings + mysql_config: Dict[str, Any] +- main_table: str = "shoplazza_product_sku" +- extension_table: Optional[str] = None + + # Field definitions + fields: List[FieldConfig] +@@ -86,6 +87,10 @@ class CustomerConfig: + + # ES index settings + es_index_name: str ++ ++ # Optional fields with defaults ++ main_table: str = "shoplazza_product_sku" ++ extension_table: Optional[str] = None + es_settings: Dict[str, Any] = field(default_factory=dict) + + +@@ -228,13 +233,17 @@ class ConfigLoader: + if analyzer_str not in ANALYZER_MAP: + raise ConfigurationError(f"Unknown analyzer: {analyzer_str}") + ++ # Parse language field mapping if present ++ language_field_mapping = index_data.get("language_field_mapping") ++ + return IndexConfig( + name=index_data["name"], + label=index_data.get("label", index_data["name"]), + fields=index_data["fields"], + analyzer=ANALYZER_MAP[analyzer_str], + boost=index_data.get("boost", 1.0), +- example=index_data.get("example") ++ example=index_data.get("example"), ++ language_field_mapping=language_field_mapping + ) + + def validate_config(self, config: CustomerConfig) -> List[str]: +@@ -360,11 +369,16 @@ class ConfigLoader: + + def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]: + """Convert IndexConfig to dictionary.""" +- return { ++ result = { + "name": index.name, + "label": index.label, + "fields": index.fields, + "analyzer": index.analyzer.value, + "boost": index.boost, + "example": index.example +- } +\ No newline at end of file ++ } ++ ++ if index.language_field_mapping: ++ result["language_field_mapping"] = index.language_field_mapping ++ ++ return result +\ No newline at end of file +diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml +index bfe2e53..84e9ba1 100644 +--- a/config/schema/customer1_config.yaml ++++ b/config/schema/customer1_config.yaml +@@ -177,6 +177,15 @@ indexes: + analyzer: "chinese_ecommerce" + boost: 1.0 + example: 'query=default:"消防套"' ++ language_field_mapping: ++ zh: ++ - "name" ++ - "categoryName" ++ - "brandName" ++ en: ++ - "enSpuName" ++ ru: ++ - "ruSkuName" + + - name: "title" + label: "标题索引" +@@ -187,6 +196,13 @@ indexes: + analyzer: "chinese_ecommerce" + boost: 2.0 + example: 'query=title:"芭比娃娃"' ++ language_field_mapping: ++ zh: ++ - "name" ++ en: ++ - "enSpuName" ++ ru: ++ - "ruSkuName" + + - name: "category" + label: "类目索引" +diff --git a/search/searcher.py b/search/searcher.py +index a7088ec..0a798ed 100644 +--- a/search/searcher.py ++++ b/search/searcher.py +@@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery + from indexer import MappingGenerator + from .boolean_parser import BooleanParser, QueryNode + from .es_query_builder import ESQueryBuilder ++from .multilang_query_builder import MultiLanguageQueryBuilder + from .ranking_engine import RankingEngine + + +@@ -86,10 +87,10 @@ class Searcher: + self.text_embedding_field = mapping_gen.get_text_embedding_field() + self.image_embedding_field = mapping_gen.get_image_embedding_field() + +- # Query builder +- self.query_builder = ESQueryBuilder( ++ # Query builder - use multi-language version ++ self.query_builder = MultiLanguageQueryBuilder( ++ config=config, + index_name=config.es_index_name, +- match_fields=self.match_fields, + text_embedding_field=self.text_embedding_field, + image_embedding_field=self.image_embedding_field + ) +@@ -144,11 +145,10 @@ class Searcher: + query_text = parsed_query.rewritten_query + print(f"[Searcher] Parsed boolean expression: {query_node}") + +- # Step 3: Build ES query +- es_query = self.query_builder.build_query( +- query_text=query_text, ++ # Step 3: Build ES query using multi-language builder ++ es_query = self.query_builder.build_multilang_query( ++ parsed_query=parsed_query, + query_vector=parsed_query.query_vector if enable_embedding else None, +- query_node=query_node, + filters=filters, + size=size, + from_=from_, +@@ -325,6 +325,15 @@ class Searcher: + query_info={'image_url': image_url, 'search_type': 'image_similarity'} + ) + ++ def get_domain_summary(self) -> Dict[str, Any]: ++ """ ++ Get summary of all configured domains. ++ ++ Returns: ++ Dictionary with domain information ++ """ ++ return self.query_builder.get_domain_summary() ++ + def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: + """ + Get single document by ID. + + + + + + + + -- libgit2 0.21.2