Blame view

支持多语言查询.md 6.62 KB
b926f678   tangwang   多语言查询
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
  
  
  
  
  这次修改没改完。
  
  
  diff --git a/HighLevelDesign.md b/HighLevelDesign.md
  index 397a9f7..3e728c9 100644
  --- a/HighLevelDesign.md
  +++ b/HighLevelDesign.md
  @@ -112,10 +112,9 @@ if response.status_code == 200:
   支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。
   
   default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。
  -多语言搜索:
  -对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。
   
   暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分)
  +bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。
bb52dba6   tangwang   API接口设计优化:
21
  ```text
b926f678   tangwang   多语言查询
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
   bm25打分(base_query):
   "multi_match": {
       "query": search_query,
  diff --git a/config/config_loader.py b/config/config_loader.py
  index 8df15b3..f3fcaa3 100644
  --- a/config/config_loader.py
  +++ b/config/config_loader.py
  @@ -27,6 +27,9 @@ class IndexConfig:
       boost: float = 1.0
       example: Optional[str] = None
   
  +    # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]}
  +    language_field_mapping: Optional[Dict[str, List[str]]] = None
  +
   
   @dataclass
   class RankingConfig:
ae5a294d   tangwang   命名修改、代码清理
39
  @@ -66,8 +69,6 @@ class TenantConfig:
b926f678   tangwang   多语言查询
40
41
42
43
44
45
46
47
   
       # Database settings
       mysql_config: Dict[str, Any]
  -    main_table: str = "shoplazza_product_sku"
  -    extension_table: Optional[str] = None
   
       # Field definitions
       fields: List[FieldConfig]
ae5a294d   tangwang   命名修改、代码清理
48
  @@ -86,6 +87,10 @@ class TenantConfig:
b926f678   tangwang   多语言查询
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
   
       # ES index settings
       es_index_name: str
  +
  +    # Optional fields with defaults
  +    main_table: str = "shoplazza_product_sku"
  +    extension_table: Optional[str] = None
       es_settings: Dict[str, Any] = field(default_factory=dict)
   
   
  @@ -228,13 +233,17 @@ class ConfigLoader:
           if analyzer_str not in ANALYZER_MAP:
               raise ConfigurationError(f"Unknown analyzer: {analyzer_str}")
   
  +        # Parse language field mapping if present
  +        language_field_mapping = index_data.get("language_field_mapping")
  +
           return IndexConfig(
               name=index_data["name"],
               label=index_data.get("label", index_data["name"]),
               fields=index_data["fields"],
               analyzer=ANALYZER_MAP[analyzer_str],
               boost=index_data.get("boost", 1.0),
  -            example=index_data.get("example")
  +            example=index_data.get("example"),
  +            language_field_mapping=language_field_mapping
           )
   
ae5a294d   tangwang   命名修改、代码清理
77
       def validate_config(self, config: TenantConfig) -> List[str]:
b926f678   tangwang   多语言查询
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  @@ -360,11 +369,16 @@ class ConfigLoader:
   
       def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
           """Convert IndexConfig to dictionary."""
  -        return {
  +        result = {
               "name": index.name,
               "label": index.label,
               "fields": index.fields,
               "analyzer": index.analyzer.value,
               "boost": index.boost,
               "example": index.example
  -        }
  \ No newline at end of file
  +        }
  +
  +        if index.language_field_mapping:
  +            result["language_field_mapping"] = index.language_field_mapping
  +
  +        return result
  \ No newline at end of file
ae5a294d   tangwang   命名修改、代码清理
99
  diff --git a/config/schema/tenant1_config.yaml b/config/schema/tenant1_config.yaml
b926f678   tangwang   多语言查询
100
  index bfe2e53..84e9ba1 100644
ae5a294d   tangwang   命名修改、代码清理
101
102
  --- a/config/schema/tenant1_config.yaml
  +++ b/config/schema/tenant1_config.yaml
b926f678   tangwang   多语言查询
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
  @@ -177,6 +177,15 @@ indexes:
       analyzer: "chinese_ecommerce"
       boost: 1.0
       example: 'query=default:"消防套"'
  +    language_field_mapping:
  +      zh:
  +        - "name"
  +        - "categoryName"
  +        - "brandName"
  +      en:
  +        - "enSpuName"
  +      ru:
  +        - "ruSkuName"
   
     - name: "title"
       label: "标题索引"
  @@ -187,6 +196,13 @@ indexes:
       analyzer: "chinese_ecommerce"
       boost: 2.0
       example: 'query=title:"芭比娃娃"'
  +    language_field_mapping:
  +      zh:
  +        - "name"
  +      en:
  +        - "enSpuName"
  +      ru:
  +        - "ruSkuName"
   
     - name: "category"
       label: "类目索引"
  diff --git a/search/searcher.py b/search/searcher.py
  index a7088ec..0a798ed 100644
  --- a/search/searcher.py
  +++ b/search/searcher.py
  @@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery
   from indexer import MappingGenerator
   from .boolean_parser import BooleanParser, QueryNode
   from .es_query_builder import ESQueryBuilder
  +from .multilang_query_builder import MultiLanguageQueryBuilder
   from .ranking_engine import RankingEngine
   
   
  @@ -86,10 +87,10 @@ class Searcher:
           self.text_embedding_field = mapping_gen.get_text_embedding_field()
           self.image_embedding_field = mapping_gen.get_image_embedding_field()
   
  -        # Query builder
  -        self.query_builder = ESQueryBuilder(
  +        # Query builder - use multi-language version
  +        self.query_builder = MultiLanguageQueryBuilder(
  +            config=config,
               index_name=config.es_index_name,
  -            match_fields=self.match_fields,
               text_embedding_field=self.text_embedding_field,
               image_embedding_field=self.image_embedding_field
           )
  @@ -144,11 +145,10 @@ class Searcher:
               query_text = parsed_query.rewritten_query
               print(f"[Searcher] Parsed boolean expression: {query_node}")
   
  -        # Step 3: Build ES query
  -        es_query = self.query_builder.build_query(
  -            query_text=query_text,
  +        # Step 3: Build ES query using multi-language builder
  +        es_query = self.query_builder.build_multilang_query(
  +            parsed_query=parsed_query,
               query_vector=parsed_query.query_vector if enable_embedding else None,
  -            query_node=query_node,
               filters=filters,
               size=size,
               from_=from_,
  @@ -325,6 +325,15 @@ class Searcher:
               query_info={'image_url': image_url, 'search_type': 'image_similarity'}
           )
   
  +    def get_domain_summary(self) -> Dict[str, Any]:
  +        """
  +        Get summary of all configured domains.
  +
  +        Returns:
  +            Dictionary with domain information
  +        """
  +        return self.query_builder.get_domain_summary()
  +
       def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
           """
           Get single document by ID.
bb52dba6   tangwang   API接口设计优化:
190
  ```
b926f678   tangwang   多语言查询