b926f678
tangwang
多语言查询
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
这次修改没改完。
diff --git a/HighLevelDesign.md b/HighLevelDesign.md
index 397a9f7..3e728c9 100644
--- a/HighLevelDesign.md
+++ b/HighLevelDesign.md
@@ -112,10 +112,9 @@ if response.status_code == 200:
支持多种匹配方式,如AND、OR、RANK、NOTAND以及(),优先级从高到低为(),ANDNOT,AND,OR,RANK。
default域的相关性,是代码里面单独计算,是特定的深度定制优化的,暂时不做配置化。
-多语言搜索:
-对外提供的服务 用default域搜索就行,但是内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。
暂时具体实现为 bm25()+0.2*text_embedding_relevence(也就是knn检索表达式的打分)
+bm25() 包括多语言的打分:内部需要通过配置翻译为多种语言(配置几种目标语言 默认中文、英文,并且设置对应的检索域),然后分别到对应的字段搜索,中文字段到配置的中文title搜索,英文到对应的英文title搜索。
|
bb52dba6
tangwang
API接口设计优化:
|
21
|
```text
|
b926f678
tangwang
多语言查询
|
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
bm25打分(base_query):
"multi_match": {
"query": search_query,
diff --git a/config/config_loader.py b/config/config_loader.py
index 8df15b3..f3fcaa3 100644
--- a/config/config_loader.py
+++ b/config/config_loader.py
@@ -27,6 +27,9 @@ class IndexConfig:
boost: float = 1.0
example: Optional[str] = None
+ # Multi-language field mapping: {"zh": ["name"], "en": ["enSpuName"], "ru": ["ruSkuName"]}
+ language_field_mapping: Optional[Dict[str, List[str]]] = None
+
@dataclass
class RankingConfig:
|
ae5a294d
tangwang
命名修改、代码清理
|
39
|
@@ -66,8 +69,6 @@ class TenantConfig:
|
b926f678
tangwang
多语言查询
|
40
41
42
43
44
45
46
47
|
# Database settings
mysql_config: Dict[str, Any]
- main_table: str = "shoplazza_product_sku"
- extension_table: Optional[str] = None
# Field definitions
fields: List[FieldConfig]
|
ae5a294d
tangwang
命名修改、代码清理
|
48
|
@@ -86,6 +87,10 @@ class TenantConfig:
|
b926f678
tangwang
多语言查询
|
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
# ES index settings
es_index_name: str
+
+ # Optional fields with defaults
+ main_table: str = "shoplazza_product_sku"
+ extension_table: Optional[str] = None
es_settings: Dict[str, Any] = field(default_factory=dict)
@@ -228,13 +233,17 @@ class ConfigLoader:
if analyzer_str not in ANALYZER_MAP:
raise ConfigurationError(f"Unknown analyzer: {analyzer_str}")
+ # Parse language field mapping if present
+ language_field_mapping = index_data.get("language_field_mapping")
+
return IndexConfig(
name=index_data["name"],
label=index_data.get("label", index_data["name"]),
fields=index_data["fields"],
analyzer=ANALYZER_MAP[analyzer_str],
boost=index_data.get("boost", 1.0),
- example=index_data.get("example")
+ example=index_data.get("example"),
+ language_field_mapping=language_field_mapping
)
|
ae5a294d
tangwang
命名修改、代码清理
|
77
|
def validate_config(self, config: TenantConfig) -> List[str]:
|
b926f678
tangwang
多语言查询
|
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
@@ -360,11 +369,16 @@ class ConfigLoader:
def _index_to_dict(self, index: IndexConfig) -> Dict[str, Any]:
"""Convert IndexConfig to dictionary."""
- return {
+ result = {
"name": index.name,
"label": index.label,
"fields": index.fields,
"analyzer": index.analyzer.value,
"boost": index.boost,
"example": index.example
- }
\ No newline at end of file
+ }
+
+ if index.language_field_mapping:
+ result["language_field_mapping"] = index.language_field_mapping
+
+ return result
\ No newline at end of file
|
ae5a294d
tangwang
命名修改、代码清理
|
99
|
diff --git a/config/schema/tenant1_config.yaml b/config/schema/tenant1_config.yaml
|
b926f678
tangwang
多语言查询
|
100
|
index bfe2e53..84e9ba1 100644
|
ae5a294d
tangwang
命名修改、代码清理
|
101
102
|
--- a/config/schema/tenant1_config.yaml
+++ b/config/schema/tenant1_config.yaml
|
b926f678
tangwang
多语言查询
|
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
|
@@ -177,6 +177,15 @@ indexes:
analyzer: "chinese_ecommerce"
boost: 1.0
example: 'query=default:"消防套"'
+ language_field_mapping:
+ zh:
+ - "name"
+ - "categoryName"
+ - "brandName"
+ en:
+ - "enSpuName"
+ ru:
+ - "ruSkuName"
- name: "title"
label: "标题索引"
@@ -187,6 +196,13 @@ indexes:
analyzer: "chinese_ecommerce"
boost: 2.0
example: 'query=title:"芭比娃娃"'
+ language_field_mapping:
+ zh:
+ - "name"
+ en:
+ - "enSpuName"
+ ru:
+ - "ruSkuName"
- name: "category"
label: "类目索引"
diff --git a/search/searcher.py b/search/searcher.py
index a7088ec..0a798ed 100644
--- a/search/searcher.py
+++ b/search/searcher.py
@@ -13,6 +13,7 @@ from query import QueryParser, ParsedQuery
from indexer import MappingGenerator
from .boolean_parser import BooleanParser, QueryNode
from .es_query_builder import ESQueryBuilder
+from .multilang_query_builder import MultiLanguageQueryBuilder
from .ranking_engine import RankingEngine
@@ -86,10 +87,10 @@ class Searcher:
self.text_embedding_field = mapping_gen.get_text_embedding_field()
self.image_embedding_field = mapping_gen.get_image_embedding_field()
- # Query builder
- self.query_builder = ESQueryBuilder(
+ # Query builder - use multi-language version
+ self.query_builder = MultiLanguageQueryBuilder(
+ config=config,
index_name=config.es_index_name,
- match_fields=self.match_fields,
text_embedding_field=self.text_embedding_field,
image_embedding_field=self.image_embedding_field
)
@@ -144,11 +145,10 @@ class Searcher:
query_text = parsed_query.rewritten_query
print(f"[Searcher] Parsed boolean expression: {query_node}")
- # Step 3: Build ES query
- es_query = self.query_builder.build_query(
- query_text=query_text,
+ # Step 3: Build ES query using multi-language builder
+ es_query = self.query_builder.build_multilang_query(
+ parsed_query=parsed_query,
query_vector=parsed_query.query_vector if enable_embedding else None,
- query_node=query_node,
filters=filters,
size=size,
from_=from_,
@@ -325,6 +325,15 @@ class Searcher:
query_info={'image_url': image_url, 'search_type': 'image_similarity'}
)
+ def get_domain_summary(self) -> Dict[str, Any]:
+ """
+ Get summary of all configured domains.
+
+ Returns:
+ Dictionary with domain information
+ """
+ return self.query_builder.get_domain_summary()
+
def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
"""
Get single document by ID.
|
bb52dba6
tangwang
API接口设计优化:
|
190
|
```
|
b926f678
tangwang
多语言查询
|
|
|