Commit 70dab99fcf4c5e65ff36ecd2de434c7f8c0f6346
1 parent
92d5eb07
add logs
Showing
9 changed files
with
462 additions
and
61 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -118,10 +118,13 @@ query_config: |
| 118 | 118 | # 返回字段配置(_source includes) |
| 119 | 119 | # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 |
| 120 | 120 | source_fields: null |
| 121 | + | |
| 122 | + # KNN boost配置(向量召回的boost值) | |
| 123 | + knn_boost: 0.25 # Lower boost for embedding recall | |
| 121 | 124 | |
| 122 | 125 | # Ranking Configuration(排序配置) |
| 123 | 126 | ranking: |
| 124 | - expression: "bm25() + 0.2*text_embedding_relevance()" | |
| 127 | + expression: "bm25() + 0.25*text_embedding_relevance()" | |
| 125 | 128 | description: "BM25 text relevance combined with semantic embedding similarity" |
| 126 | 129 | |
| 127 | 130 | # Function Score配置(ES层打分规则) | ... | ... |
config/config_loader.py
| ... | ... | @@ -56,6 +56,9 @@ class QueryConfig: |
| 56 | 56 | |
| 57 | 57 | # Source fields configuration |
| 58 | 58 | source_fields: Optional[List[str]] = None |
| 59 | + | |
| 60 | + # KNN boost configuration | |
| 61 | + knn_boost: float = 0.25 # Boost value for KNN (embedding recall) | |
| 59 | 62 | |
| 60 | 63 | |
| 61 | 64 | @dataclass |
| ... | ... | @@ -241,7 +244,8 @@ class ConfigLoader: |
| 241 | 244 | image_embedding_field=query_config_data.get("image_embedding_field"), |
| 242 | 245 | embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), |
| 243 | 246 | embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3), |
| 244 | - source_fields=query_config_data.get("source_fields") | |
| 247 | + source_fields=query_config_data.get("source_fields"), | |
| 248 | + knn_boost=query_config_data.get("knn_boost", 0.25) | |
| 245 | 249 | ) |
| 246 | 250 | |
| 247 | 251 | # Parse ranking config | ... | ... |
| ... | ... | @@ -0,0 +1,361 @@ |
| 1 | +# 亚马逊格式到店匠格式转换 - 核心工作内容分析 | |
| 2 | + | |
| 3 | +## 一、概述 | |
| 4 | + | |
| 5 | +本项目实现了从**亚马逊格式Excel数据**到**店匠(Shoplazza)商品导入模板**的格式转换,主要处理商品的多款式(变体)结构和属性字段映射。 | |
| 6 | + | |
| 7 | +**核心脚本**:`scripts/amazon_xlsx_to_shoplazza_xlsx.py` | |
| 8 | + | |
| 9 | +--- | |
| 10 | + | |
| 11 | +## 二、父子款式处理(M/P/S 结构转换) | |
| 12 | + | |
| 13 | +### 2.1 输入格式(亚马逊) | |
| 14 | + | |
| 15 | +- **ASIN**:变体ID(SKU级别) | |
| 16 | +- **父ASIN**:父商品ID(SPU级别) | |
| 17 | +- 一个父ASIN可以包含多个ASIN(多个变体) | |
| 18 | + | |
| 19 | +### 2.2 输出格式(店匠) | |
| 20 | + | |
| 21 | +店匠模板定义了三种商品属性类型: | |
| 22 | + | |
| 23 | +1. **S(单一款式)**:只有一个变体的商品 | |
| 24 | + - 输出:**1行** | |
| 25 | + - 包含所有商品信息(标题、价格、库存等) | |
| 26 | + | |
| 27 | +2. **M(主商品)+ P(子款式)**:包含多个变体的商品 | |
| 28 | + - 输出:**1行M + N行P** | |
| 29 | + - **关键约束**:同一商品的P行必须紧跟在M行后面(模板导入强约束) | |
| 30 | + | |
| 31 | +### 2.3 转换策略 | |
| 32 | + | |
| 33 | +```python | |
| 34 | +# 核心逻辑(简化版) | |
| 35 | +for 父ASIN in 所有父ASIN: | |
| 36 | + variants = 获取该父ASIN下的所有ASIN | |
| 37 | + | |
| 38 | + if len(variants) == 1: | |
| 39 | + 生成 S 行(单一款式) | |
| 40 | + else: | |
| 41 | + 生成 M 行(主商品)+ 多个 P 行(子款式) | |
| 42 | +``` | |
| 43 | + | |
| 44 | +### 2.4 关键处理点 | |
| 45 | + | |
| 46 | +#### 1. 父ASIN排序 | |
| 47 | +- 确保父ASIN对应的变体在列表最前面 | |
| 48 | +- 如果找不到父ASIN对应的变体,根据配置决定是否丢弃整个SPU | |
| 49 | + | |
| 50 | +#### 2. 标题一致性检查 | |
| 51 | +- 同一SPU下的所有变体标题必须一致 | |
| 52 | +- 如果发现不一致: | |
| 53 | + - 选项1:丢弃标题不一致的SKU(默认) | |
| 54 | + - 选项2:修正为统一的主商品标题 | |
| 55 | + | |
| 56 | +#### 3. M行与P行的字段分工 | |
| 57 | + | |
| 58 | +**M行(主商品)填写**: | |
| 59 | +- ✅ 商品标题、描述、SEO信息 | |
| 60 | +- ✅ 专辑、标签、供应商信息 | |
| 61 | +- ✅ 商品主图 | |
| 62 | +- ✅ 款式维度名(款式1/2/3的key) | |
| 63 | +- ❌ 不填:价格、库存、重量等SKU级字段(保持为空更安全) | |
| 64 | + | |
| 65 | +**P行(子款式)填写**: | |
| 66 | +- ✅ 商品标题(与M行一致) | |
| 67 | +- ✅ 款式维度值(款式1/2/3的value) | |
| 68 | +- ✅ 价格、商品SKU(ASIN)、库存 | |
| 69 | +- ✅ 重量、尺寸 | |
| 70 | +- ✅ 子款式图(可选) | |
| 71 | +- ❌ 不填:描述、SEO、专辑等SPU级字段(保持为空) | |
| 72 | + | |
| 73 | +--- | |
| 74 | + | |
| 75 | +## 三、属性字段处理(款式维度解析) | |
| 76 | + | |
| 77 | +### 3.1 问题背景 | |
| 78 | + | |
| 79 | +亚马逊格式中,变体的"颜色/尺码"等信息**并不拆成多个列**,而是集中在 `SKU` 字符串里: | |
| 80 | + | |
| 81 | +``` | |
| 82 | +示例1: "Size: One Size | Color: Black" | |
| 83 | +示例2: "Color: Red | Style: 2-Pack" | |
| 84 | +``` | |
| 85 | + | |
| 86 | +而店匠模板需要: | |
| 87 | +- **M行**:`款式1/款式2/款式3` 填写**维度名**(如 Size、Color、Material) | |
| 88 | +- **P行**:`款式1/款式2/款式3` 填写**维度值**(如 One Size、Black、Cotton) | |
| 89 | + | |
| 90 | +### 3.2 SKU解析逻辑 | |
| 91 | + | |
| 92 | +```python | |
| 93 | +def parse_sku_options(sku_text): | |
| 94 | + """ | |
| 95 | + 解析 SKU 列,提取 key:value 对 | |
| 96 | + 输入: "Size: One Size | Color: Black" | |
| 97 | + 输出: {"Size": "One Size", "Color": "Black"} | |
| 98 | + """ | |
| 99 | + # 1. 按 | 分割 | |
| 100 | + parts = sku_text.split("|") | |
| 101 | + | |
| 102 | + # 2. 按 : 拆成 key/value | |
| 103 | + for part in parts: | |
| 104 | + if ":" in part: | |
| 105 | + key, value = part.split(":", 1) | |
| 106 | + result[key.strip()] = value.strip() | |
| 107 | + | |
| 108 | + return result | |
| 109 | +``` | |
| 110 | + | |
| 111 | +### 3.3 维度选择策略(最多3个维度) | |
| 112 | + | |
| 113 | +店匠模板只提供 `款式1~3` 三个维度,因此需要从多个变体中**智能选择最多3个维度**: | |
| 114 | + | |
| 115 | +#### 优先级规则 | |
| 116 | + | |
| 117 | +1. **按预设优先级排序**: | |
| 118 | + ```python | |
| 119 | + PREFERRED_OPTION_KEYS = [ | |
| 120 | + "Size", "Color", "Style", "Pattern", "Material", | |
| 121 | + "Flavor", "Scent", "Pack", "Pack of", ... | |
| 122 | + ] | |
| 123 | + ``` | |
| 124 | + | |
| 125 | +2. **按出现频次排序**:统计每个key在所有变体中的出现次数 | |
| 126 | + | |
| 127 | +3. **综合排序**: | |
| 128 | + ```python | |
| 129 | + def key_sort(k): | |
| 130 | + return ( | |
| 131 | + 预设优先级(越小越优先), | |
| 132 | + -出现频次(越大越优先), | |
| 133 | + 字母顺序(作为最后排序依据) | |
| 134 | + ) | |
| 135 | + ``` | |
| 136 | + | |
| 137 | +#### 退化处理 | |
| 138 | + | |
| 139 | +如果解析不到任何 key/value,则退化为单维度: | |
| 140 | +- M行:`款式1 = "Variant"` | |
| 141 | +- P行:`款式1 = ASIN`(使用ASIN作为维度值) | |
| 142 | + | |
| 143 | +### 3.4 维度映射示例 | |
| 144 | + | |
| 145 | +**输入数据**(3个变体): | |
| 146 | +``` | |
| 147 | +变体1: SKU = "Size: S | Color: Red" | |
| 148 | +变体2: SKU = "Size: M | Color: Red" | |
| 149 | +变体3: SKU = "Size: S | Color: Blue" | |
| 150 | +``` | |
| 151 | + | |
| 152 | +**解析结果**: | |
| 153 | +- 维度统计:Size出现3次,Color出现3次 | |
| 154 | +- 选择维度:Size(优先级1)、Color(优先级2) | |
| 155 | +- 最多3个,所以选择前2个 | |
| 156 | + | |
| 157 | +**输出格式**: | |
| 158 | +``` | |
| 159 | +M行: | |
| 160 | + 款式1 = "Size" | |
| 161 | + 款式2 = "Color" | |
| 162 | + 款式3 = "" | |
| 163 | + | |
| 164 | +P行1: | |
| 165 | + 款式1 = "S" | |
| 166 | + 款式2 = "Red" | |
| 167 | + 款式3 = "" | |
| 168 | + | |
| 169 | +P行2: | |
| 170 | + 款式1 = "M" | |
| 171 | + 款式2 = "Red" | |
| 172 | + 款式3 = "" | |
| 173 | + | |
| 174 | +P行3: | |
| 175 | + 款式1 = "S" | |
| 176 | + 款式2 = "Blue" | |
| 177 | + 款式3 = "" | |
| 178 | +``` | |
| 179 | + | |
| 180 | +--- | |
| 181 | + | |
| 182 | +## 四、字段映射总览 | |
| 183 | + | |
| 184 | +### 4.1 核心字段映射 | |
| 185 | + | |
| 186 | +| 店匠字段 | 亚马逊字段 | 处理逻辑 | | |
| 187 | +|---------|-----------|---------| | |
| 188 | +| **商品spu** | `父ASIN` | 无父ASIN则用ASIN | | |
| 189 | +| **商品SKU** | `ASIN` | 直接映射 | | |
| 190 | +| **商品标题*** | `商品标题` | 截断至255字符 | | |
| 191 | +| **商品图片*** | `商品主图` | URL直接映射 | | |
| 192 | +| **商品售价*** | `prime价格($)` 或 `价格($)` | 优先prime价格 | | |
| 193 | +| **创建时间** | `上架时间` | 日期格式转换(补齐时分秒) | | |
| 194 | +| **商品描述** | `商品标题` + `详细参数` | HTML拼接 | | |
| 195 | +| **专辑名称** | `大类目` | 无则取`类目路径`第一段 | | |
| 196 | +| **标签** | `品牌,大类目,小类目` | 逗号拼接 | | |
| 197 | +| **商品重量/重量单位** | `商品重量(单位换算)` | 解析数值和单位(g/kg/lb/oz) | | |
| 198 | +| **尺寸信息** | `商品尺寸` | 解析前三段数字,拼成 `L,W,H`(英寸) | | |
| 199 | + | |
| 200 | +### 4.2 特殊字段处理 | |
| 201 | + | |
| 202 | +#### 1. 价格处理 | |
| 203 | +```python | |
| 204 | +price = prime价格($) or 价格($) or 9.99 # 默认值9.99 | |
| 205 | +``` | |
| 206 | + | |
| 207 | +#### 2. 库存处理 | |
| 208 | +- 亚马逊数据源通常**没有库存** | |
| 209 | +- 脚本默认给每个变体固定库存:**100** | |
| 210 | + | |
| 211 | +#### 3. 日期格式转换 | |
| 212 | +```python | |
| 213 | +输入: "2018-05-09" 或 datetime对象 | |
| 214 | +输出: "2018-05-09 00:00:00" # 补齐时分秒 | |
| 215 | +``` | |
| 216 | + | |
| 217 | +#### 4. 重量解析 | |
| 218 | +```python | |
| 219 | +输入: "68.04 g" 或 "0.15 pounds" | |
| 220 | +输出: (68.04, "g") 或 (0.15, "lb") | |
| 221 | +``` | |
| 222 | + | |
| 223 | +#### 5. 尺寸解析 | |
| 224 | +```python | |
| 225 | +输入: "7.9 x 7.9 x 2 inches" | |
| 226 | +输出: "7.9,7.9,2" # L,W,H格式 | |
| 227 | +``` | |
| 228 | + | |
| 229 | +#### 6. SEO URL Handle生成 | |
| 230 | +```python | |
| 231 | +输入: "Legendary Whitetails Men's Buck Camp Flannel Shirt" | |
| 232 | +输出: "products/legendary-whitetails-mens-buck-camp-flannel-shirt" | |
| 233 | +# 规则:转小写、去特殊字符、空格转横线 | |
| 234 | +``` | |
| 235 | + | |
| 236 | +--- | |
| 237 | + | |
| 238 | +## 五、重要工作内容总结 | |
| 239 | + | |
| 240 | +### 5.1 数据结构转换 | |
| 241 | + | |
| 242 | +1. **父子关系识别**:从扁平化的ASIN列表识别出SPU-SKU层级关系 | |
| 243 | +2. **分组策略**:按父ASIN分组,决定生成S还是M+P结构 | |
| 244 | +3. **行序保证**:确保同一SPU的M+P行连续,不被打断 | |
| 245 | + | |
| 246 | +### 5.2 属性字段解析 | |
| 247 | + | |
| 248 | +1. **SKU字符串解析**:从非结构化字符串中提取key:value对 | |
| 249 | +2. **维度智能选择**:从多个可能的维度中选择最重要的3个 | |
| 250 | +3. **优先级算法**:综合考虑预设优先级、出现频次、字母顺序 | |
| 251 | + | |
| 252 | +### 5.3 字段映射与转换 | |
| 253 | + | |
| 254 | +1. **格式转换**:日期、价格、重量、尺寸等格式标准化 | |
| 255 | +2. **默认值填充**:库存、价格等缺失字段的默认值策略 | |
| 256 | +3. **数据清洗**:标题截断、HTML转义、URL生成等 | |
| 257 | + | |
| 258 | +### 5.4 数据质量控制 | |
| 259 | + | |
| 260 | +1. **标题一致性检查**:确保同一SPU下所有变体标题一致 | |
| 261 | +2. **父ASIN验证**:检查父ASIN是否存在于变体列表中 | |
| 262 | +3. **错误处理**:提供配置选项决定是修正还是丢弃异常数据 | |
| 263 | + | |
| 264 | +### 5.5 性能优化 | |
| 265 | + | |
| 266 | +1. **批量处理**:支持多文件批量转换 | |
| 267 | +2. **文件拆分**:按最大行数自动拆分输出文件(保证同一SPU不拆分) | |
| 268 | +3. **快速读写**:使用`iter_rows(values_only=True)`和write_only模式提升性能 | |
| 269 | + | |
| 270 | +--- | |
| 271 | + | |
| 272 | +## 六、关键技术难点 | |
| 273 | + | |
| 274 | +### 6.1 维度选择算法 | |
| 275 | + | |
| 276 | +**挑战**:从多个变体的SKU字符串中,智能选择最重要的3个维度 | |
| 277 | + | |
| 278 | +**解决方案**: | |
| 279 | +- 统计所有变体中每个key的出现频次 | |
| 280 | +- 结合预设优先级列表(Size > Color > Style > ...) | |
| 281 | +- 综合排序选择前3个 | |
| 282 | + | |
| 283 | +### 6.2 数据一致性保证 | |
| 284 | + | |
| 285 | +**挑战**:确保同一SPU下的所有变体数据一致 | |
| 286 | + | |
| 287 | +**解决方案**: | |
| 288 | +- 标题一致性检查和修正 | |
| 289 | +- 父ASIN排序保证 | |
| 290 | +- M行和P行的字段分工明确 | |
| 291 | + | |
| 292 | +### 6.3 文件拆分策略 | |
| 293 | + | |
| 294 | +**挑战**:按最大行数拆分文件,但不能拆分同一SPU | |
| 295 | + | |
| 296 | +**解决方案**: | |
| 297 | +- 先按SPU分组生成所有行 | |
| 298 | +- 拆分时以SPU为单位,不拆分单个SPU的行 | |
| 299 | + | |
| 300 | +--- | |
| 301 | + | |
| 302 | +## 七、扩展点 | |
| 303 | + | |
| 304 | +### 7.1 可配置项 | |
| 305 | + | |
| 306 | +- 库存默认值(当前100) | |
| 307 | +- 价格默认值(当前9.99) | |
| 308 | +- 上架/收税/物流策略(当前硬编码Y/N) | |
| 309 | + | |
| 310 | +### 7.2 可增强功能 | |
| 311 | + | |
| 312 | +- **更强的多款式解析**:如果SKU字段不规范,可从`详细参数`中提取Color/Size | |
| 313 | +- **图片策略优化**:P行可改为使用M行合并的多图(逗号拼接) | |
| 314 | +- **元字段支持**:支持店匠的元字段导入 | |
| 315 | + | |
| 316 | +--- | |
| 317 | + | |
| 318 | +## 八、使用示例 | |
| 319 | + | |
| 320 | +### 8.1 小批量验证 | |
| 321 | + | |
| 322 | +```bash | |
| 323 | +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | |
| 324 | + --input-dir data/mai_jia_jing_ling/products_data \ | |
| 325 | + --template docs/商品导入模板.xlsx \ | |
| 326 | + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \ | |
| 327 | + --max-files 1 --max-products 50 | |
| 328 | +``` | |
| 329 | + | |
| 330 | +### 8.2 全量转换 | |
| 331 | + | |
| 332 | +```bash | |
| 333 | +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | |
| 334 | + --input-dir data/mai_jia_jing_ling/products_data \ | |
| 335 | + --template docs/商品导入模板.xlsx \ | |
| 336 | + --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx | |
| 337 | +``` | |
| 338 | + | |
| 339 | +### 8.3 自动拆分文件 | |
| 340 | + | |
| 341 | +```bash | |
| 342 | +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ | |
| 343 | + --input-dir data/mai_jia_jing_ling/products_data \ | |
| 344 | + --template docs/商品导入模板.xlsx \ | |
| 345 | + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \ | |
| 346 | + --max-rows-per-output 40000 | |
| 347 | +``` | |
| 348 | + | |
| 349 | +--- | |
| 350 | + | |
| 351 | +## 九、总结 | |
| 352 | + | |
| 353 | +亚马逊格式到店匠格式的转换,核心工作包括: | |
| 354 | + | |
| 355 | +1. **父子款式结构转换**:从ASIN/父ASIN关系转换为M/P/S结构 | |
| 356 | +2. **属性字段解析**:从SKU字符串中提取并智能选择款式维度 | |
| 357 | +3. **字段映射与转换**:40+个字段的格式转换和默认值处理 | |
| 358 | +4. **数据质量控制**:一致性检查、错误处理、数据清洗 | |
| 359 | +5. **性能优化**:批量处理、文件拆分、快速读写 | |
| 360 | + | |
| 361 | +这是一个典型的**数据格式转换ETL任务**,涉及数据结构重组、字符串解析、智能算法选择等多个技术领域。 | ... | ... |
docs/常用查询 - ES.md
| ... | ... | @@ -6,7 +6,8 @@ |
| 6 | 6 | |
| 7 | 7 | ### 1. 根据 tenant_id / spu_id 查询 |
| 8 | 8 | curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 9 | - "size": 1, | |
| 9 | + "size": 100, | |
| 10 | + "_source": ["title_zh", "title_en"], | |
| 10 | 11 | "query": { |
| 11 | 12 | "bool": { |
| 12 | 13 | "filter": [ |
| ... | ... | @@ -18,7 +19,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ |
| 18 | 19 | |
| 19 | 20 | curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ |
| 20 | 21 | "size": 1, |
| 21 | - "_source": ["_id", "*"], | |
| 22 | + "_source": ["title_zh", "title_en"], | |
| 22 | 23 | "query": { |
| 23 | 24 | "bool": { |
| 24 | 25 | "must": [ | ... | ... |
frontend/index.html
| ... | ... | @@ -74,7 +74,7 @@ |
| 74 | 74 | </div> |
| 75 | 75 | <div class="tenant-input-wrapper"> |
| 76 | 76 | <label for="tenantInput">tenant ID:</label> |
| 77 | - <input type="text" id="tenantInput" placeholder="请输入租户ID" value="162"> | |
| 77 | + <input type="text" id="tenantInput" placeholder="请输入租户ID" value="170"> | |
| 78 | 78 | </div> |
| 79 | 79 | <div class="tenant-input-wrapper"> |
| 80 | 80 | <label for="skuFilterDimension">sku_filter_dimension:</label> | ... | ... |
query/query_parser.py
| ... | ... | @@ -175,7 +175,7 @@ class QueryParser: |
| 175 | 175 | logger = context.logger if context else None |
| 176 | 176 | if logger: |
| 177 | 177 | logger.info( |
| 178 | - f"开始查询解析 | 原查询: '{query}' | 生成向量: {generate_vector}", | |
| 178 | + f"Starting query parsing | Original query: '{query}' | Generate vector: {generate_vector}", | |
| 179 | 179 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 180 | 180 | ) |
| 181 | 181 | |
| ... | ... | @@ -193,13 +193,13 @@ class QueryParser: |
| 193 | 193 | |
| 194 | 194 | # Stage 1: Normalize |
| 195 | 195 | normalized = self.normalizer.normalize(query) |
| 196 | - log_debug(f"标准化完成 | '{query}' -> '{normalized}'") | |
| 196 | + log_debug(f"Normalization completed | '{query}' -> '{normalized}'") | |
| 197 | 197 | if context: |
| 198 | 198 | context.store_intermediate_result('normalized_query', normalized) |
| 199 | 199 | |
| 200 | 200 | # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike") |
| 201 | 201 | domain, query_text = self.normalizer.extract_domain_query(normalized) |
| 202 | - log_debug(f"域提取 | 域: '{domain}', 查询: '{query_text}'") | |
| 202 | + log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'") | |
| 203 | 203 | if context: |
| 204 | 204 | context.store_intermediate_result('extracted_domain', domain) |
| 205 | 205 | context.store_intermediate_result('domain_query', query_text) |
| ... | ... | @@ -209,18 +209,18 @@ class QueryParser: |
| 209 | 209 | if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists |
| 210 | 210 | rewritten = self.rewriter.rewrite(query_text) |
| 211 | 211 | if rewritten != query_text: |
| 212 | - log_info(f"查询重写 | '{query_text}' -> '{rewritten}'") | |
| 212 | + log_info(f"Query rewritten | '{query_text}' -> '{rewritten}'") | |
| 213 | 213 | query_text = rewritten |
| 214 | 214 | if context: |
| 215 | 215 | context.store_intermediate_result('rewritten_query', rewritten) |
| 216 | - context.add_warning(f"查询被重写: {query_text}") | |
| 216 | + context.add_warning(f"Query was rewritten: {query_text}") | |
| 217 | 217 | |
| 218 | 218 | # Stage 3: Language detection |
| 219 | 219 | detected_lang = self.language_detector.detect(query_text) |
| 220 | 220 | # Use default language if detection failed (None or "unknown") |
| 221 | 221 | if not detected_lang or detected_lang == "unknown": |
| 222 | 222 | detected_lang = self.config.query_config.default_language |
| 223 | - log_info(f"语言检测 | 检测到语言: {detected_lang}") | |
| 223 | + log_info(f"Language detection | Detected language: {detected_lang}") | |
| 224 | 224 | if context: |
| 225 | 225 | context.store_intermediate_result('detected_language', detected_lang) |
| 226 | 226 | |
| ... | ... | @@ -286,9 +286,9 @@ class QueryParser: |
| 286 | 286 | translations = {k: v for k, v in translations.items() if v is not None} |
| 287 | 287 | |
| 288 | 288 | if translations: |
| 289 | - log_info(f"翻译完成(缓存命中) | 结果: {translations}") | |
| 289 | + log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}") | |
| 290 | 290 | if translation_futures: |
| 291 | - log_debug(f"翻译进行中,等待结果... | 语言: {list(translation_futures.keys())}") | |
| 291 | + log_debug(f"Translation in progress, waiting for results... | Query text: '{query_text}' | Languages: {list(translation_futures.keys())}") | |
| 292 | 292 | |
| 293 | 293 | if context: |
| 294 | 294 | context.store_intermediate_result('translations', translations) |
| ... | ... | @@ -297,7 +297,7 @@ class QueryParser: |
| 297 | 297 | context.store_intermediate_result(f'translation_{lang}', translation) |
| 298 | 298 | |
| 299 | 299 | except Exception as e: |
| 300 | - error_msg = f"翻译失败 | 错误: {str(e)}" | |
| 300 | + error_msg = f"Translation failed | Error: {str(e)}" | |
| 301 | 301 | log_info(error_msg) |
| 302 | 302 | if context: |
| 303 | 303 | context.add_warning(error_msg) |
| ... | ... | @@ -307,8 +307,8 @@ class QueryParser: |
| 307 | 307 | token_count = self._get_token_count(query_text) |
| 308 | 308 | is_short_query, is_long_query = self._analyze_query_type(query_text, token_count) |
| 309 | 309 | |
| 310 | - log_debug(f"查询分析 | 关键词: {keywords} | token数: {token_count} | " | |
| 311 | - f"短查询: {is_short_query} | 长查询: {is_long_query}") | |
| 310 | + log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " | |
| 311 | + f"Short query: {is_short_query} | Long query: {is_long_query}") | |
| 312 | 312 | if context: |
| 313 | 313 | context.store_intermediate_result('keywords', keywords) |
| 314 | 314 | context.store_intermediate_result('token_count', token_count) |
| ... | ... | @@ -328,7 +328,7 @@ class QueryParser: |
| 328 | 328 | encoding_executor = None |
| 329 | 329 | if should_generate_embedding: |
| 330 | 330 | try: |
| 331 | - log_debug("开始生成查询向量(异步)") | |
| 331 | + log_debug("Starting query vector generation (async)") | |
| 332 | 332 | # Submit encoding task to thread pool for async execution |
| 333 | 333 | encoding_executor = ThreadPoolExecutor(max_workers=1) |
| 334 | 334 | def _encode_query_vector() -> Optional[np.ndarray]: |
| ... | ... | @@ -341,7 +341,7 @@ class QueryParser: |
| 341 | 341 | _encode_query_vector |
| 342 | 342 | ) |
| 343 | 343 | except Exception as e: |
| 344 | - error_msg = f"查询向量生成任务提交失败 | 错误: {str(e)}" | |
| 344 | + error_msg = f"Query vector generation task submission failed | Error: {str(e)}" | |
| 345 | 345 | log_info(error_msg) |
| 346 | 346 | if context: |
| 347 | 347 | context.add_warning(error_msg) |
| ... | ... | @@ -350,7 +350,7 @@ class QueryParser: |
| 350 | 350 | |
| 351 | 351 | # Wait for all async tasks to complete (translation and embedding) |
| 352 | 352 | if translation_futures or embedding_future: |
| 353 | - log_debug("等待异步任务完成...") | |
| 353 | + log_debug("Waiting for async tasks to complete...") | |
| 354 | 354 | |
| 355 | 355 | # Collect all futures with their identifiers |
| 356 | 356 | all_futures = [] |
| ... | ... | @@ -371,22 +371,22 @@ class QueryParser: |
| 371 | 371 | if task_type == 'translation': |
| 372 | 372 | if result: |
| 373 | 373 | translations[lang] = result |
| 374 | - log_info(f"翻译完成 | {lang}: {result}") | |
| 374 | + log_info(f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'") | |
| 375 | 375 | if context: |
| 376 | 376 | context.store_intermediate_result(f'translation_{lang}', result) |
| 377 | 377 | elif task_type == 'embedding': |
| 378 | 378 | query_vector = result |
| 379 | 379 | if query_vector is not None: |
| 380 | - log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}") | |
| 380 | + log_debug(f"Query vector generation completed | Shape: {query_vector.shape}") | |
| 381 | 381 | if context: |
| 382 | 382 | context.store_intermediate_result('query_vector_shape', query_vector.shape) |
| 383 | 383 | else: |
| 384 | - log_info("查询向量生成完成但结果为空(None),将按无向量处理") | |
| 384 | + log_info("Query vector generation completed but result is None, will process without vector") | |
| 385 | 385 | except Exception as e: |
| 386 | 386 | if task_type == 'translation': |
| 387 | - error_msg = f"翻译失败 | 语言: {lang} | 错误: {str(e)}" | |
| 387 | + error_msg = f"Translation failed | Language: {lang} | Error: {str(e)}" | |
| 388 | 388 | else: |
| 389 | - error_msg = f"查询向量生成失败 | 错误: {str(e)}" | |
| 389 | + error_msg = f"Query vector generation failed | Error: {str(e)}" | |
| 390 | 390 | log_info(error_msg) |
| 391 | 391 | if context: |
| 392 | 392 | context.add_warning(error_msg) |
| ... | ... | @@ -416,15 +416,15 @@ class QueryParser: |
| 416 | 416 | |
| 417 | 417 | if context and hasattr(context, 'logger'): |
| 418 | 418 | context.logger.info( |
| 419 | - f"查询解析完成 | 原查询: '{query}' | 最终查询: '{rewritten or query_text}' | " | |
| 420 | - f"语言: {detected_lang} | 域: {domain} | " | |
| 421 | - f"翻译数量: {len(translations)} | 向量: {'是' if query_vector is not None else '否'}", | |
| 419 | + f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " | |
| 420 | + f"Language: {detected_lang} | Domain: {domain} | " | |
| 421 | + f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}", | |
| 422 | 422 | extra={'reqid': context.reqid, 'uid': context.uid} |
| 423 | 423 | ) |
| 424 | 424 | else: |
| 425 | 425 | logger.info( |
| 426 | - f"查询解析完成 | 原查询: '{query}' | 最终查询: '{rewritten or query_text}' | " | |
| 427 | - f"语言: {detected_lang} | 域: {domain}" | |
| 426 | + f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " | |
| 427 | + f"Language: {detected_lang} | Domain: {domain}" | |
| 428 | 428 | ) |
| 429 | 429 | |
| 430 | 430 | return result | ... | ... |
query/translator.py
| ... | ... | @@ -140,7 +140,10 @@ class Translator: |
| 140 | 140 | return text |
| 141 | 141 | |
| 142 | 142 | if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): |
| 143 | - logger.info(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'") | |
| 143 | + logger.info( | |
| 144 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 145 | + f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" | |
| 146 | + ) | |
| 144 | 147 | return text |
| 145 | 148 | |
| 146 | 149 | # Use provided context or default context |
| ... | ... | @@ -158,33 +161,39 @@ class Translator: |
| 158 | 161 | cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) |
| 159 | 162 | if cached: |
| 160 | 163 | logger.info( |
| 161 | - f"[Translator] Cache hit: source={source_lang or 'auto'} " | |
| 162 | - f"target={target_lang} | text='{text[:80]}...' -> '{cached[:80]}...'" | |
| 164 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 165 | + f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" | |
| 163 | 166 | ) |
| 164 | 167 | return cached |
| 165 | 168 | |
| 166 | 169 | # If no API key, return mock translation (for testing) |
| 167 | 170 | if not self.api_key: |
| 168 | - logger.debug(f"[Translator] No API key, returning original text (mock mode)") | |
| 171 | + logger.info( | |
| 172 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 173 | + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" | |
| 174 | + ) | |
| 169 | 175 | return text |
| 170 | 176 | |
| 171 | 177 | # Translate using DeepL (Pro endpoint only, no free fallback) |
| 172 | 178 | logger.info( |
| 173 | - f"[Translator] Translating text: target={target_lang}, " | |
| 174 | - f"source={source_lang or 'auto'}, context={translation_context}, " | |
| 175 | - f"prompt={'yes' if prompt else 'no'} | text='{text[:80]}...'" | |
| 179 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 180 | + f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " | |
| 181 | + f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" | |
| 176 | 182 | ) |
| 177 | 183 | result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) |
| 178 | 184 | |
| 179 | 185 | # If still failed, return original text with warning |
| 180 | 186 | if result is None: |
| 181 | - logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text") | |
| 187 | + logger.warning( | |
| 188 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 189 | + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original" | |
| 190 | + ) | |
| 182 | 191 | result = text |
| 183 | - | |
| 184 | - logger.info( | |
| 185 | - f"[Translator] Translation completed: source={source_lang or 'auto'} " | |
| 186 | - f"target={target_lang} | original='{text[:80]}...' -> '{result[:80]}...'" | |
| 187 | - ) | |
| 192 | + else: | |
| 193 | + logger.info( | |
| 194 | + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " | |
| 195 | + f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" | |
| 196 | + ) | |
| 188 | 197 | |
| 189 | 198 | # Cache result |
| 190 | 199 | if result and self.use_cache and self.redis_client: |
| ... | ... | @@ -265,16 +274,29 @@ class Translator: |
| 265 | 274 | translated_text = self._extract_term_from_translation( |
| 266 | 275 | translated_text, text, target_code |
| 267 | 276 | ) |
| 277 | + logger.debug( | |
| 278 | + f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " | |
| 279 | + f"Translation result: '{translated_text}'" | |
| 280 | + ) | |
| 268 | 281 | return translated_text |
| 269 | 282 | else: |
| 270 | - logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}") | |
| 283 | + logger.error( | |
| 284 | + f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " | |
| 285 | + f"Status code: {response.status_code} | Error message: {response.text}" | |
| 286 | + ) | |
| 271 | 287 | return None |
| 272 | 288 | |
| 273 | 289 | except requests.Timeout: |
| 274 | - logger.warning(f"[Translator] Translation request timed out") | |
| 290 | + logger.warning( | |
| 291 | + f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " | |
| 292 | + f"Timeout: {self.timeout}s" | |
| 293 | + ) | |
| 275 | 294 | return None |
| 276 | 295 | except Exception as e: |
| 277 | - logger.error(f"[Translator] Translation failed: {e}", exc_info=True) | |
| 296 | + logger.error( | |
| 297 | + f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " | |
| 298 | + f"Error: {e}", exc_info=True | |
| 299 | + ) | |
| 278 | 300 | return None |
| 279 | 301 | |
| 280 | 302 | # NOTE: _translate_deepl_free is intentionally not implemented. |
| ... | ... | @@ -443,15 +465,18 @@ class Translator: |
| 443 | 465 | if value: |
| 444 | 466 | # Sliding expiration: reset expiration time on access |
| 445 | 467 | self.redis_client.expire(cache_key, self.expire_time) |
| 446 | - logger.info( | |
| 447 | - f"[Translator] Redis cache hit: key={cache_key}, " | |
| 448 | - f"target={target_lang}, value='{value[:80]}...'" | |
| 468 | + logger.debug( | |
| 469 | + f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " | |
| 470 | + f"Cache key: {cache_key} | Translation result: '{value}'" | |
| 449 | 471 | ) |
| 450 | 472 | return value |
| 451 | - logger.debug(f"[Translator] Redis cache miss: key={cache_key}, target={target_lang}") | |
| 473 | + logger.debug( | |
| 474 | + f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " | |
| 475 | + f"Cache key: {cache_key}" | |
| 476 | + ) | |
| 452 | 477 | return None |
| 453 | 478 | except Exception as e: |
| 454 | - logger.error(f"[Translator] Redis error during get translation cache: '{text}' {target_lang}: {e}") | |
| 479 | + logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") | |
| 455 | 480 | return None |
| 456 | 481 | |
| 457 | 482 | def _set_cached_translation_redis( |
| ... | ... | @@ -470,12 +495,15 @@ class Translator: |
| 470 | 495 | try: |
| 471 | 496 | cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" |
| 472 | 497 | self.redis_client.setex(cache_key, self.expire_time, translation) |
| 473 | - logger.info( | |
| 474 | - f"[Translator] Cached translation: key={cache_key}, " | |
| 475 | - f"target={target_lang}, value='{translation}...'" | |
| 498 | + logger.debug( | |
| 499 | + f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " | |
| 500 | + f"Cache key: {cache_key} | Translation result: '{translation}'" | |
| 476 | 501 | ) |
| 477 | 502 | except Exception as e: |
| 478 | - logger.error(f"[Translator] Redis error during set translation cache: '{text}' {target_lang}: {e}") | |
| 503 | + logger.error( | |
| 504 | + f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " | |
| 505 | + f"Error: {e}" | |
| 506 | + ) | |
| 479 | 507 | |
| 480 | 508 | def _translate_async( |
| 481 | 509 | self, | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -26,7 +26,8 @@ class ESQueryBuilder: |
| 26 | 26 | source_fields: Optional[List[str]] = None, |
| 27 | 27 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 28 | 28 | enable_multilang_search: bool = True, |
| 29 | - default_language: str = "zh" | |
| 29 | + default_language: str = "zh", | |
| 30 | + knn_boost: float = 0.25 | |
| 30 | 31 | ): |
| 31 | 32 | """ |
| 32 | 33 | Initialize query builder. |
| ... | ... | @@ -40,6 +41,7 @@ class ESQueryBuilder: |
| 40 | 41 | function_score_config: Function score configuration |
| 41 | 42 | enable_multilang_search: Enable multi-language search using translations |
| 42 | 43 | default_language: Default language to use when detection fails or returns "unknown" |
| 44 | + knn_boost: Boost value for KNN (embedding recall) | |
| 43 | 45 | """ |
| 44 | 46 | self.index_name = index_name |
| 45 | 47 | self.match_fields = match_fields |
| ... | ... | @@ -49,6 +51,7 @@ class ESQueryBuilder: |
| 49 | 51 | self.function_score_config = function_score_config |
| 50 | 52 | self.enable_multilang_search = enable_multilang_search |
| 51 | 53 | self.default_language = default_language |
| 54 | + self.knn_boost = knn_boost | |
| 52 | 55 | |
| 53 | 56 | def _split_filters_for_faceting( |
| 54 | 57 | self, |
| ... | ... | @@ -221,7 +224,7 @@ class ESQueryBuilder: |
| 221 | 224 | "query_vector": query_vector.tolist(), |
| 222 | 225 | "k": knn_k, |
| 223 | 226 | "num_candidates": knn_num_candidates, |
| 224 | - "boost": 0.2 # Lower boost for embedding recall | |
| 227 | + "boost": self.knn_boost # Lower boost for embedding recall | |
| 225 | 228 | } |
| 226 | 229 | es_query["knn"] = knn_clause |
| 227 | 230 | |
| ... | ... | @@ -458,7 +461,7 @@ class ESQueryBuilder: |
| 458 | 461 | "_name": "base_query", |
| 459 | 462 | "fields": match_fields, |
| 460 | 463 | "minimum_should_match": "75%", |
| 461 | - "operator": "AND", | |
| 464 | + # "operator": "AND", | |
| 462 | 465 | "query": query_text, |
| 463 | 466 | "tie_breaker": tie_breaker_base_query |
| 464 | 467 | } |
| ... | ... | @@ -472,7 +475,7 @@ class ESQueryBuilder: |
| 472 | 475 | "multi_match": { |
| 473 | 476 | "query": translations['zh'], |
| 474 | 477 | "fields": zh_fields, |
| 475 | - "operator": "AND", | |
| 478 | + # "operator": "AND", | |
| 476 | 479 | "minimum_should_match": "75%", |
| 477 | 480 | "tie_breaker": tie_breaker_base_query, |
| 478 | 481 | "boost": 0.4, |
| ... | ... | @@ -486,7 +489,7 @@ class ESQueryBuilder: |
| 486 | 489 | "multi_match": { |
| 487 | 490 | "query": translations['en'], |
| 488 | 491 | "fields": en_fields, |
| 489 | - "operator": "AND", | |
| 492 | + # "operator": "AND", | |
| 490 | 493 | "minimum_should_match": "75%", |
| 491 | 494 | "tie_breaker": tie_breaker_base_query, |
| 492 | 495 | "boost": 0.4, |
| ... | ... | @@ -532,7 +535,7 @@ class ESQueryBuilder: |
| 532 | 535 | "multi_match": { |
| 533 | 536 | "query": keywords, |
| 534 | 537 | "fields": core_fields, |
| 535 | - "operator": "AND", | |
| 538 | + # "operator": "AND", | |
| 536 | 539 | "tie_breaker": tie_breaker_keywords, |
| 537 | 540 | "boost": 0.1, |
| 538 | 541 | "_name": "keywords_query" | ... | ... |
search/searcher.py
| ... | ... | @@ -115,7 +115,8 @@ class Searcher: |
| 115 | 115 | source_fields=self.source_fields, |
| 116 | 116 | function_score_config=self.config.function_score, |
| 117 | 117 | enable_multilang_search=self.config.query_config.enable_multilang_search, |
| 118 | - default_language=self.config.query_config.default_language | |
| 118 | + default_language=self.config.query_config.default_language, | |
| 119 | + knn_boost=self.config.query_config.knn_boost | |
| 119 | 120 | ) |
| 120 | 121 | |
| 121 | 122 | def search( | ... | ... |