diff --git a/config/config.yaml b/config/config.yaml index a907a17..7296ff5 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -118,10 +118,13 @@ query_config: # 返回字段配置(_source includes) # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 source_fields: null + + # KNN boost配置(向量召回的boost值) + knn_boost: 0.25 # Lower boost for embedding recall # Ranking Configuration(排序配置) ranking: - expression: "bm25() + 0.2*text_embedding_relevance()" + expression: "bm25() + 0.25*text_embedding_relevance()" description: "BM25 text relevance combined with semantic embedding similarity" # Function Score配置(ES层打分规则) diff --git a/config/config_loader.py b/config/config_loader.py index 6de1b05..d0be6ed 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -56,6 +56,9 @@ class QueryConfig: # Source fields configuration source_fields: Optional[List[str]] = None + + # KNN boost configuration + knn_boost: float = 0.25 # Boost value for KNN (embedding recall) @dataclass @@ -241,7 +244,8 @@ class ConfigLoader: image_embedding_field=query_config_data.get("image_embedding_field"), embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4), embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3), - source_fields=query_config_data.get("source_fields") + source_fields=query_config_data.get("source_fields"), + knn_boost=query_config_data.get("knn_boost", 0.25) ) # Parse ranking config diff --git a/docs/亚马逊到店匠格式转换分析.md b/docs/亚马逊到店匠格式转换分析.md new file mode 100644 index 0000000..740ac01 --- /dev/null +++ b/docs/亚马逊到店匠格式转换分析.md @@ -0,0 +1,361 @@ +# 亚马逊格式到店匠格式转换 - 核心工作内容分析 + +## 一、概述 + +本项目实现了从**亚马逊格式Excel数据**到**店匠(Shoplazza)商品导入模板**的格式转换,主要处理商品的多款式(变体)结构和属性字段映射。 + +**核心脚本**:`scripts/amazon_xlsx_to_shoplazza_xlsx.py` + +--- + +## 二、父子款式处理(M/P/S 结构转换) + +### 2.1 输入格式(亚马逊) + +- **ASIN**:变体ID(SKU级别) +- **父ASIN**:父商品ID(SPU级别) +- 一个父ASIN可以包含多个ASIN(多个变体) + +### 2.2 输出格式(店匠) + +店匠模板定义了三种商品属性类型: + +1. **S(单一款式)**:只有一个变体的商品 + - 输出:**1行** + - 包含所有商品信息(标题、价格、库存等) + +2. **M(主商品)+ P(子款式)**:包含多个变体的商品 + - 输出:**1行M + N行P** + - **关键约束**:同一商品的P行必须紧跟在M行后面(模板导入强约束) + +### 2.3 转换策略 + +```python +# 核心逻辑(简化版) +for 父ASIN in 所有父ASIN: + variants = 获取该父ASIN下的所有ASIN + + if len(variants) == 1: + 生成 S 行(单一款式) + else: + 生成 M 行(主商品)+ 多个 P 行(子款式) +``` + +### 2.4 关键处理点 + +#### 1. 父ASIN排序 +- 确保父ASIN对应的变体在列表最前面 +- 如果找不到父ASIN对应的变体,根据配置决定是否丢弃整个SPU + +#### 2. 标题一致性检查 +- 同一SPU下的所有变体标题必须一致 +- 如果发现不一致: + - 选项1:丢弃标题不一致的SKU(默认) + - 选项2:修正为统一的主商品标题 + +#### 3. M行与P行的字段分工 + +**M行(主商品)填写**: +- ✅ 商品标题、描述、SEO信息 +- ✅ 专辑、标签、供应商信息 +- ✅ 商品主图 +- ✅ 款式维度名(款式1/2/3的key) +- ❌ 不填:价格、库存、重量等SKU级字段(保持为空更安全) + +**P行(子款式)填写**: +- ✅ 商品标题(与M行一致) +- ✅ 款式维度值(款式1/2/3的value) +- ✅ 价格、商品SKU(ASIN)、库存 +- ✅ 重量、尺寸 +- ✅ 子款式图(可选) +- ❌ 不填:描述、SEO、专辑等SPU级字段(保持为空) + +--- + +## 三、属性字段处理(款式维度解析) + +### 3.1 问题背景 + +亚马逊格式中,变体的"颜色/尺码"等信息**并不拆成多个列**,而是集中在 `SKU` 字符串里: + +``` +示例1: "Size: One Size | Color: Black" +示例2: "Color: Red | Style: 2-Pack" +``` + +而店匠模板需要: +- **M行**:`款式1/款式2/款式3` 填写**维度名**(如 Size、Color、Material) +- **P行**:`款式1/款式2/款式3` 填写**维度值**(如 One Size、Black、Cotton) + +### 3.2 SKU解析逻辑 + +```python +def parse_sku_options(sku_text): + """ + 解析 SKU 列,提取 key:value 对 + 输入: "Size: One Size | Color: Black" + 输出: {"Size": "One Size", "Color": "Black"} + """ + # 1. 按 | 分割 + parts = sku_text.split("|") + + # 2. 按 : 拆成 key/value + for part in parts: + if ":" in part: + key, value = part.split(":", 1) + result[key.strip()] = value.strip() + + return result +``` + +### 3.3 维度选择策略(最多3个维度) + +店匠模板只提供 `款式1~3` 三个维度,因此需要从多个变体中**智能选择最多3个维度**: + +#### 优先级规则 + +1. **按预设优先级排序**: + ```python + PREFERRED_OPTION_KEYS = [ + "Size", "Color", "Style", "Pattern", "Material", + "Flavor", "Scent", "Pack", "Pack of", ... + ] + ``` + +2. **按出现频次排序**:统计每个key在所有变体中的出现次数 + +3. **综合排序**: + ```python + def key_sort(k): + return ( + 预设优先级(越小越优先), + -出现频次(越大越优先), + 字母顺序(作为最后排序依据) + ) + ``` + +#### 退化处理 + +如果解析不到任何 key/value,则退化为单维度: +- M行:`款式1 = "Variant"` +- P行:`款式1 = ASIN`(使用ASIN作为维度值) + +### 3.4 维度映射示例 + +**输入数据**(3个变体): +``` +变体1: SKU = "Size: S | Color: Red" +变体2: SKU = "Size: M | Color: Red" +变体3: SKU = "Size: S | Color: Blue" +``` + +**解析结果**: +- 维度统计:Size出现3次,Color出现3次 +- 选择维度:Size(优先级1)、Color(优先级2) +- 最多3个,所以选择前2个 + +**输出格式**: +``` +M行: + 款式1 = "Size" + 款式2 = "Color" + 款式3 = "" + +P行1: + 款式1 = "S" + 款式2 = "Red" + 款式3 = "" + +P行2: + 款式1 = "M" + 款式2 = "Red" + 款式3 = "" + +P行3: + 款式1 = "S" + 款式2 = "Blue" + 款式3 = "" +``` + +--- + +## 四、字段映射总览 + +### 4.1 核心字段映射 + +| 店匠字段 | 亚马逊字段 | 处理逻辑 | +|---------|-----------|---------| +| **商品spu** | `父ASIN` | 无父ASIN则用ASIN | +| **商品SKU** | `ASIN` | 直接映射 | +| **商品标题*** | `商品标题` | 截断至255字符 | +| **商品图片*** | `商品主图` | URL直接映射 | +| **商品售价*** | `prime价格($)` 或 `价格($)` | 优先prime价格 | +| **创建时间** | `上架时间` | 日期格式转换(补齐时分秒) | +| **商品描述** | `商品标题` + `详细参数` | HTML拼接 | +| **专辑名称** | `大类目` | 无则取`类目路径`第一段 | +| **标签** | `品牌,大类目,小类目` | 逗号拼接 | +| **商品重量/重量单位** | `商品重量(单位换算)` | 解析数值和单位(g/kg/lb/oz) | +| **尺寸信息** | `商品尺寸` | 解析前三段数字,拼成 `L,W,H`(英寸) | + +### 4.2 特殊字段处理 + +#### 1. 价格处理 +```python +price = prime价格($) or 价格($) or 9.99 # 默认值9.99 +``` + +#### 2. 库存处理 +- 亚马逊数据源通常**没有库存** +- 脚本默认给每个变体固定库存:**100** + +#### 3. 日期格式转换 +```python +输入: "2018-05-09" 或 datetime对象 +输出: "2018-05-09 00:00:00" # 补齐时分秒 +``` + +#### 4. 重量解析 +```python +输入: "68.04 g" 或 "0.15 pounds" +输出: (68.04, "g") 或 (0.15, "lb") +``` + +#### 5. 尺寸解析 +```python +输入: "7.9 x 7.9 x 2 inches" +输出: "7.9,7.9,2" # L,W,H格式 +``` + +#### 6. SEO URL Handle生成 +```python +输入: "Legendary Whitetails Men's Buck Camp Flannel Shirt" +输出: "products/legendary-whitetails-mens-buck-camp-flannel-shirt" +# 规则:转小写、去特殊字符、空格转横线 +``` + +--- + +## 五、重要工作内容总结 + +### 5.1 数据结构转换 + +1. **父子关系识别**:从扁平化的ASIN列表识别出SPU-SKU层级关系 +2. **分组策略**:按父ASIN分组,决定生成S还是M+P结构 +3. **行序保证**:确保同一SPU的M+P行连续,不被打断 + +### 5.2 属性字段解析 + +1. **SKU字符串解析**:从非结构化字符串中提取key:value对 +2. **维度智能选择**:从多个可能的维度中选择最重要的3个 +3. **优先级算法**:综合考虑预设优先级、出现频次、字母顺序 + +### 5.3 字段映射与转换 + +1. **格式转换**:日期、价格、重量、尺寸等格式标准化 +2. **默认值填充**:库存、价格等缺失字段的默认值策略 +3. **数据清洗**:标题截断、HTML转义、URL生成等 + +### 5.4 数据质量控制 + +1. **标题一致性检查**:确保同一SPU下所有变体标题一致 +2. **父ASIN验证**:检查父ASIN是否存在于变体列表中 +3. **错误处理**:提供配置选项决定是修正还是丢弃异常数据 + +### 5.5 性能优化 + +1. **批量处理**:支持多文件批量转换 +2. **文件拆分**:按最大行数自动拆分输出文件(保证同一SPU不拆分) +3. **快速读写**:使用`iter_rows(values_only=True)`和write_only模式提升性能 + +--- + +## 六、关键技术难点 + +### 6.1 维度选择算法 + +**挑战**:从多个变体的SKU字符串中,智能选择最重要的3个维度 + +**解决方案**: +- 统计所有变体中每个key的出现频次 +- 结合预设优先级列表(Size > Color > Style > ...) +- 综合排序选择前3个 + +### 6.2 数据一致性保证 + +**挑战**:确保同一SPU下的所有变体数据一致 + +**解决方案**: +- 标题一致性检查和修正 +- 父ASIN排序保证 +- M行和P行的字段分工明确 + +### 6.3 文件拆分策略 + +**挑战**:按最大行数拆分文件,但不能拆分同一SPU + +**解决方案**: +- 先按SPU分组生成所有行 +- 拆分时以SPU为单位,不拆分单个SPU的行 + +--- + +## 七、扩展点 + +### 7.1 可配置项 + +- 库存默认值(当前100) +- 价格默认值(当前9.99) +- 上架/收税/物流策略(当前硬编码Y/N) + +### 7.2 可增强功能 + +- **更强的多款式解析**:如果SKU字段不规范,可从`详细参数`中提取Color/Size +- **图片策略优化**:P行可改为使用M行合并的多图(逗号拼接) +- **元字段支持**:支持店匠的元字段导入 + +--- + +## 八、使用示例 + +### 8.1 小批量验证 + +```bash +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ + --input-dir data/mai_jia_jing_ling/products_data \ + --template docs/商品导入模板.xlsx \ + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \ + --max-files 1 --max-products 50 +``` + +### 8.2 全量转换 + +```bash +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ + --input-dir data/mai_jia_jing_ling/products_data \ + --template docs/商品导入模板.xlsx \ + --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx +``` + +### 8.3 自动拆分文件 + +```bash +python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ + --input-dir data/mai_jia_jing_ling/products_data \ + --template docs/商品导入模板.xlsx \ + --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \ + --max-rows-per-output 40000 +``` + +--- + +## 九、总结 + +亚马逊格式到店匠格式的转换,核心工作包括: + +1. **父子款式结构转换**:从ASIN/父ASIN关系转换为M/P/S结构 +2. **属性字段解析**:从SKU字符串中提取并智能选择款式维度 +3. **字段映射与转换**:40+个字段的格式转换和默认值处理 +4. **数据质量控制**:一致性检查、错误处理、数据清洗 +5. **性能优化**:批量处理、文件拆分、快速读写 + +这是一个典型的**数据格式转换ETL任务**,涉及数据结构重组、字符串解析、智能算法选择等多个技术领域。 diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md index f8c7065..04a9644 100644 --- a/docs/常用查询 - ES.md +++ b/docs/常用查询 - ES.md @@ -6,7 +6,8 @@ ### 1. 根据 tenant_id / spu_id 查询 curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ - "size": 1, + "size": 100, + "_source": ["title_zh", "title_en"], "query": { "bool": { "filter": [ @@ -18,7 +19,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 1, - "_source": ["_id", "*"], + "_source": ["title_zh", "title_en"], "query": { "bool": { "must": [ diff --git a/frontend/index.html b/frontend/index.html index 2b96b20..b7e9077 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -74,7 +74,7 @@
- +
diff --git a/query/query_parser.py b/query/query_parser.py index c71c4db..f294efd 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -175,7 +175,7 @@ class QueryParser: logger = context.logger if context else None if logger: logger.info( - f"开始查询解析 | 原查询: '{query}' | 生成向量: {generate_vector}", + f"Starting query parsing | Original query: '{query}' | Generate vector: {generate_vector}", extra={'reqid': context.reqid, 'uid': context.uid} ) @@ -193,13 +193,13 @@ class QueryParser: # Stage 1: Normalize normalized = self.normalizer.normalize(query) - log_debug(f"标准化完成 | '{query}' -> '{normalized}'") + log_debug(f"Normalization completed | '{query}' -> '{normalized}'") if context: context.store_intermediate_result('normalized_query', normalized) # Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike") domain, query_text = self.normalizer.extract_domain_query(normalized) - log_debug(f"域提取 | 域: '{domain}', 查询: '{query_text}'") + log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'") if context: context.store_intermediate_result('extracted_domain', domain) context.store_intermediate_result('domain_query', query_text) @@ -209,18 +209,18 @@ class QueryParser: if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists rewritten = self.rewriter.rewrite(query_text) if rewritten != query_text: - log_info(f"查询重写 | '{query_text}' -> '{rewritten}'") + log_info(f"Query rewritten | '{query_text}' -> '{rewritten}'") query_text = rewritten if context: context.store_intermediate_result('rewritten_query', rewritten) - context.add_warning(f"查询被重写: {query_text}") + context.add_warning(f"Query was rewritten: {query_text}") # Stage 3: Language detection detected_lang = self.language_detector.detect(query_text) # Use default language if detection failed (None or "unknown") if not detected_lang or detected_lang == "unknown": detected_lang = self.config.query_config.default_language - log_info(f"语言检测 | 检测到语言: {detected_lang}") + log_info(f"Language detection | Detected language: {detected_lang}") if context: context.store_intermediate_result('detected_language', detected_lang) @@ -286,9 +286,9 @@ class QueryParser: translations = {k: v for k, v in translations.items() if v is not None} if translations: - log_info(f"翻译完成(缓存命中) | 结果: {translations}") + log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}") if translation_futures: - log_debug(f"翻译进行中,等待结果... | 语言: {list(translation_futures.keys())}") + log_debug(f"Translation in progress, waiting for results... | Query text: '{query_text}' | Languages: {list(translation_futures.keys())}") if context: context.store_intermediate_result('translations', translations) @@ -297,7 +297,7 @@ class QueryParser: context.store_intermediate_result(f'translation_{lang}', translation) except Exception as e: - error_msg = f"翻译失败 | 错误: {str(e)}" + error_msg = f"Translation failed | Error: {str(e)}" log_info(error_msg) if context: context.add_warning(error_msg) @@ -307,8 +307,8 @@ class QueryParser: token_count = self._get_token_count(query_text) is_short_query, is_long_query = self._analyze_query_type(query_text, token_count) - log_debug(f"查询分析 | 关键词: {keywords} | token数: {token_count} | " - f"短查询: {is_short_query} | 长查询: {is_long_query}") + log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | " + f"Short query: {is_short_query} | Long query: {is_long_query}") if context: context.store_intermediate_result('keywords', keywords) context.store_intermediate_result('token_count', token_count) @@ -328,7 +328,7 @@ class QueryParser: encoding_executor = None if should_generate_embedding: try: - log_debug("开始生成查询向量(异步)") + log_debug("Starting query vector generation (async)") # Submit encoding task to thread pool for async execution encoding_executor = ThreadPoolExecutor(max_workers=1) def _encode_query_vector() -> Optional[np.ndarray]: @@ -341,7 +341,7 @@ class QueryParser: _encode_query_vector ) except Exception as e: - error_msg = f"查询向量生成任务提交失败 | 错误: {str(e)}" + error_msg = f"Query vector generation task submission failed | Error: {str(e)}" log_info(error_msg) if context: context.add_warning(error_msg) @@ -350,7 +350,7 @@ class QueryParser: # Wait for all async tasks to complete (translation and embedding) if translation_futures or embedding_future: - log_debug("等待异步任务完成...") + log_debug("Waiting for async tasks to complete...") # Collect all futures with their identifiers all_futures = [] @@ -371,22 +371,22 @@ class QueryParser: if task_type == 'translation': if result: translations[lang] = result - log_info(f"翻译完成 | {lang}: {result}") + log_info(f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'") if context: context.store_intermediate_result(f'translation_{lang}', result) elif task_type == 'embedding': query_vector = result if query_vector is not None: - log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}") + log_debug(f"Query vector generation completed | Shape: {query_vector.shape}") if context: context.store_intermediate_result('query_vector_shape', query_vector.shape) else: - log_info("查询向量生成完成但结果为空(None),将按无向量处理") + log_info("Query vector generation completed but result is None, will process without vector") except Exception as e: if task_type == 'translation': - error_msg = f"翻译失败 | 语言: {lang} | 错误: {str(e)}" + error_msg = f"Translation failed | Language: {lang} | Error: {str(e)}" else: - error_msg = f"查询向量生成失败 | 错误: {str(e)}" + error_msg = f"Query vector generation failed | Error: {str(e)}" log_info(error_msg) if context: context.add_warning(error_msg) @@ -416,15 +416,15 @@ class QueryParser: if context and hasattr(context, 'logger'): context.logger.info( - f"查询解析完成 | 原查询: '{query}' | 最终查询: '{rewritten or query_text}' | " - f"语言: {detected_lang} | 域: {domain} | " - f"翻译数量: {len(translations)} | 向量: {'是' if query_vector is not None else '否'}", + f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " + f"Language: {detected_lang} | Domain: {domain} | " + f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}", extra={'reqid': context.reqid, 'uid': context.uid} ) else: logger.info( - f"查询解析完成 | 原查询: '{query}' | 最终查询: '{rewritten or query_text}' | " - f"语言: {detected_lang} | 域: {domain}" + f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | " + f"Language: {detected_lang} | Domain: {domain}" ) return result diff --git a/query/translator.py b/query/translator.py index 16f3c99..7db2555 100644 --- a/query/translator.py +++ b/query/translator.py @@ -140,7 +140,10 @@ class Translator: return text if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)): - logger.info(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'") + logger.info( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)" + ) return text # Use provided context or default context @@ -158,33 +161,39 @@ class Translator: cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt) if cached: logger.info( - f"[Translator] Cache hit: source={source_lang or 'auto'} " - f"target={target_lang} | text='{text[:80]}...' -> '{cached[:80]}...'" + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit" ) return cached # If no API key, return mock translation (for testing) if not self.api_key: - logger.debug(f"[Translator] No API key, returning original text (mock mode)") + logger.info( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)" + ) return text # Translate using DeepL (Pro endpoint only, no free fallback) logger.info( - f"[Translator] Translating text: target={target_lang}, " - f"source={source_lang or 'auto'}, context={translation_context}, " - f"prompt={'yes' if prompt else 'no'} | text='{text[:80]}...'" + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Context: {translation_context} | " + f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation" ) result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt) # If still failed, return original text with warning if result is None: - logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text") + logger.warning( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original" + ) result = text - - logger.info( - f"[Translator] Translation completed: source={source_lang or 'auto'} " - f"target={target_lang} | original='{text[:80]}...' -> '{result[:80]}...'" - ) + else: + logger.info( + f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | " + f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful" + ) # Cache result if result and self.use_cache and self.redis_client: @@ -265,16 +274,29 @@ class Translator: translated_text = self._extract_term_from_translation( translated_text, text, target_code ) + logger.debug( + f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | " + f"Translation result: '{translated_text}'" + ) return translated_text else: - logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}") + logger.error( + f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | " + f"Status code: {response.status_code} | Error message: {response.text}" + ) return None except requests.Timeout: - logger.warning(f"[Translator] Translation request timed out") + logger.warning( + f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | " + f"Timeout: {self.timeout}s" + ) return None except Exception as e: - logger.error(f"[Translator] Translation failed: {e}", exc_info=True) + logger.error( + f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | " + f"Error: {e}", exc_info=True + ) return None # NOTE: _translate_deepl_free is intentionally not implemented. @@ -443,15 +465,18 @@ class Translator: if value: # Sliding expiration: reset expiration time on access self.redis_client.expire(cache_key, self.expire_time) - logger.info( - f"[Translator] Redis cache hit: key={cache_key}, " - f"target={target_lang}, value='{value[:80]}...'" + logger.debug( + f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | " + f"Cache key: {cache_key} | Translation result: '{value}'" ) return value - logger.debug(f"[Translator] Redis cache miss: key={cache_key}, target={target_lang}") + logger.debug( + f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | " + f"Cache key: {cache_key}" + ) return None except Exception as e: - logger.error(f"[Translator] Redis error during get translation cache: '{text}' {target_lang}: {e}") + logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}") return None def _set_cached_translation_redis( @@ -470,12 +495,15 @@ class Translator: try: cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}" self.redis_client.setex(cache_key, self.expire_time, translation) - logger.info( - f"[Translator] Cached translation: key={cache_key}, " - f"target={target_lang}, value='{translation}...'" + logger.debug( + f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | " + f"Cache key: {cache_key} | Translation result: '{translation}'" ) except Exception as e: - logger.error(f"[Translator] Redis error during set translation cache: '{text}' {target_lang}: {e}") + logger.error( + f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | " + f"Error: {e}" + ) def _translate_async( self, diff --git a/search/es_query_builder.py b/search/es_query_builder.py index 1405416..dbe44fa 100644 --- a/search/es_query_builder.py +++ b/search/es_query_builder.py @@ -26,7 +26,8 @@ class ESQueryBuilder: source_fields: Optional[List[str]] = None, function_score_config: Optional[FunctionScoreConfig] = None, enable_multilang_search: bool = True, - default_language: str = "zh" + default_language: str = "zh", + knn_boost: float = 0.25 ): """ Initialize query builder. @@ -40,6 +41,7 @@ class ESQueryBuilder: function_score_config: Function score configuration enable_multilang_search: Enable multi-language search using translations default_language: Default language to use when detection fails or returns "unknown" + knn_boost: Boost value for KNN (embedding recall) """ self.index_name = index_name self.match_fields = match_fields @@ -49,6 +51,7 @@ class ESQueryBuilder: self.function_score_config = function_score_config self.enable_multilang_search = enable_multilang_search self.default_language = default_language + self.knn_boost = knn_boost def _split_filters_for_faceting( self, @@ -221,7 +224,7 @@ class ESQueryBuilder: "query_vector": query_vector.tolist(), "k": knn_k, "num_candidates": knn_num_candidates, - "boost": 0.2 # Lower boost for embedding recall + "boost": self.knn_boost # Lower boost for embedding recall } es_query["knn"] = knn_clause @@ -458,7 +461,7 @@ class ESQueryBuilder: "_name": "base_query", "fields": match_fields, "minimum_should_match": "75%", - "operator": "AND", + # "operator": "AND", "query": query_text, "tie_breaker": tie_breaker_base_query } @@ -472,7 +475,7 @@ class ESQueryBuilder: "multi_match": { "query": translations['zh'], "fields": zh_fields, - "operator": "AND", + # "operator": "AND", "minimum_should_match": "75%", "tie_breaker": tie_breaker_base_query, "boost": 0.4, @@ -486,7 +489,7 @@ class ESQueryBuilder: "multi_match": { "query": translations['en'], "fields": en_fields, - "operator": "AND", + # "operator": "AND", "minimum_should_match": "75%", "tie_breaker": tie_breaker_base_query, "boost": 0.4, @@ -532,7 +535,7 @@ class ESQueryBuilder: "multi_match": { "query": keywords, "fields": core_fields, - "operator": "AND", + # "operator": "AND", "tie_breaker": tie_breaker_keywords, "boost": 0.1, "_name": "keywords_query" diff --git a/search/searcher.py b/search/searcher.py index ae8e83e..bd746c1 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -115,7 +115,8 @@ class Searcher: source_fields=self.source_fields, function_score_config=self.config.function_score, enable_multilang_search=self.config.query_config.enable_multilang_search, - default_language=self.config.query_config.default_language + default_language=self.config.query_config.default_language, + knn_boost=self.config.query_config.knn_boost ) def search( -- libgit2 0.21.2