diff --git a/config/config.yaml b/config/config.yaml
index a907a17..7296ff5 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -118,10 +118,13 @@ query_config:
# 返回字段配置(_source includes)
# null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
source_fields: null
+
+ # KNN boost配置(向量召回的boost值)
+ knn_boost: 0.25 # Lower boost for embedding recall
# Ranking Configuration(排序配置)
ranking:
- expression: "bm25() + 0.2*text_embedding_relevance()"
+ expression: "bm25() + 0.25*text_embedding_relevance()"
description: "BM25 text relevance combined with semantic embedding similarity"
# Function Score配置(ES层打分规则)
diff --git a/config/config_loader.py b/config/config_loader.py
index 6de1b05..d0be6ed 100644
--- a/config/config_loader.py
+++ b/config/config_loader.py
@@ -56,6 +56,9 @@ class QueryConfig:
# Source fields configuration
source_fields: Optional[List[str]] = None
+
+ # KNN boost configuration
+ knn_boost: float = 0.25 # Boost value for KNN (embedding recall)
@dataclass
@@ -241,7 +244,8 @@ class ConfigLoader:
image_embedding_field=query_config_data.get("image_embedding_field"),
embedding_disable_chinese_char_limit=embedding_thresholds.get("chinese_char_limit", 4),
embedding_disable_english_word_limit=embedding_thresholds.get("english_word_limit", 3),
- source_fields=query_config_data.get("source_fields")
+ source_fields=query_config_data.get("source_fields"),
+ knn_boost=query_config_data.get("knn_boost", 0.25)
)
# Parse ranking config
diff --git a/docs/亚马逊到店匠格式转换分析.md b/docs/亚马逊到店匠格式转换分析.md
new file mode 100644
index 0000000..740ac01
--- /dev/null
+++ b/docs/亚马逊到店匠格式转换分析.md
@@ -0,0 +1,361 @@
+# 亚马逊格式到店匠格式转换 - 核心工作内容分析
+
+## 一、概述
+
+本项目实现了从**亚马逊格式Excel数据**到**店匠(Shoplazza)商品导入模板**的格式转换,主要处理商品的多款式(变体)结构和属性字段映射。
+
+**核心脚本**:`scripts/amazon_xlsx_to_shoplazza_xlsx.py`
+
+---
+
+## 二、父子款式处理(M/P/S 结构转换)
+
+### 2.1 输入格式(亚马逊)
+
+- **ASIN**:变体ID(SKU级别)
+- **父ASIN**:父商品ID(SPU级别)
+- 一个父ASIN可以包含多个ASIN(多个变体)
+
+### 2.2 输出格式(店匠)
+
+店匠模板定义了三种商品属性类型:
+
+1. **S(单一款式)**:只有一个变体的商品
+ - 输出:**1行**
+ - 包含所有商品信息(标题、价格、库存等)
+
+2. **M(主商品)+ P(子款式)**:包含多个变体的商品
+ - 输出:**1行M + N行P**
+ - **关键约束**:同一商品的P行必须紧跟在M行后面(模板导入强约束)
+
+### 2.3 转换策略
+
+```python
+# 核心逻辑(简化版)
+for 父ASIN in 所有父ASIN:
+ variants = 获取该父ASIN下的所有ASIN
+
+ if len(variants) == 1:
+ 生成 S 行(单一款式)
+ else:
+ 生成 M 行(主商品)+ 多个 P 行(子款式)
+```
+
+### 2.4 关键处理点
+
+#### 1. 父ASIN排序
+- 确保父ASIN对应的变体在列表最前面
+- 如果找不到父ASIN对应的变体,根据配置决定是否丢弃整个SPU
+
+#### 2. 标题一致性检查
+- 同一SPU下的所有变体标题必须一致
+- 如果发现不一致:
+ - 选项1:丢弃标题不一致的SKU(默认)
+ - 选项2:修正为统一的主商品标题
+
+#### 3. M行与P行的字段分工
+
+**M行(主商品)填写**:
+- ✅ 商品标题、描述、SEO信息
+- ✅ 专辑、标签、供应商信息
+- ✅ 商品主图
+- ✅ 款式维度名(款式1/2/3的key)
+- ❌ 不填:价格、库存、重量等SKU级字段(保持为空更安全)
+
+**P行(子款式)填写**:
+- ✅ 商品标题(与M行一致)
+- ✅ 款式维度值(款式1/2/3的value)
+- ✅ 价格、商品SKU(ASIN)、库存
+- ✅ 重量、尺寸
+- ✅ 子款式图(可选)
+- ❌ 不填:描述、SEO、专辑等SPU级字段(保持为空)
+
+---
+
+## 三、属性字段处理(款式维度解析)
+
+### 3.1 问题背景
+
+亚马逊格式中,变体的"颜色/尺码"等信息**并不拆成多个列**,而是集中在 `SKU` 字符串里:
+
+```
+示例1: "Size: One Size | Color: Black"
+示例2: "Color: Red | Style: 2-Pack"
+```
+
+而店匠模板需要:
+- **M行**:`款式1/款式2/款式3` 填写**维度名**(如 Size、Color、Material)
+- **P行**:`款式1/款式2/款式3` 填写**维度值**(如 One Size、Black、Cotton)
+
+### 3.2 SKU解析逻辑
+
+```python
+def parse_sku_options(sku_text):
+ """
+ 解析 SKU 列,提取 key:value 对
+ 输入: "Size: One Size | Color: Black"
+ 输出: {"Size": "One Size", "Color": "Black"}
+ """
+ # 1. 按 | 分割
+ parts = sku_text.split("|")
+
+ # 2. 按 : 拆成 key/value
+ for part in parts:
+ if ":" in part:
+ key, value = part.split(":", 1)
+ result[key.strip()] = value.strip()
+
+ return result
+```
+
+### 3.3 维度选择策略(最多3个维度)
+
+店匠模板只提供 `款式1~3` 三个维度,因此需要从多个变体中**智能选择最多3个维度**:
+
+#### 优先级规则
+
+1. **按预设优先级排序**:
+ ```python
+ PREFERRED_OPTION_KEYS = [
+ "Size", "Color", "Style", "Pattern", "Material",
+ "Flavor", "Scent", "Pack", "Pack of", ...
+ ]
+ ```
+
+2. **按出现频次排序**:统计每个key在所有变体中的出现次数
+
+3. **综合排序**:
+ ```python
+ def key_sort(k):
+ return (
+ 预设优先级(越小越优先),
+ -出现频次(越大越优先),
+ 字母顺序(作为最后排序依据)
+ )
+ ```
+
+#### 退化处理
+
+如果解析不到任何 key/value,则退化为单维度:
+- M行:`款式1 = "Variant"`
+- P行:`款式1 = ASIN`(使用ASIN作为维度值)
+
+### 3.4 维度映射示例
+
+**输入数据**(3个变体):
+```
+变体1: SKU = "Size: S | Color: Red"
+变体2: SKU = "Size: M | Color: Red"
+变体3: SKU = "Size: S | Color: Blue"
+```
+
+**解析结果**:
+- 维度统计:Size出现3次,Color出现3次
+- 选择维度:Size(优先级1)、Color(优先级2)
+- 最多3个,所以选择前2个
+
+**输出格式**:
+```
+M行:
+ 款式1 = "Size"
+ 款式2 = "Color"
+ 款式3 = ""
+
+P行1:
+ 款式1 = "S"
+ 款式2 = "Red"
+ 款式3 = ""
+
+P行2:
+ 款式1 = "M"
+ 款式2 = "Red"
+ 款式3 = ""
+
+P行3:
+ 款式1 = "S"
+ 款式2 = "Blue"
+ 款式3 = ""
+```
+
+---
+
+## 四、字段映射总览
+
+### 4.1 核心字段映射
+
+| 店匠字段 | 亚马逊字段 | 处理逻辑 |
+|---------|-----------|---------|
+| **商品spu** | `父ASIN` | 无父ASIN则用ASIN |
+| **商品SKU** | `ASIN` | 直接映射 |
+| **商品标题*** | `商品标题` | 截断至255字符 |
+| **商品图片*** | `商品主图` | URL直接映射 |
+| **商品售价*** | `prime价格($)` 或 `价格($)` | 优先prime价格 |
+| **创建时间** | `上架时间` | 日期格式转换(补齐时分秒) |
+| **商品描述** | `商品标题` + `详细参数` | HTML拼接 |
+| **专辑名称** | `大类目` | 无则取`类目路径`第一段 |
+| **标签** | `品牌,大类目,小类目` | 逗号拼接 |
+| **商品重量/重量单位** | `商品重量(单位换算)` | 解析数值和单位(g/kg/lb/oz) |
+| **尺寸信息** | `商品尺寸` | 解析前三段数字,拼成 `L,W,H`(英寸) |
+
+### 4.2 特殊字段处理
+
+#### 1. 价格处理
+```python
+price = prime价格($) or 价格($) or 9.99 # 默认值9.99
+```
+
+#### 2. 库存处理
+- 亚马逊数据源通常**没有库存**
+- 脚本默认给每个变体固定库存:**100**
+
+#### 3. 日期格式转换
+```python
+输入: "2018-05-09" 或 datetime对象
+输出: "2018-05-09 00:00:00" # 补齐时分秒
+```
+
+#### 4. 重量解析
+```python
+输入: "68.04 g" 或 "0.15 pounds"
+输出: (68.04, "g") 或 (0.15, "lb")
+```
+
+#### 5. 尺寸解析
+```python
+输入: "7.9 x 7.9 x 2 inches"
+输出: "7.9,7.9,2" # L,W,H格式
+```
+
+#### 6. SEO URL Handle生成
+```python
+输入: "Legendary Whitetails Men's Buck Camp Flannel Shirt"
+输出: "products/legendary-whitetails-mens-buck-camp-flannel-shirt"
+# 规则:转小写、去特殊字符、空格转横线
+```
+
+---
+
+## 五、重要工作内容总结
+
+### 5.1 数据结构转换
+
+1. **父子关系识别**:从扁平化的ASIN列表识别出SPU-SKU层级关系
+2. **分组策略**:按父ASIN分组,决定生成S还是M+P结构
+3. **行序保证**:确保同一SPU的M+P行连续,不被打断
+
+### 5.2 属性字段解析
+
+1. **SKU字符串解析**:从非结构化字符串中提取key:value对
+2. **维度智能选择**:从多个可能的维度中选择最重要的3个
+3. **优先级算法**:综合考虑预设优先级、出现频次、字母顺序
+
+### 5.3 字段映射与转换
+
+1. **格式转换**:日期、价格、重量、尺寸等格式标准化
+2. **默认值填充**:库存、价格等缺失字段的默认值策略
+3. **数据清洗**:标题截断、HTML转义、URL生成等
+
+### 5.4 数据质量控制
+
+1. **标题一致性检查**:确保同一SPU下所有变体标题一致
+2. **父ASIN验证**:检查父ASIN是否存在于变体列表中
+3. **错误处理**:提供配置选项决定是修正还是丢弃异常数据
+
+### 5.5 性能优化
+
+1. **批量处理**:支持多文件批量转换
+2. **文件拆分**:按最大行数自动拆分输出文件(保证同一SPU不拆分)
+3. **快速读写**:使用`iter_rows(values_only=True)`和write_only模式提升性能
+
+---
+
+## 六、关键技术难点
+
+### 6.1 维度选择算法
+
+**挑战**:从多个变体的SKU字符串中,智能选择最重要的3个维度
+
+**解决方案**:
+- 统计所有变体中每个key的出现频次
+- 结合预设优先级列表(Size > Color > Style > ...)
+- 综合排序选择前3个
+
+### 6.2 数据一致性保证
+
+**挑战**:确保同一SPU下的所有变体数据一致
+
+**解决方案**:
+- 标题一致性检查和修正
+- 父ASIN排序保证
+- M行和P行的字段分工明确
+
+### 6.3 文件拆分策略
+
+**挑战**:按最大行数拆分文件,但不能拆分同一SPU
+
+**解决方案**:
+- 先按SPU分组生成所有行
+- 拆分时以SPU为单位,不拆分单个SPU的行
+
+---
+
+## 七、扩展点
+
+### 7.1 可配置项
+
+- 库存默认值(当前100)
+- 价格默认值(当前9.99)
+- 上架/收税/物流策略(当前硬编码Y/N)
+
+### 7.2 可增强功能
+
+- **更强的多款式解析**:如果SKU字段不规范,可从`详细参数`中提取Color/Size
+- **图片策略优化**:P行可改为使用M行合并的多图(逗号拼接)
+- **元字段支持**:支持店匠的元字段导入
+
+---
+
+## 八、使用示例
+
+### 8.1 小批量验证
+
+```bash
+python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
+ --input-dir data/mai_jia_jing_ling/products_data \
+ --template docs/商品导入模板.xlsx \
+ --output data/mai_jia_jing_ling/amazon_shoplazza_import_SAMPLE.xlsx \
+ --max-files 1 --max-products 50
+```
+
+### 8.2 全量转换
+
+```bash
+python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
+ --input-dir data/mai_jia_jing_ling/products_data \
+ --template docs/商品导入模板.xlsx \
+ --output data/mai_jia_jing_ling/amazon_shoplazza_import_ALL.xlsx
+```
+
+### 8.3 自动拆分文件
+
+```bash
+python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
+ --input-dir data/mai_jia_jing_ling/products_data \
+ --template docs/商品导入模板.xlsx \
+ --output data/mai_jia_jing_ling/amazon_shoplazza_import_SPLIT.xlsx \
+ --max-rows-per-output 40000
+```
+
+---
+
+## 九、总结
+
+亚马逊格式到店匠格式的转换,核心工作包括:
+
+1. **父子款式结构转换**:从ASIN/父ASIN关系转换为M/P/S结构
+2. **属性字段解析**:从SKU字符串中提取并智能选择款式维度
+3. **字段映射与转换**:40+个字段的格式转换和默认值处理
+4. **数据质量控制**:一致性检查、错误处理、数据清洗
+5. **性能优化**:批量处理、文件拆分、快速读写
+
+这是一个典型的**数据格式转换ETL任务**,涉及数据结构重组、字符串解析、智能算法选择等多个技术领域。
diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md
index f8c7065..04a9644 100644
--- a/docs/常用查询 - ES.md
+++ b/docs/常用查询 - ES.md
@@ -6,7 +6,8 @@
### 1. 根据 tenant_id / spu_id 查询
curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{
- "size": 1,
+ "size": 100,
+ "_source": ["title_zh", "title_en"],
"query": {
"bool": {
"filter": [
@@ -18,7 +19,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/
curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{
"size": 1,
- "_source": ["_id", "*"],
+ "_source": ["title_zh", "title_en"],
"query": {
"bool": {
"must": [
diff --git a/frontend/index.html b/frontend/index.html
index 2b96b20..b7e9077 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -74,7 +74,7 @@
-
+
diff --git a/query/query_parser.py b/query/query_parser.py
index c71c4db..f294efd 100644
--- a/query/query_parser.py
+++ b/query/query_parser.py
@@ -175,7 +175,7 @@ class QueryParser:
logger = context.logger if context else None
if logger:
logger.info(
- f"开始查询解析 | 原查询: '{query}' | 生成向量: {generate_vector}",
+ f"Starting query parsing | Original query: '{query}' | Generate vector: {generate_vector}",
extra={'reqid': context.reqid, 'uid': context.uid}
)
@@ -193,13 +193,13 @@ class QueryParser:
# Stage 1: Normalize
normalized = self.normalizer.normalize(query)
- log_debug(f"标准化完成 | '{query}' -> '{normalized}'")
+ log_debug(f"Normalization completed | '{query}' -> '{normalized}'")
if context:
context.store_intermediate_result('normalized_query', normalized)
# Extract domain if present (e.g., "brand:Nike" -> domain="brand", query="Nike")
domain, query_text = self.normalizer.extract_domain_query(normalized)
- log_debug(f"域提取 | 域: '{domain}', 查询: '{query_text}'")
+ log_debug(f"Domain extraction | Domain: '{domain}', Query: '{query_text}'")
if context:
context.store_intermediate_result('extracted_domain', domain)
context.store_intermediate_result('domain_query', query_text)
@@ -209,18 +209,18 @@ class QueryParser:
if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
rewritten = self.rewriter.rewrite(query_text)
if rewritten != query_text:
- log_info(f"查询重写 | '{query_text}' -> '{rewritten}'")
+ log_info(f"Query rewritten | '{query_text}' -> '{rewritten}'")
query_text = rewritten
if context:
context.store_intermediate_result('rewritten_query', rewritten)
- context.add_warning(f"查询被重写: {query_text}")
+ context.add_warning(f"Query was rewritten: {query_text}")
# Stage 3: Language detection
detected_lang = self.language_detector.detect(query_text)
# Use default language if detection failed (None or "unknown")
if not detected_lang or detected_lang == "unknown":
detected_lang = self.config.query_config.default_language
- log_info(f"语言检测 | 检测到语言: {detected_lang}")
+ log_info(f"Language detection | Detected language: {detected_lang}")
if context:
context.store_intermediate_result('detected_language', detected_lang)
@@ -286,9 +286,9 @@ class QueryParser:
translations = {k: v for k, v in translations.items() if v is not None}
if translations:
- log_info(f"翻译完成(缓存命中) | 结果: {translations}")
+ log_info(f"Translation completed (cache hit) | Query text: '{query_text}' | Results: {translations}")
if translation_futures:
- log_debug(f"翻译进行中,等待结果... | 语言: {list(translation_futures.keys())}")
+ log_debug(f"Translation in progress, waiting for results... | Query text: '{query_text}' | Languages: {list(translation_futures.keys())}")
if context:
context.store_intermediate_result('translations', translations)
@@ -297,7 +297,7 @@ class QueryParser:
context.store_intermediate_result(f'translation_{lang}', translation)
except Exception as e:
- error_msg = f"翻译失败 | 错误: {str(e)}"
+ error_msg = f"Translation failed | Error: {str(e)}"
log_info(error_msg)
if context:
context.add_warning(error_msg)
@@ -307,8 +307,8 @@ class QueryParser:
token_count = self._get_token_count(query_text)
is_short_query, is_long_query = self._analyze_query_type(query_text, token_count)
- log_debug(f"查询分析 | 关键词: {keywords} | token数: {token_count} | "
- f"短查询: {is_short_query} | 长查询: {is_long_query}")
+ log_debug(f"Query analysis | Keywords: {keywords} | Token count: {token_count} | "
+ f"Short query: {is_short_query} | Long query: {is_long_query}")
if context:
context.store_intermediate_result('keywords', keywords)
context.store_intermediate_result('token_count', token_count)
@@ -328,7 +328,7 @@ class QueryParser:
encoding_executor = None
if should_generate_embedding:
try:
- log_debug("开始生成查询向量(异步)")
+ log_debug("Starting query vector generation (async)")
# Submit encoding task to thread pool for async execution
encoding_executor = ThreadPoolExecutor(max_workers=1)
def _encode_query_vector() -> Optional[np.ndarray]:
@@ -341,7 +341,7 @@ class QueryParser:
_encode_query_vector
)
except Exception as e:
- error_msg = f"查询向量生成任务提交失败 | 错误: {str(e)}"
+ error_msg = f"Query vector generation task submission failed | Error: {str(e)}"
log_info(error_msg)
if context:
context.add_warning(error_msg)
@@ -350,7 +350,7 @@ class QueryParser:
# Wait for all async tasks to complete (translation and embedding)
if translation_futures or embedding_future:
- log_debug("等待异步任务完成...")
+ log_debug("Waiting for async tasks to complete...")
# Collect all futures with their identifiers
all_futures = []
@@ -371,22 +371,22 @@ class QueryParser:
if task_type == 'translation':
if result:
translations[lang] = result
- log_info(f"翻译完成 | {lang}: {result}")
+ log_info(f"Translation completed | Query text: '{query_text}' | Target language: {lang} | Translation result: '{result}'")
if context:
context.store_intermediate_result(f'translation_{lang}', result)
elif task_type == 'embedding':
query_vector = result
if query_vector is not None:
- log_debug(f"查询向量生成完成 | 形状: {query_vector.shape}")
+ log_debug(f"Query vector generation completed | Shape: {query_vector.shape}")
if context:
context.store_intermediate_result('query_vector_shape', query_vector.shape)
else:
- log_info("查询向量生成完成但结果为空(None),将按无向量处理")
+ log_info("Query vector generation completed but result is None, will process without vector")
except Exception as e:
if task_type == 'translation':
- error_msg = f"翻译失败 | 语言: {lang} | 错误: {str(e)}"
+ error_msg = f"Translation failed | Language: {lang} | Error: {str(e)}"
else:
- error_msg = f"查询向量生成失败 | 错误: {str(e)}"
+ error_msg = f"Query vector generation failed | Error: {str(e)}"
log_info(error_msg)
if context:
context.add_warning(error_msg)
@@ -416,15 +416,15 @@ class QueryParser:
if context and hasattr(context, 'logger'):
context.logger.info(
- f"查询解析完成 | 原查询: '{query}' | 最终查询: '{rewritten or query_text}' | "
- f"语言: {detected_lang} | 域: {domain} | "
- f"翻译数量: {len(translations)} | 向量: {'是' if query_vector is not None else '否'}",
+ f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
+ f"Language: {detected_lang} | Domain: {domain} | "
+ f"Translation count: {len(translations)} | Vector: {'yes' if query_vector is not None else 'no'}",
extra={'reqid': context.reqid, 'uid': context.uid}
)
else:
logger.info(
- f"查询解析完成 | 原查询: '{query}' | 最终查询: '{rewritten or query_text}' | "
- f"语言: {detected_lang} | 域: {domain}"
+ f"Query parsing completed | Original query: '{query}' | Final query: '{rewritten or query_text}' | "
+ f"Language: {detected_lang} | Domain: {domain}"
)
return result
diff --git a/query/translator.py b/query/translator.py
index 16f3c99..7db2555 100644
--- a/query/translator.py
+++ b/query/translator.py
@@ -140,7 +140,10 @@ class Translator:
return text
if target_lang == 'zh' and (self._contains_chinese(text) or self._is_pure_number(text)):
- logger.info(f"[Translator] Text contains Chinese or is pure number, skipping translation: '{text[:50]}...'")
+ logger.info(
+ f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
+ f"Source language: {source_lang or 'auto'} | Result: Skip translation (contains Chinese or pure number)"
+ )
return text
# Use provided context or default context
@@ -158,33 +161,39 @@ class Translator:
cached = self._get_cached_translation_redis(text, target_lang, source_lang, translation_context, prompt)
if cached:
logger.info(
- f"[Translator] Cache hit: source={source_lang or 'auto'} "
- f"target={target_lang} | text='{text[:80]}...' -> '{cached[:80]}...'"
+ f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
+ f"Source language: {source_lang or 'auto'} | Result: '{cached}' | Source: Cache hit"
)
return cached
# If no API key, return mock translation (for testing)
if not self.api_key:
- logger.debug(f"[Translator] No API key, returning original text (mock mode)")
+ logger.info(
+ f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
+ f"Source language: {source_lang or 'auto'} | Result: '{text}' | Source: Mock mode (no API key)"
+ )
return text
# Translate using DeepL (Pro endpoint only, no free fallback)
logger.info(
- f"[Translator] Translating text: target={target_lang}, "
- f"source={source_lang or 'auto'}, context={translation_context}, "
- f"prompt={'yes' if prompt else 'no'} | text='{text[:80]}...'"
+ f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
+ f"Source language: {source_lang or 'auto'} | Context: {translation_context} | "
+ f"Prompt: {'yes' if prompt else 'no'} | Status: Starting translation"
)
result = self._translate_deepl(text, target_lang, source_lang, translation_context, prompt)
# If still failed, return original text with warning
if result is None:
- logger.warning(f"[Translator] Translation failed for '{text[:50]}...', returning original text")
+ logger.warning(
+ f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
+ f"Source language: {source_lang or 'auto'} | Result: '{text}' | Status: Translation failed, returning original"
+ )
result = text
-
- logger.info(
- f"[Translator] Translation completed: source={source_lang or 'auto'} "
- f"target={target_lang} | original='{text[:80]}...' -> '{result[:80]}...'"
- )
+ else:
+ logger.info(
+ f"[Translator] Translation request | Original text: '{text}' | Target language: {target_lang} | "
+ f"Source language: {source_lang or 'auto'} | Result: '{result}' | Status: Translation successful"
+ )
# Cache result
if result and self.use_cache and self.redis_client:
@@ -265,16 +274,29 @@ class Translator:
translated_text = self._extract_term_from_translation(
translated_text, text, target_code
)
+ logger.debug(
+ f"[Translator] DeepL API response success | Original text: '{text}' | Target language: {target_code} | "
+ f"Translation result: '{translated_text}'"
+ )
return translated_text
else:
- logger.error(f"[Translator] DeepL API error: {response.status_code} - {response.text}")
+ logger.error(
+ f"[Translator] DeepL API error | Original text: '{text}' | Target language: {target_code} | "
+ f"Status code: {response.status_code} | Error message: {response.text}"
+ )
return None
except requests.Timeout:
- logger.warning(f"[Translator] Translation request timed out")
+ logger.warning(
+ f"[Translator] DeepL API request timeout | Original text: '{text}' | Target language: {target_code} | "
+ f"Timeout: {self.timeout}s"
+ )
return None
except Exception as e:
- logger.error(f"[Translator] Translation failed: {e}", exc_info=True)
+ logger.error(
+ f"[Translator] DeepL API request exception | Original text: '{text}' | Target language: {target_code} | "
+ f"Error: {e}", exc_info=True
+ )
return None
# NOTE: _translate_deepl_free is intentionally not implemented.
@@ -443,15 +465,18 @@ class Translator:
if value:
# Sliding expiration: reset expiration time on access
self.redis_client.expire(cache_key, self.expire_time)
- logger.info(
- f"[Translator] Redis cache hit: key={cache_key}, "
- f"target={target_lang}, value='{value[:80]}...'"
+ logger.debug(
+ f"[Translator] Redis cache hit | Original text: '{text}' | Target language: {target_lang} | "
+ f"Cache key: {cache_key} | Translation result: '{value}'"
)
return value
- logger.debug(f"[Translator] Redis cache miss: key={cache_key}, target={target_lang}")
+ logger.debug(
+ f"[Translator] Redis cache miss | Original text: '{text}' | Target language: {target_lang} | "
+ f"Cache key: {cache_key}"
+ )
return None
except Exception as e:
- logger.error(f"[Translator] Redis error during get translation cache: '{text}' {target_lang}: {e}")
+ logger.error(f"[Translator] Redis error during get translation cache | Original text: '{text}' | Target language: {target_lang} | Error: {e}")
return None
def _set_cached_translation_redis(
@@ -470,12 +495,15 @@ class Translator:
try:
cache_key = f"{self.cache_prefix}:{target_lang.upper()}:{text}"
self.redis_client.setex(cache_key, self.expire_time, translation)
- logger.info(
- f"[Translator] Cached translation: key={cache_key}, "
- f"target={target_lang}, value='{translation}...'"
+ logger.debug(
+ f"[Translator] Redis cache write | Original text: '{text}' | Target language: {target_lang} | "
+ f"Cache key: {cache_key} | Translation result: '{translation}'"
)
except Exception as e:
- logger.error(f"[Translator] Redis error during set translation cache: '{text}' {target_lang}: {e}")
+ logger.error(
+ f"[Translator] Redis cache write failed | Original text: '{text}' | Target language: {target_lang} | "
+ f"Error: {e}"
+ )
def _translate_async(
self,
diff --git a/search/es_query_builder.py b/search/es_query_builder.py
index 1405416..dbe44fa 100644
--- a/search/es_query_builder.py
+++ b/search/es_query_builder.py
@@ -26,7 +26,8 @@ class ESQueryBuilder:
source_fields: Optional[List[str]] = None,
function_score_config: Optional[FunctionScoreConfig] = None,
enable_multilang_search: bool = True,
- default_language: str = "zh"
+ default_language: str = "zh",
+ knn_boost: float = 0.25
):
"""
Initialize query builder.
@@ -40,6 +41,7 @@ class ESQueryBuilder:
function_score_config: Function score configuration
enable_multilang_search: Enable multi-language search using translations
default_language: Default language to use when detection fails or returns "unknown"
+ knn_boost: Boost value for KNN (embedding recall)
"""
self.index_name = index_name
self.match_fields = match_fields
@@ -49,6 +51,7 @@ class ESQueryBuilder:
self.function_score_config = function_score_config
self.enable_multilang_search = enable_multilang_search
self.default_language = default_language
+ self.knn_boost = knn_boost
def _split_filters_for_faceting(
self,
@@ -221,7 +224,7 @@ class ESQueryBuilder:
"query_vector": query_vector.tolist(),
"k": knn_k,
"num_candidates": knn_num_candidates,
- "boost": 0.2 # Lower boost for embedding recall
+ "boost": self.knn_boost # Lower boost for embedding recall
}
es_query["knn"] = knn_clause
@@ -458,7 +461,7 @@ class ESQueryBuilder:
"_name": "base_query",
"fields": match_fields,
"minimum_should_match": "75%",
- "operator": "AND",
+ # "operator": "AND",
"query": query_text,
"tie_breaker": tie_breaker_base_query
}
@@ -472,7 +475,7 @@ class ESQueryBuilder:
"multi_match": {
"query": translations['zh'],
"fields": zh_fields,
- "operator": "AND",
+ # "operator": "AND",
"minimum_should_match": "75%",
"tie_breaker": tie_breaker_base_query,
"boost": 0.4,
@@ -486,7 +489,7 @@ class ESQueryBuilder:
"multi_match": {
"query": translations['en'],
"fields": en_fields,
- "operator": "AND",
+ # "operator": "AND",
"minimum_should_match": "75%",
"tie_breaker": tie_breaker_base_query,
"boost": 0.4,
@@ -532,7 +535,7 @@ class ESQueryBuilder:
"multi_match": {
"query": keywords,
"fields": core_fields,
- "operator": "AND",
+ # "operator": "AND",
"tie_breaker": tie_breaker_keywords,
"boost": 0.1,
"_name": "keywords_query"
diff --git a/search/searcher.py b/search/searcher.py
index ae8e83e..bd746c1 100644
--- a/search/searcher.py
+++ b/search/searcher.py
@@ -115,7 +115,8 @@ class Searcher:
source_fields=self.source_fields,
function_score_config=self.config.function_score,
enable_multilang_search=self.config.query_config.enable_multilang_search,
- default_language=self.config.query_config.default_language
+ default_language=self.config.query_config.default_language,
+ knn_boost=self.config.query_config.knn_boost
)
def search(
--
libgit2 0.21.2