Compare View

switch
from
...
to
 
Commits (3)
@@ -281,8 +281,8 @@ class AppConfigLoader: @@ -281,8 +281,8 @@ class AppConfigLoader:
281 ["title", "brief", "vendor", "category_name_text"], 281 ["title", "brief", "vendor", "category_name_text"],
282 ) 282 )
283 ), 283 ),
284 - base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "75%")),  
285 - translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "75%")), 284 + base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")),
  285 + translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")),
286 translation_boost=float(text_strategy.get("translation_boost", 0.4)), 286 translation_boost=float(text_strategy.get("translation_boost", 0.4)),
287 translation_boost_when_source_missing=float( 287 translation_boost_when_source_missing=float(
288 text_strategy.get("translation_boost_when_source_missing", 1.0) 288 text_strategy.get("translation_boost_when_source_missing", 1.0)
@@ -51,8 +51,8 @@ class QueryConfig: @@ -51,8 +51,8 @@ class QueryConfig:
51 core_multilingual_fields: List[str] = field( 51 core_multilingual_fields: List[str] = field(
52 default_factory=lambda: ["title", "brief", "vendor", "category_name_text"] 52 default_factory=lambda: ["title", "brief", "vendor", "category_name_text"]
53 ) 53 )
54 - base_minimum_should_match: str = "75%"  
55 - translation_minimum_should_match: str = "75%" 54 + base_minimum_should_match: str = "70%"
  55 + translation_minimum_should_match: str = "70%"
56 translation_boost: float = 0.4 56 translation_boost: float = 0.4
57 translation_boost_when_source_missing: float = 1.0 57 translation_boost_when_source_missing: float = 1.0
58 source_boost_when_missing: float = 0.6 58 source_boost_when_missing: float = 0.6
@@ -236,14 +236,19 @@ config/environments/<env>.yaml @@ -236,14 +236,19 @@ config/environments/<env>.yaml
236 236
237 237
238 238
  239 +筛选SKU: 先只筛选第一个维度,但考虑到用户搜索词可能带了尺码,所以第二、三个维度也要考虑
239 240
240 241
  242 +引入图片的相关性:
  243 +图片的向量最好做SKU维度,用 SPU 维度还是 SKU 维度?
  244 +1. SKU维度(主款式,option1维度),如果用户搜索“蓝色 T恤”,这种图片相关性会比较有价值。
  245 +2. 我不考虑颜色的差异,其余的款式一般是大小之类的。这些图片,项链细粉到 SKU 维度,可能价值不大,性价比偏低
241 246
242 247
243 248
244 249
245 -  
246 - 250 +无结果重查
  251 +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长)
247 252
248 253
249 254
docs/常用查询 - ES.md
@@ -654,3 +654,18 @@ GET /search_products_tenant_170/_search @@ -654,3 +654,18 @@ GET /search_products_tenant_170/_search
654 } 654 }
655 } 655 }
656 } 656 }
  657 +
  658 +
  659 +检查某个字段是否存在
  660 +curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \
  661 + 'http://localhost:9200/search_products_tenant_163/_count' \
  662 + -H 'Content-Type: application/json' \
  663 + -d '{
  664 + "query": {
  665 + "bool": {
  666 + "filter": [
  667 + { "exists": { "field": "title_embedding" } }
  668 + ]
  669 + }
  670 + }
  671 + }'
657 \ No newline at end of file 672 \ No newline at end of file
embeddings/server.py
@@ -14,7 +14,6 @@ import time @@ -14,7 +14,6 @@ import time
14 import uuid 14 import uuid
15 from collections import deque 15 from collections import deque
16 from dataclasses import dataclass 16 from dataclasses import dataclass
17 -from logging.handlers import TimedRotatingFileHandler  
18 from typing import Any, Dict, List, Optional 17 from typing import Any, Dict, List, Optional
19 18
20 import numpy as np 19 import numpy as np
@@ -44,9 +43,7 @@ def configure_embedding_logging() -> None: @@ -44,9 +43,7 @@ def configure_embedding_logging() -> None:
44 return 43 return
45 44
46 log_dir = pathlib.Path("logs") 45 log_dir = pathlib.Path("logs")
47 - verbose_dir = log_dir / "verbose"  
48 log_dir.mkdir(exist_ok=True) 46 log_dir.mkdir(exist_ok=True)
49 - verbose_dir.mkdir(parents=True, exist_ok=True)  
50 47
51 log_level = os.getenv("LOG_LEVEL", "INFO").upper() 48 log_level = os.getenv("LOG_LEVEL", "INFO").upper()
52 numeric_level = getattr(logging, log_level, logging.INFO) 49 numeric_level = getattr(logging, log_level, logging.INFO)
@@ -56,47 +53,18 @@ def configure_embedding_logging() -> None: @@ -56,47 +53,18 @@ def configure_embedding_logging() -> None:
56 request_filter = _DefaultRequestIdFilter() 53 request_filter = _DefaultRequestIdFilter()
57 54
58 root_logger.setLevel(numeric_level) 55 root_logger.setLevel(numeric_level)
59 -  
60 - file_handler = TimedRotatingFileHandler(  
61 - filename=log_dir / "embedding_api.log",  
62 - when="midnight",  
63 - interval=1,  
64 - backupCount=30,  
65 - encoding="utf-8",  
66 - )  
67 - file_handler.setLevel(numeric_level)  
68 - file_handler.setFormatter(formatter)  
69 - file_handler.addFilter(request_filter)  
70 - root_logger.addHandler(file_handler)  
71 -  
72 - error_handler = TimedRotatingFileHandler(  
73 - filename=log_dir / "embedding_api_error.log",  
74 - when="midnight",  
75 - interval=1,  
76 - backupCount=30,  
77 - encoding="utf-8",  
78 - )  
79 - error_handler.setLevel(logging.ERROR)  
80 - error_handler.setFormatter(formatter)  
81 - error_handler.addFilter(request_filter)  
82 - root_logger.addHandler(error_handler) 56 + root_logger.handlers.clear()
  57 + stream_handler = logging.StreamHandler()
  58 + stream_handler.setLevel(numeric_level)
  59 + stream_handler.setFormatter(formatter)
  60 + stream_handler.addFilter(request_filter)
  61 + root_logger.addHandler(stream_handler)
83 62
84 verbose_logger = logging.getLogger("embedding.verbose") 63 verbose_logger = logging.getLogger("embedding.verbose")
85 verbose_logger.setLevel(numeric_level) 64 verbose_logger.setLevel(numeric_level)
86 verbose_logger.handlers.clear() 65 verbose_logger.handlers.clear()
87 - verbose_logger.propagate = False  
88 -  
89 - verbose_handler = TimedRotatingFileHandler(  
90 - filename=verbose_dir / "embedding_verbose.log",  
91 - when="midnight",  
92 - interval=1,  
93 - backupCount=30,  
94 - encoding="utf-8",  
95 - )  
96 - verbose_handler.setLevel(numeric_level)  
97 - verbose_handler.setFormatter(formatter)  
98 - verbose_handler.addFilter(request_filter)  
99 - verbose_logger.addHandler(verbose_handler) 66 + # Consolidate verbose logs into the main embedding log stream.
  67 + verbose_logger.propagate = True
100 68
101 root_logger._embedding_logging_configured = True # type: ignore[attr-defined] 69 root_logger._embedding_logging_configured = True # type: ignore[attr-defined]
102 70
frontend/static/css/style.css
@@ -379,7 +379,7 @@ body { @@ -379,7 +379,7 @@ body {
379 margin-top: 8px; 379 margin-top: 8px;
380 } 380 }
381 381
382 -.product-debug-inline-es-btn { 382 +.product-debug-inline-result-btn {
383 font-family: inherit; 383 font-family: inherit;
384 font-size: 12px; 384 font-size: 12px;
385 padding: 4px 10px; 385 padding: 4px 10px;
@@ -390,27 +390,22 @@ body { @@ -390,27 +390,22 @@ body {
390 cursor: pointer; 390 cursor: pointer;
391 } 391 }
392 392
393 -.product-debug-inline-es-btn:hover { 393 +.product-debug-inline-result-btn:hover {
394 background: #f0f0f0; 394 background: #f0f0f0;
395 border-color: #bbb; 395 border-color: #bbb;
396 } 396 }
397 397
398 -.product-debug--es-expanded { 398 +.product-debug--result-expanded {
399 max-height: min(70vh, 720px); 399 max-height: min(70vh, 720px);
400 } 400 }
401 401
402 -.product-es-doc-panel { 402 +.product-result-doc-panel {
403 margin-top: 10px; 403 margin-top: 10px;
404 padding-top: 8px; 404 padding-top: 8px;
405 border-top: 1px dashed #e8e8e8; 405 border-top: 1px dashed #e8e8e8;
406 } 406 }
407 407
408 -.product-es-doc-panel-status {  
409 - font-size: 12px;  
410 - color: #888;  
411 -}  
412 -  
413 -.product-es-doc-pre { 408 +.product-result-doc-pre {
414 margin: 6px 0 0; 409 margin: 6px 0 0;
415 padding: 10px; 410 padding: 10px;
416 background: #f5f5f5; 411 background: #f5f5f5;
frontend/static/js/app.js
@@ -68,25 +68,25 @@ function initializeApp() { @@ -68,25 +68,25 @@ function initializeApp() {
68 // 初始化租户下拉框和分面面板 68 // 初始化租户下拉框和分面面板
69 console.log('Initializing app...'); 69 console.log('Initializing app...');
70 initTenantSelect(); 70 initTenantSelect();
71 - setupProductGridEsDocToggle(); 71 + setupProductGridResultDocToggle();
72 const searchInput = document.getElementById('searchInput'); 72 const searchInput = document.getElementById('searchInput');
73 if (searchInput) { 73 if (searchInput) {
74 searchInput.focus(); 74 searchInput.focus();
75 } 75 }
76 } 76 }
77 77
78 -/** Delegated handler: toggle inline ES raw response under each result card (survives innerHTML refresh on re-search). */  
79 -function setupProductGridEsDocToggle() { 78 +/** Delegated handler: toggle inline current result JSON under each result card (survives innerHTML refresh on re-search). */
  79 +function setupProductGridResultDocToggle() {
80 const grid = document.getElementById('productGrid'); 80 const grid = document.getElementById('productGrid');
81 - if (!grid || grid.dataset.esDocToggleBound === '1') { 81 + if (!grid || grid.dataset.resultDocToggleBound === '1') {
82 return; 82 return;
83 } 83 }
84 - grid.dataset.esDocToggleBound = '1';  
85 - grid.addEventListener('click', onProductGridEsDocToggleClick); 84 + grid.dataset.resultDocToggleBound = '1';
  85 + grid.addEventListener('click', onProductGridResultDocToggleClick);
86 } 86 }
87 87
88 -async function onProductGridEsDocToggleClick(event) {  
89 - const btn = event.target.closest('[data-action="toggle-es-inline-doc"]'); 88 +function onProductGridResultDocToggleClick(event) {
  89 + const btn = event.target.closest('[data-action="toggle-result-inline-doc"]');
90 if (!btn) { 90 if (!btn) {
91 return; 91 return;
92 } 92 }
@@ -95,55 +95,27 @@ async function onProductGridEsDocToggleClick(event) { @@ -95,55 +95,27 @@ async function onProductGridEsDocToggleClick(event) {
95 if (!debugRoot) { 95 if (!debugRoot) {
96 return; 96 return;
97 } 97 }
98 - const panel = debugRoot.querySelector('.product-es-doc-panel');  
99 - const pre = debugRoot.querySelector('.product-es-doc-pre');  
100 - const statusEl = debugRoot.querySelector('.product-es-doc-panel-status');  
101 - if (!panel || !pre || !statusEl) { 98 + const panel = debugRoot.querySelector('.product-result-doc-panel');
  99 + const pre = debugRoot.querySelector('.product-result-doc-pre');
  100 + if (!panel || !pre) {
102 return; 101 return;
103 } 102 }
104 103
105 - const spuId = btn.getAttribute('data-spu-id') || '';  
106 - const tenantId = getTenantId();  
107 - const url = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`;  
108 -  
109 - if (debugRoot.dataset.esInlineOpen === '1') { 104 + if (debugRoot.dataset.resultInlineOpen === '1') {
110 panel.setAttribute('hidden', ''); 105 panel.setAttribute('hidden', '');
111 - debugRoot.classList.remove('product-debug--es-expanded');  
112 - debugRoot.dataset.esInlineOpen = '0';  
113 - btn.textContent = '在结果中显示 ES 文档'; 106 + debugRoot.classList.remove('product-debug--result-expanded');
  107 + debugRoot.dataset.resultInlineOpen = '0';
  108 + btn.textContent = '在结果中显示当前结果数据';
114 return; 109 return;
115 } 110 }
116 111
117 panel.removeAttribute('hidden'); 112 panel.removeAttribute('hidden');
118 - debugRoot.classList.add('product-debug--es-expanded');  
119 - debugRoot.dataset.esInlineOpen = '1';  
120 - btn.textContent = '隐藏 ES 文档';  
121 -  
122 - if (pre.textContent.length > 0) {  
123 - panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' });  
124 - return;  
125 - }  
126 -  
127 - statusEl.style.display = '';  
128 - statusEl.textContent = '加载中…';  
129 - pre.style.display = 'none';  
130 -  
131 - try {  
132 - const response = await fetch(url);  
133 - if (!response.ok) {  
134 - const errText = await response.text();  
135 - throw new Error(`HTTP ${response.status}: ${errText.slice(0, 200)}`);  
136 - }  
137 - const data = await response.json();  
138 - pre.textContent = customStringify(data);  
139 - statusEl.style.display = 'none';  
140 - pre.style.display = 'block';  
141 - } catch (err) {  
142 - console.error('ES doc fetch failed', err);  
143 - statusEl.textContent = `加载失败: ${err.message || err}`;  
144 - pre.style.display = 'none'; 113 + debugRoot.classList.add('product-debug--result-expanded');
  114 + debugRoot.dataset.resultInlineOpen = '1';
  115 + btn.textContent = '隐藏当前结果数据';
  116 + if (pre.textContent.length === 0) {
  117 + pre.textContent = btn.getAttribute('data-result-json') || '{}';
145 } 118 }
146 -  
147 panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); 119 panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
148 } 120 }
149 121
@@ -213,7 +185,7 @@ function initTenantSelect() { @@ -213,7 +185,7 @@ function initTenantSelect() {
213 }); 185 });
214 // 设置默认值(仅当输入框为空时) 186 // 设置默认值(仅当输入框为空时)
215 if (!tenantSelect.value.trim()) { 187 if (!tenantSelect.value.trim()) {
216 - tenantSelect.value = availableTenants.includes('170') ? '170' : availableTenants[0]; 188 + tenantSelect.value = availableTenants.includes('0') ? '0' : availableTenants[0];
217 } 189 }
218 } 190 }
219 191
@@ -462,6 +434,7 @@ function displayResults(data) { @@ -462,6 +434,7 @@ function displayResults(data) {
462 }); 434 });
463 } 435 }
464 436
  437 + const resultJson = customStringify(result);
465 const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; 438 const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`;
466 439
467 debugHtml = ` 440 debugHtml = `
@@ -475,18 +448,17 @@ function displayResults(data) { @@ -475,18 +448,17 @@ function displayResults(data) {
475 <div class="product-debug-line">Fused score: ${fusedScore}</div> 448 <div class="product-debug-line">Fused score: ${fusedScore}</div>
476 ${titleLines} 449 ${titleLines}
477 <div class="product-debug-actions"> 450 <div class="product-debug-actions">
478 - <button type="button" class="product-debug-inline-es-btn"  
479 - data-action="toggle-es-inline-doc"  
480 - data-spu-id="${escapeAttr(String(spuId || ''))}">  
481 - 在结果中显示 ES 文档 451 + <button type="button" class="product-debug-inline-result-btn"
  452 + data-action="toggle-result-inline-doc"
  453 + data-result-json="${escapeAttr(resultJson)}">
  454 + 在结果中显示当前结果数据
482 </button> 455 </button>
483 <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer"> 456 <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer">
484 查看 ES 原始文档 457 查看 ES 原始文档
485 </a> 458 </a>
486 </div> 459 </div>
487 - <div class="product-es-doc-panel" hidden>  
488 - <div class="product-es-doc-panel-status"></div>  
489 - <pre class="product-es-doc-pre"></pre> 460 + <div class="product-result-doc-panel" hidden>
  461 + <pre class="product-result-doc-pre"></pre>
490 </div> 462 </div>
491 </div> 463 </div>
492 `; 464 `;
query/language_detector.py
1 """ 1 """
2 Language detection utility. 2 Language detection utility.
3 3
4 -Detects language of short e-commerce queries with script checks + lightweight  
5 -Latin-language scoring (de/fr/es/it/pt/nl/en). 4 +Script-first rules for CJK and other non-Latin scripts, then Lingua
  5 +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
6 """ 6 """
7 7
8 -from typing import Dict, List 8 +from __future__ import annotations
  9 +
  10 +from typing import Dict, Optional
9 import re 11 import re
10 12
  13 +from lingua import Language, LanguageDetectorBuilder
  14 +
  15 +_LINGUA_TO_CODE: Dict[Language, str] = {
  16 + Language.CHINESE: "zh",
  17 + Language.ENGLISH: "en",
  18 + Language.JAPANESE: "ja",
  19 + Language.KOREAN: "ko",
  20 + Language.GERMAN: "de",
  21 + Language.FRENCH: "fr",
  22 + Language.SPANISH: "es",
  23 + Language.ITALIAN: "it",
  24 + Language.PORTUGUESE: "pt",
  25 + Language.DUTCH: "nl",
  26 + Language.RUSSIAN: "ru",
  27 + Language.ARABIC: "ar",
  28 + Language.HINDI: "hi",
  29 + Language.HEBREW: "he",
  30 + Language.THAI: "th",
  31 +}
  32 +
  33 +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())
  34 +
  35 +_lingua_detector: Optional[object] = None
  36 +
  37 +
  38 +def _get_lingua_detector():
  39 + global _lingua_detector
  40 + if _lingua_detector is None:
  41 + _lingua_detector = LanguageDetectorBuilder.from_languages(
  42 + *_LINGUA_LANGUAGES
  43 + ).build()
  44 + return _lingua_detector
  45 +
11 46
12 class LanguageDetector: 47 class LanguageDetector:
13 - """Rule-based language detector for common e-commerce query languages.""" 48 + """Language detector: script hints + Lingua for Latin-family queries."""
14 49
15 def __init__(self): 50 def __init__(self):
16 self._re_zh = re.compile(r"[\u4e00-\u9fff]") 51 self._re_zh = re.compile(r"[\u4e00-\u9fff]")
@@ -21,47 +56,6 @@ class LanguageDetector: @@ -21,47 +56,6 @@ class LanguageDetector:
21 self._re_hi = re.compile(r"[\u0900-\u097f]") 56 self._re_hi = re.compile(r"[\u0900-\u097f]")
22 self._re_he = re.compile(r"[\u0590-\u05ff]") 57 self._re_he = re.compile(r"[\u0590-\u05ff]")
23 self._re_th = re.compile(r"[\u0e00-\u0e7f]") 58 self._re_th = re.compile(r"[\u0e00-\u0e7f]")
24 - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")  
25 -  
26 - # Stopwords + e-commerce terms for Latin-family disambiguation.  
27 - self._latin_lexicons: Dict[str, set] = {  
28 - "en": {  
29 - "the", "and", "for", "with", "new", "women", "men", "kids",  
30 - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",  
31 - },  
32 - "de": {  
33 - "der", "die", "das", "und", "mit", "für", "damen", "herren",  
34 - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",  
35 - },  
36 - "fr": {  
37 - "le", "la", "les", "et", "avec", "pour", "femme", "homme",  
38 - "enfant", "chaussures", "robe", "chemise", "veste", "sac",  
39 - },  
40 - "es": {  
41 - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",  
42 - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",  
43 - },  
44 - "it": {  
45 - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",  
46 - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",  
47 - },  
48 - "pt": {  
49 - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",  
50 - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",  
51 - },  
52 - "nl": {  
53 - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",  
54 - "schoenen", "jurk", "overhemd", "jas", "tas",  
55 - },  
56 - }  
57 - self._diacritic_weights: Dict[str, Dict[str, int]] = {  
58 - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},  
59 - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},  
60 - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},  
61 - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},  
62 - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},  
63 - "nl": {"ij": 2},  
64 - }  
65 59
66 def detect(self, text: str) -> str: 60 def detect(self, text: str) -> str:
67 """ 61 """
@@ -71,9 +65,9 @@ class LanguageDetector: @@ -71,9 +65,9 @@ class LanguageDetector:
71 """ 65 """
72 if not text or not text.strip(): 66 if not text or not text.strip():
73 return "unknown" 67 return "unknown"
74 - q = text.strip().lower() 68 + q = text.strip()
75 69
76 - # Script-first detection for non-Latin languages. 70 + # Script-first: unambiguous blocks before Latin/Romance Lingua pass.
77 if self._re_ja_kana.search(q): 71 if self._re_ja_kana.search(q):
78 return "ja" 72 return "ja"
79 if self._re_ko.search(q): 73 if self._re_ko.search(q):
@@ -91,48 +85,11 @@ class LanguageDetector: @@ -91,48 +85,11 @@ class LanguageDetector:
91 if self._re_th.search(q): 85 if self._re_th.search(q):
92 return "th" 86 return "th"
93 87
94 - # Latin-family scoring.  
95 - tokens = self._re_latin_word.findall(q)  
96 - if not tokens: 88 + detected = _get_lingua_detector().detect_language_of(q)
  89 + if detected is None:
97 return "unknown" 90 return "unknown"
98 -  
99 - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}  
100 - scores["en"] = scores.get("en", 0.0)  
101 - token_set = set(tokens)  
102 -  
103 - # Lexicon matches  
104 - for lang, lex in self._latin_lexicons.items():  
105 - overlap = len(token_set & lex)  
106 - if overlap:  
107 - scores[lang] += overlap * 2.0  
108 -  
109 - # Diacritics / orthographic hints  
110 - for lang, hints in self._diacritic_weights.items():  
111 - for marker, weight in hints.items():  
112 - if marker in q:  
113 - scores[lang] += weight  
114 -  
115 - # Light suffix hints for common product words  
116 - for t in tokens:  
117 - if t.endswith("ung") or t.endswith("chen"):  
118 - scores["de"] += 0.6  
119 - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):  
120 - scores["es"] += 0.6  
121 - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):  
122 - scores["it"] += 0.6  
123 - if t.endswith("ção") or t.endswith("mente"):  
124 - scores["pt"] += 0.6  
125 - if t.endswith("ment") or t.endswith("eau"):  
126 - scores["fr"] += 0.5  
127 -  
128 - # Fallback preference: English for pure Latin short tokens.  
129 - scores["en"] += 0.2  
130 -  
131 - best_lang = max(scores.items(), key=lambda x: x[1])[0]  
132 - best_score = scores[best_lang]  
133 - if best_score <= 0:  
134 - return "en"  
135 - return best_lang 91 + code = _LINGUA_TO_CODE.get(detected)
  92 + return code if code is not None else "unknown"
136 93
137 def is_chinese(self, text: str) -> bool: 94 def is_chinese(self, text: str) -> bool:
138 return self.detect(text) == "zh" 95 return self.detect(text) == "zh"
@@ -42,3 +42,6 @@ click&gt;=8.1.0 @@ -42,3 +42,6 @@ click&gt;=8.1.0
42 pytest>=7.4.0 42 pytest>=7.4.0
43 pytest-asyncio>=0.21.0 43 pytest-asyncio>=0.21.0
44 httpx>=0.24.0 44 httpx>=0.24.0
  45 +
  46 +# language detector
  47 +lingua-language-detector
45 \ No newline at end of file 48 \ No newline at end of file
scripts/start_embedding_service.sh
@@ -138,7 +138,11 @@ fi @@ -138,7 +138,11 @@ fi
138 if [[ "${IMAGE_MODEL_ENABLED}" == "1" ]]; then 138 if [[ "${IMAGE_MODEL_ENABLED}" == "1" ]]; then
139 echo "Image max inflight: ${IMAGE_MAX_INFLIGHT:-1}" 139 echo "Image max inflight: ${IMAGE_MAX_INFLIGHT:-1}"
140 fi 140 fi
141 -echo "Logs: logs/embedding_api.log, logs/embedding_api_error.log, logs/verbose/embedding_verbose.log" 141 +if [[ "${SERVICE_KIND}" == "image" ]]; then
  142 + echo "Logs: logs/embedding-image.log"
  143 +else
  144 + echo "Logs: logs/embedding.log"
  145 +fi
142 echo 146 echo
143 echo "Tips:" 147 echo "Tips:"
144 echo " - Use a single worker (GPU models cannot be safely duplicated across workers)." 148 echo " - Use a single worker (GPU models cannot be safely duplicated across workers)."
@@ -153,12 +157,16 @@ echo @@ -153,12 +157,16 @@ echo
153 157
154 UVICORN_LOG_LEVEL="${EMBEDDING_UVICORN_LOG_LEVEL:-info}" 158 UVICORN_LOG_LEVEL="${EMBEDDING_UVICORN_LOG_LEVEL:-info}"
155 UVICORN_ACCESS_LOG="${EMBEDDING_UVICORN_ACCESS_LOG:-true}" 159 UVICORN_ACCESS_LOG="${EMBEDDING_UVICORN_ACCESS_LOG:-true}"
  160 +UVICORN_LOG_CONFIG="${EMBEDDING_UVICORN_LOG_CONFIG:-${PROJECT_ROOT}/config/uvicorn_embedding_logging.json}"
156 UVICORN_ARGS=( 161 UVICORN_ARGS=(
157 --host "${EMBEDDING_SERVICE_HOST}" 162 --host "${EMBEDDING_SERVICE_HOST}"
158 --port "${EMBEDDING_SERVICE_PORT}" 163 --port "${EMBEDDING_SERVICE_PORT}"
159 --workers 1 164 --workers 1
160 --log-level "${UVICORN_LOG_LEVEL}" 165 --log-level "${UVICORN_LOG_LEVEL}"
161 ) 166 )
  167 +if [[ -f "${UVICORN_LOG_CONFIG}" ]]; then
  168 + UVICORN_ARGS+=(--log-config "${UVICORN_LOG_CONFIG}")
  169 +fi
162 if [[ "${UVICORN_ACCESS_LOG}" == "0" || "${UVICORN_ACCESS_LOG}" == "false" || "${UVICORN_ACCESS_LOG}" == "no" ]]; then 170 if [[ "${UVICORN_ACCESS_LOG}" == "0" || "${UVICORN_ACCESS_LOG}" == "false" || "${UVICORN_ACCESS_LOG}" == "no" ]]; then
163 UVICORN_ARGS+=(--no-access-log) 171 UVICORN_ARGS+=(--no-access-log)
164 fi 172 fi
search/es_query_builder.py
@@ -33,8 +33,8 @@ class ESQueryBuilder: @@ -33,8 +33,8 @@ class ESQueryBuilder:
33 function_score_config: Optional[FunctionScoreConfig] = None, 33 function_score_config: Optional[FunctionScoreConfig] = None,
34 default_language: str = "en", 34 default_language: str = "en",
35 knn_boost: float = 0.25, 35 knn_boost: float = 0.25,
36 - base_minimum_should_match: str = "75%",  
37 - translation_minimum_should_match: str = "75%", 36 + base_minimum_should_match: str = "70%",
  37 + translation_minimum_should_match: str = "70%",
38 translation_boost: float = 0.4, 38 translation_boost: float = 0.4,
39 translation_boost_when_source_missing: float = 1.0, 39 translation_boost_when_source_missing: float = 1.0,
40 source_boost_when_missing: float = 0.6, 40 source_boost_when_missing: float = 0.6,
@@ -261,16 +261,13 @@ class ESQueryBuilder: @@ -261,16 +261,13 @@ class ESQueryBuilder:
261 if parsed_query: 261 if parsed_query:
262 query_tokens = getattr(parsed_query, 'query_tokens', None) or [] 262 query_tokens = getattr(parsed_query, 'query_tokens', None) or []
263 token_count = len(query_tokens) 263 token_count = len(query_tokens)
264 - if token_count <= 2:  
265 - knn_k, knn_num_candidates = 30, 100  
266 - knn_boost = self.knn_boost * 0.6 # Lower weight for short queries  
267 - elif token_count >= 5:  
268 - knn_k, knn_num_candidates = 80, 300 264 + if token_count >= 5:
  265 + knn_k, knn_num_candidates = 160, 500
269 knn_boost = self.knn_boost * 1.4 # Higher weight for long queries 266 knn_boost = self.knn_boost * 1.4 # Higher weight for long queries
270 else: 267 else:
271 - knn_k, knn_num_candidates = 50, 200 268 + knn_k, knn_num_candidates = 120, 400
272 else: 269 else:
273 - knn_k, knn_num_candidates = 50, 200 270 + knn_k, knn_num_candidates = 120, 400
274 knn_clause = { 271 knn_clause = {
275 "field": self.text_embedding_field, 272 "field": self.text_embedding_field,
276 "query_vector": query_vector.tolist(), 273 "query_vector": query_vector.tolist(),