Compare View
Commits (3)
Showing
11 changed files
Show diff stats
config/loader.py
| @@ -281,8 +281,8 @@ class AppConfigLoader: | @@ -281,8 +281,8 @@ class AppConfigLoader: | ||
| 281 | ["title", "brief", "vendor", "category_name_text"], | 281 | ["title", "brief", "vendor", "category_name_text"], |
| 282 | ) | 282 | ) |
| 283 | ), | 283 | ), |
| 284 | - base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "75%")), | ||
| 285 | - translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "75%")), | 284 | + base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), |
| 285 | + translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), | ||
| 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), | 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), |
| 287 | translation_boost_when_source_missing=float( | 287 | translation_boost_when_source_missing=float( |
| 288 | text_strategy.get("translation_boost_when_source_missing", 1.0) | 288 | text_strategy.get("translation_boost_when_source_missing", 1.0) |
config/schema.py
| @@ -51,8 +51,8 @@ class QueryConfig: | @@ -51,8 +51,8 @@ class QueryConfig: | ||
| 51 | core_multilingual_fields: List[str] = field( | 51 | core_multilingual_fields: List[str] = field( |
| 52 | default_factory=lambda: ["title", "brief", "vendor", "category_name_text"] | 52 | default_factory=lambda: ["title", "brief", "vendor", "category_name_text"] |
| 53 | ) | 53 | ) |
| 54 | - base_minimum_should_match: str = "75%" | ||
| 55 | - translation_minimum_should_match: str = "75%" | 54 | + base_minimum_should_match: str = "70%" |
| 55 | + translation_minimum_should_match: str = "70%" | ||
| 56 | translation_boost: float = 0.4 | 56 | translation_boost: float = 0.4 |
| 57 | translation_boost_when_source_missing: float = 1.0 | 57 | translation_boost_when_source_missing: float = 1.0 |
| 58 | source_boost_when_missing: float = 0.6 | 58 | source_boost_when_missing: float = 0.6 |
docs/TODO.txt
| @@ -236,14 +236,19 @@ config/environments/<env>.yaml | @@ -236,14 +236,19 @@ config/environments/<env>.yaml | ||
| 236 | 236 | ||
| 237 | 237 | ||
| 238 | 238 | ||
| 239 | +筛选SKU: 先只筛选第一个维度,但考虑到用户搜索词可能带了尺码,所以第二、三个维度也要考虑 | ||
| 239 | 240 | ||
| 240 | 241 | ||
| 242 | +引入图片的相关性: | ||
| 243 | +图片的向量最好做SKU维度,用 SPU 维度还是 SKU 维度? | ||
| 244 | +1. SKU维度(主款式,option1维度),如果用户搜索“蓝色 T恤”,这种图片相关性会比较有价值。 | ||
| 245 | +2. 我不考虑颜色的差异,其余的款式一般是大小之类的。这些图片,项链细粉到 SKU 维度,可能价值不大,性价比偏低 | ||
| 241 | 246 | ||
| 242 | 247 | ||
| 243 | 248 | ||
| 244 | 249 | ||
| 245 | - | ||
| 246 | - | 250 | +无结果重查 |
| 251 | +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长) | ||
| 247 | 252 | ||
| 248 | 253 | ||
| 249 | 254 |
docs/常用查询 - ES.md
| @@ -654,3 +654,18 @@ GET /search_products_tenant_170/_search | @@ -654,3 +654,18 @@ GET /search_products_tenant_170/_search | ||
| 654 | } | 654 | } |
| 655 | } | 655 | } |
| 656 | } | 656 | } |
| 657 | + | ||
| 658 | + | ||
| 659 | +检查某个字段是否存在 | ||
| 660 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ | ||
| 661 | + 'http://localhost:9200/search_products_tenant_163/_count' \ | ||
| 662 | + -H 'Content-Type: application/json' \ | ||
| 663 | + -d '{ | ||
| 664 | + "query": { | ||
| 665 | + "bool": { | ||
| 666 | + "filter": [ | ||
| 667 | + { "exists": { "field": "title_embedding" } } | ||
| 668 | + ] | ||
| 669 | + } | ||
| 670 | + } | ||
| 671 | + }' | ||
| 657 | \ No newline at end of file | 672 | \ No newline at end of file |
embeddings/server.py
| @@ -14,7 +14,6 @@ import time | @@ -14,7 +14,6 @@ import time | ||
| 14 | import uuid | 14 | import uuid |
| 15 | from collections import deque | 15 | from collections import deque |
| 16 | from dataclasses import dataclass | 16 | from dataclasses import dataclass |
| 17 | -from logging.handlers import TimedRotatingFileHandler | ||
| 18 | from typing import Any, Dict, List, Optional | 17 | from typing import Any, Dict, List, Optional |
| 19 | 18 | ||
| 20 | import numpy as np | 19 | import numpy as np |
| @@ -44,9 +43,7 @@ def configure_embedding_logging() -> None: | @@ -44,9 +43,7 @@ def configure_embedding_logging() -> None: | ||
| 44 | return | 43 | return |
| 45 | 44 | ||
| 46 | log_dir = pathlib.Path("logs") | 45 | log_dir = pathlib.Path("logs") |
| 47 | - verbose_dir = log_dir / "verbose" | ||
| 48 | log_dir.mkdir(exist_ok=True) | 46 | log_dir.mkdir(exist_ok=True) |
| 49 | - verbose_dir.mkdir(parents=True, exist_ok=True) | ||
| 50 | 47 | ||
| 51 | log_level = os.getenv("LOG_LEVEL", "INFO").upper() | 48 | log_level = os.getenv("LOG_LEVEL", "INFO").upper() |
| 52 | numeric_level = getattr(logging, log_level, logging.INFO) | 49 | numeric_level = getattr(logging, log_level, logging.INFO) |
| @@ -56,47 +53,18 @@ def configure_embedding_logging() -> None: | @@ -56,47 +53,18 @@ def configure_embedding_logging() -> None: | ||
| 56 | request_filter = _DefaultRequestIdFilter() | 53 | request_filter = _DefaultRequestIdFilter() |
| 57 | 54 | ||
| 58 | root_logger.setLevel(numeric_level) | 55 | root_logger.setLevel(numeric_level) |
| 59 | - | ||
| 60 | - file_handler = TimedRotatingFileHandler( | ||
| 61 | - filename=log_dir / "embedding_api.log", | ||
| 62 | - when="midnight", | ||
| 63 | - interval=1, | ||
| 64 | - backupCount=30, | ||
| 65 | - encoding="utf-8", | ||
| 66 | - ) | ||
| 67 | - file_handler.setLevel(numeric_level) | ||
| 68 | - file_handler.setFormatter(formatter) | ||
| 69 | - file_handler.addFilter(request_filter) | ||
| 70 | - root_logger.addHandler(file_handler) | ||
| 71 | - | ||
| 72 | - error_handler = TimedRotatingFileHandler( | ||
| 73 | - filename=log_dir / "embedding_api_error.log", | ||
| 74 | - when="midnight", | ||
| 75 | - interval=1, | ||
| 76 | - backupCount=30, | ||
| 77 | - encoding="utf-8", | ||
| 78 | - ) | ||
| 79 | - error_handler.setLevel(logging.ERROR) | ||
| 80 | - error_handler.setFormatter(formatter) | ||
| 81 | - error_handler.addFilter(request_filter) | ||
| 82 | - root_logger.addHandler(error_handler) | 56 | + root_logger.handlers.clear() |
| 57 | + stream_handler = logging.StreamHandler() | ||
| 58 | + stream_handler.setLevel(numeric_level) | ||
| 59 | + stream_handler.setFormatter(formatter) | ||
| 60 | + stream_handler.addFilter(request_filter) | ||
| 61 | + root_logger.addHandler(stream_handler) | ||
| 83 | 62 | ||
| 84 | verbose_logger = logging.getLogger("embedding.verbose") | 63 | verbose_logger = logging.getLogger("embedding.verbose") |
| 85 | verbose_logger.setLevel(numeric_level) | 64 | verbose_logger.setLevel(numeric_level) |
| 86 | verbose_logger.handlers.clear() | 65 | verbose_logger.handlers.clear() |
| 87 | - verbose_logger.propagate = False | ||
| 88 | - | ||
| 89 | - verbose_handler = TimedRotatingFileHandler( | ||
| 90 | - filename=verbose_dir / "embedding_verbose.log", | ||
| 91 | - when="midnight", | ||
| 92 | - interval=1, | ||
| 93 | - backupCount=30, | ||
| 94 | - encoding="utf-8", | ||
| 95 | - ) | ||
| 96 | - verbose_handler.setLevel(numeric_level) | ||
| 97 | - verbose_handler.setFormatter(formatter) | ||
| 98 | - verbose_handler.addFilter(request_filter) | ||
| 99 | - verbose_logger.addHandler(verbose_handler) | 66 | + # Consolidate verbose logs into the main embedding log stream. |
| 67 | + verbose_logger.propagate = True | ||
| 100 | 68 | ||
| 101 | root_logger._embedding_logging_configured = True # type: ignore[attr-defined] | 69 | root_logger._embedding_logging_configured = True # type: ignore[attr-defined] |
| 102 | 70 |
frontend/static/css/style.css
| @@ -379,7 +379,7 @@ body { | @@ -379,7 +379,7 @@ body { | ||
| 379 | margin-top: 8px; | 379 | margin-top: 8px; |
| 380 | } | 380 | } |
| 381 | 381 | ||
| 382 | -.product-debug-inline-es-btn { | 382 | +.product-debug-inline-result-btn { |
| 383 | font-family: inherit; | 383 | font-family: inherit; |
| 384 | font-size: 12px; | 384 | font-size: 12px; |
| 385 | padding: 4px 10px; | 385 | padding: 4px 10px; |
| @@ -390,27 +390,22 @@ body { | @@ -390,27 +390,22 @@ body { | ||
| 390 | cursor: pointer; | 390 | cursor: pointer; |
| 391 | } | 391 | } |
| 392 | 392 | ||
| 393 | -.product-debug-inline-es-btn:hover { | 393 | +.product-debug-inline-result-btn:hover { |
| 394 | background: #f0f0f0; | 394 | background: #f0f0f0; |
| 395 | border-color: #bbb; | 395 | border-color: #bbb; |
| 396 | } | 396 | } |
| 397 | 397 | ||
| 398 | -.product-debug--es-expanded { | 398 | +.product-debug--result-expanded { |
| 399 | max-height: min(70vh, 720px); | 399 | max-height: min(70vh, 720px); |
| 400 | } | 400 | } |
| 401 | 401 | ||
| 402 | -.product-es-doc-panel { | 402 | +.product-result-doc-panel { |
| 403 | margin-top: 10px; | 403 | margin-top: 10px; |
| 404 | padding-top: 8px; | 404 | padding-top: 8px; |
| 405 | border-top: 1px dashed #e8e8e8; | 405 | border-top: 1px dashed #e8e8e8; |
| 406 | } | 406 | } |
| 407 | 407 | ||
| 408 | -.product-es-doc-panel-status { | ||
| 409 | - font-size: 12px; | ||
| 410 | - color: #888; | ||
| 411 | -} | ||
| 412 | - | ||
| 413 | -.product-es-doc-pre { | 408 | +.product-result-doc-pre { |
| 414 | margin: 6px 0 0; | 409 | margin: 6px 0 0; |
| 415 | padding: 10px; | 410 | padding: 10px; |
| 416 | background: #f5f5f5; | 411 | background: #f5f5f5; |
frontend/static/js/app.js
| @@ -68,25 +68,25 @@ function initializeApp() { | @@ -68,25 +68,25 @@ function initializeApp() { | ||
| 68 | // 初始化租户下拉框和分面面板 | 68 | // 初始化租户下拉框和分面面板 |
| 69 | console.log('Initializing app...'); | 69 | console.log('Initializing app...'); |
| 70 | initTenantSelect(); | 70 | initTenantSelect(); |
| 71 | - setupProductGridEsDocToggle(); | 71 | + setupProductGridResultDocToggle(); |
| 72 | const searchInput = document.getElementById('searchInput'); | 72 | const searchInput = document.getElementById('searchInput'); |
| 73 | if (searchInput) { | 73 | if (searchInput) { |
| 74 | searchInput.focus(); | 74 | searchInput.focus(); |
| 75 | } | 75 | } |
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | -/** Delegated handler: toggle inline ES raw response under each result card (survives innerHTML refresh on re-search). */ | ||
| 79 | -function setupProductGridEsDocToggle() { | 78 | +/** Delegated handler: toggle inline current result JSON under each result card (survives innerHTML refresh on re-search). */ |
| 79 | +function setupProductGridResultDocToggle() { | ||
| 80 | const grid = document.getElementById('productGrid'); | 80 | const grid = document.getElementById('productGrid'); |
| 81 | - if (!grid || grid.dataset.esDocToggleBound === '1') { | 81 | + if (!grid || grid.dataset.resultDocToggleBound === '1') { |
| 82 | return; | 82 | return; |
| 83 | } | 83 | } |
| 84 | - grid.dataset.esDocToggleBound = '1'; | ||
| 85 | - grid.addEventListener('click', onProductGridEsDocToggleClick); | 84 | + grid.dataset.resultDocToggleBound = '1'; |
| 85 | + grid.addEventListener('click', onProductGridResultDocToggleClick); | ||
| 86 | } | 86 | } |
| 87 | 87 | ||
| 88 | -async function onProductGridEsDocToggleClick(event) { | ||
| 89 | - const btn = event.target.closest('[data-action="toggle-es-inline-doc"]'); | 88 | +function onProductGridResultDocToggleClick(event) { |
| 89 | + const btn = event.target.closest('[data-action="toggle-result-inline-doc"]'); | ||
| 90 | if (!btn) { | 90 | if (!btn) { |
| 91 | return; | 91 | return; |
| 92 | } | 92 | } |
| @@ -95,55 +95,27 @@ async function onProductGridEsDocToggleClick(event) { | @@ -95,55 +95,27 @@ async function onProductGridEsDocToggleClick(event) { | ||
| 95 | if (!debugRoot) { | 95 | if (!debugRoot) { |
| 96 | return; | 96 | return; |
| 97 | } | 97 | } |
| 98 | - const panel = debugRoot.querySelector('.product-es-doc-panel'); | ||
| 99 | - const pre = debugRoot.querySelector('.product-es-doc-pre'); | ||
| 100 | - const statusEl = debugRoot.querySelector('.product-es-doc-panel-status'); | ||
| 101 | - if (!panel || !pre || !statusEl) { | 98 | + const panel = debugRoot.querySelector('.product-result-doc-panel'); |
| 99 | + const pre = debugRoot.querySelector('.product-result-doc-pre'); | ||
| 100 | + if (!panel || !pre) { | ||
| 102 | return; | 101 | return; |
| 103 | } | 102 | } |
| 104 | 103 | ||
| 105 | - const spuId = btn.getAttribute('data-spu-id') || ''; | ||
| 106 | - const tenantId = getTenantId(); | ||
| 107 | - const url = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; | ||
| 108 | - | ||
| 109 | - if (debugRoot.dataset.esInlineOpen === '1') { | 104 | + if (debugRoot.dataset.resultInlineOpen === '1') { |
| 110 | panel.setAttribute('hidden', ''); | 105 | panel.setAttribute('hidden', ''); |
| 111 | - debugRoot.classList.remove('product-debug--es-expanded'); | ||
| 112 | - debugRoot.dataset.esInlineOpen = '0'; | ||
| 113 | - btn.textContent = '在结果中显示 ES 文档'; | 106 | + debugRoot.classList.remove('product-debug--result-expanded'); |
| 107 | + debugRoot.dataset.resultInlineOpen = '0'; | ||
| 108 | + btn.textContent = '在结果中显示当前结果数据'; | ||
| 114 | return; | 109 | return; |
| 115 | } | 110 | } |
| 116 | 111 | ||
| 117 | panel.removeAttribute('hidden'); | 112 | panel.removeAttribute('hidden'); |
| 118 | - debugRoot.classList.add('product-debug--es-expanded'); | ||
| 119 | - debugRoot.dataset.esInlineOpen = '1'; | ||
| 120 | - btn.textContent = '隐藏 ES 文档'; | ||
| 121 | - | ||
| 122 | - if (pre.textContent.length > 0) { | ||
| 123 | - panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); | ||
| 124 | - return; | ||
| 125 | - } | ||
| 126 | - | ||
| 127 | - statusEl.style.display = ''; | ||
| 128 | - statusEl.textContent = '加载中…'; | ||
| 129 | - pre.style.display = 'none'; | ||
| 130 | - | ||
| 131 | - try { | ||
| 132 | - const response = await fetch(url); | ||
| 133 | - if (!response.ok) { | ||
| 134 | - const errText = await response.text(); | ||
| 135 | - throw new Error(`HTTP ${response.status}: ${errText.slice(0, 200)}`); | ||
| 136 | - } | ||
| 137 | - const data = await response.json(); | ||
| 138 | - pre.textContent = customStringify(data); | ||
| 139 | - statusEl.style.display = 'none'; | ||
| 140 | - pre.style.display = 'block'; | ||
| 141 | - } catch (err) { | ||
| 142 | - console.error('ES doc fetch failed', err); | ||
| 143 | - statusEl.textContent = `加载失败: ${err.message || err}`; | ||
| 144 | - pre.style.display = 'none'; | 113 | + debugRoot.classList.add('product-debug--result-expanded'); |
| 114 | + debugRoot.dataset.resultInlineOpen = '1'; | ||
| 115 | + btn.textContent = '隐藏当前结果数据'; | ||
| 116 | + if (pre.textContent.length === 0) { | ||
| 117 | + pre.textContent = btn.getAttribute('data-result-json') || '{}'; | ||
| 145 | } | 118 | } |
| 146 | - | ||
| 147 | panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); | 119 | panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); |
| 148 | } | 120 | } |
| 149 | 121 | ||
| @@ -213,7 +185,7 @@ function initTenantSelect() { | @@ -213,7 +185,7 @@ function initTenantSelect() { | ||
| 213 | }); | 185 | }); |
| 214 | // 设置默认值(仅当输入框为空时) | 186 | // 设置默认值(仅当输入框为空时) |
| 215 | if (!tenantSelect.value.trim()) { | 187 | if (!tenantSelect.value.trim()) { |
| 216 | - tenantSelect.value = availableTenants.includes('170') ? '170' : availableTenants[0]; | 188 | + tenantSelect.value = availableTenants.includes('0') ? '0' : availableTenants[0]; |
| 217 | } | 189 | } |
| 218 | } | 190 | } |
| 219 | 191 | ||
| @@ -462,6 +434,7 @@ function displayResults(data) { | @@ -462,6 +434,7 @@ function displayResults(data) { | ||
| 462 | }); | 434 | }); |
| 463 | } | 435 | } |
| 464 | 436 | ||
| 437 | + const resultJson = customStringify(result); | ||
| 465 | const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; | 438 | const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; |
| 466 | 439 | ||
| 467 | debugHtml = ` | 440 | debugHtml = ` |
| @@ -475,18 +448,17 @@ function displayResults(data) { | @@ -475,18 +448,17 @@ function displayResults(data) { | ||
| 475 | <div class="product-debug-line">Fused score: ${fusedScore}</div> | 448 | <div class="product-debug-line">Fused score: ${fusedScore}</div> |
| 476 | ${titleLines} | 449 | ${titleLines} |
| 477 | <div class="product-debug-actions"> | 450 | <div class="product-debug-actions"> |
| 478 | - <button type="button" class="product-debug-inline-es-btn" | ||
| 479 | - data-action="toggle-es-inline-doc" | ||
| 480 | - data-spu-id="${escapeAttr(String(spuId || ''))}"> | ||
| 481 | - 在结果中显示 ES 文档 | 451 | + <button type="button" class="product-debug-inline-result-btn" |
| 452 | + data-action="toggle-result-inline-doc" | ||
| 453 | + data-result-json="${escapeAttr(resultJson)}"> | ||
| 454 | + 在结果中显示当前结果数据 | ||
| 482 | </button> | 455 | </button> |
| 483 | <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer"> | 456 | <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer"> |
| 484 | 查看 ES 原始文档 | 457 | 查看 ES 原始文档 |
| 485 | </a> | 458 | </a> |
| 486 | </div> | 459 | </div> |
| 487 | - <div class="product-es-doc-panel" hidden> | ||
| 488 | - <div class="product-es-doc-panel-status"></div> | ||
| 489 | - <pre class="product-es-doc-pre"></pre> | 460 | + <div class="product-result-doc-panel" hidden> |
| 461 | + <pre class="product-result-doc-pre"></pre> | ||
| 490 | </div> | 462 | </div> |
| 491 | </div> | 463 | </div> |
| 492 | `; | 464 | `; |
query/language_detector.py
| 1 | """ | 1 | """ |
| 2 | Language detection utility. | 2 | Language detection utility. |
| 3 | 3 | ||
| 4 | -Detects language of short e-commerce queries with script checks + lightweight | ||
| 5 | -Latin-language scoring (de/fr/es/it/pt/nl/en). | 4 | +Script-first rules for CJK and other non-Latin scripts, then Lingua |
| 5 | +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation. | ||
| 6 | """ | 6 | """ |
| 7 | 7 | ||
| 8 | -from typing import Dict, List | 8 | +from __future__ import annotations |
| 9 | + | ||
| 10 | +from typing import Dict, Optional | ||
| 9 | import re | 11 | import re |
| 10 | 12 | ||
| 13 | +from lingua import Language, LanguageDetectorBuilder | ||
| 14 | + | ||
| 15 | +_LINGUA_TO_CODE: Dict[Language, str] = { | ||
| 16 | + Language.CHINESE: "zh", | ||
| 17 | + Language.ENGLISH: "en", | ||
| 18 | + Language.JAPANESE: "ja", | ||
| 19 | + Language.KOREAN: "ko", | ||
| 20 | + Language.GERMAN: "de", | ||
| 21 | + Language.FRENCH: "fr", | ||
| 22 | + Language.SPANISH: "es", | ||
| 23 | + Language.ITALIAN: "it", | ||
| 24 | + Language.PORTUGUESE: "pt", | ||
| 25 | + Language.DUTCH: "nl", | ||
| 26 | + Language.RUSSIAN: "ru", | ||
| 27 | + Language.ARABIC: "ar", | ||
| 28 | + Language.HINDI: "hi", | ||
| 29 | + Language.HEBREW: "he", | ||
| 30 | + Language.THAI: "th", | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys()) | ||
| 34 | + | ||
| 35 | +_lingua_detector: Optional[object] = None | ||
| 36 | + | ||
| 37 | + | ||
| 38 | +def _get_lingua_detector(): | ||
| 39 | + global _lingua_detector | ||
| 40 | + if _lingua_detector is None: | ||
| 41 | + _lingua_detector = LanguageDetectorBuilder.from_languages( | ||
| 42 | + *_LINGUA_LANGUAGES | ||
| 43 | + ).build() | ||
| 44 | + return _lingua_detector | ||
| 45 | + | ||
| 11 | 46 | ||
| 12 | class LanguageDetector: | 47 | class LanguageDetector: |
| 13 | - """Rule-based language detector for common e-commerce query languages.""" | 48 | + """Language detector: script hints + Lingua for Latin-family queries.""" |
| 14 | 49 | ||
| 15 | def __init__(self): | 50 | def __init__(self): |
| 16 | self._re_zh = re.compile(r"[\u4e00-\u9fff]") | 51 | self._re_zh = re.compile(r"[\u4e00-\u9fff]") |
| @@ -21,47 +56,6 @@ class LanguageDetector: | @@ -21,47 +56,6 @@ class LanguageDetector: | ||
| 21 | self._re_hi = re.compile(r"[\u0900-\u097f]") | 56 | self._re_hi = re.compile(r"[\u0900-\u097f]") |
| 22 | self._re_he = re.compile(r"[\u0590-\u05ff]") | 57 | self._re_he = re.compile(r"[\u0590-\u05ff]") |
| 23 | self._re_th = re.compile(r"[\u0e00-\u0e7f]") | 58 | self._re_th = re.compile(r"[\u0e00-\u0e7f]") |
| 24 | - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") | ||
| 25 | - | ||
| 26 | - # Stopwords + e-commerce terms for Latin-family disambiguation. | ||
| 27 | - self._latin_lexicons: Dict[str, set] = { | ||
| 28 | - "en": { | ||
| 29 | - "the", "and", "for", "with", "new", "women", "men", "kids", | ||
| 30 | - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", | ||
| 31 | - }, | ||
| 32 | - "de": { | ||
| 33 | - "der", "die", "das", "und", "mit", "für", "damen", "herren", | ||
| 34 | - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", | ||
| 35 | - }, | ||
| 36 | - "fr": { | ||
| 37 | - "le", "la", "les", "et", "avec", "pour", "femme", "homme", | ||
| 38 | - "enfant", "chaussures", "robe", "chemise", "veste", "sac", | ||
| 39 | - }, | ||
| 40 | - "es": { | ||
| 41 | - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", | ||
| 42 | - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", | ||
| 43 | - }, | ||
| 44 | - "it": { | ||
| 45 | - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", | ||
| 46 | - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", | ||
| 47 | - }, | ||
| 48 | - "pt": { | ||
| 49 | - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", | ||
| 50 | - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", | ||
| 51 | - }, | ||
| 52 | - "nl": { | ||
| 53 | - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", | ||
| 54 | - "schoenen", "jurk", "overhemd", "jas", "tas", | ||
| 55 | - }, | ||
| 56 | - } | ||
| 57 | - self._diacritic_weights: Dict[str, Dict[str, int]] = { | ||
| 58 | - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, | ||
| 59 | - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, | ||
| 60 | - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, | ||
| 61 | - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, | ||
| 62 | - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, | ||
| 63 | - "nl": {"ij": 2}, | ||
| 64 | - } | ||
| 65 | 59 | ||
| 66 | def detect(self, text: str) -> str: | 60 | def detect(self, text: str) -> str: |
| 67 | """ | 61 | """ |
| @@ -71,9 +65,9 @@ class LanguageDetector: | @@ -71,9 +65,9 @@ class LanguageDetector: | ||
| 71 | """ | 65 | """ |
| 72 | if not text or not text.strip(): | 66 | if not text or not text.strip(): |
| 73 | return "unknown" | 67 | return "unknown" |
| 74 | - q = text.strip().lower() | 68 | + q = text.strip() |
| 75 | 69 | ||
| 76 | - # Script-first detection for non-Latin languages. | 70 | + # Script-first: unambiguous blocks before Latin/Romance Lingua pass. |
| 77 | if self._re_ja_kana.search(q): | 71 | if self._re_ja_kana.search(q): |
| 78 | return "ja" | 72 | return "ja" |
| 79 | if self._re_ko.search(q): | 73 | if self._re_ko.search(q): |
| @@ -91,48 +85,11 @@ class LanguageDetector: | @@ -91,48 +85,11 @@ class LanguageDetector: | ||
| 91 | if self._re_th.search(q): | 85 | if self._re_th.search(q): |
| 92 | return "th" | 86 | return "th" |
| 93 | 87 | ||
| 94 | - # Latin-family scoring. | ||
| 95 | - tokens = self._re_latin_word.findall(q) | ||
| 96 | - if not tokens: | 88 | + detected = _get_lingua_detector().detect_language_of(q) |
| 89 | + if detected is None: | ||
| 97 | return "unknown" | 90 | return "unknown" |
| 98 | - | ||
| 99 | - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} | ||
| 100 | - scores["en"] = scores.get("en", 0.0) | ||
| 101 | - token_set = set(tokens) | ||
| 102 | - | ||
| 103 | - # Lexicon matches | ||
| 104 | - for lang, lex in self._latin_lexicons.items(): | ||
| 105 | - overlap = len(token_set & lex) | ||
| 106 | - if overlap: | ||
| 107 | - scores[lang] += overlap * 2.0 | ||
| 108 | - | ||
| 109 | - # Diacritics / orthographic hints | ||
| 110 | - for lang, hints in self._diacritic_weights.items(): | ||
| 111 | - for marker, weight in hints.items(): | ||
| 112 | - if marker in q: | ||
| 113 | - scores[lang] += weight | ||
| 114 | - | ||
| 115 | - # Light suffix hints for common product words | ||
| 116 | - for t in tokens: | ||
| 117 | - if t.endswith("ung") or t.endswith("chen"): | ||
| 118 | - scores["de"] += 0.6 | ||
| 119 | - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): | ||
| 120 | - scores["es"] += 0.6 | ||
| 121 | - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): | ||
| 122 | - scores["it"] += 0.6 | ||
| 123 | - if t.endswith("ção") or t.endswith("mente"): | ||
| 124 | - scores["pt"] += 0.6 | ||
| 125 | - if t.endswith("ment") or t.endswith("eau"): | ||
| 126 | - scores["fr"] += 0.5 | ||
| 127 | - | ||
| 128 | - # Fallback preference: English for pure Latin short tokens. | ||
| 129 | - scores["en"] += 0.2 | ||
| 130 | - | ||
| 131 | - best_lang = max(scores.items(), key=lambda x: x[1])[0] | ||
| 132 | - best_score = scores[best_lang] | ||
| 133 | - if best_score <= 0: | ||
| 134 | - return "en" | ||
| 135 | - return best_lang | 91 | + code = _LINGUA_TO_CODE.get(detected) |
| 92 | + return code if code is not None else "unknown" | ||
| 136 | 93 | ||
| 137 | def is_chinese(self, text: str) -> bool: | 94 | def is_chinese(self, text: str) -> bool: |
| 138 | return self.detect(text) == "zh" | 95 | return self.detect(text) == "zh" |
requirements.txt
| @@ -42,3 +42,6 @@ click>=8.1.0 | @@ -42,3 +42,6 @@ click>=8.1.0 | ||
| 42 | pytest>=7.4.0 | 42 | pytest>=7.4.0 |
| 43 | pytest-asyncio>=0.21.0 | 43 | pytest-asyncio>=0.21.0 |
| 44 | httpx>=0.24.0 | 44 | httpx>=0.24.0 |
| 45 | + | ||
| 46 | +# language detector | ||
| 47 | +lingua-language-detector | ||
| 45 | \ No newline at end of file | 48 | \ No newline at end of file |
scripts/start_embedding_service.sh
| @@ -138,7 +138,11 @@ fi | @@ -138,7 +138,11 @@ fi | ||
| 138 | if [[ "${IMAGE_MODEL_ENABLED}" == "1" ]]; then | 138 | if [[ "${IMAGE_MODEL_ENABLED}" == "1" ]]; then |
| 139 | echo "Image max inflight: ${IMAGE_MAX_INFLIGHT:-1}" | 139 | echo "Image max inflight: ${IMAGE_MAX_INFLIGHT:-1}" |
| 140 | fi | 140 | fi |
| 141 | -echo "Logs: logs/embedding_api.log, logs/embedding_api_error.log, logs/verbose/embedding_verbose.log" | 141 | +if [[ "${SERVICE_KIND}" == "image" ]]; then |
| 142 | + echo "Logs: logs/embedding-image.log" | ||
| 143 | +else | ||
| 144 | + echo "Logs: logs/embedding.log" | ||
| 145 | +fi | ||
| 142 | echo | 146 | echo |
| 143 | echo "Tips:" | 147 | echo "Tips:" |
| 144 | echo " - Use a single worker (GPU models cannot be safely duplicated across workers)." | 148 | echo " - Use a single worker (GPU models cannot be safely duplicated across workers)." |
| @@ -153,12 +157,16 @@ echo | @@ -153,12 +157,16 @@ echo | ||
| 153 | 157 | ||
| 154 | UVICORN_LOG_LEVEL="${EMBEDDING_UVICORN_LOG_LEVEL:-info}" | 158 | UVICORN_LOG_LEVEL="${EMBEDDING_UVICORN_LOG_LEVEL:-info}" |
| 155 | UVICORN_ACCESS_LOG="${EMBEDDING_UVICORN_ACCESS_LOG:-true}" | 159 | UVICORN_ACCESS_LOG="${EMBEDDING_UVICORN_ACCESS_LOG:-true}" |
| 160 | +UVICORN_LOG_CONFIG="${EMBEDDING_UVICORN_LOG_CONFIG:-${PROJECT_ROOT}/config/uvicorn_embedding_logging.json}" | ||
| 156 | UVICORN_ARGS=( | 161 | UVICORN_ARGS=( |
| 157 | --host "${EMBEDDING_SERVICE_HOST}" | 162 | --host "${EMBEDDING_SERVICE_HOST}" |
| 158 | --port "${EMBEDDING_SERVICE_PORT}" | 163 | --port "${EMBEDDING_SERVICE_PORT}" |
| 159 | --workers 1 | 164 | --workers 1 |
| 160 | --log-level "${UVICORN_LOG_LEVEL}" | 165 | --log-level "${UVICORN_LOG_LEVEL}" |
| 161 | ) | 166 | ) |
| 167 | +if [[ -f "${UVICORN_LOG_CONFIG}" ]]; then | ||
| 168 | + UVICORN_ARGS+=(--log-config "${UVICORN_LOG_CONFIG}") | ||
| 169 | +fi | ||
| 162 | if [[ "${UVICORN_ACCESS_LOG}" == "0" || "${UVICORN_ACCESS_LOG}" == "false" || "${UVICORN_ACCESS_LOG}" == "no" ]]; then | 170 | if [[ "${UVICORN_ACCESS_LOG}" == "0" || "${UVICORN_ACCESS_LOG}" == "false" || "${UVICORN_ACCESS_LOG}" == "no" ]]; then |
| 163 | UVICORN_ARGS+=(--no-access-log) | 171 | UVICORN_ARGS+=(--no-access-log) |
| 164 | fi | 172 | fi |
search/es_query_builder.py
| @@ -33,8 +33,8 @@ class ESQueryBuilder: | @@ -33,8 +33,8 @@ class ESQueryBuilder: | ||
| 33 | function_score_config: Optional[FunctionScoreConfig] = None, | 33 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 34 | default_language: str = "en", | 34 | default_language: str = "en", |
| 35 | knn_boost: float = 0.25, | 35 | knn_boost: float = 0.25, |
| 36 | - base_minimum_should_match: str = "75%", | ||
| 37 | - translation_minimum_should_match: str = "75%", | 36 | + base_minimum_should_match: str = "70%", |
| 37 | + translation_minimum_should_match: str = "70%", | ||
| 38 | translation_boost: float = 0.4, | 38 | translation_boost: float = 0.4, |
| 39 | translation_boost_when_source_missing: float = 1.0, | 39 | translation_boost_when_source_missing: float = 1.0, |
| 40 | source_boost_when_missing: float = 0.6, | 40 | source_boost_when_missing: float = 0.6, |
| @@ -261,16 +261,13 @@ class ESQueryBuilder: | @@ -261,16 +261,13 @@ class ESQueryBuilder: | ||
| 261 | if parsed_query: | 261 | if parsed_query: |
| 262 | query_tokens = getattr(parsed_query, 'query_tokens', None) or [] | 262 | query_tokens = getattr(parsed_query, 'query_tokens', None) or [] |
| 263 | token_count = len(query_tokens) | 263 | token_count = len(query_tokens) |
| 264 | - if token_count <= 2: | ||
| 265 | - knn_k, knn_num_candidates = 30, 100 | ||
| 266 | - knn_boost = self.knn_boost * 0.6 # Lower weight for short queries | ||
| 267 | - elif token_count >= 5: | ||
| 268 | - knn_k, knn_num_candidates = 80, 300 | 264 | + if token_count >= 5: |
| 265 | + knn_k, knn_num_candidates = 160, 500 | ||
| 269 | knn_boost = self.knn_boost * 1.4 # Higher weight for long queries | 266 | knn_boost = self.knn_boost * 1.4 # Higher weight for long queries |
| 270 | else: | 267 | else: |
| 271 | - knn_k, knn_num_candidates = 50, 200 | 268 | + knn_k, knn_num_candidates = 120, 400 |
| 272 | else: | 269 | else: |
| 273 | - knn_k, knn_num_candidates = 50, 200 | 270 | + knn_k, knn_num_candidates = 120, 400 |
| 274 | knn_clause = { | 271 | knn_clause = { |
| 275 | "field": self.text_embedding_field, | 272 | "field": self.text_embedding_field, |
| 276 | "query_vector": query_vector.tolist(), | 273 | "query_vector": query_vector.tolist(), |