Compare View
Commits (3)
Showing
11 changed files
Show diff stats
config/loader.py
| ... | ... | @@ -281,8 +281,8 @@ class AppConfigLoader: |
| 281 | 281 | ["title", "brief", "vendor", "category_name_text"], |
| 282 | 282 | ) |
| 283 | 283 | ), |
| 284 | - base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "75%")), | |
| 285 | - translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "75%")), | |
| 284 | + base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")), | |
| 285 | + translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")), | |
| 286 | 286 | translation_boost=float(text_strategy.get("translation_boost", 0.4)), |
| 287 | 287 | translation_boost_when_source_missing=float( |
| 288 | 288 | text_strategy.get("translation_boost_when_source_missing", 1.0) | ... | ... |
config/schema.py
| ... | ... | @@ -51,8 +51,8 @@ class QueryConfig: |
| 51 | 51 | core_multilingual_fields: List[str] = field( |
| 52 | 52 | default_factory=lambda: ["title", "brief", "vendor", "category_name_text"] |
| 53 | 53 | ) |
| 54 | - base_minimum_should_match: str = "75%" | |
| 55 | - translation_minimum_should_match: str = "75%" | |
| 54 | + base_minimum_should_match: str = "70%" | |
| 55 | + translation_minimum_should_match: str = "70%" | |
| 56 | 56 | translation_boost: float = 0.4 |
| 57 | 57 | translation_boost_when_source_missing: float = 1.0 |
| 58 | 58 | source_boost_when_missing: float = 0.6 | ... | ... |
docs/TODO.txt
| ... | ... | @@ -236,14 +236,19 @@ config/environments/<env>.yaml |
| 236 | 236 | |
| 237 | 237 | |
| 238 | 238 | |
| 239 | +筛选SKU: 先只筛选第一个维度,但考虑到用户搜索词可能带了尺码,所以第二、三个维度也要考虑 | |
| 239 | 240 | |
| 240 | 241 | |
| 242 | +引入图片的相关性: | |
| 243 | +图片的向量最好做SKU维度,用 SPU 维度还是 SKU 维度? | |
| 244 | +1. SKU维度(主款式,option1维度),如果用户搜索“蓝色 T恤”,这种图片相关性会比较有价值。 | |
| 245 | +2. 我不考虑颜色的差异,其余的款式一般是大小之类的。这些图片,项链细粉到 SKU 维度,可能价值不大,性价比偏低 | |
| 241 | 246 | |
| 242 | 247 | |
| 243 | 248 | |
| 244 | 249 | |
| 245 | - | |
| 246 | - | |
| 250 | +无结果重查 | |
| 251 | +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长) | |
| 247 | 252 | |
| 248 | 253 | |
| 249 | 254 | ... | ... |
docs/常用查询 - ES.md
| ... | ... | @@ -654,3 +654,18 @@ GET /search_products_tenant_170/_search |
| 654 | 654 | } |
| 655 | 655 | } |
| 656 | 656 | } |
| 657 | + | |
| 658 | + | |
| 659 | +检查某个字段是否存在 | |
| 660 | +curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \ | |
| 661 | + 'http://localhost:9200/search_products_tenant_163/_count' \ | |
| 662 | + -H 'Content-Type: application/json' \ | |
| 663 | + -d '{ | |
| 664 | + "query": { | |
| 665 | + "bool": { | |
| 666 | + "filter": [ | |
| 667 | + { "exists": { "field": "title_embedding" } } | |
| 668 | + ] | |
| 669 | + } | |
| 670 | + } | |
| 671 | + }' | |
| 657 | 672 | \ No newline at end of file | ... | ... |
embeddings/server.py
| ... | ... | @@ -14,7 +14,6 @@ import time |
| 14 | 14 | import uuid |
| 15 | 15 | from collections import deque |
| 16 | 16 | from dataclasses import dataclass |
| 17 | -from logging.handlers import TimedRotatingFileHandler | |
| 18 | 17 | from typing import Any, Dict, List, Optional |
| 19 | 18 | |
| 20 | 19 | import numpy as np |
| ... | ... | @@ -44,9 +43,7 @@ def configure_embedding_logging() -> None: |
| 44 | 43 | return |
| 45 | 44 | |
| 46 | 45 | log_dir = pathlib.Path("logs") |
| 47 | - verbose_dir = log_dir / "verbose" | |
| 48 | 46 | log_dir.mkdir(exist_ok=True) |
| 49 | - verbose_dir.mkdir(parents=True, exist_ok=True) | |
| 50 | 47 | |
| 51 | 48 | log_level = os.getenv("LOG_LEVEL", "INFO").upper() |
| 52 | 49 | numeric_level = getattr(logging, log_level, logging.INFO) |
| ... | ... | @@ -56,47 +53,18 @@ def configure_embedding_logging() -> None: |
| 56 | 53 | request_filter = _DefaultRequestIdFilter() |
| 57 | 54 | |
| 58 | 55 | root_logger.setLevel(numeric_level) |
| 59 | - | |
| 60 | - file_handler = TimedRotatingFileHandler( | |
| 61 | - filename=log_dir / "embedding_api.log", | |
| 62 | - when="midnight", | |
| 63 | - interval=1, | |
| 64 | - backupCount=30, | |
| 65 | - encoding="utf-8", | |
| 66 | - ) | |
| 67 | - file_handler.setLevel(numeric_level) | |
| 68 | - file_handler.setFormatter(formatter) | |
| 69 | - file_handler.addFilter(request_filter) | |
| 70 | - root_logger.addHandler(file_handler) | |
| 71 | - | |
| 72 | - error_handler = TimedRotatingFileHandler( | |
| 73 | - filename=log_dir / "embedding_api_error.log", | |
| 74 | - when="midnight", | |
| 75 | - interval=1, | |
| 76 | - backupCount=30, | |
| 77 | - encoding="utf-8", | |
| 78 | - ) | |
| 79 | - error_handler.setLevel(logging.ERROR) | |
| 80 | - error_handler.setFormatter(formatter) | |
| 81 | - error_handler.addFilter(request_filter) | |
| 82 | - root_logger.addHandler(error_handler) | |
| 56 | + root_logger.handlers.clear() | |
| 57 | + stream_handler = logging.StreamHandler() | |
| 58 | + stream_handler.setLevel(numeric_level) | |
| 59 | + stream_handler.setFormatter(formatter) | |
| 60 | + stream_handler.addFilter(request_filter) | |
| 61 | + root_logger.addHandler(stream_handler) | |
| 83 | 62 | |
| 84 | 63 | verbose_logger = logging.getLogger("embedding.verbose") |
| 85 | 64 | verbose_logger.setLevel(numeric_level) |
| 86 | 65 | verbose_logger.handlers.clear() |
| 87 | - verbose_logger.propagate = False | |
| 88 | - | |
| 89 | - verbose_handler = TimedRotatingFileHandler( | |
| 90 | - filename=verbose_dir / "embedding_verbose.log", | |
| 91 | - when="midnight", | |
| 92 | - interval=1, | |
| 93 | - backupCount=30, | |
| 94 | - encoding="utf-8", | |
| 95 | - ) | |
| 96 | - verbose_handler.setLevel(numeric_level) | |
| 97 | - verbose_handler.setFormatter(formatter) | |
| 98 | - verbose_handler.addFilter(request_filter) | |
| 99 | - verbose_logger.addHandler(verbose_handler) | |
| 66 | + # Consolidate verbose logs into the main embedding log stream. | |
| 67 | + verbose_logger.propagate = True | |
| 100 | 68 | |
| 101 | 69 | root_logger._embedding_logging_configured = True # type: ignore[attr-defined] |
| 102 | 70 | ... | ... |
frontend/static/css/style.css
| ... | ... | @@ -379,7 +379,7 @@ body { |
| 379 | 379 | margin-top: 8px; |
| 380 | 380 | } |
| 381 | 381 | |
| 382 | -.product-debug-inline-es-btn { | |
| 382 | +.product-debug-inline-result-btn { | |
| 383 | 383 | font-family: inherit; |
| 384 | 384 | font-size: 12px; |
| 385 | 385 | padding: 4px 10px; |
| ... | ... | @@ -390,27 +390,22 @@ body { |
| 390 | 390 | cursor: pointer; |
| 391 | 391 | } |
| 392 | 392 | |
| 393 | -.product-debug-inline-es-btn:hover { | |
| 393 | +.product-debug-inline-result-btn:hover { | |
| 394 | 394 | background: #f0f0f0; |
| 395 | 395 | border-color: #bbb; |
| 396 | 396 | } |
| 397 | 397 | |
| 398 | -.product-debug--es-expanded { | |
| 398 | +.product-debug--result-expanded { | |
| 399 | 399 | max-height: min(70vh, 720px); |
| 400 | 400 | } |
| 401 | 401 | |
| 402 | -.product-es-doc-panel { | |
| 402 | +.product-result-doc-panel { | |
| 403 | 403 | margin-top: 10px; |
| 404 | 404 | padding-top: 8px; |
| 405 | 405 | border-top: 1px dashed #e8e8e8; |
| 406 | 406 | } |
| 407 | 407 | |
| 408 | -.product-es-doc-panel-status { | |
| 409 | - font-size: 12px; | |
| 410 | - color: #888; | |
| 411 | -} | |
| 412 | - | |
| 413 | -.product-es-doc-pre { | |
| 408 | +.product-result-doc-pre { | |
| 414 | 409 | margin: 6px 0 0; |
| 415 | 410 | padding: 10px; |
| 416 | 411 | background: #f5f5f5; | ... | ... |
frontend/static/js/app.js
| ... | ... | @@ -68,25 +68,25 @@ function initializeApp() { |
| 68 | 68 | // 初始化租户下拉框和分面面板 |
| 69 | 69 | console.log('Initializing app...'); |
| 70 | 70 | initTenantSelect(); |
| 71 | - setupProductGridEsDocToggle(); | |
| 71 | + setupProductGridResultDocToggle(); | |
| 72 | 72 | const searchInput = document.getElementById('searchInput'); |
| 73 | 73 | if (searchInput) { |
| 74 | 74 | searchInput.focus(); |
| 75 | 75 | } |
| 76 | 76 | } |
| 77 | 77 | |
| 78 | -/** Delegated handler: toggle inline ES raw response under each result card (survives innerHTML refresh on re-search). */ | |
| 79 | -function setupProductGridEsDocToggle() { | |
| 78 | +/** Delegated handler: toggle inline current result JSON under each result card (survives innerHTML refresh on re-search). */ | |
| 79 | +function setupProductGridResultDocToggle() { | |
| 80 | 80 | const grid = document.getElementById('productGrid'); |
| 81 | - if (!grid || grid.dataset.esDocToggleBound === '1') { | |
| 81 | + if (!grid || grid.dataset.resultDocToggleBound === '1') { | |
| 82 | 82 | return; |
| 83 | 83 | } |
| 84 | - grid.dataset.esDocToggleBound = '1'; | |
| 85 | - grid.addEventListener('click', onProductGridEsDocToggleClick); | |
| 84 | + grid.dataset.resultDocToggleBound = '1'; | |
| 85 | + grid.addEventListener('click', onProductGridResultDocToggleClick); | |
| 86 | 86 | } |
| 87 | 87 | |
| 88 | -async function onProductGridEsDocToggleClick(event) { | |
| 89 | - const btn = event.target.closest('[data-action="toggle-es-inline-doc"]'); | |
| 88 | +function onProductGridResultDocToggleClick(event) { | |
| 89 | + const btn = event.target.closest('[data-action="toggle-result-inline-doc"]'); | |
| 90 | 90 | if (!btn) { |
| 91 | 91 | return; |
| 92 | 92 | } |
| ... | ... | @@ -95,55 +95,27 @@ async function onProductGridEsDocToggleClick(event) { |
| 95 | 95 | if (!debugRoot) { |
| 96 | 96 | return; |
| 97 | 97 | } |
| 98 | - const panel = debugRoot.querySelector('.product-es-doc-panel'); | |
| 99 | - const pre = debugRoot.querySelector('.product-es-doc-pre'); | |
| 100 | - const statusEl = debugRoot.querySelector('.product-es-doc-panel-status'); | |
| 101 | - if (!panel || !pre || !statusEl) { | |
| 98 | + const panel = debugRoot.querySelector('.product-result-doc-panel'); | |
| 99 | + const pre = debugRoot.querySelector('.product-result-doc-pre'); | |
| 100 | + if (!panel || !pre) { | |
| 102 | 101 | return; |
| 103 | 102 | } |
| 104 | 103 | |
| 105 | - const spuId = btn.getAttribute('data-spu-id') || ''; | |
| 106 | - const tenantId = getTenantId(); | |
| 107 | - const url = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; | |
| 108 | - | |
| 109 | - if (debugRoot.dataset.esInlineOpen === '1') { | |
| 104 | + if (debugRoot.dataset.resultInlineOpen === '1') { | |
| 110 | 105 | panel.setAttribute('hidden', ''); |
| 111 | - debugRoot.classList.remove('product-debug--es-expanded'); | |
| 112 | - debugRoot.dataset.esInlineOpen = '0'; | |
| 113 | - btn.textContent = '在结果中显示 ES 文档'; | |
| 106 | + debugRoot.classList.remove('product-debug--result-expanded'); | |
| 107 | + debugRoot.dataset.resultInlineOpen = '0'; | |
| 108 | + btn.textContent = '在结果中显示当前结果数据'; | |
| 114 | 109 | return; |
| 115 | 110 | } |
| 116 | 111 | |
| 117 | 112 | panel.removeAttribute('hidden'); |
| 118 | - debugRoot.classList.add('product-debug--es-expanded'); | |
| 119 | - debugRoot.dataset.esInlineOpen = '1'; | |
| 120 | - btn.textContent = '隐藏 ES 文档'; | |
| 121 | - | |
| 122 | - if (pre.textContent.length > 0) { | |
| 123 | - panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); | |
| 124 | - return; | |
| 125 | - } | |
| 126 | - | |
| 127 | - statusEl.style.display = ''; | |
| 128 | - statusEl.textContent = '加载中…'; | |
| 129 | - pre.style.display = 'none'; | |
| 130 | - | |
| 131 | - try { | |
| 132 | - const response = await fetch(url); | |
| 133 | - if (!response.ok) { | |
| 134 | - const errText = await response.text(); | |
| 135 | - throw new Error(`HTTP ${response.status}: ${errText.slice(0, 200)}`); | |
| 136 | - } | |
| 137 | - const data = await response.json(); | |
| 138 | - pre.textContent = customStringify(data); | |
| 139 | - statusEl.style.display = 'none'; | |
| 140 | - pre.style.display = 'block'; | |
| 141 | - } catch (err) { | |
| 142 | - console.error('ES doc fetch failed', err); | |
| 143 | - statusEl.textContent = `加载失败: ${err.message || err}`; | |
| 144 | - pre.style.display = 'none'; | |
| 113 | + debugRoot.classList.add('product-debug--result-expanded'); | |
| 114 | + debugRoot.dataset.resultInlineOpen = '1'; | |
| 115 | + btn.textContent = '隐藏当前结果数据'; | |
| 116 | + if (pre.textContent.length === 0) { | |
| 117 | + pre.textContent = btn.getAttribute('data-result-json') || '{}'; | |
| 145 | 118 | } |
| 146 | - | |
| 147 | 119 | panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); |
| 148 | 120 | } |
| 149 | 121 | |
| ... | ... | @@ -213,7 +185,7 @@ function initTenantSelect() { |
| 213 | 185 | }); |
| 214 | 186 | // 设置默认值(仅当输入框为空时) |
| 215 | 187 | if (!tenantSelect.value.trim()) { |
| 216 | - tenantSelect.value = availableTenants.includes('170') ? '170' : availableTenants[0]; | |
| 188 | + tenantSelect.value = availableTenants.includes('0') ? '0' : availableTenants[0]; | |
| 217 | 189 | } |
| 218 | 190 | } |
| 219 | 191 | |
| ... | ... | @@ -462,6 +434,7 @@ function displayResults(data) { |
| 462 | 434 | }); |
| 463 | 435 | } |
| 464 | 436 | |
| 437 | + const resultJson = customStringify(result); | |
| 465 | 438 | const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`; |
| 466 | 439 | |
| 467 | 440 | debugHtml = ` |
| ... | ... | @@ -475,18 +448,17 @@ function displayResults(data) { |
| 475 | 448 | <div class="product-debug-line">Fused score: ${fusedScore}</div> |
| 476 | 449 | ${titleLines} |
| 477 | 450 | <div class="product-debug-actions"> |
| 478 | - <button type="button" class="product-debug-inline-es-btn" | |
| 479 | - data-action="toggle-es-inline-doc" | |
| 480 | - data-spu-id="${escapeAttr(String(spuId || ''))}"> | |
| 481 | - 在结果中显示 ES 文档 | |
| 451 | + <button type="button" class="product-debug-inline-result-btn" | |
| 452 | + data-action="toggle-result-inline-doc" | |
| 453 | + data-result-json="${escapeAttr(resultJson)}"> | |
| 454 | + 在结果中显示当前结果数据 | |
| 482 | 455 | </button> |
| 483 | 456 | <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer"> |
| 484 | 457 | 查看 ES 原始文档 |
| 485 | 458 | </a> |
| 486 | 459 | </div> |
| 487 | - <div class="product-es-doc-panel" hidden> | |
| 488 | - <div class="product-es-doc-panel-status"></div> | |
| 489 | - <pre class="product-es-doc-pre"></pre> | |
| 460 | + <div class="product-result-doc-panel" hidden> | |
| 461 | + <pre class="product-result-doc-pre"></pre> | |
| 490 | 462 | </div> |
| 491 | 463 | </div> |
| 492 | 464 | `; | ... | ... |
query/language_detector.py
| 1 | 1 | """ |
| 2 | 2 | Language detection utility. |
| 3 | 3 | |
| 4 | -Detects language of short e-commerce queries with script checks + lightweight | |
| 5 | -Latin-language scoring (de/fr/es/it/pt/nl/en). | |
| 4 | +Script-first rules for CJK and other non-Latin scripts, then Lingua | |
| 5 | +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation. | |
| 6 | 6 | """ |
| 7 | 7 | |
| 8 | -from typing import Dict, List | |
| 8 | +from __future__ import annotations | |
| 9 | + | |
| 10 | +from typing import Dict, Optional | |
| 9 | 11 | import re |
| 10 | 12 | |
| 13 | +from lingua import Language, LanguageDetectorBuilder | |
| 14 | + | |
| 15 | +_LINGUA_TO_CODE: Dict[Language, str] = { | |
| 16 | + Language.CHINESE: "zh", | |
| 17 | + Language.ENGLISH: "en", | |
| 18 | + Language.JAPANESE: "ja", | |
| 19 | + Language.KOREAN: "ko", | |
| 20 | + Language.GERMAN: "de", | |
| 21 | + Language.FRENCH: "fr", | |
| 22 | + Language.SPANISH: "es", | |
| 23 | + Language.ITALIAN: "it", | |
| 24 | + Language.PORTUGUESE: "pt", | |
| 25 | + Language.DUTCH: "nl", | |
| 26 | + Language.RUSSIAN: "ru", | |
| 27 | + Language.ARABIC: "ar", | |
| 28 | + Language.HINDI: "hi", | |
| 29 | + Language.HEBREW: "he", | |
| 30 | + Language.THAI: "th", | |
| 31 | +} | |
| 32 | + | |
| 33 | +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys()) | |
| 34 | + | |
| 35 | +_lingua_detector: Optional[object] = None | |
| 36 | + | |
| 37 | + | |
| 38 | +def _get_lingua_detector(): | |
| 39 | + global _lingua_detector | |
| 40 | + if _lingua_detector is None: | |
| 41 | + _lingua_detector = LanguageDetectorBuilder.from_languages( | |
| 42 | + *_LINGUA_LANGUAGES | |
| 43 | + ).build() | |
| 44 | + return _lingua_detector | |
| 45 | + | |
| 11 | 46 | |
| 12 | 47 | class LanguageDetector: |
| 13 | - """Rule-based language detector for common e-commerce query languages.""" | |
| 48 | + """Language detector: script hints + Lingua for Latin-family queries.""" | |
| 14 | 49 | |
| 15 | 50 | def __init__(self): |
| 16 | 51 | self._re_zh = re.compile(r"[\u4e00-\u9fff]") |
| ... | ... | @@ -21,47 +56,6 @@ class LanguageDetector: |
| 21 | 56 | self._re_hi = re.compile(r"[\u0900-\u097f]") |
| 22 | 57 | self._re_he = re.compile(r"[\u0590-\u05ff]") |
| 23 | 58 | self._re_th = re.compile(r"[\u0e00-\u0e7f]") |
| 24 | - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+") | |
| 25 | - | |
| 26 | - # Stopwords + e-commerce terms for Latin-family disambiguation. | |
| 27 | - self._latin_lexicons: Dict[str, set] = { | |
| 28 | - "en": { | |
| 29 | - "the", "and", "for", "with", "new", "women", "men", "kids", | |
| 30 | - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless", | |
| 31 | - }, | |
| 32 | - "de": { | |
| 33 | - "der", "die", "das", "und", "mit", "für", "damen", "herren", | |
| 34 | - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche", | |
| 35 | - }, | |
| 36 | - "fr": { | |
| 37 | - "le", "la", "les", "et", "avec", "pour", "femme", "homme", | |
| 38 | - "enfant", "chaussures", "robe", "chemise", "veste", "sac", | |
| 39 | - }, | |
| 40 | - "es": { | |
| 41 | - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre", | |
| 42 | - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso", | |
| 43 | - }, | |
| 44 | - "it": { | |
| 45 | - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo", | |
| 46 | - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa", | |
| 47 | - }, | |
| 48 | - "pt": { | |
| 49 | - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem", | |
| 50 | - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa", | |
| 51 | - }, | |
| 52 | - "nl": { | |
| 53 | - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen", | |
| 54 | - "schoenen", "jurk", "overhemd", "jas", "tas", | |
| 55 | - }, | |
| 56 | - } | |
| 57 | - self._diacritic_weights: Dict[str, Dict[str, int]] = { | |
| 58 | - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4}, | |
| 59 | - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2}, | |
| 60 | - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2}, | |
| 61 | - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2}, | |
| 62 | - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2}, | |
| 63 | - "nl": {"ij": 2}, | |
| 64 | - } | |
| 65 | 59 | |
| 66 | 60 | def detect(self, text: str) -> str: |
| 67 | 61 | """ |
| ... | ... | @@ -71,9 +65,9 @@ class LanguageDetector: |
| 71 | 65 | """ |
| 72 | 66 | if not text or not text.strip(): |
| 73 | 67 | return "unknown" |
| 74 | - q = text.strip().lower() | |
| 68 | + q = text.strip() | |
| 75 | 69 | |
| 76 | - # Script-first detection for non-Latin languages. | |
| 70 | + # Script-first: unambiguous blocks before Latin/Romance Lingua pass. | |
| 77 | 71 | if self._re_ja_kana.search(q): |
| 78 | 72 | return "ja" |
| 79 | 73 | if self._re_ko.search(q): |
| ... | ... | @@ -91,48 +85,11 @@ class LanguageDetector: |
| 91 | 85 | if self._re_th.search(q): |
| 92 | 86 | return "th" |
| 93 | 87 | |
| 94 | - # Latin-family scoring. | |
| 95 | - tokens = self._re_latin_word.findall(q) | |
| 96 | - if not tokens: | |
| 88 | + detected = _get_lingua_detector().detect_language_of(q) | |
| 89 | + if detected is None: | |
| 97 | 90 | return "unknown" |
| 98 | - | |
| 99 | - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()} | |
| 100 | - scores["en"] = scores.get("en", 0.0) | |
| 101 | - token_set = set(tokens) | |
| 102 | - | |
| 103 | - # Lexicon matches | |
| 104 | - for lang, lex in self._latin_lexicons.items(): | |
| 105 | - overlap = len(token_set & lex) | |
| 106 | - if overlap: | |
| 107 | - scores[lang] += overlap * 2.0 | |
| 108 | - | |
| 109 | - # Diacritics / orthographic hints | |
| 110 | - for lang, hints in self._diacritic_weights.items(): | |
| 111 | - for marker, weight in hints.items(): | |
| 112 | - if marker in q: | |
| 113 | - scores[lang] += weight | |
| 114 | - | |
| 115 | - # Light suffix hints for common product words | |
| 116 | - for t in tokens: | |
| 117 | - if t.endswith("ung") or t.endswith("chen"): | |
| 118 | - scores["de"] += 0.6 | |
| 119 | - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"): | |
| 120 | - scores["es"] += 0.6 | |
| 121 | - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"): | |
| 122 | - scores["it"] += 0.6 | |
| 123 | - if t.endswith("ção") or t.endswith("mente"): | |
| 124 | - scores["pt"] += 0.6 | |
| 125 | - if t.endswith("ment") or t.endswith("eau"): | |
| 126 | - scores["fr"] += 0.5 | |
| 127 | - | |
| 128 | - # Fallback preference: English for pure Latin short tokens. | |
| 129 | - scores["en"] += 0.2 | |
| 130 | - | |
| 131 | - best_lang = max(scores.items(), key=lambda x: x[1])[0] | |
| 132 | - best_score = scores[best_lang] | |
| 133 | - if best_score <= 0: | |
| 134 | - return "en" | |
| 135 | - return best_lang | |
| 91 | + code = _LINGUA_TO_CODE.get(detected) | |
| 92 | + return code if code is not None else "unknown" | |
| 136 | 93 | |
| 137 | 94 | def is_chinese(self, text: str) -> bool: |
| 138 | 95 | return self.detect(text) == "zh" | ... | ... |
requirements.txt
scripts/start_embedding_service.sh
| ... | ... | @@ -138,7 +138,11 @@ fi |
| 138 | 138 | if [[ "${IMAGE_MODEL_ENABLED}" == "1" ]]; then |
| 139 | 139 | echo "Image max inflight: ${IMAGE_MAX_INFLIGHT:-1}" |
| 140 | 140 | fi |
| 141 | -echo "Logs: logs/embedding_api.log, logs/embedding_api_error.log, logs/verbose/embedding_verbose.log" | |
| 141 | +if [[ "${SERVICE_KIND}" == "image" ]]; then | |
| 142 | + echo "Logs: logs/embedding-image.log" | |
| 143 | +else | |
| 144 | + echo "Logs: logs/embedding.log" | |
| 145 | +fi | |
| 142 | 146 | echo |
| 143 | 147 | echo "Tips:" |
| 144 | 148 | echo " - Use a single worker (GPU models cannot be safely duplicated across workers)." |
| ... | ... | @@ -153,12 +157,16 @@ echo |
| 153 | 157 | |
| 154 | 158 | UVICORN_LOG_LEVEL="${EMBEDDING_UVICORN_LOG_LEVEL:-info}" |
| 155 | 159 | UVICORN_ACCESS_LOG="${EMBEDDING_UVICORN_ACCESS_LOG:-true}" |
| 160 | +UVICORN_LOG_CONFIG="${EMBEDDING_UVICORN_LOG_CONFIG:-${PROJECT_ROOT}/config/uvicorn_embedding_logging.json}" | |
| 156 | 161 | UVICORN_ARGS=( |
| 157 | 162 | --host "${EMBEDDING_SERVICE_HOST}" |
| 158 | 163 | --port "${EMBEDDING_SERVICE_PORT}" |
| 159 | 164 | --workers 1 |
| 160 | 165 | --log-level "${UVICORN_LOG_LEVEL}" |
| 161 | 166 | ) |
| 167 | +if [[ -f "${UVICORN_LOG_CONFIG}" ]]; then | |
| 168 | + UVICORN_ARGS+=(--log-config "${UVICORN_LOG_CONFIG}") | |
| 169 | +fi | |
| 162 | 170 | if [[ "${UVICORN_ACCESS_LOG}" == "0" || "${UVICORN_ACCESS_LOG}" == "false" || "${UVICORN_ACCESS_LOG}" == "no" ]]; then |
| 163 | 171 | UVICORN_ARGS+=(--no-access-log) |
| 164 | 172 | fi | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -33,8 +33,8 @@ class ESQueryBuilder: |
| 33 | 33 | function_score_config: Optional[FunctionScoreConfig] = None, |
| 34 | 34 | default_language: str = "en", |
| 35 | 35 | knn_boost: float = 0.25, |
| 36 | - base_minimum_should_match: str = "75%", | |
| 37 | - translation_minimum_should_match: str = "75%", | |
| 36 | + base_minimum_should_match: str = "70%", | |
| 37 | + translation_minimum_should_match: str = "70%", | |
| 38 | 38 | translation_boost: float = 0.4, |
| 39 | 39 | translation_boost_when_source_missing: float = 1.0, |
| 40 | 40 | source_boost_when_missing: float = 0.6, |
| ... | ... | @@ -261,16 +261,13 @@ class ESQueryBuilder: |
| 261 | 261 | if parsed_query: |
| 262 | 262 | query_tokens = getattr(parsed_query, 'query_tokens', None) or [] |
| 263 | 263 | token_count = len(query_tokens) |
| 264 | - if token_count <= 2: | |
| 265 | - knn_k, knn_num_candidates = 30, 100 | |
| 266 | - knn_boost = self.knn_boost * 0.6 # Lower weight for short queries | |
| 267 | - elif token_count >= 5: | |
| 268 | - knn_k, knn_num_candidates = 80, 300 | |
| 264 | + if token_count >= 5: | |
| 265 | + knn_k, knn_num_candidates = 160, 500 | |
| 269 | 266 | knn_boost = self.knn_boost * 1.4 # Higher weight for long queries |
| 270 | 267 | else: |
| 271 | - knn_k, knn_num_candidates = 50, 200 | |
| 268 | + knn_k, knn_num_candidates = 120, 400 | |
| 272 | 269 | else: |
| 273 | - knn_k, knn_num_candidates = 50, 200 | |
| 270 | + knn_k, knn_num_candidates = 120, 400 | |
| 274 | 271 | knn_clause = { |
| 275 | 272 | "field": self.text_embedding_field, |
| 276 | 273 | "query_vector": query_vector.tolist(), | ... | ... |