Compare View

switch
from
...
to
 
Commits (3)
config/loader.py
... ... @@ -281,8 +281,8 @@ class AppConfigLoader:
281 281 ["title", "brief", "vendor", "category_name_text"],
282 282 )
283 283 ),
284   - base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "75%")),
285   - translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "75%")),
  284 + base_minimum_should_match=str(text_strategy.get("base_minimum_should_match", "70%")),
  285 + translation_minimum_should_match=str(text_strategy.get("translation_minimum_should_match", "70%")),
286 286 translation_boost=float(text_strategy.get("translation_boost", 0.4)),
287 287 translation_boost_when_source_missing=float(
288 288 text_strategy.get("translation_boost_when_source_missing", 1.0)
... ...
config/schema.py
... ... @@ -51,8 +51,8 @@ class QueryConfig:
51 51 core_multilingual_fields: List[str] = field(
52 52 default_factory=lambda: ["title", "brief", "vendor", "category_name_text"]
53 53 )
54   - base_minimum_should_match: str = "75%"
55   - translation_minimum_should_match: str = "75%"
  54 + base_minimum_should_match: str = "70%"
  55 + translation_minimum_should_match: str = "70%"
56 56 translation_boost: float = 0.4
57 57 translation_boost_when_source_missing: float = 1.0
58 58 source_boost_when_missing: float = 0.6
... ...
docs/TODO.txt
... ... @@ -236,14 +236,19 @@ config/environments/<env>.yaml
236 236  
237 237  
238 238  
  239 +筛选SKU: 先只筛选第一个维度,但考虑到用户搜索词可能带了尺码,所以第二、三个维度也要考虑
239 240  
240 241  
  242 +引入图片的相关性:
  243 +图片的向量最好做SKU维度,用 SPU 维度还是 SKU 维度?
  244 +1. SKU维度(主款式,option1维度),如果用户搜索“蓝色 T恤”,这种图片相关性会比较有价值。
  245 +2. 我不考虑颜色的差异,其余的款式一般是大小之类的。这些图片,项链细粉到 SKU 维度,可能价值不大,性价比偏低
241 246  
242 247  
243 248  
244 249  
245   -
246   -
  250 +无结果重查
  251 +稀有语言,翻译可能超时(因为zh-en互译之外的翻译耗时更长)
247 252  
248 253  
249 254  
... ...
docs/常用查询 - ES.md
... ... @@ -654,3 +654,18 @@ GET /search_products_tenant_170/_search
654 654 }
655 655 }
656 656 }
  657 +
  658 +
  659 +检查某个字段是否存在
  660 +curl -u 'saas:4hOaLaf41y2VuI8y' -X POST \
  661 + 'http://localhost:9200/search_products_tenant_163/_count' \
  662 + -H 'Content-Type: application/json' \
  663 + -d '{
  664 + "query": {
  665 + "bool": {
  666 + "filter": [
  667 + { "exists": { "field": "title_embedding" } }
  668 + ]
  669 + }
  670 + }
  671 + }'
657 672 \ No newline at end of file
... ...
embeddings/server.py
... ... @@ -14,7 +14,6 @@ import time
14 14 import uuid
15 15 from collections import deque
16 16 from dataclasses import dataclass
17   -from logging.handlers import TimedRotatingFileHandler
18 17 from typing import Any, Dict, List, Optional
19 18  
20 19 import numpy as np
... ... @@ -44,9 +43,7 @@ def configure_embedding_logging() -> None:
44 43 return
45 44  
46 45 log_dir = pathlib.Path("logs")
47   - verbose_dir = log_dir / "verbose"
48 46 log_dir.mkdir(exist_ok=True)
49   - verbose_dir.mkdir(parents=True, exist_ok=True)
50 47  
51 48 log_level = os.getenv("LOG_LEVEL", "INFO").upper()
52 49 numeric_level = getattr(logging, log_level, logging.INFO)
... ... @@ -56,47 +53,18 @@ def configure_embedding_logging() -> None:
56 53 request_filter = _DefaultRequestIdFilter()
57 54  
58 55 root_logger.setLevel(numeric_level)
59   -
60   - file_handler = TimedRotatingFileHandler(
61   - filename=log_dir / "embedding_api.log",
62   - when="midnight",
63   - interval=1,
64   - backupCount=30,
65   - encoding="utf-8",
66   - )
67   - file_handler.setLevel(numeric_level)
68   - file_handler.setFormatter(formatter)
69   - file_handler.addFilter(request_filter)
70   - root_logger.addHandler(file_handler)
71   -
72   - error_handler = TimedRotatingFileHandler(
73   - filename=log_dir / "embedding_api_error.log",
74   - when="midnight",
75   - interval=1,
76   - backupCount=30,
77   - encoding="utf-8",
78   - )
79   - error_handler.setLevel(logging.ERROR)
80   - error_handler.setFormatter(formatter)
81   - error_handler.addFilter(request_filter)
82   - root_logger.addHandler(error_handler)
  56 + root_logger.handlers.clear()
  57 + stream_handler = logging.StreamHandler()
  58 + stream_handler.setLevel(numeric_level)
  59 + stream_handler.setFormatter(formatter)
  60 + stream_handler.addFilter(request_filter)
  61 + root_logger.addHandler(stream_handler)
83 62  
84 63 verbose_logger = logging.getLogger("embedding.verbose")
85 64 verbose_logger.setLevel(numeric_level)
86 65 verbose_logger.handlers.clear()
87   - verbose_logger.propagate = False
88   -
89   - verbose_handler = TimedRotatingFileHandler(
90   - filename=verbose_dir / "embedding_verbose.log",
91   - when="midnight",
92   - interval=1,
93   - backupCount=30,
94   - encoding="utf-8",
95   - )
96   - verbose_handler.setLevel(numeric_level)
97   - verbose_handler.setFormatter(formatter)
98   - verbose_handler.addFilter(request_filter)
99   - verbose_logger.addHandler(verbose_handler)
  66 + # Consolidate verbose logs into the main embedding log stream.
  67 + verbose_logger.propagate = True
100 68  
101 69 root_logger._embedding_logging_configured = True # type: ignore[attr-defined]
102 70  
... ...
frontend/static/css/style.css
... ... @@ -379,7 +379,7 @@ body {
379 379 margin-top: 8px;
380 380 }
381 381  
382   -.product-debug-inline-es-btn {
  382 +.product-debug-inline-result-btn {
383 383 font-family: inherit;
384 384 font-size: 12px;
385 385 padding: 4px 10px;
... ... @@ -390,27 +390,22 @@ body {
390 390 cursor: pointer;
391 391 }
392 392  
393   -.product-debug-inline-es-btn:hover {
  393 +.product-debug-inline-result-btn:hover {
394 394 background: #f0f0f0;
395 395 border-color: #bbb;
396 396 }
397 397  
398   -.product-debug--es-expanded {
  398 +.product-debug--result-expanded {
399 399 max-height: min(70vh, 720px);
400 400 }
401 401  
402   -.product-es-doc-panel {
  402 +.product-result-doc-panel {
403 403 margin-top: 10px;
404 404 padding-top: 8px;
405 405 border-top: 1px dashed #e8e8e8;
406 406 }
407 407  
408   -.product-es-doc-panel-status {
409   - font-size: 12px;
410   - color: #888;
411   -}
412   -
413   -.product-es-doc-pre {
  408 +.product-result-doc-pre {
414 409 margin: 6px 0 0;
415 410 padding: 10px;
416 411 background: #f5f5f5;
... ...
frontend/static/js/app.js
... ... @@ -68,25 +68,25 @@ function initializeApp() {
68 68 // 初始化租户下拉框和分面面板
69 69 console.log('Initializing app...');
70 70 initTenantSelect();
71   - setupProductGridEsDocToggle();
  71 + setupProductGridResultDocToggle();
72 72 const searchInput = document.getElementById('searchInput');
73 73 if (searchInput) {
74 74 searchInput.focus();
75 75 }
76 76 }
77 77  
78   -/** Delegated handler: toggle inline ES raw response under each result card (survives innerHTML refresh on re-search). */
79   -function setupProductGridEsDocToggle() {
  78 +/** Delegated handler: toggle inline current result JSON under each result card (survives innerHTML refresh on re-search). */
  79 +function setupProductGridResultDocToggle() {
80 80 const grid = document.getElementById('productGrid');
81   - if (!grid || grid.dataset.esDocToggleBound === '1') {
  81 + if (!grid || grid.dataset.resultDocToggleBound === '1') {
82 82 return;
83 83 }
84   - grid.dataset.esDocToggleBound = '1';
85   - grid.addEventListener('click', onProductGridEsDocToggleClick);
  84 + grid.dataset.resultDocToggleBound = '1';
  85 + grid.addEventListener('click', onProductGridResultDocToggleClick);
86 86 }
87 87  
88   -async function onProductGridEsDocToggleClick(event) {
89   - const btn = event.target.closest('[data-action="toggle-es-inline-doc"]');
  88 +function onProductGridResultDocToggleClick(event) {
  89 + const btn = event.target.closest('[data-action="toggle-result-inline-doc"]');
90 90 if (!btn) {
91 91 return;
92 92 }
... ... @@ -95,55 +95,27 @@ async function onProductGridEsDocToggleClick(event) {
95 95 if (!debugRoot) {
96 96 return;
97 97 }
98   - const panel = debugRoot.querySelector('.product-es-doc-panel');
99   - const pre = debugRoot.querySelector('.product-es-doc-pre');
100   - const statusEl = debugRoot.querySelector('.product-es-doc-panel-status');
101   - if (!panel || !pre || !statusEl) {
  98 + const panel = debugRoot.querySelector('.product-result-doc-panel');
  99 + const pre = debugRoot.querySelector('.product-result-doc-pre');
  100 + if (!panel || !pre) {
102 101 return;
103 102 }
104 103  
105   - const spuId = btn.getAttribute('data-spu-id') || '';
106   - const tenantId = getTenantId();
107   - const url = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`;
108   -
109   - if (debugRoot.dataset.esInlineOpen === '1') {
  104 + if (debugRoot.dataset.resultInlineOpen === '1') {
110 105 panel.setAttribute('hidden', '');
111   - debugRoot.classList.remove('product-debug--es-expanded');
112   - debugRoot.dataset.esInlineOpen = '0';
113   - btn.textContent = '在结果中显示 ES 文档';
  106 + debugRoot.classList.remove('product-debug--result-expanded');
  107 + debugRoot.dataset.resultInlineOpen = '0';
  108 + btn.textContent = '在结果中显示当前结果数据';
114 109 return;
115 110 }
116 111  
117 112 panel.removeAttribute('hidden');
118   - debugRoot.classList.add('product-debug--es-expanded');
119   - debugRoot.dataset.esInlineOpen = '1';
120   - btn.textContent = '隐藏 ES 文档';
121   -
122   - if (pre.textContent.length > 0) {
123   - panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
124   - return;
125   - }
126   -
127   - statusEl.style.display = '';
128   - statusEl.textContent = '加载中…';
129   - pre.style.display = 'none';
130   -
131   - try {
132   - const response = await fetch(url);
133   - if (!response.ok) {
134   - const errText = await response.text();
135   - throw new Error(`HTTP ${response.status}: ${errText.slice(0, 200)}`);
136   - }
137   - const data = await response.json();
138   - pre.textContent = customStringify(data);
139   - statusEl.style.display = 'none';
140   - pre.style.display = 'block';
141   - } catch (err) {
142   - console.error('ES doc fetch failed', err);
143   - statusEl.textContent = `加载失败: ${err.message || err}`;
144   - pre.style.display = 'none';
  113 + debugRoot.classList.add('product-debug--result-expanded');
  114 + debugRoot.dataset.resultInlineOpen = '1';
  115 + btn.textContent = '隐藏当前结果数据';
  116 + if (pre.textContent.length === 0) {
  117 + pre.textContent = btn.getAttribute('data-result-json') || '{}';
145 118 }
146   -
147 119 panel.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
148 120 }
149 121  
... ... @@ -213,7 +185,7 @@ function initTenantSelect() {
213 185 });
214 186 // 设置默认值(仅当输入框为空时)
215 187 if (!tenantSelect.value.trim()) {
216   - tenantSelect.value = availableTenants.includes('170') ? '170' : availableTenants[0];
  188 + tenantSelect.value = availableTenants.includes('0') ? '0' : availableTenants[0];
217 189 }
218 190 }
219 191  
... ... @@ -462,6 +434,7 @@ function displayResults(data) {
462 434 });
463 435 }
464 436  
  437 + const resultJson = customStringify(result);
465 438 const rawUrl = `${API_BASE_URL}/search/es-doc/${encodeURIComponent(spuId)}?tenant_id=${encodeURIComponent(tenantId)}`;
466 439  
467 440 debugHtml = `
... ... @@ -475,18 +448,17 @@ function displayResults(data) {
475 448 <div class="product-debug-line">Fused score: ${fusedScore}</div>
476 449 ${titleLines}
477 450 <div class="product-debug-actions">
478   - <button type="button" class="product-debug-inline-es-btn"
479   - data-action="toggle-es-inline-doc"
480   - data-spu-id="${escapeAttr(String(spuId || ''))}">
481   - 在结果中显示 ES 文档
  451 + <button type="button" class="product-debug-inline-result-btn"
  452 + data-action="toggle-result-inline-doc"
  453 + data-result-json="${escapeAttr(resultJson)}">
  454 + 在结果中显示当前结果数据
482 455 </button>
483 456 <a class="product-debug-link" href="${rawUrl}" target="_blank" rel="noopener noreferrer">
484 457 查看 ES 原始文档
485 458 </a>
486 459 </div>
487   - <div class="product-es-doc-panel" hidden>
488   - <div class="product-es-doc-panel-status"></div>
489   - <pre class="product-es-doc-pre"></pre>
  460 + <div class="product-result-doc-panel" hidden>
  461 + <pre class="product-result-doc-pre"></pre>
490 462 </div>
491 463 </div>
492 464 `;
... ...
query/language_detector.py
1 1 """
2 2 Language detection utility.
3 3  
4   -Detects language of short e-commerce queries with script checks + lightweight
5   -Latin-language scoring (de/fr/es/it/pt/nl/en).
  4 +Script-first rules for CJK and other non-Latin scripts, then Lingua
  5 +(lingua-language-detector) for Latin text and Romance/Germanic disambiguation.
6 6 """
7 7  
8   -from typing import Dict, List
  8 +from __future__ import annotations
  9 +
  10 +from typing import Dict, Optional
9 11 import re
10 12  
  13 +from lingua import Language, LanguageDetectorBuilder
  14 +
  15 +_LINGUA_TO_CODE: Dict[Language, str] = {
  16 + Language.CHINESE: "zh",
  17 + Language.ENGLISH: "en",
  18 + Language.JAPANESE: "ja",
  19 + Language.KOREAN: "ko",
  20 + Language.GERMAN: "de",
  21 + Language.FRENCH: "fr",
  22 + Language.SPANISH: "es",
  23 + Language.ITALIAN: "it",
  24 + Language.PORTUGUESE: "pt",
  25 + Language.DUTCH: "nl",
  26 + Language.RUSSIAN: "ru",
  27 + Language.ARABIC: "ar",
  28 + Language.HINDI: "hi",
  29 + Language.HEBREW: "he",
  30 + Language.THAI: "th",
  31 +}
  32 +
  33 +_LINGUA_LANGUAGES = tuple(_LINGUA_TO_CODE.keys())
  34 +
  35 +_lingua_detector: Optional[object] = None
  36 +
  37 +
  38 +def _get_lingua_detector():
  39 + global _lingua_detector
  40 + if _lingua_detector is None:
  41 + _lingua_detector = LanguageDetectorBuilder.from_languages(
  42 + *_LINGUA_LANGUAGES
  43 + ).build()
  44 + return _lingua_detector
  45 +
11 46  
12 47 class LanguageDetector:
13   - """Rule-based language detector for common e-commerce query languages."""
  48 + """Language detector: script hints + Lingua for Latin-family queries."""
14 49  
15 50 def __init__(self):
16 51 self._re_zh = re.compile(r"[\u4e00-\u9fff]")
... ... @@ -21,47 +56,6 @@ class LanguageDetector:
21 56 self._re_hi = re.compile(r"[\u0900-\u097f]")
22 57 self._re_he = re.compile(r"[\u0590-\u05ff]")
23 58 self._re_th = re.compile(r"[\u0e00-\u0e7f]")
24   - self._re_latin_word = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+")
25   -
26   - # Stopwords + e-commerce terms for Latin-family disambiguation.
27   - self._latin_lexicons: Dict[str, set] = {
28   - "en": {
29   - "the", "and", "for", "with", "new", "women", "men", "kids",
30   - "shoe", "shoes", "dress", "shirt", "jacket", "bag", "wireless",
31   - },
32   - "de": {
33   - "der", "die", "das", "und", "mit", "für", "damen", "herren",
34   - "kinder", "schuhe", "kleid", "hemd", "jacke", "tasche",
35   - },
36   - "fr": {
37   - "le", "la", "les", "et", "avec", "pour", "femme", "homme",
38   - "enfant", "chaussures", "robe", "chemise", "veste", "sac",
39   - },
40   - "es": {
41   - "el", "la", "los", "las", "y", "con", "para", "mujer", "hombre",
42   - "niño", "niña", "zapatos", "vestido", "camisa", "chaqueta", "bolso",
43   - },
44   - "it": {
45   - "il", "lo", "la", "gli", "le", "e", "con", "per", "donna", "uomo",
46   - "bambino", "scarpe", "abito", "camicia", "giacca", "borsa",
47   - },
48   - "pt": {
49   - "o", "a", "os", "as", "e", "com", "para", "mulher", "homem",
50   - "criança", "sapatos", "vestido", "camisa", "jaqueta", "bolsa",
51   - },
52   - "nl": {
53   - "de", "het", "en", "met", "voor", "dames", "heren", "kinderen",
54   - "schoenen", "jurk", "overhemd", "jas", "tas",
55   - },
56   - }
57   - self._diacritic_weights: Dict[str, Dict[str, int]] = {
58   - "de": {"ä": 3, "ö": 3, "ü": 3, "ß": 4},
59   - "fr": {"é": 2, "è": 2, "ê": 2, "à": 2, "ç": 2, "ù": 2, "ô": 2},
60   - "es": {"ñ": 3, "á": 2, "é": 2, "í": 2, "ó": 2, "ú": 2},
61   - "it": {"à": 2, "è": 2, "é": 2, "ì": 2, "ò": 2, "ù": 2},
62   - "pt": {"ã": 3, "õ": 3, "ç": 2, "á": 2, "â": 2, "ê": 2, "ô": 2},
63   - "nl": {"ij": 2},
64   - }
65 59  
66 60 def detect(self, text: str) -> str:
67 61 """
... ... @@ -71,9 +65,9 @@ class LanguageDetector:
71 65 """
72 66 if not text or not text.strip():
73 67 return "unknown"
74   - q = text.strip().lower()
  68 + q = text.strip()
75 69  
76   - # Script-first detection for non-Latin languages.
  70 + # Script-first: unambiguous blocks before Latin/Romance Lingua pass.
77 71 if self._re_ja_kana.search(q):
78 72 return "ja"
79 73 if self._re_ko.search(q):
... ... @@ -91,48 +85,11 @@ class LanguageDetector:
91 85 if self._re_th.search(q):
92 86 return "th"
93 87  
94   - # Latin-family scoring.
95   - tokens = self._re_latin_word.findall(q)
96   - if not tokens:
  88 + detected = _get_lingua_detector().detect_language_of(q)
  89 + if detected is None:
97 90 return "unknown"
98   -
99   - scores: Dict[str, float] = {k: 0.0 for k in self._latin_lexicons.keys()}
100   - scores["en"] = scores.get("en", 0.0)
101   - token_set = set(tokens)
102   -
103   - # Lexicon matches
104   - for lang, lex in self._latin_lexicons.items():
105   - overlap = len(token_set & lex)
106   - if overlap:
107   - scores[lang] += overlap * 2.0
108   -
109   - # Diacritics / orthographic hints
110   - for lang, hints in self._diacritic_weights.items():
111   - for marker, weight in hints.items():
112   - if marker in q:
113   - scores[lang] += weight
114   -
115   - # Light suffix hints for common product words
116   - for t in tokens:
117   - if t.endswith("ung") or t.endswith("chen"):
118   - scores["de"] += 0.6
119   - if t.endswith("ción") or t.endswith("ado") or t.endswith("ada"):
120   - scores["es"] += 0.6
121   - if t.endswith("zione") or t.endswith("etto") or t.endswith("ella"):
122   - scores["it"] += 0.6
123   - if t.endswith("ção") or t.endswith("mente"):
124   - scores["pt"] += 0.6
125   - if t.endswith("ment") or t.endswith("eau"):
126   - scores["fr"] += 0.5
127   -
128   - # Fallback preference: English for pure Latin short tokens.
129   - scores["en"] += 0.2
130   -
131   - best_lang = max(scores.items(), key=lambda x: x[1])[0]
132   - best_score = scores[best_lang]
133   - if best_score <= 0:
134   - return "en"
135   - return best_lang
  91 + code = _LINGUA_TO_CODE.get(detected)
  92 + return code if code is not None else "unknown"
136 93  
137 94 def is_chinese(self, text: str) -> bool:
138 95 return self.detect(text) == "zh"
... ...
requirements.txt
... ... @@ -42,3 +42,6 @@ click&gt;=8.1.0
42 42 pytest>=7.4.0
43 43 pytest-asyncio>=0.21.0
44 44 httpx>=0.24.0
  45 +
  46 +# language detector
  47 +lingua-language-detector
45 48 \ No newline at end of file
... ...
scripts/start_embedding_service.sh
... ... @@ -138,7 +138,11 @@ fi
138 138 if [[ "${IMAGE_MODEL_ENABLED}" == "1" ]]; then
139 139 echo "Image max inflight: ${IMAGE_MAX_INFLIGHT:-1}"
140 140 fi
141   -echo "Logs: logs/embedding_api.log, logs/embedding_api_error.log, logs/verbose/embedding_verbose.log"
  141 +if [[ "${SERVICE_KIND}" == "image" ]]; then
  142 + echo "Logs: logs/embedding-image.log"
  143 +else
  144 + echo "Logs: logs/embedding.log"
  145 +fi
142 146 echo
143 147 echo "Tips:"
144 148 echo " - Use a single worker (GPU models cannot be safely duplicated across workers)."
... ... @@ -153,12 +157,16 @@ echo
153 157  
154 158 UVICORN_LOG_LEVEL="${EMBEDDING_UVICORN_LOG_LEVEL:-info}"
155 159 UVICORN_ACCESS_LOG="${EMBEDDING_UVICORN_ACCESS_LOG:-true}"
  160 +UVICORN_LOG_CONFIG="${EMBEDDING_UVICORN_LOG_CONFIG:-${PROJECT_ROOT}/config/uvicorn_embedding_logging.json}"
156 161 UVICORN_ARGS=(
157 162 --host "${EMBEDDING_SERVICE_HOST}"
158 163 --port "${EMBEDDING_SERVICE_PORT}"
159 164 --workers 1
160 165 --log-level "${UVICORN_LOG_LEVEL}"
161 166 )
  167 +if [[ -f "${UVICORN_LOG_CONFIG}" ]]; then
  168 + UVICORN_ARGS+=(--log-config "${UVICORN_LOG_CONFIG}")
  169 +fi
162 170 if [[ "${UVICORN_ACCESS_LOG}" == "0" || "${UVICORN_ACCESS_LOG}" == "false" || "${UVICORN_ACCESS_LOG}" == "no" ]]; then
163 171 UVICORN_ARGS+=(--no-access-log)
164 172 fi
... ...
search/es_query_builder.py
... ... @@ -33,8 +33,8 @@ class ESQueryBuilder:
33 33 function_score_config: Optional[FunctionScoreConfig] = None,
34 34 default_language: str = "en",
35 35 knn_boost: float = 0.25,
36   - base_minimum_should_match: str = "75%",
37   - translation_minimum_should_match: str = "75%",
  36 + base_minimum_should_match: str = "70%",
  37 + translation_minimum_should_match: str = "70%",
38 38 translation_boost: float = 0.4,
39 39 translation_boost_when_source_missing: float = 1.0,
40 40 source_boost_when_missing: float = 0.6,
... ... @@ -261,16 +261,13 @@ class ESQueryBuilder:
261 261 if parsed_query:
262 262 query_tokens = getattr(parsed_query, 'query_tokens', None) or []
263 263 token_count = len(query_tokens)
264   - if token_count <= 2:
265   - knn_k, knn_num_candidates = 30, 100
266   - knn_boost = self.knn_boost * 0.6 # Lower weight for short queries
267   - elif token_count >= 5:
268   - knn_k, knn_num_candidates = 80, 300
  264 + if token_count >= 5:
  265 + knn_k, knn_num_candidates = 160, 500
269 266 knn_boost = self.knn_boost * 1.4 # Higher weight for long queries
270 267 else:
271   - knn_k, knn_num_candidates = 50, 200
  268 + knn_k, knn_num_candidates = 120, 400
272 269 else:
273   - knn_k, knn_num_candidates = 50, 200
  270 + knn_k, knn_num_candidates = 120, 400
274 271 knn_clause = {
275 272 "field": self.text_embedding_field,
276 273 "query_vector": query_vector.tolist(),
... ...