diff --git a/context/request_context.py b/context/request_context.py
index 3533cb3..13e3e5f 100644
--- a/context/request_context.py
+++ b/context/request_context.py
@@ -41,6 +41,7 @@ class QueryAnalysisResult:
rewritten_query: Optional[str] = None
detected_language: Optional[str] = None
translations: Dict[str, str] = field(default_factory=dict)
+ keywords_queries: Dict[str, str] = field(default_factory=dict)
query_vector: Optional[List[float]] = None
boolean_ast: Optional[str] = None
diff --git a/frontend/static/js/app.js b/frontend/static/js/app.js
index 7ead241..435780b 100644
--- a/frontend/static/js/app.js
+++ b/frontend/static/js/app.js
@@ -1062,6 +1062,7 @@ function buildGlobalFunnelHtml(data, debugInfo) {
const fineInfo = rankingFunnel.fine_rank || debugInfo.fine_rank || {};
const rerankInfo = rankingFunnel.rerank || debugInfo.rerank || {};
const translations = queryAnalysis.translations || {};
+ const keywordsQueries = queryAnalysis.keywords_queries || {};
const summaryHtml = `
@@ -1072,11 +1073,13 @@ function buildGlobalFunnelHtml(data, debugInfo) {
{ label: 'detected_language', value: queryAnalysis.detected_language || 'N/A' },
{ label: 'index_languages', value: (queryAnalysis.index_languages || []).join(', ') || 'N/A' },
{ label: 'query_tokens', value: (queryAnalysis.query_tokens || []).join(', ') || 'N/A' },
+ { label: 'base_keywords', value: keywordsQueries.base || 'N/A' },
{ label: 'translation_enabled', value: featureFlags.translation_enabled ? 'enabled' : 'disabled' },
{ label: 'embedding_enabled', value: featureFlags.embedding_enabled ? 'enabled' : 'disabled' },
{ label: 'style_intent_active', value: featureFlags.style_intent_active ? 'yes' : 'no' },
])}
${Object.keys(translations).length ? renderJsonDetails('Translations', translations, true) : ''}
+ ${Object.keys(keywordsQueries).length ? renderJsonDetails('Keywords Queries', keywordsQueries, true) : ''}
${formatIntentDetectionHtml(queryAnalysis.intent_detection ?? queryAnalysis.style_intent_profile)}
`;
diff --git a/query/query_parser.py b/query/query_parser.py
index 28279c7..3a36e37 100644
--- a/query/query_parser.py
+++ b/query/query_parser.py
@@ -359,16 +359,15 @@ class QueryParser:
else:
active_logger.debug(msg)
+ before_wait_t0 = time.perf_counter()
+
# Stage 1: Normalize
- normalize_t0 = time.perf_counter()
normalized = self.normalizer.normalize(query)
- normalize_ms = (time.perf_counter() - normalize_t0) * 1000.0
log_debug(f"Normalization completed | '{query}' -> '{normalized}'")
if context:
context.store_intermediate_result('query_normalized', normalized)
# Stage 2: Query rewriting
- rewrite_t0 = time.perf_counter()
query_text = normalized
rewritten = normalized
if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists
@@ -379,12 +378,10 @@ class QueryParser:
if context:
context.store_intermediate_result('rewritten_query', rewritten)
context.add_warning(f"Query was rewritten: {query_text}")
- rewrite_ms = (time.perf_counter() - rewrite_t0) * 1000.0
normalized_targets = self._normalize_language_codes(target_languages)
# Stage 3: Language detection
- language_detect_t0 = time.perf_counter()
detected_lang = self._detect_query_language(
query_text,
target_languages=normalized_targets,
@@ -392,7 +389,6 @@ class QueryParser:
# Use default language if detection failed (None or "unknown")
if not detected_lang or detected_lang == "unknown":
detected_lang = self.config.query_config.default_language
- language_detect_ms = (time.perf_counter() - language_detect_t0) * 1000.0
log_info(f"Language detection | Detected language: {detected_lang}")
if context:
context.store_intermediate_result('detected_language', detected_lang)
@@ -433,9 +429,7 @@ class QueryParser:
thread_name_prefix="query-enrichment",
)
- async_submit_ms = 0.0
try:
- async_submit_t0 = time.perf_counter()
if async_executor is not None:
for lang in translation_targets:
model_name = self._pick_query_translation_model(
@@ -503,7 +497,6 @@ class QueryParser:
future = async_executor.submit(_encode_image_query_vector)
future_to_task[future] = ("image_embedding", None)
future_submit_at[future] = time.perf_counter()
- async_submit_ms = (time.perf_counter() - async_submit_t0) * 1000.0
except Exception as e:
error_msg = f"Async query enrichment submission failed | Error: {str(e)}"
log_info(error_msg)
@@ -516,14 +509,8 @@ class QueryParser:
future_submit_at.clear()
# Stage 4: Query analysis (tokenization) now overlaps with async enrichment work.
- query_analysis_t0 = time.perf_counter()
- query_tokenizer_t0 = time.perf_counter()
query_tokenizer_result = text_analysis_cache.get_tokenizer_result(query_text)
- query_tokenizer_ms = (time.perf_counter() - query_tokenizer_t0) * 1000.0
- query_token_extract_t0 = time.perf_counter()
query_tokens = self._extract_tokens(query_tokenizer_result)
- query_token_extract_ms = (time.perf_counter() - query_token_extract_t0) * 1000.0
- query_analysis_ms = (time.perf_counter() - query_analysis_t0) * 1000.0
log_debug(f"Query analysis | Query tokens: {query_tokens}")
if context:
@@ -541,6 +528,7 @@ class QueryParser:
keywords_base_ms = (time.perf_counter() - keywords_base_t0) * 1000.0
except Exception as e:
log_info(f"Base keyword extraction failed | Error: {e}")
+ before_wait_ms = (time.perf_counter() - before_wait_t0) * 1000.0
# Wait for translation + embedding concurrently; shared budget depends on whether
# the detected language belongs to caller-provided target_languages.
@@ -569,7 +557,6 @@ class QueryParser:
async_wait_t0 = time.perf_counter()
done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec)
async_wait_ms = (time.perf_counter() - async_wait_t0) * 1000.0
- async_collect_t0 = time.perf_counter()
for future in done:
task_type, lang = future_to_task[future]
t0 = future_submit_at.pop(future, None)
@@ -630,7 +617,6 @@ class QueryParser:
log_info(timeout_msg)
if context:
context.add_warning(timeout_msg)
- async_collect_ms = (time.perf_counter() - async_collect_t0) * 1000.0
if async_executor:
async_executor.shutdown(wait=False)
@@ -639,7 +625,6 @@ class QueryParser:
context.store_intermediate_result("translations", translations)
else:
async_wait_ms = 0.0
- async_collect_ms = 0.0
tail_sync_t0 = time.perf_counter()
keywords_queries: Dict[str, str] = {}
@@ -655,6 +640,9 @@ class QueryParser:
base_keywords_query=keywords_base_query,
)
keyword_tail_ms = (time.perf_counter() - keywords_t0) * 1000.0
+ if context:
+ context.store_intermediate_result("keywords_queries", keywords_queries)
+ log_info(f"Keyword extraction completed | keywords_queries={keywords_queries}")
except Exception as e:
log_info(f"Keyword extraction failed | Error: {e}")
@@ -671,39 +659,15 @@ class QueryParser:
keywords_queries=keywords_queries,
_text_analysis_cache=text_analysis_cache,
)
- style_intent_t0 = time.perf_counter()
style_intent_profile = self.style_intent_detector.detect(base_result)
- style_intent_ms = (time.perf_counter() - style_intent_t0) * 1000.0
- product_title_exclusion_t0 = time.perf_counter()
product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result)
- product_title_exclusion_ms = (
- (time.perf_counter() - product_title_exclusion_t0) * 1000.0
- )
tail_sync_ms = (time.perf_counter() - tail_sync_t0) * 1000.0
- before_wait_ms = (
- normalize_ms
- + rewrite_ms
- + language_detect_ms
- + async_submit_ms
- + query_analysis_ms
- + keywords_base_ms
- )
log_info(
"Query parse stage timings | "
- f"normalize_ms={normalize_ms:.1f} | "
- f"rewrite_ms={rewrite_ms:.1f} | "
- f"language_detect_ms={language_detect_ms:.1f} | "
- f"query_tokenizer_ms={query_tokenizer_ms:.1f} | "
- f"query_token_extract_ms={query_token_extract_ms:.1f} | "
- f"query_analysis_ms={query_analysis_ms:.1f} | "
- f"async_submit_ms={async_submit_ms:.1f} | "
f"before_wait_ms={before_wait_ms:.1f} | "
f"async_wait_ms={async_wait_ms:.1f} | "
- f"async_collect_ms={async_collect_ms:.1f} | "
f"base_keywords_ms={keywords_base_ms:.1f} | "
f"keyword_tail_ms={keyword_tail_ms:.1f} | "
- f"style_intent_ms={style_intent_ms:.1f} | "
- f"product_title_exclusion_ms={product_title_exclusion_ms:.1f} | "
f"tail_sync_ms={tail_sync_ms:.1f}"
)
if context:
diff --git a/query/tokenization.py b/query/tokenization.py
index e33a31d..dadd4dc 100644
--- a/query/tokenization.py
+++ b/query/tokenization.py
@@ -89,11 +89,18 @@ def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -> List[str]
return phrases
-def _build_coarse_tokens(text: str, fine_tokens: Sequence[str]) -> List[str]:
- coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text))
- if contains_han_text(text) and fine_tokens:
- return list(_dedupe_preserve_order(fine_tokens))
- return coarse_tokens
+def _build_coarse_tokens(
+ text: str,
+ *,
+ language_hint: Optional[str],
+ tokenizer_tokens: Sequence[str],
+) -> List[str]:
+ normalized_language = normalize_query_text(language_hint)
+ if normalized_language == "zh" or (contains_han_text(text) and tokenizer_tokens):
+ # Chinese coarse tokenization should follow the model tokenizer rather than a
+ # regex that collapses the whole sentence into one CJK span.
+ return list(_dedupe_preserve_order(tokenizer_tokens))
+ return _dedupe_preserve_order(simple_tokenize_query(text))
@dataclass(frozen=True)
@@ -159,7 +166,11 @@ class QueryTextAnalysisCache:
normalized_text = normalize_query_text(normalized_input)
fine_raw = extract_token_strings(self.get_tokenizer_result(normalized_input))
fine_tokens = _dedupe_preserve_order(fine_raw)
- coarse_tokens = _build_coarse_tokens(normalized_input, fine_tokens)
+ coarse_tokens = _build_coarse_tokens(
+ normalized_input,
+ language_hint=self.get_language_hint(normalized_input),
+ tokenizer_tokens=fine_tokens,
+ )
bundle = TokenizedText(
text=normalized_input,
diff --git a/search/searcher.py b/search/searcher.py
index 37f4ffe..e1a1c72 100644
--- a/search/searcher.py
+++ b/search/searcher.py
@@ -446,6 +446,7 @@ class Searcher:
rewritten_query=parsed_query.rewritten_query,
detected_language=parsed_query.detected_language,
translations=parsed_query.translations,
+ keywords_queries=parsed_query.keywords_queries,
query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None,
)
context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query)
@@ -454,6 +455,7 @@ class Searcher:
f"查询解析完成 | 原查询: '{parsed_query.original_query}' | "
f"重写后: '{parsed_query.rewritten_query}' | "
f"语言: {parsed_query.detected_language} | "
+ f"关键词: {parsed_query.keywords_queries} | "
f"文本向量: {'是' if parsed_query.query_vector is not None else '否'} | "
f"图片向量: {'是' if getattr(parsed_query, 'image_query_vector', None) is not None else '否'}",
extra={'reqid': context.reqid, 'uid': context.uid}
@@ -1172,6 +1174,7 @@ class Searcher:
"detected_language": context.query_analysis.detected_language,
"index_languages": index_langs,
"translations": context.query_analysis.translations,
+ "keywords_queries": context.query_analysis.keywords_queries,
"has_vector": context.query_analysis.query_vector is not None,
"has_image_vector": getattr(parsed_query, "image_query_vector", None) is not None,
"query_tokens": getattr(parsed_query, "query_tokens", []),
--
libgit2 0.21.2