diff --git a/context/request_context.py b/context/request_context.py index 3533cb3..13e3e5f 100644 --- a/context/request_context.py +++ b/context/request_context.py @@ -41,6 +41,7 @@ class QueryAnalysisResult: rewritten_query: Optional[str] = None detected_language: Optional[str] = None translations: Dict[str, str] = field(default_factory=dict) + keywords_queries: Dict[str, str] = field(default_factory=dict) query_vector: Optional[List[float]] = None boolean_ast: Optional[str] = None diff --git a/frontend/static/js/app.js b/frontend/static/js/app.js index 7ead241..435780b 100644 --- a/frontend/static/js/app.js +++ b/frontend/static/js/app.js @@ -1062,6 +1062,7 @@ function buildGlobalFunnelHtml(data, debugInfo) { const fineInfo = rankingFunnel.fine_rank || debugInfo.fine_rank || {}; const rerankInfo = rankingFunnel.rerank || debugInfo.rerank || {}; const translations = queryAnalysis.translations || {}; + const keywordsQueries = queryAnalysis.keywords_queries || {}; const summaryHtml = `
@@ -1072,11 +1073,13 @@ function buildGlobalFunnelHtml(data, debugInfo) { { label: 'detected_language', value: queryAnalysis.detected_language || 'N/A' }, { label: 'index_languages', value: (queryAnalysis.index_languages || []).join(', ') || 'N/A' }, { label: 'query_tokens', value: (queryAnalysis.query_tokens || []).join(', ') || 'N/A' }, + { label: 'base_keywords', value: keywordsQueries.base || 'N/A' }, { label: 'translation_enabled', value: featureFlags.translation_enabled ? 'enabled' : 'disabled' }, { label: 'embedding_enabled', value: featureFlags.embedding_enabled ? 'enabled' : 'disabled' }, { label: 'style_intent_active', value: featureFlags.style_intent_active ? 'yes' : 'no' }, ])} ${Object.keys(translations).length ? renderJsonDetails('Translations', translations, true) : ''} + ${Object.keys(keywordsQueries).length ? renderJsonDetails('Keywords Queries', keywordsQueries, true) : ''} ${formatIntentDetectionHtml(queryAnalysis.intent_detection ?? queryAnalysis.style_intent_profile)}
`; diff --git a/query/query_parser.py b/query/query_parser.py index 28279c7..3a36e37 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -359,16 +359,15 @@ class QueryParser: else: active_logger.debug(msg) + before_wait_t0 = time.perf_counter() + # Stage 1: Normalize - normalize_t0 = time.perf_counter() normalized = self.normalizer.normalize(query) - normalize_ms = (time.perf_counter() - normalize_t0) * 1000.0 log_debug(f"Normalization completed | '{query}' -> '{normalized}'") if context: context.store_intermediate_result('query_normalized', normalized) # Stage 2: Query rewriting - rewrite_t0 = time.perf_counter() query_text = normalized rewritten = normalized if self.config.query_config.rewrite_dictionary: # Enable rewrite if dictionary exists @@ -379,12 +378,10 @@ class QueryParser: if context: context.store_intermediate_result('rewritten_query', rewritten) context.add_warning(f"Query was rewritten: {query_text}") - rewrite_ms = (time.perf_counter() - rewrite_t0) * 1000.0 normalized_targets = self._normalize_language_codes(target_languages) # Stage 3: Language detection - language_detect_t0 = time.perf_counter() detected_lang = self._detect_query_language( query_text, target_languages=normalized_targets, @@ -392,7 +389,6 @@ class QueryParser: # Use default language if detection failed (None or "unknown") if not detected_lang or detected_lang == "unknown": detected_lang = self.config.query_config.default_language - language_detect_ms = (time.perf_counter() - language_detect_t0) * 1000.0 log_info(f"Language detection | Detected language: {detected_lang}") if context: context.store_intermediate_result('detected_language', detected_lang) @@ -433,9 +429,7 @@ class QueryParser: thread_name_prefix="query-enrichment", ) - async_submit_ms = 0.0 try: - async_submit_t0 = time.perf_counter() if async_executor is not None: for lang in translation_targets: model_name = self._pick_query_translation_model( @@ -503,7 +497,6 @@ class QueryParser: future = async_executor.submit(_encode_image_query_vector) future_to_task[future] = ("image_embedding", None) future_submit_at[future] = time.perf_counter() - async_submit_ms = (time.perf_counter() - async_submit_t0) * 1000.0 except Exception as e: error_msg = f"Async query enrichment submission failed | Error: {str(e)}" log_info(error_msg) @@ -516,14 +509,8 @@ class QueryParser: future_submit_at.clear() # Stage 4: Query analysis (tokenization) now overlaps with async enrichment work. - query_analysis_t0 = time.perf_counter() - query_tokenizer_t0 = time.perf_counter() query_tokenizer_result = text_analysis_cache.get_tokenizer_result(query_text) - query_tokenizer_ms = (time.perf_counter() - query_tokenizer_t0) * 1000.0 - query_token_extract_t0 = time.perf_counter() query_tokens = self._extract_tokens(query_tokenizer_result) - query_token_extract_ms = (time.perf_counter() - query_token_extract_t0) * 1000.0 - query_analysis_ms = (time.perf_counter() - query_analysis_t0) * 1000.0 log_debug(f"Query analysis | Query tokens: {query_tokens}") if context: @@ -541,6 +528,7 @@ class QueryParser: keywords_base_ms = (time.perf_counter() - keywords_base_t0) * 1000.0 except Exception as e: log_info(f"Base keyword extraction failed | Error: {e}") + before_wait_ms = (time.perf_counter() - before_wait_t0) * 1000.0 # Wait for translation + embedding concurrently; shared budget depends on whether # the detected language belongs to caller-provided target_languages. @@ -569,7 +557,6 @@ class QueryParser: async_wait_t0 = time.perf_counter() done, not_done = wait(list(future_to_task.keys()), timeout=budget_sec) async_wait_ms = (time.perf_counter() - async_wait_t0) * 1000.0 - async_collect_t0 = time.perf_counter() for future in done: task_type, lang = future_to_task[future] t0 = future_submit_at.pop(future, None) @@ -630,7 +617,6 @@ class QueryParser: log_info(timeout_msg) if context: context.add_warning(timeout_msg) - async_collect_ms = (time.perf_counter() - async_collect_t0) * 1000.0 if async_executor: async_executor.shutdown(wait=False) @@ -639,7 +625,6 @@ class QueryParser: context.store_intermediate_result("translations", translations) else: async_wait_ms = 0.0 - async_collect_ms = 0.0 tail_sync_t0 = time.perf_counter() keywords_queries: Dict[str, str] = {} @@ -655,6 +640,9 @@ class QueryParser: base_keywords_query=keywords_base_query, ) keyword_tail_ms = (time.perf_counter() - keywords_t0) * 1000.0 + if context: + context.store_intermediate_result("keywords_queries", keywords_queries) + log_info(f"Keyword extraction completed | keywords_queries={keywords_queries}") except Exception as e: log_info(f"Keyword extraction failed | Error: {e}") @@ -671,39 +659,15 @@ class QueryParser: keywords_queries=keywords_queries, _text_analysis_cache=text_analysis_cache, ) - style_intent_t0 = time.perf_counter() style_intent_profile = self.style_intent_detector.detect(base_result) - style_intent_ms = (time.perf_counter() - style_intent_t0) * 1000.0 - product_title_exclusion_t0 = time.perf_counter() product_title_exclusion_profile = self.product_title_exclusion_detector.detect(base_result) - product_title_exclusion_ms = ( - (time.perf_counter() - product_title_exclusion_t0) * 1000.0 - ) tail_sync_ms = (time.perf_counter() - tail_sync_t0) * 1000.0 - before_wait_ms = ( - normalize_ms - + rewrite_ms - + language_detect_ms - + async_submit_ms - + query_analysis_ms - + keywords_base_ms - ) log_info( "Query parse stage timings | " - f"normalize_ms={normalize_ms:.1f} | " - f"rewrite_ms={rewrite_ms:.1f} | " - f"language_detect_ms={language_detect_ms:.1f} | " - f"query_tokenizer_ms={query_tokenizer_ms:.1f} | " - f"query_token_extract_ms={query_token_extract_ms:.1f} | " - f"query_analysis_ms={query_analysis_ms:.1f} | " - f"async_submit_ms={async_submit_ms:.1f} | " f"before_wait_ms={before_wait_ms:.1f} | " f"async_wait_ms={async_wait_ms:.1f} | " - f"async_collect_ms={async_collect_ms:.1f} | " f"base_keywords_ms={keywords_base_ms:.1f} | " f"keyword_tail_ms={keyword_tail_ms:.1f} | " - f"style_intent_ms={style_intent_ms:.1f} | " - f"product_title_exclusion_ms={product_title_exclusion_ms:.1f} | " f"tail_sync_ms={tail_sync_ms:.1f}" ) if context: diff --git a/query/tokenization.py b/query/tokenization.py index e33a31d..dadd4dc 100644 --- a/query/tokenization.py +++ b/query/tokenization.py @@ -89,11 +89,18 @@ def _build_phrase_candidates(tokens: Sequence[str], max_ngram: int) -> List[str] return phrases -def _build_coarse_tokens(text: str, fine_tokens: Sequence[str]) -> List[str]: - coarse_tokens = _dedupe_preserve_order(simple_tokenize_query(text)) - if contains_han_text(text) and fine_tokens: - return list(_dedupe_preserve_order(fine_tokens)) - return coarse_tokens +def _build_coarse_tokens( + text: str, + *, + language_hint: Optional[str], + tokenizer_tokens: Sequence[str], +) -> List[str]: + normalized_language = normalize_query_text(language_hint) + if normalized_language == "zh" or (contains_han_text(text) and tokenizer_tokens): + # Chinese coarse tokenization should follow the model tokenizer rather than a + # regex that collapses the whole sentence into one CJK span. + return list(_dedupe_preserve_order(tokenizer_tokens)) + return _dedupe_preserve_order(simple_tokenize_query(text)) @dataclass(frozen=True) @@ -159,7 +166,11 @@ class QueryTextAnalysisCache: normalized_text = normalize_query_text(normalized_input) fine_raw = extract_token_strings(self.get_tokenizer_result(normalized_input)) fine_tokens = _dedupe_preserve_order(fine_raw) - coarse_tokens = _build_coarse_tokens(normalized_input, fine_tokens) + coarse_tokens = _build_coarse_tokens( + normalized_input, + language_hint=self.get_language_hint(normalized_input), + tokenizer_tokens=fine_tokens, + ) bundle = TokenizedText( text=normalized_input, diff --git a/search/searcher.py b/search/searcher.py index 37f4ffe..e1a1c72 100644 --- a/search/searcher.py +++ b/search/searcher.py @@ -446,6 +446,7 @@ class Searcher: rewritten_query=parsed_query.rewritten_query, detected_language=parsed_query.detected_language, translations=parsed_query.translations, + keywords_queries=parsed_query.keywords_queries, query_vector=parsed_query.query_vector.tolist() if parsed_query.query_vector is not None else None, ) context.metadata["feature_flags"]["style_intent_active"] = self._has_style_intent(parsed_query) @@ -454,6 +455,7 @@ class Searcher: f"查询解析完成 | 原查询: '{parsed_query.original_query}' | " f"重写后: '{parsed_query.rewritten_query}' | " f"语言: {parsed_query.detected_language} | " + f"关键词: {parsed_query.keywords_queries} | " f"文本向量: {'是' if parsed_query.query_vector is not None else '否'} | " f"图片向量: {'是' if getattr(parsed_query, 'image_query_vector', None) is not None else '否'}", extra={'reqid': context.reqid, 'uid': context.uid} @@ -1172,6 +1174,7 @@ class Searcher: "detected_language": context.query_analysis.detected_language, "index_languages": index_langs, "translations": context.query_analysis.translations, + "keywords_queries": context.query_analysis.keywords_queries, "has_vector": context.query_analysis.query_vector is not None, "has_image_vector": getattr(parsed_query, "image_query_vector", None) is not None, "query_tokens": getattr(parsed_query, "query_tokens", []), -- libgit2 0.21.2