Commit a77693fefb06e91a73e7493e836d5f4d6fd2e881

Authored by tangwang
1 parent 1f071951

调整配置目录结构

config/config_loader.py
@@ -104,10 +104,53 @@ class ConfigLoader: @@ -104,10 +104,53 @@ class ConfigLoader:
104 104
105 def __init__(self, config_dir: str = "config/schema"): 105 def __init__(self, config_dir: str = "config/schema"):
106 self.config_dir = Path(config_dir) 106 self.config_dir = Path(config_dir)
  107 +
  108 + def _load_rewrite_dictionary(self, customer_id: str) -> Dict[str, str]:
  109 + """
  110 + Load query rewrite dictionary from external file.
  111 +
  112 + Args:
  113 + customer_id: Customer identifier
  114 +
  115 + Returns:
  116 + Dictionary mapping query terms to rewritten queries
  117 + """
  118 + dict_file = self.config_dir / customer_id / "query_rewrite.dict"
  119 +
  120 + if not dict_file.exists():
  121 + # Dictionary file is optional, return empty dict if not found
  122 + return {}
  123 +
  124 + rewrite_dict = {}
  125 + try:
  126 + with open(dict_file, 'r', encoding='utf-8') as f:
  127 + for line_num, line in enumerate(f, 1):
  128 + line = line.strip()
  129 + # Skip empty lines and comments
  130 + if not line or line.startswith('#'):
  131 + continue
  132 +
  133 + # Parse tab-separated format
  134 + parts = line.split('\t')
  135 + if len(parts) != 2:
  136 + print(f"Warning: Invalid format in {dict_file} line {line_num}: {line}")
  137 + continue
  138 +
  139 + key, value = parts
  140 + rewrite_dict[key.strip()] = value.strip()
  141 + except Exception as e:
  142 + print(f"Error loading rewrite dictionary from {dict_file}: {e}")
  143 + return {}
  144 +
  145 + return rewrite_dict
107 146
108 def load_customer_config(self, customer_id: str) -> CustomerConfig: 147 def load_customer_config(self, customer_id: str) -> CustomerConfig:
109 """ 148 """
110 Load customer configuration from YAML file. 149 Load customer configuration from YAML file.
  150 +
  151 + Supports two directory structures:
  152 + 1. New structure: config/schema/{customer_id}/config.yaml
  153 + 2. Old structure: config/schema/{customer_id}_config.yaml (for backward compatibility)
111 154
112 Args: 155 Args:
113 customer_id: Customer identifier (used to find config file) 156 customer_id: Customer identifier (used to find config file)
@@ -118,8 +161,13 @@ class ConfigLoader: @@ -118,8 +161,13 @@ class ConfigLoader:
118 Raises: 161 Raises:
119 ConfigurationError: If config file not found or invalid 162 ConfigurationError: If config file not found or invalid
120 """ 163 """
121 - config_file = self.config_dir / f"{customer_id}_config.yaml"  
122 - 164 + # Try new directory structure first
  165 + config_file = self.config_dir / customer_id / "config.yaml"
  166 +
  167 + # Fall back to old structure if new one doesn't exist
  168 + if not config_file.exists():
  169 + config_file = self.config_dir / f"{customer_id}_config.yaml"
  170 +
123 if not config_file.exists(): 171 if not config_file.exists():
124 raise ConfigurationError(f"Configuration file not found: {config_file}") 172 raise ConfigurationError(f"Configuration file not found: {config_file}")
125 173
@@ -146,13 +194,17 @@ class ConfigLoader: @@ -146,13 +194,17 @@ class ConfigLoader:
146 194
147 # Parse query config 195 # Parse query config
148 query_config_data = config_data.get("query_config", {}) 196 query_config_data = config_data.get("query_config", {})
  197 +
  198 + # Load rewrite dictionary from external file instead of config
  199 + rewrite_dictionary = self._load_rewrite_dictionary(customer_id)
  200 +
149 query_config = QueryConfig( 201 query_config = QueryConfig(
150 supported_languages=query_config_data.get("supported_languages", ["zh", "en"]), 202 supported_languages=query_config_data.get("supported_languages", ["zh", "en"]),
151 default_language=query_config_data.get("default_language", "zh"), 203 default_language=query_config_data.get("default_language", "zh"),
152 enable_translation=query_config_data.get("enable_translation", True), 204 enable_translation=query_config_data.get("enable_translation", True),
153 enable_text_embedding=query_config_data.get("enable_text_embedding", True), 205 enable_text_embedding=query_config_data.get("enable_text_embedding", True),
154 enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), 206 enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
155 - rewrite_dictionary=query_config_data.get("rewrite_dictionary", {}), 207 + rewrite_dictionary=rewrite_dictionary,
156 translation_api_key=query_config_data.get("translation_api_key"), 208 translation_api_key=query_config_data.get("translation_api_key"),
157 translation_service=query_config_data.get("translation_service", "deepl") 209 translation_service=query_config_data.get("translation_service", "deepl")
158 ) 210 )
@@ -342,13 +394,18 @@ class ConfigLoader: @@ -342,13 +394,18 @@ class ConfigLoader:
342 def save_config(self, config: CustomerConfig, output_path: Optional[str] = None) -> None: 394 def save_config(self, config: CustomerConfig, output_path: Optional[str] = None) -> None:
343 """ 395 """
344 Save customer configuration to YAML file. 396 Save customer configuration to YAML file.
  397 +
  398 + Note: rewrite_dictionary is saved separately to query_rewrite.dict file
345 399
346 Args: 400 Args:
347 config: Configuration to save 401 config: Configuration to save
348 - output_path: Optional output path (defaults to config dir) 402 + output_path: Optional output path (defaults to new directory structure)
349 """ 403 """
350 if output_path is None: 404 if output_path is None:
351 - output_path = self.config_dir / f"{config.customer_id}_config.yaml" 405 + # Use new directory structure by default
  406 + customer_dir = self.config_dir / config.customer_id
  407 + customer_dir.mkdir(parents=True, exist_ok=True)
  408 + output_path = customer_dir / "config.yaml"
352 409
353 # Convert config back to dictionary format 410 # Convert config back to dictionary format
354 config_dict = { 411 config_dict = {
@@ -366,7 +423,7 @@ class ConfigLoader: @@ -366,7 +423,7 @@ class ConfigLoader:
366 "enable_translation": config.query_config.enable_translation, 423 "enable_translation": config.query_config.enable_translation,
367 "enable_text_embedding": config.query_config.enable_text_embedding, 424 "enable_text_embedding": config.query_config.enable_text_embedding,
368 "enable_query_rewrite": config.query_config.enable_query_rewrite, 425 "enable_query_rewrite": config.query_config.enable_query_rewrite,
369 - "rewrite_dictionary": config.query_config.rewrite_dictionary, 426 + # rewrite_dictionary is stored in separate file, not in config
370 "translation_api_key": config.query_config.translation_api_key, 427 "translation_api_key": config.query_config.translation_api_key,
371 "translation_service": config.query_config.translation_service, 428 "translation_service": config.query_config.translation_service,
372 }, 429 },
@@ -383,6 +440,25 @@ class ConfigLoader: @@ -383,6 +440,25 @@ class ConfigLoader:
383 440
384 with open(output_path, 'w', encoding='utf-8') as f: 441 with open(output_path, 'w', encoding='utf-8') as f:
385 yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True) 442 yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True)
  443 +
  444 + # Save rewrite dictionary to separate file
  445 + self._save_rewrite_dictionary(config.customer_id, config.query_config.rewrite_dictionary)
  446 +
  447 + def _save_rewrite_dictionary(self, customer_id: str, rewrite_dict: Dict[str, str]) -> None:
  448 + """
  449 + Save rewrite dictionary to external file.
  450 +
  451 + Args:
  452 + customer_id: Customer identifier
  453 + rewrite_dict: Dictionary to save
  454 + """
  455 + customer_dir = self.config_dir / customer_id
  456 + customer_dir.mkdir(parents=True, exist_ok=True)
  457 + dict_file = customer_dir / "query_rewrite.dict"
  458 +
  459 + with open(dict_file, 'w', encoding='utf-8') as f:
  460 + for key, value in rewrite_dict.items():
  461 + f.write(f"{key}\t{value}\n")
386 462
387 def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: 463 def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]:
388 """Convert FieldConfig to dictionary.""" 464 """Convert FieldConfig to dictionary."""
config/schema/customer1_config.yaml renamed to config/schema/customer1/config.yaml
@@ -246,12 +246,6 @@ query_config: @@ -246,12 +246,6 @@ query_config:
246 enable_text_embedding: true 246 enable_text_embedding: true
247 enable_query_rewrite: true 247 enable_query_rewrite: true
248 248
249 - # Query Rewrite Dictionary  
250 - rewrite_dictionary:  
251 - "芭比": "brand:芭比 OR name:芭比娃娃"  
252 - "玩具": "category:玩具"  
253 - "消防": "category:消防 OR name:消防"  
254 -  
255 # Translation API (DeepL) 249 # Translation API (DeepL)
256 translation_service: "deepl" 250 translation_service: "deepl"
257 translation_api_key: null # Set via environment variable 251 translation_api_key: null # Set via environment variable
@@ -266,3 +260,4 @@ spu_config: @@ -266,3 +260,4 @@ spu_config:
266 enabled: false 260 enabled: false
267 spu_field: null 261 spu_field: null
268 inner_hits_size: 3 262 inner_hits_size: 3
  263 +
config/schema/customer1/query_rewrite.dict 0 → 100644
@@ -0,0 +1,4 @@ @@ -0,0 +1,4 @@
  1 +芭比 brand:芭比 OR name:芭比娃娃
  2 +玩具 category:玩具
  3 +消防 category:消防 OR name:消防
  4 +
@@ -6,6 +6,7 @@ RequestContext使用示例 @@ -6,6 +6,7 @@ RequestContext使用示例
6 6
7 import sys 7 import sys
8 import os 8 import os
  9 +import time
9 10
10 # 添加项目根目录到Python路径 11 # 添加项目根目录到Python路径
11 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) 12 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -25,7 +26,6 @@ def example_basic_usage(): @@ -25,7 +26,6 @@ def example_basic_usage():
25 # 步骤1: 查询解析 26 # 步骤1: 查询解析
26 context.start_stage(RequestContextStage.QUERY_PARSING) 27 context.start_stage(RequestContextStage.QUERY_PARSING)
27 # 这里调用 query_parser.parse(query, context=context) 28 # 这里调用 query_parser.parse(query, context=context)
28 - import time  
29 time.sleep(0.05) # 模拟处理时间 29 time.sleep(0.05) # 模拟处理时间
30 context.end_stage(RequestContextStage.QUERY_PARSING) 30 context.end_stage(RequestContextStage.QUERY_PARSING)
31 31
@@ -97,7 +97,6 @@ def example_with_searcher(): @@ -97,7 +97,6 @@ def example_with_searcher():
97 97
98 # 模拟搜索流程 98 # 模拟搜索流程
99 context.start_stage(RequestContextStage.QUERY_PARSING) 99 context.start_stage(RequestContextStage.QUERY_PARSING)
100 - import time  
101 time.sleep(0.04) 100 time.sleep(0.04)
102 context.end_stage(RequestContextStage.QUERY_PARSING) 101 context.end_stage(RequestContextStage.QUERY_PARSING)
103 context.store_query_analysis( 102 context.store_query_analysis(
@@ -181,7 +180,6 @@ def example_performance_analysis(): @@ -181,7 +180,6 @@ def example_performance_analysis():
181 (RequestContextStage.RERANKING, 42.7) 180 (RequestContextStage.RERANKING, 42.7)
182 ] 181 ]
183 182
184 - import time  
185 for stage, duration_ms in stages_with_durations: 183 for stage, duration_ms in stages_with_durations:
186 context.start_stage(stage) 184 context.start_stage(stage)
187 time.sleep(duration_ms / 1000.0) # 转换为秒 185 time.sleep(duration_ms / 1000.0) # 转换为秒
indexer/data_transformer.py
@@ -6,6 +6,7 @@ Handles field mapping, type conversion, and embedding generation. @@ -6,6 +6,7 @@ Handles field mapping, type conversion, and embedding generation.
6 6
7 import pandas as pd 7 import pandas as pd
8 import numpy as np 8 import numpy as np
  9 +import datetime
9 from typing import Dict, Any, List, Optional 10 from typing import Dict, Any, List, Optional
10 from config import CustomerConfig, FieldConfig, FieldType 11 from config import CustomerConfig, FieldConfig, FieldType
11 from embeddings import BgeEncoder, CLIPImageEncoder 12 from embeddings import BgeEncoder, CLIPImageEncoder
@@ -304,7 +305,6 @@ class DataTransformer: @@ -304,7 +305,6 @@ class DataTransformer:
304 elif isinstance(value, str): 305 elif isinstance(value, str):
305 # Try to parse string datetime and convert to ISO format 306 # Try to parse string datetime and convert to ISO format
306 try: 307 try:
307 - import datetime  
308 # Handle common datetime formats 308 # Handle common datetime formats
309 formats = [ 309 formats = [
310 '%Y-%m-%d %H:%M:%S', # 2020-07-07 16:44:09 310 '%Y-%m-%d %H:%M:%S', # 2020-07-07 16:44:09
@@ -12,19 +12,21 @@ import sys @@ -12,19 +12,21 @@ import sys
12 import os 12 import os
13 import argparse 13 import argparse
14 import json 14 import json
  15 +import pandas as pd
  16 +import uvicorn
15 17
16 # Add parent directory to path 18 # Add parent directory to path
17 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) 19 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
18 20
  21 +from config import ConfigLoader
  22 +from utils import ESClient
  23 +from indexer import DataTransformer, IndexingPipeline
  24 +from embeddings import BgeEncoder, CLIPImageEncoder
  25 +from search import Searcher
  26 +
19 27
20 def cmd_ingest(args): 28 def cmd_ingest(args):
21 """Run data ingestion.""" 29 """Run data ingestion."""
22 - from config import ConfigLoader  
23 - from utils import ESClient  
24 - from indexer import DataTransformer, IndexingPipeline  
25 - from embeddings import BgeEncoder, CLIPImageEncoder  
26 - import pandas as pd  
27 -  
28 print(f"Starting ingestion for customer: {args.customer}") 30 print(f"Starting ingestion for customer: {args.customer}")
29 31
30 # Load config 32 # Load config
@@ -63,8 +65,6 @@ def cmd_ingest(args): @@ -63,8 +65,6 @@ def cmd_ingest(args):
63 65
64 def cmd_serve(args): 66 def cmd_serve(args):
65 """Start API service.""" 67 """Start API service."""
66 - import uvicorn  
67 -  
68 os.environ['CUSTOMER_ID'] = args.customer 68 os.environ['CUSTOMER_ID'] = args.customer
69 os.environ['ES_HOST'] = args.es_host 69 os.environ['ES_HOST'] = args.es_host
70 70
@@ -83,10 +83,6 @@ def cmd_serve(args): @@ -83,10 +83,6 @@ def cmd_serve(args):
83 83
84 def cmd_search(args): 84 def cmd_search(args):
85 """Test search from command line.""" 85 """Test search from command line."""
86 - from config import ConfigLoader  
87 - from utils import ESClient  
88 - from search import Searcher  
89 -  
90 # Load config 86 # Load config
91 config_loader = ConfigLoader("config/schema") 87 config_loader = ConfigLoader("config/schema")
92 config = config_loader.load_customer_config(args.customer) 88 config = config_loader.load_customer_config(args.customer)
scripts/run_tests.py
@@ -118,7 +118,6 @@ class TestRunner: @@ -118,7 +118,6 @@ class TestRunner:
118 118
119 # 检查Python环境 119 # 检查Python环境
120 try: 120 try:
121 - import sys  
122 python_version = sys.version 121 python_version = sys.version
123 self.logger.info(f"Python版本: {python_version}") 122 self.logger.info(f"Python版本: {python_version}")
124 checks.append(("Python", True, f"版本 {python_version}")) 123 checks.append(("Python", True, f"版本 {python_version}"))
test_aggregation_functionality.py
@@ -5,6 +5,7 @@ Simple test script to verify aggregation functionality without external dependen @@ -5,6 +5,7 @@ Simple test script to verify aggregation functionality without external dependen
5 5
6 import sys 6 import sys
7 import os 7 import os
  8 +import inspect
8 9
9 # Add the project root to the Python path 10 # Add the project root to the Python path
10 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) 11 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -110,7 +111,6 @@ def test_searcher_integration(): @@ -110,7 +111,6 @@ def test_searcher_integration():
110 return False 111 return False
111 112
112 # We can't easily test the full searcher without ES, but we can check the method signature 113 # We can't easily test the full searcher without ES, but we can check the method signature
113 - import inspect  
114 search_method = getattr(Searcher, 'search', None) 114 search_method = getattr(Searcher, 'search', None)
115 115
116 if search_method: 116 if search_method: