Commit a77693fefb06e91a73e7493e836d5f4d6fd2e881

Authored by tangwang
1 parent 1f071951

调整配置目录结构

config/config_loader.py
... ... @@ -104,10 +104,53 @@ class ConfigLoader:
104 104  
105 105 def __init__(self, config_dir: str = "config/schema"):
106 106 self.config_dir = Path(config_dir)
  107 +
  108 + def _load_rewrite_dictionary(self, customer_id: str) -> Dict[str, str]:
  109 + """
  110 + Load query rewrite dictionary from external file.
  111 +
  112 + Args:
  113 + customer_id: Customer identifier
  114 +
  115 + Returns:
  116 + Dictionary mapping query terms to rewritten queries
  117 + """
  118 + dict_file = self.config_dir / customer_id / "query_rewrite.dict"
  119 +
  120 + if not dict_file.exists():
  121 + # Dictionary file is optional, return empty dict if not found
  122 + return {}
  123 +
  124 + rewrite_dict = {}
  125 + try:
  126 + with open(dict_file, 'r', encoding='utf-8') as f:
  127 + for line_num, line in enumerate(f, 1):
  128 + line = line.strip()
  129 + # Skip empty lines and comments
  130 + if not line or line.startswith('#'):
  131 + continue
  132 +
  133 + # Parse tab-separated format
  134 + parts = line.split('\t')
  135 + if len(parts) != 2:
  136 + print(f"Warning: Invalid format in {dict_file} line {line_num}: {line}")
  137 + continue
  138 +
  139 + key, value = parts
  140 + rewrite_dict[key.strip()] = value.strip()
  141 + except Exception as e:
  142 + print(f"Error loading rewrite dictionary from {dict_file}: {e}")
  143 + return {}
  144 +
  145 + return rewrite_dict
107 146  
108 147 def load_customer_config(self, customer_id: str) -> CustomerConfig:
109 148 """
110 149 Load customer configuration from YAML file.
  150 +
  151 + Supports two directory structures:
  152 + 1. New structure: config/schema/{customer_id}/config.yaml
  153 + 2. Old structure: config/schema/{customer_id}_config.yaml (for backward compatibility)
111 154  
112 155 Args:
113 156 customer_id: Customer identifier (used to find config file)
... ... @@ -118,8 +161,13 @@ class ConfigLoader:
118 161 Raises:
119 162 ConfigurationError: If config file not found or invalid
120 163 """
121   - config_file = self.config_dir / f"{customer_id}_config.yaml"
122   -
  164 + # Try new directory structure first
  165 + config_file = self.config_dir / customer_id / "config.yaml"
  166 +
  167 + # Fall back to old structure if new one doesn't exist
  168 + if not config_file.exists():
  169 + config_file = self.config_dir / f"{customer_id}_config.yaml"
  170 +
123 171 if not config_file.exists():
124 172 raise ConfigurationError(f"Configuration file not found: {config_file}")
125 173  
... ... @@ -146,13 +194,17 @@ class ConfigLoader:
146 194  
147 195 # Parse query config
148 196 query_config_data = config_data.get("query_config", {})
  197 +
  198 + # Load rewrite dictionary from external file instead of config
  199 + rewrite_dictionary = self._load_rewrite_dictionary(customer_id)
  200 +
149 201 query_config = QueryConfig(
150 202 supported_languages=query_config_data.get("supported_languages", ["zh", "en"]),
151 203 default_language=query_config_data.get("default_language", "zh"),
152 204 enable_translation=query_config_data.get("enable_translation", True),
153 205 enable_text_embedding=query_config_data.get("enable_text_embedding", True),
154 206 enable_query_rewrite=query_config_data.get("enable_query_rewrite", True),
155   - rewrite_dictionary=query_config_data.get("rewrite_dictionary", {}),
  207 + rewrite_dictionary=rewrite_dictionary,
156 208 translation_api_key=query_config_data.get("translation_api_key"),
157 209 translation_service=query_config_data.get("translation_service", "deepl")
158 210 )
... ... @@ -342,13 +394,18 @@ class ConfigLoader:
342 394 def save_config(self, config: CustomerConfig, output_path: Optional[str] = None) -> None:
343 395 """
344 396 Save customer configuration to YAML file.
  397 +
  398 + Note: rewrite_dictionary is saved separately to query_rewrite.dict file
345 399  
346 400 Args:
347 401 config: Configuration to save
348   - output_path: Optional output path (defaults to config dir)
  402 + output_path: Optional output path (defaults to new directory structure)
349 403 """
350 404 if output_path is None:
351   - output_path = self.config_dir / f"{config.customer_id}_config.yaml"
  405 + # Use new directory structure by default
  406 + customer_dir = self.config_dir / config.customer_id
  407 + customer_dir.mkdir(parents=True, exist_ok=True)
  408 + output_path = customer_dir / "config.yaml"
352 409  
353 410 # Convert config back to dictionary format
354 411 config_dict = {
... ... @@ -366,7 +423,7 @@ class ConfigLoader:
366 423 "enable_translation": config.query_config.enable_translation,
367 424 "enable_text_embedding": config.query_config.enable_text_embedding,
368 425 "enable_query_rewrite": config.query_config.enable_query_rewrite,
369   - "rewrite_dictionary": config.query_config.rewrite_dictionary,
  426 + # rewrite_dictionary is stored in separate file, not in config
370 427 "translation_api_key": config.query_config.translation_api_key,
371 428 "translation_service": config.query_config.translation_service,
372 429 },
... ... @@ -383,6 +440,25 @@ class ConfigLoader:
383 440  
384 441 with open(output_path, 'w', encoding='utf-8') as f:
385 442 yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True)
  443 +
  444 + # Save rewrite dictionary to separate file
  445 + self._save_rewrite_dictionary(config.customer_id, config.query_config.rewrite_dictionary)
  446 +
  447 + def _save_rewrite_dictionary(self, customer_id: str, rewrite_dict: Dict[str, str]) -> None:
  448 + """
  449 + Save rewrite dictionary to external file.
  450 +
  451 + Args:
  452 + customer_id: Customer identifier
  453 + rewrite_dict: Dictionary to save
  454 + """
  455 + customer_dir = self.config_dir / customer_id
  456 + customer_dir.mkdir(parents=True, exist_ok=True)
  457 + dict_file = customer_dir / "query_rewrite.dict"
  458 +
  459 + with open(dict_file, 'w', encoding='utf-8') as f:
  460 + for key, value in rewrite_dict.items():
  461 + f.write(f"{key}\t{value}\n")
386 462  
387 463 def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]:
388 464 """Convert FieldConfig to dictionary."""
... ...
config/schema/customer1_config.yaml renamed to config/schema/customer1/config.yaml
... ... @@ -246,12 +246,6 @@ query_config:
246 246 enable_text_embedding: true
247 247 enable_query_rewrite: true
248 248  
249   - # Query Rewrite Dictionary
250   - rewrite_dictionary:
251   - "芭比": "brand:芭比 OR name:芭比娃娃"
252   - "玩具": "category:玩具"
253   - "消防": "category:消防 OR name:消防"
254   -
255 249 # Translation API (DeepL)
256 250 translation_service: "deepl"
257 251 translation_api_key: null # Set via environment variable
... ... @@ -266,3 +260,4 @@ spu_config:
266 260 enabled: false
267 261 spu_field: null
268 262 inner_hits_size: 3
  263 +
... ...
config/schema/customer1/query_rewrite.dict 0 → 100644
... ... @@ -0,0 +1,4 @@
  1 +芭比 brand:芭比 OR name:芭比娃娃
  2 +玩具 category:玩具
  3 +消防 category:消防 OR name:消防
  4 +
... ...
example_usage.py
... ... @@ -6,6 +6,7 @@ RequestContext使用示例
6 6  
7 7 import sys
8 8 import os
  9 +import time
9 10  
10 11 # 添加项目根目录到Python路径
11 12 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
... ... @@ -25,7 +26,6 @@ def example_basic_usage():
25 26 # 步骤1: 查询解析
26 27 context.start_stage(RequestContextStage.QUERY_PARSING)
27 28 # 这里调用 query_parser.parse(query, context=context)
28   - import time
29 29 time.sleep(0.05) # 模拟处理时间
30 30 context.end_stage(RequestContextStage.QUERY_PARSING)
31 31  
... ... @@ -97,7 +97,6 @@ def example_with_searcher():
97 97  
98 98 # 模拟搜索流程
99 99 context.start_stage(RequestContextStage.QUERY_PARSING)
100   - import time
101 100 time.sleep(0.04)
102 101 context.end_stage(RequestContextStage.QUERY_PARSING)
103 102 context.store_query_analysis(
... ... @@ -181,7 +180,6 @@ def example_performance_analysis():
181 180 (RequestContextStage.RERANKING, 42.7)
182 181 ]
183 182  
184   - import time
185 183 for stage, duration_ms in stages_with_durations:
186 184 context.start_stage(stage)
187 185 time.sleep(duration_ms / 1000.0) # 转换为秒
... ...
indexer/data_transformer.py
... ... @@ -6,6 +6,7 @@ Handles field mapping, type conversion, and embedding generation.
6 6  
7 7 import pandas as pd
8 8 import numpy as np
  9 +import datetime
9 10 from typing import Dict, Any, List, Optional
10 11 from config import CustomerConfig, FieldConfig, FieldType
11 12 from embeddings import BgeEncoder, CLIPImageEncoder
... ... @@ -304,7 +305,6 @@ class DataTransformer:
304 305 elif isinstance(value, str):
305 306 # Try to parse string datetime and convert to ISO format
306 307 try:
307   - import datetime
308 308 # Handle common datetime formats
309 309 formats = [
310 310 '%Y-%m-%d %H:%M:%S', # 2020-07-07 16:44:09
... ...
... ... @@ -12,19 +12,21 @@ import sys
12 12 import os
13 13 import argparse
14 14 import json
  15 +import pandas as pd
  16 +import uvicorn
15 17  
16 18 # Add parent directory to path
17 19 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
18 20  
  21 +from config import ConfigLoader
  22 +from utils import ESClient
  23 +from indexer import DataTransformer, IndexingPipeline
  24 +from embeddings import BgeEncoder, CLIPImageEncoder
  25 +from search import Searcher
  26 +
19 27  
20 28 def cmd_ingest(args):
21 29 """Run data ingestion."""
22   - from config import ConfigLoader
23   - from utils import ESClient
24   - from indexer import DataTransformer, IndexingPipeline
25   - from embeddings import BgeEncoder, CLIPImageEncoder
26   - import pandas as pd
27   -
28 30 print(f"Starting ingestion for customer: {args.customer}")
29 31  
30 32 # Load config
... ... @@ -63,8 +65,6 @@ def cmd_ingest(args):
63 65  
64 66 def cmd_serve(args):
65 67 """Start API service."""
66   - import uvicorn
67   -
68 68 os.environ['CUSTOMER_ID'] = args.customer
69 69 os.environ['ES_HOST'] = args.es_host
70 70  
... ... @@ -83,10 +83,6 @@ def cmd_serve(args):
83 83  
84 84 def cmd_search(args):
85 85 """Test search from command line."""
86   - from config import ConfigLoader
87   - from utils import ESClient
88   - from search import Searcher
89   -
90 86 # Load config
91 87 config_loader = ConfigLoader("config/schema")
92 88 config = config_loader.load_customer_config(args.customer)
... ...
scripts/run_tests.py
... ... @@ -118,7 +118,6 @@ class TestRunner:
118 118  
119 119 # 检查Python环境
120 120 try:
121   - import sys
122 121 python_version = sys.version
123 122 self.logger.info(f"Python版本: {python_version}")
124 123 checks.append(("Python", True, f"版本 {python_version}"))
... ...
test_aggregation_functionality.py
... ... @@ -5,6 +5,7 @@ Simple test script to verify aggregation functionality without external dependen
5 5  
6 6 import sys
7 7 import os
  8 +import inspect
8 9  
9 10 # Add the project root to the Python path
10 11 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
... ... @@ -110,7 +111,6 @@ def test_searcher_integration():
110 111 return False
111 112  
112 113 # We can't easily test the full searcher without ES, but we can check the method signature
113   - import inspect
114 114 search_method = getattr(Searcher, 'search', None)
115 115  
116 116 if search_method:
... ...