Commit a77693fefb06e91a73e7493e836d5f4d6fd2e881
1 parent
1f071951
调整配置目录结构
Showing
8 changed files
with
98 additions
and
30 deletions
Show diff stats
config/config_loader.py
| @@ -104,10 +104,53 @@ class ConfigLoader: | @@ -104,10 +104,53 @@ class ConfigLoader: | ||
| 104 | 104 | ||
| 105 | def __init__(self, config_dir: str = "config/schema"): | 105 | def __init__(self, config_dir: str = "config/schema"): |
| 106 | self.config_dir = Path(config_dir) | 106 | self.config_dir = Path(config_dir) |
| 107 | + | ||
| 108 | + def _load_rewrite_dictionary(self, customer_id: str) -> Dict[str, str]: | ||
| 109 | + """ | ||
| 110 | + Load query rewrite dictionary from external file. | ||
| 111 | + | ||
| 112 | + Args: | ||
| 113 | + customer_id: Customer identifier | ||
| 114 | + | ||
| 115 | + Returns: | ||
| 116 | + Dictionary mapping query terms to rewritten queries | ||
| 117 | + """ | ||
| 118 | + dict_file = self.config_dir / customer_id / "query_rewrite.dict" | ||
| 119 | + | ||
| 120 | + if not dict_file.exists(): | ||
| 121 | + # Dictionary file is optional, return empty dict if not found | ||
| 122 | + return {} | ||
| 123 | + | ||
| 124 | + rewrite_dict = {} | ||
| 125 | + try: | ||
| 126 | + with open(dict_file, 'r', encoding='utf-8') as f: | ||
| 127 | + for line_num, line in enumerate(f, 1): | ||
| 128 | + line = line.strip() | ||
| 129 | + # Skip empty lines and comments | ||
| 130 | + if not line or line.startswith('#'): | ||
| 131 | + continue | ||
| 132 | + | ||
| 133 | + # Parse tab-separated format | ||
| 134 | + parts = line.split('\t') | ||
| 135 | + if len(parts) != 2: | ||
| 136 | + print(f"Warning: Invalid format in {dict_file} line {line_num}: {line}") | ||
| 137 | + continue | ||
| 138 | + | ||
| 139 | + key, value = parts | ||
| 140 | + rewrite_dict[key.strip()] = value.strip() | ||
| 141 | + except Exception as e: | ||
| 142 | + print(f"Error loading rewrite dictionary from {dict_file}: {e}") | ||
| 143 | + return {} | ||
| 144 | + | ||
| 145 | + return rewrite_dict | ||
| 107 | 146 | ||
| 108 | def load_customer_config(self, customer_id: str) -> CustomerConfig: | 147 | def load_customer_config(self, customer_id: str) -> CustomerConfig: |
| 109 | """ | 148 | """ |
| 110 | Load customer configuration from YAML file. | 149 | Load customer configuration from YAML file. |
| 150 | + | ||
| 151 | + Supports two directory structures: | ||
| 152 | + 1. New structure: config/schema/{customer_id}/config.yaml | ||
| 153 | + 2. Old structure: config/schema/{customer_id}_config.yaml (for backward compatibility) | ||
| 111 | 154 | ||
| 112 | Args: | 155 | Args: |
| 113 | customer_id: Customer identifier (used to find config file) | 156 | customer_id: Customer identifier (used to find config file) |
| @@ -118,8 +161,13 @@ class ConfigLoader: | @@ -118,8 +161,13 @@ class ConfigLoader: | ||
| 118 | Raises: | 161 | Raises: |
| 119 | ConfigurationError: If config file not found or invalid | 162 | ConfigurationError: If config file not found or invalid |
| 120 | """ | 163 | """ |
| 121 | - config_file = self.config_dir / f"{customer_id}_config.yaml" | ||
| 122 | - | 164 | + # Try new directory structure first |
| 165 | + config_file = self.config_dir / customer_id / "config.yaml" | ||
| 166 | + | ||
| 167 | + # Fall back to old structure if new one doesn't exist | ||
| 168 | + if not config_file.exists(): | ||
| 169 | + config_file = self.config_dir / f"{customer_id}_config.yaml" | ||
| 170 | + | ||
| 123 | if not config_file.exists(): | 171 | if not config_file.exists(): |
| 124 | raise ConfigurationError(f"Configuration file not found: {config_file}") | 172 | raise ConfigurationError(f"Configuration file not found: {config_file}") |
| 125 | 173 | ||
| @@ -146,13 +194,17 @@ class ConfigLoader: | @@ -146,13 +194,17 @@ class ConfigLoader: | ||
| 146 | 194 | ||
| 147 | # Parse query config | 195 | # Parse query config |
| 148 | query_config_data = config_data.get("query_config", {}) | 196 | query_config_data = config_data.get("query_config", {}) |
| 197 | + | ||
| 198 | + # Load rewrite dictionary from external file instead of config | ||
| 199 | + rewrite_dictionary = self._load_rewrite_dictionary(customer_id) | ||
| 200 | + | ||
| 149 | query_config = QueryConfig( | 201 | query_config = QueryConfig( |
| 150 | supported_languages=query_config_data.get("supported_languages", ["zh", "en"]), | 202 | supported_languages=query_config_data.get("supported_languages", ["zh", "en"]), |
| 151 | default_language=query_config_data.get("default_language", "zh"), | 203 | default_language=query_config_data.get("default_language", "zh"), |
| 152 | enable_translation=query_config_data.get("enable_translation", True), | 204 | enable_translation=query_config_data.get("enable_translation", True), |
| 153 | enable_text_embedding=query_config_data.get("enable_text_embedding", True), | 205 | enable_text_embedding=query_config_data.get("enable_text_embedding", True), |
| 154 | enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), | 206 | enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), |
| 155 | - rewrite_dictionary=query_config_data.get("rewrite_dictionary", {}), | 207 | + rewrite_dictionary=rewrite_dictionary, |
| 156 | translation_api_key=query_config_data.get("translation_api_key"), | 208 | translation_api_key=query_config_data.get("translation_api_key"), |
| 157 | translation_service=query_config_data.get("translation_service", "deepl") | 209 | translation_service=query_config_data.get("translation_service", "deepl") |
| 158 | ) | 210 | ) |
| @@ -342,13 +394,18 @@ class ConfigLoader: | @@ -342,13 +394,18 @@ class ConfigLoader: | ||
| 342 | def save_config(self, config: CustomerConfig, output_path: Optional[str] = None) -> None: | 394 | def save_config(self, config: CustomerConfig, output_path: Optional[str] = None) -> None: |
| 343 | """ | 395 | """ |
| 344 | Save customer configuration to YAML file. | 396 | Save customer configuration to YAML file. |
| 397 | + | ||
| 398 | + Note: rewrite_dictionary is saved separately to query_rewrite.dict file | ||
| 345 | 399 | ||
| 346 | Args: | 400 | Args: |
| 347 | config: Configuration to save | 401 | config: Configuration to save |
| 348 | - output_path: Optional output path (defaults to config dir) | 402 | + output_path: Optional output path (defaults to new directory structure) |
| 349 | """ | 403 | """ |
| 350 | if output_path is None: | 404 | if output_path is None: |
| 351 | - output_path = self.config_dir / f"{config.customer_id}_config.yaml" | 405 | + # Use new directory structure by default |
| 406 | + customer_dir = self.config_dir / config.customer_id | ||
| 407 | + customer_dir.mkdir(parents=True, exist_ok=True) | ||
| 408 | + output_path = customer_dir / "config.yaml" | ||
| 352 | 409 | ||
| 353 | # Convert config back to dictionary format | 410 | # Convert config back to dictionary format |
| 354 | config_dict = { | 411 | config_dict = { |
| @@ -366,7 +423,7 @@ class ConfigLoader: | @@ -366,7 +423,7 @@ class ConfigLoader: | ||
| 366 | "enable_translation": config.query_config.enable_translation, | 423 | "enable_translation": config.query_config.enable_translation, |
| 367 | "enable_text_embedding": config.query_config.enable_text_embedding, | 424 | "enable_text_embedding": config.query_config.enable_text_embedding, |
| 368 | "enable_query_rewrite": config.query_config.enable_query_rewrite, | 425 | "enable_query_rewrite": config.query_config.enable_query_rewrite, |
| 369 | - "rewrite_dictionary": config.query_config.rewrite_dictionary, | 426 | + # rewrite_dictionary is stored in separate file, not in config |
| 370 | "translation_api_key": config.query_config.translation_api_key, | 427 | "translation_api_key": config.query_config.translation_api_key, |
| 371 | "translation_service": config.query_config.translation_service, | 428 | "translation_service": config.query_config.translation_service, |
| 372 | }, | 429 | }, |
| @@ -383,6 +440,25 @@ class ConfigLoader: | @@ -383,6 +440,25 @@ class ConfigLoader: | ||
| 383 | 440 | ||
| 384 | with open(output_path, 'w', encoding='utf-8') as f: | 441 | with open(output_path, 'w', encoding='utf-8') as f: |
| 385 | yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True) | 442 | yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True) |
| 443 | + | ||
| 444 | + # Save rewrite dictionary to separate file | ||
| 445 | + self._save_rewrite_dictionary(config.customer_id, config.query_config.rewrite_dictionary) | ||
| 446 | + | ||
| 447 | + def _save_rewrite_dictionary(self, customer_id: str, rewrite_dict: Dict[str, str]) -> None: | ||
| 448 | + """ | ||
| 449 | + Save rewrite dictionary to external file. | ||
| 450 | + | ||
| 451 | + Args: | ||
| 452 | + customer_id: Customer identifier | ||
| 453 | + rewrite_dict: Dictionary to save | ||
| 454 | + """ | ||
| 455 | + customer_dir = self.config_dir / customer_id | ||
| 456 | + customer_dir.mkdir(parents=True, exist_ok=True) | ||
| 457 | + dict_file = customer_dir / "query_rewrite.dict" | ||
| 458 | + | ||
| 459 | + with open(dict_file, 'w', encoding='utf-8') as f: | ||
| 460 | + for key, value in rewrite_dict.items(): | ||
| 461 | + f.write(f"{key}\t{value}\n") | ||
| 386 | 462 | ||
| 387 | def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: | 463 | def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: |
| 388 | """Convert FieldConfig to dictionary.""" | 464 | """Convert FieldConfig to dictionary.""" |
config/schema/customer1_config.yaml renamed to config/schema/customer1/config.yaml
| @@ -246,12 +246,6 @@ query_config: | @@ -246,12 +246,6 @@ query_config: | ||
| 246 | enable_text_embedding: true | 246 | enable_text_embedding: true |
| 247 | enable_query_rewrite: true | 247 | enable_query_rewrite: true |
| 248 | 248 | ||
| 249 | - # Query Rewrite Dictionary | ||
| 250 | - rewrite_dictionary: | ||
| 251 | - "芭比": "brand:芭比 OR name:芭比娃娃" | ||
| 252 | - "玩具": "category:玩具" | ||
| 253 | - "消防": "category:消防 OR name:消防" | ||
| 254 | - | ||
| 255 | # Translation API (DeepL) | 249 | # Translation API (DeepL) |
| 256 | translation_service: "deepl" | 250 | translation_service: "deepl" |
| 257 | translation_api_key: null # Set via environment variable | 251 | translation_api_key: null # Set via environment variable |
| @@ -266,3 +260,4 @@ spu_config: | @@ -266,3 +260,4 @@ spu_config: | ||
| 266 | enabled: false | 260 | enabled: false |
| 267 | spu_field: null | 261 | spu_field: null |
| 268 | inner_hits_size: 3 | 262 | inner_hits_size: 3 |
| 263 | + |
example_usage.py
| @@ -6,6 +6,7 @@ RequestContext使用示例 | @@ -6,6 +6,7 @@ RequestContext使用示例 | ||
| 6 | 6 | ||
| 7 | import sys | 7 | import sys |
| 8 | import os | 8 | import os |
| 9 | +import time | ||
| 9 | 10 | ||
| 10 | # 添加项目根目录到Python路径 | 11 | # 添加项目根目录到Python路径 |
| 11 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | 12 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| @@ -25,7 +26,6 @@ def example_basic_usage(): | @@ -25,7 +26,6 @@ def example_basic_usage(): | ||
| 25 | # 步骤1: 查询解析 | 26 | # 步骤1: 查询解析 |
| 26 | context.start_stage(RequestContextStage.QUERY_PARSING) | 27 | context.start_stage(RequestContextStage.QUERY_PARSING) |
| 27 | # 这里调用 query_parser.parse(query, context=context) | 28 | # 这里调用 query_parser.parse(query, context=context) |
| 28 | - import time | ||
| 29 | time.sleep(0.05) # 模拟处理时间 | 29 | time.sleep(0.05) # 模拟处理时间 |
| 30 | context.end_stage(RequestContextStage.QUERY_PARSING) | 30 | context.end_stage(RequestContextStage.QUERY_PARSING) |
| 31 | 31 | ||
| @@ -97,7 +97,6 @@ def example_with_searcher(): | @@ -97,7 +97,6 @@ def example_with_searcher(): | ||
| 97 | 97 | ||
| 98 | # 模拟搜索流程 | 98 | # 模拟搜索流程 |
| 99 | context.start_stage(RequestContextStage.QUERY_PARSING) | 99 | context.start_stage(RequestContextStage.QUERY_PARSING) |
| 100 | - import time | ||
| 101 | time.sleep(0.04) | 100 | time.sleep(0.04) |
| 102 | context.end_stage(RequestContextStage.QUERY_PARSING) | 101 | context.end_stage(RequestContextStage.QUERY_PARSING) |
| 103 | context.store_query_analysis( | 102 | context.store_query_analysis( |
| @@ -181,7 +180,6 @@ def example_performance_analysis(): | @@ -181,7 +180,6 @@ def example_performance_analysis(): | ||
| 181 | (RequestContextStage.RERANKING, 42.7) | 180 | (RequestContextStage.RERANKING, 42.7) |
| 182 | ] | 181 | ] |
| 183 | 182 | ||
| 184 | - import time | ||
| 185 | for stage, duration_ms in stages_with_durations: | 183 | for stage, duration_ms in stages_with_durations: |
| 186 | context.start_stage(stage) | 184 | context.start_stage(stage) |
| 187 | time.sleep(duration_ms / 1000.0) # 转换为秒 | 185 | time.sleep(duration_ms / 1000.0) # 转换为秒 |
indexer/data_transformer.py
| @@ -6,6 +6,7 @@ Handles field mapping, type conversion, and embedding generation. | @@ -6,6 +6,7 @@ Handles field mapping, type conversion, and embedding generation. | ||
| 6 | 6 | ||
| 7 | import pandas as pd | 7 | import pandas as pd |
| 8 | import numpy as np | 8 | import numpy as np |
| 9 | +import datetime | ||
| 9 | from typing import Dict, Any, List, Optional | 10 | from typing import Dict, Any, List, Optional |
| 10 | from config import CustomerConfig, FieldConfig, FieldType | 11 | from config import CustomerConfig, FieldConfig, FieldType |
| 11 | from embeddings import BgeEncoder, CLIPImageEncoder | 12 | from embeddings import BgeEncoder, CLIPImageEncoder |
| @@ -304,7 +305,6 @@ class DataTransformer: | @@ -304,7 +305,6 @@ class DataTransformer: | ||
| 304 | elif isinstance(value, str): | 305 | elif isinstance(value, str): |
| 305 | # Try to parse string datetime and convert to ISO format | 306 | # Try to parse string datetime and convert to ISO format |
| 306 | try: | 307 | try: |
| 307 | - import datetime | ||
| 308 | # Handle common datetime formats | 308 | # Handle common datetime formats |
| 309 | formats = [ | 309 | formats = [ |
| 310 | '%Y-%m-%d %H:%M:%S', # 2020-07-07 16:44:09 | 310 | '%Y-%m-%d %H:%M:%S', # 2020-07-07 16:44:09 |
main.py
| @@ -12,19 +12,21 @@ import sys | @@ -12,19 +12,21 @@ import sys | ||
| 12 | import os | 12 | import os |
| 13 | import argparse | 13 | import argparse |
| 14 | import json | 14 | import json |
| 15 | +import pandas as pd | ||
| 16 | +import uvicorn | ||
| 15 | 17 | ||
| 16 | # Add parent directory to path | 18 | # Add parent directory to path |
| 17 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | 19 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| 18 | 20 | ||
| 21 | +from config import ConfigLoader | ||
| 22 | +from utils import ESClient | ||
| 23 | +from indexer import DataTransformer, IndexingPipeline | ||
| 24 | +from embeddings import BgeEncoder, CLIPImageEncoder | ||
| 25 | +from search import Searcher | ||
| 26 | + | ||
| 19 | 27 | ||
| 20 | def cmd_ingest(args): | 28 | def cmd_ingest(args): |
| 21 | """Run data ingestion.""" | 29 | """Run data ingestion.""" |
| 22 | - from config import ConfigLoader | ||
| 23 | - from utils import ESClient | ||
| 24 | - from indexer import DataTransformer, IndexingPipeline | ||
| 25 | - from embeddings import BgeEncoder, CLIPImageEncoder | ||
| 26 | - import pandas as pd | ||
| 27 | - | ||
| 28 | print(f"Starting ingestion for customer: {args.customer}") | 30 | print(f"Starting ingestion for customer: {args.customer}") |
| 29 | 31 | ||
| 30 | # Load config | 32 | # Load config |
| @@ -63,8 +65,6 @@ def cmd_ingest(args): | @@ -63,8 +65,6 @@ def cmd_ingest(args): | ||
| 63 | 65 | ||
| 64 | def cmd_serve(args): | 66 | def cmd_serve(args): |
| 65 | """Start API service.""" | 67 | """Start API service.""" |
| 66 | - import uvicorn | ||
| 67 | - | ||
| 68 | os.environ['CUSTOMER_ID'] = args.customer | 68 | os.environ['CUSTOMER_ID'] = args.customer |
| 69 | os.environ['ES_HOST'] = args.es_host | 69 | os.environ['ES_HOST'] = args.es_host |
| 70 | 70 | ||
| @@ -83,10 +83,6 @@ def cmd_serve(args): | @@ -83,10 +83,6 @@ def cmd_serve(args): | ||
| 83 | 83 | ||
| 84 | def cmd_search(args): | 84 | def cmd_search(args): |
| 85 | """Test search from command line.""" | 85 | """Test search from command line.""" |
| 86 | - from config import ConfigLoader | ||
| 87 | - from utils import ESClient | ||
| 88 | - from search import Searcher | ||
| 89 | - | ||
| 90 | # Load config | 86 | # Load config |
| 91 | config_loader = ConfigLoader("config/schema") | 87 | config_loader = ConfigLoader("config/schema") |
| 92 | config = config_loader.load_customer_config(args.customer) | 88 | config = config_loader.load_customer_config(args.customer) |
scripts/run_tests.py
| @@ -118,7 +118,6 @@ class TestRunner: | @@ -118,7 +118,6 @@ class TestRunner: | ||
| 118 | 118 | ||
| 119 | # 检查Python环境 | 119 | # 检查Python环境 |
| 120 | try: | 120 | try: |
| 121 | - import sys | ||
| 122 | python_version = sys.version | 121 | python_version = sys.version |
| 123 | self.logger.info(f"Python版本: {python_version}") | 122 | self.logger.info(f"Python版本: {python_version}") |
| 124 | checks.append(("Python", True, f"版本 {python_version}")) | 123 | checks.append(("Python", True, f"版本 {python_version}")) |
test_aggregation_functionality.py
| @@ -5,6 +5,7 @@ Simple test script to verify aggregation functionality without external dependen | @@ -5,6 +5,7 @@ Simple test script to verify aggregation functionality without external dependen | ||
| 5 | 5 | ||
| 6 | import sys | 6 | import sys |
| 7 | import os | 7 | import os |
| 8 | +import inspect | ||
| 8 | 9 | ||
| 9 | # Add the project root to the Python path | 10 | # Add the project root to the Python path |
| 10 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | 11 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| @@ -110,7 +111,6 @@ def test_searcher_integration(): | @@ -110,7 +111,6 @@ def test_searcher_integration(): | ||
| 110 | return False | 111 | return False |
| 111 | 112 | ||
| 112 | # We can't easily test the full searcher without ES, but we can check the method signature | 113 | # We can't easily test the full searcher without ES, but we can check the method signature |
| 113 | - import inspect | ||
| 114 | search_method = getattr(Searcher, 'search', None) | 114 | search_method = getattr(Searcher, 'search', None) |
| 115 | 115 | ||
| 116 | if search_method: | 116 | if search_method: |