Commit a77693fefb06e91a73e7493e836d5f4d6fd2e881
1 parent
1f071951
调整配置目录结构
Showing
8 changed files
with
98 additions
and
30 deletions
Show diff stats
config/config_loader.py
| ... | ... | @@ -104,10 +104,53 @@ class ConfigLoader: |
| 104 | 104 | |
| 105 | 105 | def __init__(self, config_dir: str = "config/schema"): |
| 106 | 106 | self.config_dir = Path(config_dir) |
| 107 | + | |
| 108 | + def _load_rewrite_dictionary(self, customer_id: str) -> Dict[str, str]: | |
| 109 | + """ | |
| 110 | + Load query rewrite dictionary from external file. | |
| 111 | + | |
| 112 | + Args: | |
| 113 | + customer_id: Customer identifier | |
| 114 | + | |
| 115 | + Returns: | |
| 116 | + Dictionary mapping query terms to rewritten queries | |
| 117 | + """ | |
| 118 | + dict_file = self.config_dir / customer_id / "query_rewrite.dict" | |
| 119 | + | |
| 120 | + if not dict_file.exists(): | |
| 121 | + # Dictionary file is optional, return empty dict if not found | |
| 122 | + return {} | |
| 123 | + | |
| 124 | + rewrite_dict = {} | |
| 125 | + try: | |
| 126 | + with open(dict_file, 'r', encoding='utf-8') as f: | |
| 127 | + for line_num, line in enumerate(f, 1): | |
| 128 | + line = line.strip() | |
| 129 | + # Skip empty lines and comments | |
| 130 | + if not line or line.startswith('#'): | |
| 131 | + continue | |
| 132 | + | |
| 133 | + # Parse tab-separated format | |
| 134 | + parts = line.split('\t') | |
| 135 | + if len(parts) != 2: | |
| 136 | + print(f"Warning: Invalid format in {dict_file} line {line_num}: {line}") | |
| 137 | + continue | |
| 138 | + | |
| 139 | + key, value = parts | |
| 140 | + rewrite_dict[key.strip()] = value.strip() | |
| 141 | + except Exception as e: | |
| 142 | + print(f"Error loading rewrite dictionary from {dict_file}: {e}") | |
| 143 | + return {} | |
| 144 | + | |
| 145 | + return rewrite_dict | |
| 107 | 146 | |
| 108 | 147 | def load_customer_config(self, customer_id: str) -> CustomerConfig: |
| 109 | 148 | """ |
| 110 | 149 | Load customer configuration from YAML file. |
| 150 | + | |
| 151 | + Supports two directory structures: | |
| 152 | + 1. New structure: config/schema/{customer_id}/config.yaml | |
| 153 | + 2. Old structure: config/schema/{customer_id}_config.yaml (for backward compatibility) | |
| 111 | 154 | |
| 112 | 155 | Args: |
| 113 | 156 | customer_id: Customer identifier (used to find config file) |
| ... | ... | @@ -118,8 +161,13 @@ class ConfigLoader: |
| 118 | 161 | Raises: |
| 119 | 162 | ConfigurationError: If config file not found or invalid |
| 120 | 163 | """ |
| 121 | - config_file = self.config_dir / f"{customer_id}_config.yaml" | |
| 122 | - | |
| 164 | + # Try new directory structure first | |
| 165 | + config_file = self.config_dir / customer_id / "config.yaml" | |
| 166 | + | |
| 167 | + # Fall back to old structure if new one doesn't exist | |
| 168 | + if not config_file.exists(): | |
| 169 | + config_file = self.config_dir / f"{customer_id}_config.yaml" | |
| 170 | + | |
| 123 | 171 | if not config_file.exists(): |
| 124 | 172 | raise ConfigurationError(f"Configuration file not found: {config_file}") |
| 125 | 173 | |
| ... | ... | @@ -146,13 +194,17 @@ class ConfigLoader: |
| 146 | 194 | |
| 147 | 195 | # Parse query config |
| 148 | 196 | query_config_data = config_data.get("query_config", {}) |
| 197 | + | |
| 198 | + # Load rewrite dictionary from external file instead of config | |
| 199 | + rewrite_dictionary = self._load_rewrite_dictionary(customer_id) | |
| 200 | + | |
| 149 | 201 | query_config = QueryConfig( |
| 150 | 202 | supported_languages=query_config_data.get("supported_languages", ["zh", "en"]), |
| 151 | 203 | default_language=query_config_data.get("default_language", "zh"), |
| 152 | 204 | enable_translation=query_config_data.get("enable_translation", True), |
| 153 | 205 | enable_text_embedding=query_config_data.get("enable_text_embedding", True), |
| 154 | 206 | enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), |
| 155 | - rewrite_dictionary=query_config_data.get("rewrite_dictionary", {}), | |
| 207 | + rewrite_dictionary=rewrite_dictionary, | |
| 156 | 208 | translation_api_key=query_config_data.get("translation_api_key"), |
| 157 | 209 | translation_service=query_config_data.get("translation_service", "deepl") |
| 158 | 210 | ) |
| ... | ... | @@ -342,13 +394,18 @@ class ConfigLoader: |
| 342 | 394 | def save_config(self, config: CustomerConfig, output_path: Optional[str] = None) -> None: |
| 343 | 395 | """ |
| 344 | 396 | Save customer configuration to YAML file. |
| 397 | + | |
| 398 | + Note: rewrite_dictionary is saved separately to query_rewrite.dict file | |
| 345 | 399 | |
| 346 | 400 | Args: |
| 347 | 401 | config: Configuration to save |
| 348 | - output_path: Optional output path (defaults to config dir) | |
| 402 | + output_path: Optional output path (defaults to new directory structure) | |
| 349 | 403 | """ |
| 350 | 404 | if output_path is None: |
| 351 | - output_path = self.config_dir / f"{config.customer_id}_config.yaml" | |
| 405 | + # Use new directory structure by default | |
| 406 | + customer_dir = self.config_dir / config.customer_id | |
| 407 | + customer_dir.mkdir(parents=True, exist_ok=True) | |
| 408 | + output_path = customer_dir / "config.yaml" | |
| 352 | 409 | |
| 353 | 410 | # Convert config back to dictionary format |
| 354 | 411 | config_dict = { |
| ... | ... | @@ -366,7 +423,7 @@ class ConfigLoader: |
| 366 | 423 | "enable_translation": config.query_config.enable_translation, |
| 367 | 424 | "enable_text_embedding": config.query_config.enable_text_embedding, |
| 368 | 425 | "enable_query_rewrite": config.query_config.enable_query_rewrite, |
| 369 | - "rewrite_dictionary": config.query_config.rewrite_dictionary, | |
| 426 | + # rewrite_dictionary is stored in separate file, not in config | |
| 370 | 427 | "translation_api_key": config.query_config.translation_api_key, |
| 371 | 428 | "translation_service": config.query_config.translation_service, |
| 372 | 429 | }, |
| ... | ... | @@ -383,6 +440,25 @@ class ConfigLoader: |
| 383 | 440 | |
| 384 | 441 | with open(output_path, 'w', encoding='utf-8') as f: |
| 385 | 442 | yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True) |
| 443 | + | |
| 444 | + # Save rewrite dictionary to separate file | |
| 445 | + self._save_rewrite_dictionary(config.customer_id, config.query_config.rewrite_dictionary) | |
| 446 | + | |
| 447 | + def _save_rewrite_dictionary(self, customer_id: str, rewrite_dict: Dict[str, str]) -> None: | |
| 448 | + """ | |
| 449 | + Save rewrite dictionary to external file. | |
| 450 | + | |
| 451 | + Args: | |
| 452 | + customer_id: Customer identifier | |
| 453 | + rewrite_dict: Dictionary to save | |
| 454 | + """ | |
| 455 | + customer_dir = self.config_dir / customer_id | |
| 456 | + customer_dir.mkdir(parents=True, exist_ok=True) | |
| 457 | + dict_file = customer_dir / "query_rewrite.dict" | |
| 458 | + | |
| 459 | + with open(dict_file, 'w', encoding='utf-8') as f: | |
| 460 | + for key, value in rewrite_dict.items(): | |
| 461 | + f.write(f"{key}\t{value}\n") | |
| 386 | 462 | |
| 387 | 463 | def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: |
| 388 | 464 | """Convert FieldConfig to dictionary.""" | ... | ... |
config/schema/customer1_config.yaml renamed to config/schema/customer1/config.yaml
| ... | ... | @@ -246,12 +246,6 @@ query_config: |
| 246 | 246 | enable_text_embedding: true |
| 247 | 247 | enable_query_rewrite: true |
| 248 | 248 | |
| 249 | - # Query Rewrite Dictionary | |
| 250 | - rewrite_dictionary: | |
| 251 | - "芭比": "brand:芭比 OR name:芭比娃娃" | |
| 252 | - "玩具": "category:玩具" | |
| 253 | - "消防": "category:消防 OR name:消防" | |
| 254 | - | |
| 255 | 249 | # Translation API (DeepL) |
| 256 | 250 | translation_service: "deepl" |
| 257 | 251 | translation_api_key: null # Set via environment variable |
| ... | ... | @@ -266,3 +260,4 @@ spu_config: |
| 266 | 260 | enabled: false |
| 267 | 261 | spu_field: null |
| 268 | 262 | inner_hits_size: 3 |
| 263 | + | ... | ... |
example_usage.py
| ... | ... | @@ -6,6 +6,7 @@ RequestContext使用示例 |
| 6 | 6 | |
| 7 | 7 | import sys |
| 8 | 8 | import os |
| 9 | +import time | |
| 9 | 10 | |
| 10 | 11 | # 添加项目根目录到Python路径 |
| 11 | 12 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| ... | ... | @@ -25,7 +26,6 @@ def example_basic_usage(): |
| 25 | 26 | # 步骤1: 查询解析 |
| 26 | 27 | context.start_stage(RequestContextStage.QUERY_PARSING) |
| 27 | 28 | # 这里调用 query_parser.parse(query, context=context) |
| 28 | - import time | |
| 29 | 29 | time.sleep(0.05) # 模拟处理时间 |
| 30 | 30 | context.end_stage(RequestContextStage.QUERY_PARSING) |
| 31 | 31 | |
| ... | ... | @@ -97,7 +97,6 @@ def example_with_searcher(): |
| 97 | 97 | |
| 98 | 98 | # 模拟搜索流程 |
| 99 | 99 | context.start_stage(RequestContextStage.QUERY_PARSING) |
| 100 | - import time | |
| 101 | 100 | time.sleep(0.04) |
| 102 | 101 | context.end_stage(RequestContextStage.QUERY_PARSING) |
| 103 | 102 | context.store_query_analysis( |
| ... | ... | @@ -181,7 +180,6 @@ def example_performance_analysis(): |
| 181 | 180 | (RequestContextStage.RERANKING, 42.7) |
| 182 | 181 | ] |
| 183 | 182 | |
| 184 | - import time | |
| 185 | 183 | for stage, duration_ms in stages_with_durations: |
| 186 | 184 | context.start_stage(stage) |
| 187 | 185 | time.sleep(duration_ms / 1000.0) # 转换为秒 | ... | ... |
indexer/data_transformer.py
| ... | ... | @@ -6,6 +6,7 @@ Handles field mapping, type conversion, and embedding generation. |
| 6 | 6 | |
| 7 | 7 | import pandas as pd |
| 8 | 8 | import numpy as np |
| 9 | +import datetime | |
| 9 | 10 | from typing import Dict, Any, List, Optional |
| 10 | 11 | from config import CustomerConfig, FieldConfig, FieldType |
| 11 | 12 | from embeddings import BgeEncoder, CLIPImageEncoder |
| ... | ... | @@ -304,7 +305,6 @@ class DataTransformer: |
| 304 | 305 | elif isinstance(value, str): |
| 305 | 306 | # Try to parse string datetime and convert to ISO format |
| 306 | 307 | try: |
| 307 | - import datetime | |
| 308 | 308 | # Handle common datetime formats |
| 309 | 309 | formats = [ |
| 310 | 310 | '%Y-%m-%d %H:%M:%S', # 2020-07-07 16:44:09 | ... | ... |
main.py
| ... | ... | @@ -12,19 +12,21 @@ import sys |
| 12 | 12 | import os |
| 13 | 13 | import argparse |
| 14 | 14 | import json |
| 15 | +import pandas as pd | |
| 16 | +import uvicorn | |
| 15 | 17 | |
| 16 | 18 | # Add parent directory to path |
| 17 | 19 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| 18 | 20 | |
| 21 | +from config import ConfigLoader | |
| 22 | +from utils import ESClient | |
| 23 | +from indexer import DataTransformer, IndexingPipeline | |
| 24 | +from embeddings import BgeEncoder, CLIPImageEncoder | |
| 25 | +from search import Searcher | |
| 26 | + | |
| 19 | 27 | |
| 20 | 28 | def cmd_ingest(args): |
| 21 | 29 | """Run data ingestion.""" |
| 22 | - from config import ConfigLoader | |
| 23 | - from utils import ESClient | |
| 24 | - from indexer import DataTransformer, IndexingPipeline | |
| 25 | - from embeddings import BgeEncoder, CLIPImageEncoder | |
| 26 | - import pandas as pd | |
| 27 | - | |
| 28 | 30 | print(f"Starting ingestion for customer: {args.customer}") |
| 29 | 31 | |
| 30 | 32 | # Load config |
| ... | ... | @@ -63,8 +65,6 @@ def cmd_ingest(args): |
| 63 | 65 | |
| 64 | 66 | def cmd_serve(args): |
| 65 | 67 | """Start API service.""" |
| 66 | - import uvicorn | |
| 67 | - | |
| 68 | 68 | os.environ['CUSTOMER_ID'] = args.customer |
| 69 | 69 | os.environ['ES_HOST'] = args.es_host |
| 70 | 70 | |
| ... | ... | @@ -83,10 +83,6 @@ def cmd_serve(args): |
| 83 | 83 | |
| 84 | 84 | def cmd_search(args): |
| 85 | 85 | """Test search from command line.""" |
| 86 | - from config import ConfigLoader | |
| 87 | - from utils import ESClient | |
| 88 | - from search import Searcher | |
| 89 | - | |
| 90 | 86 | # Load config |
| 91 | 87 | config_loader = ConfigLoader("config/schema") |
| 92 | 88 | config = config_loader.load_customer_config(args.customer) | ... | ... |
scripts/run_tests.py
test_aggregation_functionality.py
| ... | ... | @@ -5,6 +5,7 @@ Simple test script to verify aggregation functionality without external dependen |
| 5 | 5 | |
| 6 | 6 | import sys |
| 7 | 7 | import os |
| 8 | +import inspect | |
| 8 | 9 | |
| 9 | 10 | # Add the project root to the Python path |
| 10 | 11 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| ... | ... | @@ -110,7 +111,6 @@ def test_searcher_integration(): |
| 110 | 111 | return False |
| 111 | 112 | |
| 112 | 113 | # We can't easily test the full searcher without ES, but we can check the method signature |
| 113 | - import inspect | |
| 114 | 114 | search_method = getattr(Searcher, 'search', None) |
| 115 | 115 | |
| 116 | 116 | if search_method: | ... | ... |