diff --git a/config/config_loader.py b/config/config_loader.py index c510377..6c1d895 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -104,10 +104,53 @@ class ConfigLoader: def __init__(self, config_dir: str = "config/schema"): self.config_dir = Path(config_dir) + + def _load_rewrite_dictionary(self, customer_id: str) -> Dict[str, str]: + """ + Load query rewrite dictionary from external file. + + Args: + customer_id: Customer identifier + + Returns: + Dictionary mapping query terms to rewritten queries + """ + dict_file = self.config_dir / customer_id / "query_rewrite.dict" + + if not dict_file.exists(): + # Dictionary file is optional, return empty dict if not found + return {} + + rewrite_dict = {} + try: + with open(dict_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + # Skip empty lines and comments + if not line or line.startswith('#'): + continue + + # Parse tab-separated format + parts = line.split('\t') + if len(parts) != 2: + print(f"Warning: Invalid format in {dict_file} line {line_num}: {line}") + continue + + key, value = parts + rewrite_dict[key.strip()] = value.strip() + except Exception as e: + print(f"Error loading rewrite dictionary from {dict_file}: {e}") + return {} + + return rewrite_dict def load_customer_config(self, customer_id: str) -> CustomerConfig: """ Load customer configuration from YAML file. + + Supports two directory structures: + 1. New structure: config/schema/{customer_id}/config.yaml + 2. Old structure: config/schema/{customer_id}_config.yaml (for backward compatibility) Args: customer_id: Customer identifier (used to find config file) @@ -118,8 +161,13 @@ class ConfigLoader: Raises: ConfigurationError: If config file not found or invalid """ - config_file = self.config_dir / f"{customer_id}_config.yaml" - + # Try new directory structure first + config_file = self.config_dir / customer_id / "config.yaml" + + # Fall back to old structure if new one doesn't exist + if not config_file.exists(): + config_file = self.config_dir / f"{customer_id}_config.yaml" + if not config_file.exists(): raise ConfigurationError(f"Configuration file not found: {config_file}") @@ -146,13 +194,17 @@ class ConfigLoader: # Parse query config query_config_data = config_data.get("query_config", {}) + + # Load rewrite dictionary from external file instead of config + rewrite_dictionary = self._load_rewrite_dictionary(customer_id) + query_config = QueryConfig( supported_languages=query_config_data.get("supported_languages", ["zh", "en"]), default_language=query_config_data.get("default_language", "zh"), enable_translation=query_config_data.get("enable_translation", True), enable_text_embedding=query_config_data.get("enable_text_embedding", True), enable_query_rewrite=query_config_data.get("enable_query_rewrite", True), - rewrite_dictionary=query_config_data.get("rewrite_dictionary", {}), + rewrite_dictionary=rewrite_dictionary, translation_api_key=query_config_data.get("translation_api_key"), translation_service=query_config_data.get("translation_service", "deepl") ) @@ -342,13 +394,18 @@ class ConfigLoader: def save_config(self, config: CustomerConfig, output_path: Optional[str] = None) -> None: """ Save customer configuration to YAML file. + + Note: rewrite_dictionary is saved separately to query_rewrite.dict file Args: config: Configuration to save - output_path: Optional output path (defaults to config dir) + output_path: Optional output path (defaults to new directory structure) """ if output_path is None: - output_path = self.config_dir / f"{config.customer_id}_config.yaml" + # Use new directory structure by default + customer_dir = self.config_dir / config.customer_id + customer_dir.mkdir(parents=True, exist_ok=True) + output_path = customer_dir / "config.yaml" # Convert config back to dictionary format config_dict = { @@ -366,7 +423,7 @@ class ConfigLoader: "enable_translation": config.query_config.enable_translation, "enable_text_embedding": config.query_config.enable_text_embedding, "enable_query_rewrite": config.query_config.enable_query_rewrite, - "rewrite_dictionary": config.query_config.rewrite_dictionary, + # rewrite_dictionary is stored in separate file, not in config "translation_api_key": config.query_config.translation_api_key, "translation_service": config.query_config.translation_service, }, @@ -383,6 +440,25 @@ class ConfigLoader: with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True) + + # Save rewrite dictionary to separate file + self._save_rewrite_dictionary(config.customer_id, config.query_config.rewrite_dictionary) + + def _save_rewrite_dictionary(self, customer_id: str, rewrite_dict: Dict[str, str]) -> None: + """ + Save rewrite dictionary to external file. + + Args: + customer_id: Customer identifier + rewrite_dict: Dictionary to save + """ + customer_dir = self.config_dir / customer_id + customer_dir.mkdir(parents=True, exist_ok=True) + dict_file = customer_dir / "query_rewrite.dict" + + with open(dict_file, 'w', encoding='utf-8') as f: + for key, value in rewrite_dict.items(): + f.write(f"{key}\t{value}\n") def _field_to_dict(self, field: FieldConfig) -> Dict[str, Any]: """Convert FieldConfig to dictionary.""" diff --git a/config/schema/customer1/config.yaml b/config/schema/customer1/config.yaml new file mode 100644 index 0000000..8cb4c21 --- /dev/null +++ b/config/schema/customer1/config.yaml @@ -0,0 +1,263 @@ +# Customer1 Configuration +# Test customer for cross-border e-commerce search + +customer_name: "Customer1 Test Instance" + +# MySQL Database Configuration +mysql_config: + host: "120.79.247.228" + port: 3316 + database: "saas" + username: "saas" + password: "P89cZHS5d7dFyc9R" + +# Table Configuration +main_table: "shoplazza_product_sku" +extension_table: "customer1_extension" + +# Elasticsearch Index +es_index_name: "search_customer1" + +# ES Index Settings +es_settings: + number_of_shards: 1 + number_of_replicas: 0 + refresh_interval: "30s" + +# Field Definitions +fields: + # Primary Key + - name: "skuId" + type: "LONG" + source_table: "main" + source_column: "id" + required: true + index: true + store: true + + # Text Fields - Multi-language + - name: "name" + type: "TEXT" + source_table: "extension" + source_column: "name" + analyzer: "chinese_ecommerce" + boost: 2.0 + index: true + store: true + + - name: "name_pinyin" + type: "TEXT" + source_table: "extension" + source_column: "name_pinyin" + analyzer: "standard" + boost: 1.5 + index: true + store: false + + - name: "ruSkuName" + type: "TEXT" + source_table: "extension" + source_column: "ruSkuName" + analyzer: "russian" + boost: 2.0 + index: true + store: true + + - name: "enSpuName" + type: "TEXT" + source_table: "extension" + source_column: "enSpuName" + analyzer: "english" + boost: 2.0 + index: true + store: true + + # Category and Brand + - name: "categoryName" + type: "TEXT" + source_table: "extension" + source_column: "categoryName" + analyzer: "chinese_ecommerce" + boost: 1.5 + index: true + store: true + + - name: "brandName" + type: "TEXT" + source_table: "extension" + source_column: "brandName" + analyzer: "chinese_ecommerce" + boost: 1.5 + index: true + store: true + + - name: "supplierName" + type: "TEXT" + source_table: "extension" + source_column: "supplierName" + analyzer: "chinese_ecommerce" + boost: 1.0 + index: true + store: true + + # Keyword Fields + - name: "categoryName_keyword" + type: "KEYWORD" + source_table: "extension" + source_column: "categoryName" + index: true + store: false + + - name: "brandName_keyword" + type: "KEYWORD" + source_table: "extension" + source_column: "brandName" + index: true + store: false + + - name: "supplierName_keyword" + type: "KEYWORD" + source_table: "extension" + source_column: "supplierName" + index: true + store: false + + # Price Fields + - name: "price" + type: "DOUBLE" + source_table: "extension" + source_column: "price" + index: true + store: true + + # Text Embedding Fields + - name: "name_embedding" + type: "TEXT_EMBEDDING" + source_table: "extension" + source_column: "name" + embedding_dims: 1024 + embedding_similarity: "dot_product" + index: true + + - name: "enSpuName_embedding" + type: "TEXT_EMBEDDING" + source_table: "extension" + source_column: "enSpuName" + embedding_dims: 1024 + embedding_similarity: "dot_product" + index: true + + # Image Fields + - name: "imageUrl" + type: "KEYWORD" + source_table: "extension" + source_column: "imageUrl" + index: true + store: true + + - name: "image_embedding" + type: "IMAGE_EMBEDDING" + source_table: "extension" + source_column: "imageUrl" + embedding_dims: 1024 + embedding_similarity: "dot_product" + nested: false + index: true + + # Metadata Fields + - name: "create_time" + type: "DATE" + source_table: "extension" + source_column: "create_time" + index: true + store: true + + - name: "days_since_last_update" + type: "INT" + source_table: "extension" + source_column: "days_since_last_update" + index: true + store: true + +# Index Structure (Query Domains) +indexes: + - name: "default" + label: "默认索引" + fields: + - "name" + - "enSpuName" + - "ruSkuName" + - "categoryName" + - "brandName" + analyzer: "chinese_ecommerce" + boost: 1.0 + example: 'query=default:"消防套"' + language_field_mapping: + zh: + - "name" + - "categoryName" + - "brandName" + en: + - "enSpuName" + ru: + - "ruSkuName" + + - name: "title" + label: "标题索引" + fields: + - "name" + - "enSpuName" + - "ruSkuName" + analyzer: "chinese_ecommerce" + boost: 2.0 + example: 'query=title:"芭比娃娃"' + language_field_mapping: + zh: + - "name" + en: + - "enSpuName" + ru: + - "ruSkuName" + + - name: "category" + label: "类目索引" + fields: + - "categoryName" + analyzer: "chinese_ecommerce" + boost: 1.5 + example: 'query=category:"玩具"' + + - name: "brand" + label: "品牌索引" + fields: + - "brandName" + analyzer: "chinese_ecommerce" + boost: 1.5 + example: 'query=brand:"ZHU LIN"' + +# Query Configuration +query_config: + supported_languages: + - "zh" + - "en" + - "ru" + default_language: "zh" + enable_translation: true + enable_text_embedding: true + enable_query_rewrite: true + + # Translation API (DeepL) + translation_service: "deepl" + translation_api_key: null # Set via environment variable + +# Ranking Configuration +ranking: + expression: "bm25() + 0.2*text_embedding_relevance()" + description: "BM25 text relevance combined with semantic embedding similarity" + +# SPU Aggregation (disabled for customer1) +spu_config: + enabled: false + spu_field: null + inner_hits_size: 3 + diff --git a/config/schema/customer1/query_rewrite.dict b/config/schema/customer1/query_rewrite.dict new file mode 100644 index 0000000..8e5ce37 --- /dev/null +++ b/config/schema/customer1/query_rewrite.dict @@ -0,0 +1,4 @@ +芭比 brand:芭比 OR name:芭比娃娃 +玩具 category:玩具 +消防 category:消防 OR name:消防 + diff --git a/config/schema/customer1_config.yaml b/config/schema/customer1_config.yaml deleted file mode 100644 index 27cd35c..0000000 --- a/config/schema/customer1_config.yaml +++ /dev/null @@ -1,268 +0,0 @@ -# Customer1 Configuration -# Test customer for cross-border e-commerce search - -customer_name: "Customer1 Test Instance" - -# MySQL Database Configuration -mysql_config: - host: "120.79.247.228" - port: 3316 - database: "saas" - username: "saas" - password: "P89cZHS5d7dFyc9R" - -# Table Configuration -main_table: "shoplazza_product_sku" -extension_table: "customer1_extension" - -# Elasticsearch Index -es_index_name: "search_customer1" - -# ES Index Settings -es_settings: - number_of_shards: 1 - number_of_replicas: 0 - refresh_interval: "30s" - -# Field Definitions -fields: - # Primary Key - - name: "skuId" - type: "LONG" - source_table: "main" - source_column: "id" - required: true - index: true - store: true - - # Text Fields - Multi-language - - name: "name" - type: "TEXT" - source_table: "extension" - source_column: "name" - analyzer: "chinese_ecommerce" - boost: 2.0 - index: true - store: true - - - name: "name_pinyin" - type: "TEXT" - source_table: "extension" - source_column: "name_pinyin" - analyzer: "standard" - boost: 1.5 - index: true - store: false - - - name: "ruSkuName" - type: "TEXT" - source_table: "extension" - source_column: "ruSkuName" - analyzer: "russian" - boost: 2.0 - index: true - store: true - - - name: "enSpuName" - type: "TEXT" - source_table: "extension" - source_column: "enSpuName" - analyzer: "english" - boost: 2.0 - index: true - store: true - - # Category and Brand - - name: "categoryName" - type: "TEXT" - source_table: "extension" - source_column: "categoryName" - analyzer: "chinese_ecommerce" - boost: 1.5 - index: true - store: true - - - name: "brandName" - type: "TEXT" - source_table: "extension" - source_column: "brandName" - analyzer: "chinese_ecommerce" - boost: 1.5 - index: true - store: true - - - name: "supplierName" - type: "TEXT" - source_table: "extension" - source_column: "supplierName" - analyzer: "chinese_ecommerce" - boost: 1.0 - index: true - store: true - - # Keyword Fields - - name: "categoryName_keyword" - type: "KEYWORD" - source_table: "extension" - source_column: "categoryName" - index: true - store: false - - - name: "brandName_keyword" - type: "KEYWORD" - source_table: "extension" - source_column: "brandName" - index: true - store: false - - - name: "supplierName_keyword" - type: "KEYWORD" - source_table: "extension" - source_column: "supplierName" - index: true - store: false - - # Price Fields - - name: "price" - type: "DOUBLE" - source_table: "extension" - source_column: "price" - index: true - store: true - - # Text Embedding Fields - - name: "name_embedding" - type: "TEXT_EMBEDDING" - source_table: "extension" - source_column: "name" - embedding_dims: 1024 - embedding_similarity: "dot_product" - index: true - - - name: "enSpuName_embedding" - type: "TEXT_EMBEDDING" - source_table: "extension" - source_column: "enSpuName" - embedding_dims: 1024 - embedding_similarity: "dot_product" - index: true - - # Image Fields - - name: "imageUrl" - type: "KEYWORD" - source_table: "extension" - source_column: "imageUrl" - index: true - store: true - - - name: "image_embedding" - type: "IMAGE_EMBEDDING" - source_table: "extension" - source_column: "imageUrl" - embedding_dims: 1024 - embedding_similarity: "dot_product" - nested: false - index: true - - # Metadata Fields - - name: "create_time" - type: "DATE" - source_table: "extension" - source_column: "create_time" - index: true - store: true - - - name: "days_since_last_update" - type: "INT" - source_table: "extension" - source_column: "days_since_last_update" - index: true - store: true - -# Index Structure (Query Domains) -indexes: - - name: "default" - label: "默认索引" - fields: - - "name" - - "enSpuName" - - "ruSkuName" - - "categoryName" - - "brandName" - analyzer: "chinese_ecommerce" - boost: 1.0 - example: 'query=default:"消防套"' - language_field_mapping: - zh: - - "name" - - "categoryName" - - "brandName" - en: - - "enSpuName" - ru: - - "ruSkuName" - - - name: "title" - label: "标题索引" - fields: - - "name" - - "enSpuName" - - "ruSkuName" - analyzer: "chinese_ecommerce" - boost: 2.0 - example: 'query=title:"芭比娃娃"' - language_field_mapping: - zh: - - "name" - en: - - "enSpuName" - ru: - - "ruSkuName" - - - name: "category" - label: "类目索引" - fields: - - "categoryName" - analyzer: "chinese_ecommerce" - boost: 1.5 - example: 'query=category:"玩具"' - - - name: "brand" - label: "品牌索引" - fields: - - "brandName" - analyzer: "chinese_ecommerce" - boost: 1.5 - example: 'query=brand:"ZHU LIN"' - -# Query Configuration -query_config: - supported_languages: - - "zh" - - "en" - - "ru" - default_language: "zh" - enable_translation: true - enable_text_embedding: true - enable_query_rewrite: true - - # Query Rewrite Dictionary - rewrite_dictionary: - "芭比": "brand:芭比 OR name:芭比娃娃" - "玩具": "category:玩具" - "消防": "category:消防 OR name:消防" - - # Translation API (DeepL) - translation_service: "deepl" - translation_api_key: null # Set via environment variable - -# Ranking Configuration -ranking: - expression: "bm25() + 0.2*text_embedding_relevance()" - description: "BM25 text relevance combined with semantic embedding similarity" - -# SPU Aggregation (disabled for customer1) -spu_config: - enabled: false - spu_field: null - inner_hits_size: 3 diff --git a/example_usage.py b/example_usage.py index be009f1..1e48da2 100644 --- a/example_usage.py +++ b/example_usage.py @@ -6,6 +6,7 @@ RequestContext使用示例 import sys import os +import time # 添加项目根目录到Python路径 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) @@ -25,7 +26,6 @@ def example_basic_usage(): # 步骤1: 查询解析 context.start_stage(RequestContextStage.QUERY_PARSING) # 这里调用 query_parser.parse(query, context=context) - import time time.sleep(0.05) # 模拟处理时间 context.end_stage(RequestContextStage.QUERY_PARSING) @@ -97,7 +97,6 @@ def example_with_searcher(): # 模拟搜索流程 context.start_stage(RequestContextStage.QUERY_PARSING) - import time time.sleep(0.04) context.end_stage(RequestContextStage.QUERY_PARSING) context.store_query_analysis( @@ -181,7 +180,6 @@ def example_performance_analysis(): (RequestContextStage.RERANKING, 42.7) ] - import time for stage, duration_ms in stages_with_durations: context.start_stage(stage) time.sleep(duration_ms / 1000.0) # 转换为秒 diff --git a/indexer/data_transformer.py b/indexer/data_transformer.py index 9ac6f16..bfb6832 100644 --- a/indexer/data_transformer.py +++ b/indexer/data_transformer.py @@ -6,6 +6,7 @@ Handles field mapping, type conversion, and embedding generation. import pandas as pd import numpy as np +import datetime from typing import Dict, Any, List, Optional from config import CustomerConfig, FieldConfig, FieldType from embeddings import BgeEncoder, CLIPImageEncoder @@ -304,7 +305,6 @@ class DataTransformer: elif isinstance(value, str): # Try to parse string datetime and convert to ISO format try: - import datetime # Handle common datetime formats formats = [ '%Y-%m-%d %H:%M:%S', # 2020-07-07 16:44:09 diff --git a/main.py b/main.py index 56bf88f..817ee70 100755 --- a/main.py +++ b/main.py @@ -12,19 +12,21 @@ import sys import os import argparse import json +import pandas as pd +import uvicorn # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from config import ConfigLoader +from utils import ESClient +from indexer import DataTransformer, IndexingPipeline +from embeddings import BgeEncoder, CLIPImageEncoder +from search import Searcher + def cmd_ingest(args): """Run data ingestion.""" - from config import ConfigLoader - from utils import ESClient - from indexer import DataTransformer, IndexingPipeline - from embeddings import BgeEncoder, CLIPImageEncoder - import pandas as pd - print(f"Starting ingestion for customer: {args.customer}") # Load config @@ -63,8 +65,6 @@ def cmd_ingest(args): def cmd_serve(args): """Start API service.""" - import uvicorn - os.environ['CUSTOMER_ID'] = args.customer os.environ['ES_HOST'] = args.es_host @@ -83,10 +83,6 @@ def cmd_serve(args): def cmd_search(args): """Test search from command line.""" - from config import ConfigLoader - from utils import ESClient - from search import Searcher - # Load config config_loader = ConfigLoader("config/schema") config = config_loader.load_customer_config(args.customer) diff --git a/scripts/run_tests.py b/scripts/run_tests.py index e0988c4..67d900f 100755 --- a/scripts/run_tests.py +++ b/scripts/run_tests.py @@ -118,7 +118,6 @@ class TestRunner: # 检查Python环境 try: - import sys python_version = sys.version self.logger.info(f"Python版本: {python_version}") checks.append(("Python", True, f"版本 {python_version}")) diff --git a/test_aggregation_functionality.py b/test_aggregation_functionality.py index 0b12bbc..5adb5fb 100644 --- a/test_aggregation_functionality.py +++ b/test_aggregation_functionality.py @@ -5,6 +5,7 @@ Simple test script to verify aggregation functionality without external dependen import sys import os +import inspect # Add the project root to the Python path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) @@ -110,7 +111,6 @@ def test_searcher_integration(): return False # We can't easily test the full searcher without ES, but we can check the method signature - import inspect search_method = getattr(Searcher, 'search', None) if search_method: -- libgit2 0.21.2