Commit e4a39cc844139bd7cd7adfa16c74f9cc09d6c672

Authored by tangwang
1 parent 775db2b0

索引隔离。 不同的tenant_id用不同的索引

api/routes/search.py
@@ -326,18 +326,36 @@ async def instant_search( @@ -326,18 +326,36 @@ async def instant_search(
326 326
327 327
328 @router.get("/{doc_id}", response_model=DocumentResponse) 328 @router.get("/{doc_id}", response_model=DocumentResponse)
329 -async def get_document(doc_id: str): 329 +async def get_document(doc_id: str, http_request: Request):
330 """ 330 """
331 Get a single document by ID. 331 Get a single document by ID.
  332 +
  333 + Requires tenant_id in header (X-Tenant-ID) or query parameter (tenant_id).
332 """ 334 """
333 try: 335 try:
  336 + # Extract tenant_id (required)
  337 + tenant_id = http_request.headers.get('X-Tenant-ID')
  338 + if not tenant_id:
  339 + # Try to get from query string
  340 + from urllib.parse import parse_qs
  341 + query_string = http_request.url.query
  342 + if query_string:
  343 + params = parse_qs(query_string)
  344 + tenant_id = params.get('tenant_id', [None])[0]
  345 +
  346 + if not tenant_id:
  347 + raise HTTPException(
  348 + status_code=400,
  349 + detail="tenant_id is required. Provide it via header 'X-Tenant-ID' or query parameter 'tenant_id'"
  350 + )
  351 +
334 from api.app import get_searcher 352 from api.app import get_searcher
335 searcher = get_searcher() 353 searcher = get_searcher()
336 354
337 - doc = searcher.get_document(doc_id) 355 + doc = searcher.get_document(tenant_id=tenant_id, doc_id=doc_id)
338 356
339 if doc is None: 357 if doc is None:
340 - raise HTTPException(status_code=404, detail=f"Document {doc_id} not found") 358 + raise HTTPException(status_code=404, detail=f"Document {doc_id} not found for tenant {tenant_id}")
341 359
342 return DocumentResponse(id=doc_id, source=doc) 360 return DocumentResponse(id=doc_id, source=doc)
343 361
docs/亚马逊到店匠格式转换分析.md
@@ -368,3 +368,4 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \ @@ -368,3 +368,4 @@ python scripts/amazon_xlsx_to_shoplazza_xlsx.py \
368 368
369 369
370 370
  371 +
indexer/bulk_indexing_service.py
@@ -10,7 +10,7 @@ from sqlalchemy import Engine @@ -10,7 +10,7 @@ from sqlalchemy import Engine
10 from utils.es_client import ESClient 10 from utils.es_client import ESClient
11 from indexer.spu_transformer import SPUTransformer 11 from indexer.spu_transformer import SPUTransformer
12 from indexer.bulk_indexer import BulkIndexer 12 from indexer.bulk_indexer import BulkIndexer
13 -from indexer.mapping_generator import load_mapping, delete_index_if_exists, DEFAULT_INDEX_NAME 13 +from indexer.mapping_generator import load_mapping, delete_index_if_exists, get_tenant_index_name
14 from indexer.indexer_logger import ( 14 from indexer.indexer_logger import (
15 get_indexer_logger, log_index_request, log_index_result, log_bulk_index_batch 15 get_indexer_logger, log_index_request, log_index_result, log_bulk_index_batch
16 ) 16 )
@@ -33,13 +33,16 @@ class BulkIndexingService: @@ -33,13 +33,16 @@ class BulkIndexingService:
33 """ 33 """
34 self.db_engine = db_engine 34 self.db_engine = db_engine
35 self.es_client = es_client 35 self.es_client = es_client
36 - self.index_name = DEFAULT_INDEX_NAME 36 + # Index name is now generated dynamically per tenant, no longer stored here
37 37
38 def bulk_index(self, tenant_id: str, recreate_index: bool = False, batch_size: int = 500) -> Dict[str, Any]: 38 def bulk_index(self, tenant_id: str, recreate_index: bool = False, batch_size: int = 500) -> Dict[str, Any]:
39 """执行全量索引""" 39 """执行全量索引"""
40 import time 40 import time
41 start_time = time.time() 41 start_time = time.time()
42 42
  43 + # Generate tenant-specific index name
  44 + index_name = get_tenant_index_name(tenant_id)
  45 +
43 # 记录请求开始 46 # 记录请求开始
44 log_index_request( 47 log_index_request(
45 indexer_logger, 48 indexer_logger,
@@ -48,7 +51,7 @@ class BulkIndexingService: @@ -48,7 +51,7 @@ class BulkIndexingService:
48 request_params={ 51 request_params={
49 'recreate_index': recreate_index, 52 'recreate_index': recreate_index,
50 'batch_size': batch_size, 53 'batch_size': batch_size,
51 - 'index_name': self.index_name 54 + 'index_name': index_name
52 } 55 }
53 ) 56 )
54 57
@@ -61,45 +64,45 @@ class BulkIndexingService: @@ -61,45 +64,45 @@ class BulkIndexingService:
61 'index_type': 'bulk', 64 'index_type': 'bulk',
62 'tenant_id': tenant_id, 65 'tenant_id': tenant_id,
63 'operation': 'load_mapping', 66 'operation': 'load_mapping',
64 - 'index_name': self.index_name 67 + 'index_name': index_name
65 } 68 }
66 ) 69 )
67 mapping = load_mapping() 70 mapping = load_mapping()
68 71
69 # 2. 处理索引(删除并重建或创建) 72 # 2. 处理索引(删除并重建或创建)
70 if recreate_index: 73 if recreate_index:
71 - logger.info(f"[BulkIndexing] Recreating index: {self.index_name}") 74 + logger.info(f"[BulkIndexing] Recreating index: {index_name}")
72 indexer_logger.info( 75 indexer_logger.info(
73 - f"Recreating index: {self.index_name}", 76 + f"Recreating index: {index_name}",
74 extra={ 77 extra={
75 'index_type': 'bulk', 78 'index_type': 'bulk',
76 'tenant_id': tenant_id, 79 'tenant_id': tenant_id,
77 'operation': 'recreate_index', 80 'operation': 'recreate_index',
78 - 'index_name': self.index_name 81 + 'index_name': index_name
79 } 82 }
80 ) 83 )
81 - if self.es_client.index_exists(self.index_name):  
82 - if delete_index_if_exists(self.es_client, self.index_name):  
83 - logger.info(f"[BulkIndexing] Deleted existing index: {self.index_name}") 84 + if self.es_client.index_exists(index_name):
  85 + if delete_index_if_exists(self.es_client, index_name):
  86 + logger.info(f"[BulkIndexing] Deleted existing index: {index_name}")
84 else: 87 else:
85 - raise Exception(f"Failed to delete index: {self.index_name}") 88 + raise Exception(f"Failed to delete index: {index_name}")
86 89
87 - if not self.es_client.index_exists(self.index_name):  
88 - logger.info(f"[BulkIndexing] Creating index: {self.index_name}") 90 + if not self.es_client.index_exists(index_name):
  91 + logger.info(f"[BulkIndexing] Creating index: {index_name}")
89 indexer_logger.info( 92 indexer_logger.info(
90 - f"Creating index: {self.index_name}", 93 + f"Creating index: {index_name}",
91 extra={ 94 extra={
92 'index_type': 'bulk', 95 'index_type': 'bulk',
93 'tenant_id': tenant_id, 96 'tenant_id': tenant_id,
94 'operation': 'create_index', 97 'operation': 'create_index',
95 - 'index_name': self.index_name 98 + 'index_name': index_name
96 } 99 }
97 ) 100 )
98 - if not self.es_client.create_index(self.index_name, mapping):  
99 - raise Exception(f"Failed to create index: {self.index_name}")  
100 - logger.info(f"[BulkIndexing] Created index: {self.index_name}") 101 + if not self.es_client.create_index(index_name, mapping):
  102 + raise Exception(f"Failed to create index: {index_name}")
  103 + logger.info(f"[BulkIndexing] Created index: {index_name}")
101 else: 104 else:
102 - logger.info(f"[BulkIndexing] Index already exists: {self.index_name}") 105 + logger.info(f"[BulkIndexing] Index already exists: {index_name}")
103 106
104 # 3. 转换数据 107 # 3. 转换数据
105 logger.info(f"[BulkIndexing] Transforming data for tenant_id={tenant_id}") 108 logger.info(f"[BulkIndexing] Transforming data for tenant_id={tenant_id}")
@@ -109,7 +112,7 @@ class BulkIndexingService: @@ -109,7 +112,7 @@ class BulkIndexingService:
109 'index_type': 'bulk', 112 'index_type': 'bulk',
110 'tenant_id': tenant_id, 113 'tenant_id': tenant_id,
111 'operation': 'transform_data', 114 'operation': 'transform_data',
112 - 'index_name': self.index_name 115 + 'index_name': index_name
113 } 116 }
114 ) 117 )
115 transformer = SPUTransformer(self.db_engine, tenant_id) 118 transformer = SPUTransformer(self.db_engine, tenant_id)
@@ -126,7 +129,7 @@ class BulkIndexingService: @@ -126,7 +129,7 @@ class BulkIndexingService:
126 success_count=0, 129 success_count=0,
127 failed_count=0, 130 failed_count=0,
128 elapsed_time=elapsed_time, 131 elapsed_time=elapsed_time,
129 - index_name=self.index_name 132 + index_name=index_name
130 ) 133 )
131 return { 134 return {
132 "success": True, 135 "success": True,
@@ -135,7 +138,7 @@ class BulkIndexingService: @@ -135,7 +138,7 @@ class BulkIndexingService:
135 "failed": 0, 138 "failed": 0,
136 "elapsed_time": elapsed_time, 139 "elapsed_time": elapsed_time,
137 "message": "No documents to index", 140 "message": "No documents to index",
138 - "index_name": self.index_name, 141 + "index_name": index_name,
139 "tenant_id": tenant_id 142 "tenant_id": tenant_id
140 } 143 }
141 144
@@ -147,13 +150,13 @@ class BulkIndexingService: @@ -147,13 +150,13 @@ class BulkIndexingService:
147 'tenant_id': tenant_id, 150 'tenant_id': tenant_id,
148 'operation': 'transform_complete', 151 'operation': 'transform_complete',
149 'total_count': len(documents), 152 'total_count': len(documents),
150 - 'index_name': self.index_name 153 + 'index_name': index_name
151 } 154 }
152 ) 155 )
153 156
154 # 4. 批量导入 157 # 4. 批量导入
155 logger.info(f"[BulkIndexing] Indexing {len(documents)} documents (batch_size={batch_size})") 158 logger.info(f"[BulkIndexing] Indexing {len(documents)} documents (batch_size={batch_size})")
156 - indexer = BulkIndexer(self.es_client, self.index_name, batch_size=batch_size, max_retries=3) 159 + indexer = BulkIndexer(self.es_client, index_name, batch_size=batch_size, max_retries=3)
157 results = indexer.index_documents( 160 results = indexer.index_documents(
158 documents, 161 documents,
159 id_field="spu_id", 162 id_field="spu_id",
@@ -171,7 +174,7 @@ class BulkIndexingService: @@ -171,7 +174,7 @@ class BulkIndexingService:
171 success_count=results['success'], 174 success_count=results['success'],
172 failed_count=results['failed'], 175 failed_count=results['failed'],
173 elapsed_time=elapsed_time, 176 elapsed_time=elapsed_time,
174 - index_name=self.index_name, 177 + index_name=index_name,
175 errors=results.get('errors', []) 178 errors=results.get('errors', [])
176 ) 179 )
177 180
@@ -187,7 +190,7 @@ class BulkIndexingService: @@ -187,7 +190,7 @@ class BulkIndexingService:
187 "indexed": results['success'], 190 "indexed": results['success'],
188 "failed": results['failed'], 191 "failed": results['failed'],
189 "elapsed_time": elapsed_time, 192 "elapsed_time": elapsed_time,
190 - "index_name": self.index_name, 193 + "index_name": index_name,
191 "tenant_id": tenant_id 194 "tenant_id": tenant_id
192 } 195 }
193 196
@@ -203,7 +206,7 @@ class BulkIndexingService: @@ -203,7 +206,7 @@ class BulkIndexingService:
203 'operation': 'request_failed', 206 'operation': 'request_failed',
204 'error': error_msg, 207 'error': error_msg,
205 'elapsed_time': elapsed_time, 208 'elapsed_time': elapsed_time,
206 - 'index_name': self.index_name 209 + 'index_name': index_name
207 }, 210 },
208 exc_info=True 211 exc_info=True
209 ) 212 )
indexer/incremental_service.py
@@ -9,7 +9,7 @@ import numpy as np @@ -9,7 +9,7 @@ import numpy as np
9 from sqlalchemy import text, bindparam 9 from sqlalchemy import text, bindparam
10 from indexer.indexing_utils import load_category_mapping, create_document_transformer 10 from indexer.indexing_utils import load_category_mapping, create_document_transformer
11 from indexer.bulk_indexer import BulkIndexer 11 from indexer.bulk_indexer import BulkIndexer
12 -from indexer.mapping_generator import DEFAULT_INDEX_NAME 12 +from indexer.mapping_generator import get_tenant_index_name
13 from indexer.indexer_logger import ( 13 from indexer.indexer_logger import (
14 get_indexer_logger, log_index_request, log_index_result, log_spu_processing 14 get_indexer_logger, log_index_request, log_index_result, log_spu_processing
15 ) 15 )
@@ -393,7 +393,7 @@ class IncrementalIndexerService: @@ -393,7 +393,7 @@ class IncrementalIndexerService:
393 es_client, 393 es_client,
394 tenant_id: str, 394 tenant_id: str,
395 spu_ids: List[str], 395 spu_ids: List[str],
396 - index_name: str = DEFAULT_INDEX_NAME, 396 + index_name: str = None,
397 batch_size: int = 100, 397 batch_size: int = 100,
398 delete_spu_ids: List[str] = None 398 delete_spu_ids: List[str] = None
399 ) -> Dict[str, Any]: 399 ) -> Dict[str, Any]:
@@ -408,13 +408,16 @@ class IncrementalIndexerService: @@ -408,13 +408,16 @@ class IncrementalIndexerService:
408 es_client: Elasticsearch客户端 408 es_client: Elasticsearch客户端
409 tenant_id: 租户ID 409 tenant_id: 租户ID
410 spu_ids: SPU ID列表(要索引的) 410 spu_ids: SPU ID列表(要索引的)
411 - index_name: 索引名称 411 + index_name: 索引名称(可选,如果不提供则根据tenant_id自动生成)
412 batch_size: 批量写入ES的批次大小 412 batch_size: 批量写入ES的批次大小
413 delete_spu_ids: 显式指定要删除的SPU ID列表(可选) 413 delete_spu_ids: 显式指定要删除的SPU ID列表(可选)
414 414
415 Returns: 415 Returns:
416 包含成功/失败列表的字典,以及删除结果 416 包含成功/失败列表的字典,以及删除结果
417 """ 417 """
  418 + # Generate tenant-specific index name if not provided
  419 + if index_name is None:
  420 + index_name = get_tenant_index_name(tenant_id)
418 # 去重但保持顺序(避免重复DB/翻译/embedding/写ES) 421 # 去重但保持顺序(避免重复DB/翻译/embedding/写ES)
419 if spu_ids: 422 if spu_ids:
420 spu_ids = list(dict.fromkeys(spu_ids)) 423 spu_ids = list(dict.fromkeys(spu_ids))
indexer/mapping_generator.py
@@ -11,13 +11,26 @@ from pathlib import Path @@ -11,13 +11,26 @@ from pathlib import Path
11 11
12 logger = logging.getLogger(__name__) 12 logger = logging.getLogger(__name__)
13 13
14 -# Default index name 14 +# Default index name (deprecated, use get_tenant_index_name instead)
15 DEFAULT_INDEX_NAME = "search_products" 15 DEFAULT_INDEX_NAME = "search_products"
16 16
17 # Default mapping file path 17 # Default mapping file path
18 DEFAULT_MAPPING_FILE = Path(__file__).parent.parent / "mappings" / "search_products.json" 18 DEFAULT_MAPPING_FILE = Path(__file__).parent.parent / "mappings" / "search_products.json"
19 19
20 20
  21 +def get_tenant_index_name(tenant_id: str) -> str:
  22 + """
  23 + Generate index name for a tenant.
  24 +
  25 + Args:
  26 + tenant_id: Tenant ID
  27 +
  28 + Returns:
  29 + Index name in format: search_products_tenant_{tenant_id}
  30 + """
  31 + return f"search_products_tenant_{tenant_id}"
  32 +
  33 +
21 def load_mapping(mapping_file: str = None) -> Dict[str, Any]: 34 def load_mapping(mapping_file: str = None) -> Dict[str, Any]:
22 """ 35 """
23 Load Elasticsearch mapping from JSON file. 36 Load Elasticsearch mapping from JSON file.
search/searcher.py
@@ -20,6 +20,7 @@ from config.utils import get_match_fields_for_index @@ -20,6 +20,7 @@ from config.utils import get_match_fields_for_index
20 from context.request_context import RequestContext, RequestContextStage, create_request_context 20 from context.request_context import RequestContext, RequestContextStage, create_request_context
21 from api.models import FacetResult, FacetValue, FacetConfig 21 from api.models import FacetResult, FacetValue, FacetConfig
22 from api.result_formatter import ResultFormatter 22 from api.result_formatter import ResultFormatter
  23 +from indexer.mapping_generator import get_tenant_index_name
23 24
24 logger = logging.getLogger(__name__) 25 logger = logging.getLogger(__name__)
25 26
@@ -93,7 +94,7 @@ class Searcher: @@ -93,7 +94,7 @@ class Searcher:
93 """ 94 """
94 self.es_client = es_client 95 self.es_client = es_client
95 self.config = config 96 self.config = config
96 - self.index_name = config.es_index_name 97 + # Index name is now generated dynamically per tenant, no longer stored here
97 self.query_parser = query_parser or QueryParser(config) 98 self.query_parser = query_parser or QueryParser(config)
98 99
99 # Initialize components 100 # Initialize components
@@ -107,8 +108,9 @@ class Searcher: @@ -107,8 +108,9 @@ class Searcher:
107 self.source_fields = config.query_config.source_fields or [] 108 self.source_fields = config.query_config.source_fields or []
108 109
109 # Query builder - simplified single-layer architecture 110 # Query builder - simplified single-layer architecture
  111 + # index_name is no longer needed in query builder since we use tenant-specific indices
110 self.query_builder = ESQueryBuilder( 112 self.query_builder = ESQueryBuilder(
111 - index_name=self.index_name, 113 + index_name="", # Not used, kept for backward compatibility
112 match_fields=self.match_fields, 114 match_fields=self.match_fields,
113 text_embedding_field=self.text_embedding_field, 115 text_embedding_field=self.text_embedding_field,
114 image_embedding_field=self.image_embedding_field, 116 image_embedding_field=self.image_embedding_field,
@@ -271,10 +273,10 @@ class Searcher: @@ -271,10 +273,10 @@ class Searcher:
271 # Step 3: Query building 273 # Step 3: Query building
272 context.start_stage(RequestContextStage.QUERY_BUILDING) 274 context.start_stage(RequestContextStage.QUERY_BUILDING)
273 try: 275 try:
274 - # Add tenant_id to filters (required)  
275 - if filters is None:  
276 - filters = {}  
277 - filters['tenant_id'] = tenant_id 276 + # Generate tenant-specific index name
  277 + index_name = get_tenant_index_name(tenant_id)
  278 +
  279 + # No longer need to add tenant_id to filters since each tenant has its own index
278 280
279 es_query = self.query_builder.build_query( 281 es_query = self.query_builder.build_query(
280 query_text=parsed_query.rewritten_query or parsed_query.normalized_query, 282 query_text=parsed_query.rewritten_query or parsed_query.normalized_query,
@@ -332,8 +334,9 @@ class Searcher: @@ -332,8 +334,9 @@ class Searcher:
332 # Step 4: Elasticsearch search 334 # Step 4: Elasticsearch search
333 context.start_stage(RequestContextStage.ELASTICSEARCH_SEARCH) 335 context.start_stage(RequestContextStage.ELASTICSEARCH_SEARCH)
334 try: 336 try:
  337 + # Use tenant-specific index name
335 es_response = self.es_client.search( 338 es_response = self.es_client.search(
336 - index_name=self.index_name, 339 + index_name=index_name,
337 body=body_for_es, 340 body=body_for_es,
338 size=size, 341 size=size,
339 from_=from_ 342 from_=from_
@@ -496,10 +499,10 @@ class Searcher: @@ -496,10 +499,10 @@ class Searcher:
496 if image_vector is None: 499 if image_vector is None:
497 raise ValueError(f"Failed to encode image: {image_url}") 500 raise ValueError(f"Failed to encode image: {image_url}")
498 501
499 - # Add tenant_id to filters (required)  
500 - if filters is None:  
501 - filters = {}  
502 - filters['tenant_id'] = tenant_id 502 + # Generate tenant-specific index name
  503 + index_name = get_tenant_index_name(tenant_id)
  504 +
  505 + # No longer need to add tenant_id to filters since each tenant has its own index
503 506
504 # Build KNN query 507 # Build KNN query
505 es_query = { 508 es_query = {
@@ -529,7 +532,7 @@ class Searcher: @@ -529,7 +532,7 @@ class Searcher:
529 532
530 # Execute search 533 # Execute search
531 es_response = self.es_client.search( 534 es_response = self.es_client.search(
532 - index_name=self.index_name, 535 + index_name=index_name,
533 body=es_query, 536 body=es_query,
534 size=size 537 size=size
535 ) 538 )
@@ -576,24 +579,26 @@ class Searcher: @@ -576,24 +579,26 @@ class Searcher:
576 """ 579 """
577 return self.query_builder.get_domain_summary() 580 return self.query_builder.get_domain_summary()
578 581
579 - def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: 582 + def get_document(self, tenant_id: str, doc_id: str) -> Optional[Dict[str, Any]]:
580 """ 583 """
581 Get single document by ID. 584 Get single document by ID.
582 585
583 Args: 586 Args:
  587 + tenant_id: Tenant ID (required to determine which index to query)
584 doc_id: Document ID 588 doc_id: Document ID
585 589
586 Returns: 590 Returns:
587 Document or None if not found 591 Document or None if not found
588 """ 592 """
589 try: 593 try:
  594 + index_name = get_tenant_index_name(tenant_id)
590 response = self.es_client.client.get( 595 response = self.es_client.client.get(
591 - index=self.index_name, 596 + index=index_name,
592 id=doc_id 597 id=doc_id
593 ) 598 )
594 return response.get('_source') 599 return response.get('_source')
595 except Exception as e: 600 except Exception as e:
596 - logger.error(f"Failed to get document {doc_id}: {e}", exc_info=True) 601 + logger.error(f"Failed to get document {doc_id} from tenant {tenant_id}: {e}", exc_info=True)
597 return None 602 return None
598 603
599 def _standardize_facets( 604 def _standardize_facets(