Commit d6606d7aa4638d454684920b84db9d933951a9cf

Authored by tangwang
1 parent 5ac64fc7

清理旧代码,具体如下:

1. 删除 IndexingPipeline 类
文件:indexer/bulk_indexer.py
删除:IndexingPipeline 类(第201-259行)
删除:不再需要的 load_mapping 导入
2. 删除 main.py 中的旧代码
删除:cmd_ingest() 函数(整个函数)
删除:ingest 子命令定义
删除:main() 中对 ingest 命令的处理
删除:不再需要的 pandas 导入
更新:文档字符串,移除 ingest 命令说明
3. 删除旧的数据导入脚本
删除:data/customer1/ingest_customer1.py(依赖已废弃的 DataTransformer 和 IndexingPipeline)
@@ -132,7 +132,7 @@ python main.py search "query" --tenant-id 1 --size 10 @@ -132,7 +132,7 @@ python main.py search "query" --tenant-id 1 --size 10
132 132
133 2. **Indexing Layer** (`indexer/`): 133 2. **Indexing Layer** (`indexer/`):
134 - Reads from MySQL, applies transformations with embeddings 134 - Reads from MySQL, applies transformations with embeddings
135 - - Uses `DataTransformer` and `IndexingPipeline` for batch processing 135 + - Uses `SPUTransformer`, `BulkIndexingService`, and `IncrementalIndexerService` for batch processing
136 - Supports both full and incremental indexing with embedding caching 136 - Supports both full and incremental indexing with embedding caching
137 137
138 3. **Query Processing Layer** (`query/`): 138 3. **Query Processing Layer** (`query/`):
data/customer1/ingest_customer1.py deleted
@@ -1,198 +0,0 @@ @@ -1,198 +0,0 @@
1 -#!/usr/bin/env python3  
2 -"""  
3 -Customer1 data ingestion script.  
4 -  
5 -Loads data from CSV and indexes into Elasticsearch with embeddings.  
6 -"""  
7 -  
8 -import sys  
9 -import os  
10 -import pandas as pd  
11 -import argparse  
12 -from typing import Optional  
13 -  
14 -# Add parent directory to path (go up 3 levels: customer1 -> data -> SearchEngine -> root)  
15 -project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  
16 -sys.path.insert(0, project_root)  
17 -  
18 -from config import ConfigLoader  
19 -from utils import ESClient, get_connection_from_config  
20 -from indexer import DataTransformer, IndexingPipeline  
21 -from embeddings import BgeEncoder, CLIPImageEncoder  
22 -  
23 -  
24 -def load_csv_data(csv_path: str, limit: Optional[int] = None) -> pd.DataFrame:  
25 - """  
26 - Load data from CSV file.  
27 -  
28 - Args:  
29 - csv_path: Path to CSV file  
30 - limit: Maximum number of rows to load (None for all)  
31 -  
32 - Returns:  
33 - DataFrame with data  
34 - """  
35 - print(f"[Ingestion] Loading data from: {csv_path}")  
36 -  
37 - df = pd.read_csv(csv_path)  
38 -  
39 - if limit:  
40 - df = df.head(limit)  
41 -  
42 - print(f"[Ingestion] Loaded {len(df)} rows")  
43 - print(f"[Ingestion] Columns: {df.columns.tolist()}")  
44 -  
45 - return df  
46 -  
47 -  
48 -def main():  
49 - """Main ingestion function."""  
50 - parser = argparse.ArgumentParser(description='Ingest customer1 data into Elasticsearch')  
51 - parser.add_argument('--config', default='customer1', help='Customer config name')  
52 - parser.add_argument('--csv', default='data/customer1/goods_with_pic.5years_congku.csv.shuf.1w',  
53 - help='Path to CSV data file')  
54 - parser.add_argument('--limit', type=int, help='Limit number of documents to index')  
55 - parser.add_argument('--batch-size', type=int, default=100, help='Batch size for processing')  
56 - parser.add_argument('--recreate-index', action='store_true', help='Recreate index if exists')  
57 - parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host')  
58 - parser.add_argument('--es-username', default=None, help='Elasticsearch username (or set ES_USERNAME env var)')  
59 - parser.add_argument('--es-password', default=None, help='Elasticsearch password (or set ES_PASSWORD env var)')  
60 - parser.add_argument('--skip-embeddings', action='store_true', help='Skip embedding generation')  
61 - args = parser.parse_args()  
62 -  
63 - print("="*60)  
64 - print("Customer1 Data Ingestion")  
65 - print("="*60)  
66 -  
67 - # Load configuration  
68 - print(f"\n[1/6] Loading configuration: {args.config}")  
69 - config_loader = ConfigLoader("config/schema")  
70 - config = config_loader.load_customer_config(args.config)  
71 -  
72 - # Validate configuration  
73 - errors = config_loader.validate_config(config)  
74 - if errors:  
75 - print(f"Configuration validation failed:")  
76 - for error in errors:  
77 - print(f" - {error}")  
78 - return 1  
79 -  
80 - print(f"Configuration loaded successfully")  
81 - print(f" - Index: {config.es_index_name}")  
82 - print(f" - Fields: {len(config.fields)}")  
83 - print(f" - Indexes: {len(config.indexes)}")  
84 -  
85 - # Initialize Elasticsearch client  
86 - print(f"\n[2/6] Connecting to Elasticsearch: {args.es_host}")  
87 -  
88 - # Get credentials: prioritize command-line args, then environment variables, then .env file  
89 - es_username = args.es_username  
90 - es_password = args.es_password  
91 -  
92 - # If not provided via args, try to load from .env file via env_config  
93 - if not es_username or not es_password:  
94 - try:  
95 - from config.env_config import get_es_config  
96 - es_config = get_es_config()  
97 - es_username = es_username or es_config.get('username')  
98 - es_password = es_password or es_config.get('password')  
99 - except Exception:  
100 - # Fallback to environment variables  
101 - es_username = es_username or os.getenv('ES_USERNAME')  
102 - es_password = es_password or os.getenv('ES_PASSWORD')  
103 -  
104 - # Create ES client with credentials if available  
105 - if es_username and es_password:  
106 - print(f" Using authentication: {es_username}")  
107 - es_client = ESClient(hosts=[args.es_host], username=es_username, password=es_password)  
108 - else:  
109 - print(f" Warning: No authentication credentials found")  
110 - print(f" Attempting connection without authentication (will fail if ES requires auth)")  
111 - es_client = ESClient(hosts=[args.es_host])  
112 -  
113 - if not es_client.ping():  
114 - print("Failed to connect to Elasticsearch")  
115 - print("\nTroubleshooting:")  
116 - print(" 1. Check if Elasticsearch is running: curl http://localhost:9200")  
117 - print(" 2. If ES requires authentication, provide credentials:")  
118 - print(" - Use --es-username and --es-password arguments, or")  
119 - print(" - Set ES_USERNAME and ES_PASSWORD environment variables")  
120 - print(" 3. Verify the host URL is correct: --es-host")  
121 - return 1  
122 -  
123 - print("Connected to Elasticsearch successfully")  
124 -  
125 - # Load data  
126 - print(f"\n[3/6] Loading data from CSV")  
127 - df = load_csv_data(args.csv, limit=args.limit)  
128 -  
129 - # Initialize encoders (if not skipping embeddings)  
130 - text_encoder = None  
131 - image_encoder = None  
132 -  
133 - if not args.skip_embeddings:  
134 - print(f"\n[4/6] Initializing embedding encoders")  
135 - print("This may take a few minutes on first run (downloading models)...")  
136 -  
137 - try:  
138 - text_encoder = BgeEncoder()  
139 - print("Text encoder initialized")  
140 - except Exception as e:  
141 - print(f"Warning: Failed to initialize text encoder: {e}")  
142 - print("Continuing without text embeddings...")  
143 -  
144 - try:  
145 - image_encoder = CLIPImageEncoder()  
146 - print("Image encoder initialized")  
147 - except Exception as e:  
148 - print(f"Warning: Failed to initialize image encoder: {e}")  
149 - print("Continuing without image embeddings...")  
150 - else:  
151 - print(f"\n[4/6] Skipping embedding generation (--skip-embeddings)")  
152 -  
153 - # Initialize data transformer  
154 - print(f"\n[5/6] Initializing data transformation pipeline")  
155 - transformer = DataTransformer(  
156 - config=config,  
157 - text_encoder=text_encoder,  
158 - image_encoder=image_encoder,  
159 - use_cache=True  
160 - )  
161 -  
162 - # Run indexing pipeline  
163 - print(f"\n[6/6] Starting indexing pipeline")  
164 - pipeline = IndexingPipeline(  
165 - config=config,  
166 - es_client=es_client,  
167 - data_transformer=transformer,  
168 - recreate_index=args.recreate_index  
169 - )  
170 -  
171 - results = pipeline.run(df, batch_size=args.batch_size)  
172 -  
173 - # Print summary  
174 - print("\n" + "="*60)  
175 - print("Ingestion Complete!")  
176 - print("="*60)  
177 - print(f"Total documents: {results['total']}")  
178 - print(f"Successfully indexed: {results['success']}")  
179 - print(f"Failed: {results['failed']}")  
180 - print(f"Time elapsed: {results['elapsed_time']:.2f}s")  
181 - print(f"Throughput: {results['docs_per_second']:.2f} docs/s")  
182 -  
183 - if results['errors']:  
184 - print(f"\nFirst few errors:")  
185 - for error in results['errors'][:5]:  
186 - print(f" - {error}")  
187 -  
188 - # Verify index  
189 - print(f"\nVerifying index...")  
190 - doc_count = es_client.count(config.es_index_name)  
191 - print(f"Documents in index: {doc_count}")  
192 -  
193 - print("\nIngestion completed successfully!")  
194 - return 0  
195 -  
196 -  
197 -if __name__ == "__main__":  
198 - sys.exit(main())  
docs/常用查询 - ES.md
@@ -233,7 +233,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ @@ -233,7 +233,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/
233 curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_count?pretty' -H 'Content-Type: application/json' -d '{ 233 curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_count?pretty' -H 'Content-Type: application/json' -d '{
234 "query": { 234 "query": {
235 "term": { 235 "term": {
236 - "tenant_id": "162" 236 + "tenant_id": "170"
237 } 237 }
238 } 238 }
239 }' 239 }'
indexer/bulk_indexer.py
@@ -7,7 +7,7 @@ Handles batch indexing of documents with progress tracking and error handling. @@ -7,7 +7,7 @@ Handles batch indexing of documents with progress tracking and error handling.
7 from typing import List, Dict, Any, Optional 7 from typing import List, Dict, Any, Optional
8 from elasticsearch.helpers import bulk, BulkIndexError 8 from elasticsearch.helpers import bulk, BulkIndexError
9 from utils.es_client import ESClient 9 from utils.es_client import ESClient
10 -from indexer.mapping_generator import load_mapping, DEFAULT_INDEX_NAME 10 +from indexer.mapping_generator import DEFAULT_INDEX_NAME
11 import time 11 import time
12 12
13 13
@@ -196,64 +196,3 @@ class BulkIndexer: @@ -196,64 +196,3 @@ class BulkIndexer:
196 except Exception as e: 196 except Exception as e:
197 print(f"[BulkIndexer] Update by query failed: {e}") 197 print(f"[BulkIndexer] Update by query failed: {e}")
198 return 0 198 return 0
199 -  
200 -  
201 -class IndexingPipeline:  
202 - """Complete indexing pipeline from source data to ES."""  
203 -  
204 - def __init__(  
205 - self,  
206 - es_client: ESClient,  
207 - data_transformer,  
208 - index_name: str = None,  
209 - recreate_index: bool = False  
210 - ):  
211 - """  
212 - Initialize indexing pipeline.  
213 -  
214 - Args:  
215 - es_client: Elasticsearch client  
216 - data_transformer: Data transformer instance  
217 - index_name: Index name (defaults to DEFAULT_INDEX_NAME)  
218 - recreate_index: Whether to recreate index if exists  
219 - """  
220 - self.es_client = es_client  
221 - self.transformer = data_transformer  
222 - self.index_name = index_name or DEFAULT_INDEX_NAME  
223 - self.recreate_index = recreate_index  
224 -  
225 - def run(self, df, batch_size: int = 100) -> Dict[str, Any]:  
226 - """  
227 - Run complete indexing pipeline.  
228 -  
229 - Args:  
230 - df: Source dataframe  
231 - batch_size: Batch size for processing  
232 -  
233 - Returns:  
234 - Indexing statistics  
235 - """  
236 - # Load and create index  
237 - mapping = load_mapping()  
238 -  
239 - if self.recreate_index:  
240 - if self.es_client.index_exists(self.index_name):  
241 - print(f"[IndexingPipeline] Deleting existing index: {self.index_name}")  
242 - self.es_client.delete_index(self.index_name)  
243 -  
244 - if not self.es_client.index_exists(self.index_name):  
245 - print(f"[IndexingPipeline] Creating index: {self.index_name}")  
246 - self.es_client.create_index(self.index_name, mapping)  
247 - else:  
248 - print(f"[IndexingPipeline] Using existing index: {self.index_name}")  
249 -  
250 - # Transform data  
251 - print(f"[IndexingPipeline] Transforming {len(df)} documents...")  
252 - documents = self.transformer.transform_batch(df, batch_size=batch_size)  
253 - print(f"[IndexingPipeline] Transformed {len(documents)} documents")  
254 -  
255 - # Bulk index  
256 - indexer = BulkIndexer(self.es_client, self.index_name, batch_size=500)  
257 - results = indexer.index_documents(documents, id_field="skuId")  
258 -  
259 - return results  
@@ -3,8 +3,8 @@ @@ -3,8 +3,8 @@
3 Main entry point for SearchEngine operations. 3 Main entry point for SearchEngine operations.
4 4
5 Provides a unified CLI for common operations: 5 Provides a unified CLI for common operations:
6 -- ingest: Ingest data into Elasticsearch  
7 -- serve: Start API service 6 +- serve: Start API service (search + admin routes)
  7 +- serve-indexer: Start dedicated Indexer API service
8 - search: Test search from command line 8 - search: Test search from command line
9 """ 9 """
10 10
@@ -12,7 +12,6 @@ import sys @@ -12,7 +12,6 @@ import sys
12 import os 12 import os
13 import argparse 13 import argparse
14 import json 14 import json
15 -import pandas as pd  
16 import uvicorn 15 import uvicorn
17 16
18 # Add parent directory to path 17 # Add parent directory to path
@@ -23,53 +22,6 @@ from utils import ESClient @@ -23,53 +22,6 @@ from utils import ESClient
23 from search import Searcher 22 from search import Searcher
24 23
25 24
26 -def cmd_ingest(args):  
27 - """Run data ingestion."""  
28 - # Local imports to avoid hard dependency at module import time  
29 - import pandas as pd  
30 - from embeddings import BgeEncoder, CLIPImageEncoder  
31 - from indexer.bulk_indexer import IndexingPipeline  
32 - # NOTE: DataTransformer was referenced historically, but the concrete  
33 - # implementation is now provided via customer-specific scripts  
34 - # (e.g. data/customer1/ingest_customer1.py). If you still need a generic  
35 - # ingestion pipeline here, you can wire your own transformer.  
36 - from indexer.spu_transformer import SPUTransformer as DataTransformer  
37 - print("Starting data ingestion")  
38 -  
39 - # Load config  
40 - config_loader = ConfigLoader("config/config.yaml")  
41 - config = config_loader.load_config()  
42 -  
43 - # Initialize ES  
44 - es_client = ESClient(hosts=[args.es_host])  
45 - if not es_client.ping():  
46 - print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}")  
47 - return 1  
48 -  
49 - # Load data  
50 - df = pd.read_csv(args.csv_file)  
51 - if args.limit:  
52 - df = df.head(args.limit)  
53 - print(f"Loaded {len(df)} documents")  
54 -  
55 - # Initialize encoders  
56 - text_encoder = None if args.skip_embeddings else BgeEncoder()  
57 - image_encoder = None if args.skip_embeddings else CLIPImageEncoder()  
58 -  
59 - # Transform and index  
60 - transformer = DataTransformer(config, text_encoder, image_encoder, use_cache=True)  
61 - pipeline = IndexingPipeline(config, es_client, transformer, recreate_index=args.recreate)  
62 -  
63 - results = pipeline.run(df, batch_size=args.batch_size)  
64 -  
65 - print(f"\nIngestion complete:")  
66 - print(f" Success: {results['success']}")  
67 - print(f" Failed: {results['failed']}")  
68 - print(f" Time: {results['elapsed_time']:.2f}s")  
69 -  
70 - return 0  
71 -  
72 -  
73 def cmd_serve(args): 25 def cmd_serve(args):
74 """Start API service.""" 26 """Start API service."""
75 os.environ['ES_HOST'] = args.es_host 27 os.environ['ES_HOST'] = args.es_host
@@ -154,15 +106,6 @@ def main(): @@ -154,15 +106,6 @@ def main():
154 106
155 subparsers = parser.add_subparsers(dest='command', help='Command to execute') 107 subparsers = parser.add_subparsers(dest='command', help='Command to execute')
156 108
157 - # Ingest command  
158 - ingest_parser = subparsers.add_parser('ingest', help='Ingest data into Elasticsearch')  
159 - ingest_parser.add_argument('csv_file', help='Path to CSV data file')  
160 - ingest_parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host')  
161 - ingest_parser.add_argument('--limit', type=int, help='Limit number of documents')  
162 - ingest_parser.add_argument('--batch-size', type=int, default=100, help='Batch size')  
163 - ingest_parser.add_argument('--recreate', action='store_true', help='Recreate index')  
164 - ingest_parser.add_argument('--skip-embeddings', action='store_true', help='Skip embeddings')  
165 -  
166 # Serve command 109 # Serve command
167 serve_parser = subparsers.add_parser('serve', help='Start API service (multi-tenant)') 110 serve_parser = subparsers.add_parser('serve', help='Start API service (multi-tenant)')
168 serve_parser.add_argument('--host', default='0.0.0.0', help='Host to bind to') 111 serve_parser.add_argument('--host', default='0.0.0.0', help='Host to bind to')
@@ -197,9 +140,7 @@ def main(): @@ -197,9 +140,7 @@ def main():
197 return 1 140 return 1
198 141
199 # Execute command 142 # Execute command
200 - if args.command == 'ingest':  
201 - return cmd_ingest(args)  
202 - elif args.command == 'serve': 143 + if args.command == 'serve':
203 return cmd_serve(args) 144 return cmd_serve(args)
204 elif args.command == 'serve-indexer': 145 elif args.command == 'serve-indexer':
205 return cmd_serve_indexer(args) 146 return cmd_serve_indexer(args)