Commit d6606d7aa4638d454684920b84db9d933951a9cf
1 parent
5ac64fc7
清理旧代码,具体如下:
1. 删除 IndexingPipeline 类 文件:indexer/bulk_indexer.py 删除:IndexingPipeline 类(第201-259行) 删除:不再需要的 load_mapping 导入 2. 删除 main.py 中的旧代码 删除:cmd_ingest() 函数(整个函数) 删除:ingest 子命令定义 删除:main() 中对 ingest 命令的处理 删除:不再需要的 pandas 导入 更新:文档字符串,移除 ingest 命令说明 3. 删除旧的数据导入脚本 删除:data/customer1/ingest_customer1.py(依赖已废弃的 DataTransformer 和 IndexingPipeline)
Showing
5 changed files
with
6 additions
and
324 deletions
Show diff stats
CLAUDE.md
| @@ -132,7 +132,7 @@ python main.py search "query" --tenant-id 1 --size 10 | @@ -132,7 +132,7 @@ python main.py search "query" --tenant-id 1 --size 10 | ||
| 132 | 132 | ||
| 133 | 2. **Indexing Layer** (`indexer/`): | 133 | 2. **Indexing Layer** (`indexer/`): |
| 134 | - Reads from MySQL, applies transformations with embeddings | 134 | - Reads from MySQL, applies transformations with embeddings |
| 135 | - - Uses `DataTransformer` and `IndexingPipeline` for batch processing | 135 | + - Uses `SPUTransformer`, `BulkIndexingService`, and `IncrementalIndexerService` for batch processing |
| 136 | - Supports both full and incremental indexing with embedding caching | 136 | - Supports both full and incremental indexing with embedding caching |
| 137 | 137 | ||
| 138 | 3. **Query Processing Layer** (`query/`): | 138 | 3. **Query Processing Layer** (`query/`): |
data/customer1/ingest_customer1.py deleted
| @@ -1,198 +0,0 @@ | @@ -1,198 +0,0 @@ | ||
| 1 | -#!/usr/bin/env python3 | ||
| 2 | -""" | ||
| 3 | -Customer1 data ingestion script. | ||
| 4 | - | ||
| 5 | -Loads data from CSV and indexes into Elasticsearch with embeddings. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -import sys | ||
| 9 | -import os | ||
| 10 | -import pandas as pd | ||
| 11 | -import argparse | ||
| 12 | -from typing import Optional | ||
| 13 | - | ||
| 14 | -# Add parent directory to path (go up 3 levels: customer1 -> data -> SearchEngine -> root) | ||
| 15 | -project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||
| 16 | -sys.path.insert(0, project_root) | ||
| 17 | - | ||
| 18 | -from config import ConfigLoader | ||
| 19 | -from utils import ESClient, get_connection_from_config | ||
| 20 | -from indexer import DataTransformer, IndexingPipeline | ||
| 21 | -from embeddings import BgeEncoder, CLIPImageEncoder | ||
| 22 | - | ||
| 23 | - | ||
| 24 | -def load_csv_data(csv_path: str, limit: Optional[int] = None) -> pd.DataFrame: | ||
| 25 | - """ | ||
| 26 | - Load data from CSV file. | ||
| 27 | - | ||
| 28 | - Args: | ||
| 29 | - csv_path: Path to CSV file | ||
| 30 | - limit: Maximum number of rows to load (None for all) | ||
| 31 | - | ||
| 32 | - Returns: | ||
| 33 | - DataFrame with data | ||
| 34 | - """ | ||
| 35 | - print(f"[Ingestion] Loading data from: {csv_path}") | ||
| 36 | - | ||
| 37 | - df = pd.read_csv(csv_path) | ||
| 38 | - | ||
| 39 | - if limit: | ||
| 40 | - df = df.head(limit) | ||
| 41 | - | ||
| 42 | - print(f"[Ingestion] Loaded {len(df)} rows") | ||
| 43 | - print(f"[Ingestion] Columns: {df.columns.tolist()}") | ||
| 44 | - | ||
| 45 | - return df | ||
| 46 | - | ||
| 47 | - | ||
| 48 | -def main(): | ||
| 49 | - """Main ingestion function.""" | ||
| 50 | - parser = argparse.ArgumentParser(description='Ingest customer1 data into Elasticsearch') | ||
| 51 | - parser.add_argument('--config', default='customer1', help='Customer config name') | ||
| 52 | - parser.add_argument('--csv', default='data/customer1/goods_with_pic.5years_congku.csv.shuf.1w', | ||
| 53 | - help='Path to CSV data file') | ||
| 54 | - parser.add_argument('--limit', type=int, help='Limit number of documents to index') | ||
| 55 | - parser.add_argument('--batch-size', type=int, default=100, help='Batch size for processing') | ||
| 56 | - parser.add_argument('--recreate-index', action='store_true', help='Recreate index if exists') | ||
| 57 | - parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host') | ||
| 58 | - parser.add_argument('--es-username', default=None, help='Elasticsearch username (or set ES_USERNAME env var)') | ||
| 59 | - parser.add_argument('--es-password', default=None, help='Elasticsearch password (or set ES_PASSWORD env var)') | ||
| 60 | - parser.add_argument('--skip-embeddings', action='store_true', help='Skip embedding generation') | ||
| 61 | - args = parser.parse_args() | ||
| 62 | - | ||
| 63 | - print("="*60) | ||
| 64 | - print("Customer1 Data Ingestion") | ||
| 65 | - print("="*60) | ||
| 66 | - | ||
| 67 | - # Load configuration | ||
| 68 | - print(f"\n[1/6] Loading configuration: {args.config}") | ||
| 69 | - config_loader = ConfigLoader("config/schema") | ||
| 70 | - config = config_loader.load_customer_config(args.config) | ||
| 71 | - | ||
| 72 | - # Validate configuration | ||
| 73 | - errors = config_loader.validate_config(config) | ||
| 74 | - if errors: | ||
| 75 | - print(f"Configuration validation failed:") | ||
| 76 | - for error in errors: | ||
| 77 | - print(f" - {error}") | ||
| 78 | - return 1 | ||
| 79 | - | ||
| 80 | - print(f"Configuration loaded successfully") | ||
| 81 | - print(f" - Index: {config.es_index_name}") | ||
| 82 | - print(f" - Fields: {len(config.fields)}") | ||
| 83 | - print(f" - Indexes: {len(config.indexes)}") | ||
| 84 | - | ||
| 85 | - # Initialize Elasticsearch client | ||
| 86 | - print(f"\n[2/6] Connecting to Elasticsearch: {args.es_host}") | ||
| 87 | - | ||
| 88 | - # Get credentials: prioritize command-line args, then environment variables, then .env file | ||
| 89 | - es_username = args.es_username | ||
| 90 | - es_password = args.es_password | ||
| 91 | - | ||
| 92 | - # If not provided via args, try to load from .env file via env_config | ||
| 93 | - if not es_username or not es_password: | ||
| 94 | - try: | ||
| 95 | - from config.env_config import get_es_config | ||
| 96 | - es_config = get_es_config() | ||
| 97 | - es_username = es_username or es_config.get('username') | ||
| 98 | - es_password = es_password or es_config.get('password') | ||
| 99 | - except Exception: | ||
| 100 | - # Fallback to environment variables | ||
| 101 | - es_username = es_username or os.getenv('ES_USERNAME') | ||
| 102 | - es_password = es_password or os.getenv('ES_PASSWORD') | ||
| 103 | - | ||
| 104 | - # Create ES client with credentials if available | ||
| 105 | - if es_username and es_password: | ||
| 106 | - print(f" Using authentication: {es_username}") | ||
| 107 | - es_client = ESClient(hosts=[args.es_host], username=es_username, password=es_password) | ||
| 108 | - else: | ||
| 109 | - print(f" Warning: No authentication credentials found") | ||
| 110 | - print(f" Attempting connection without authentication (will fail if ES requires auth)") | ||
| 111 | - es_client = ESClient(hosts=[args.es_host]) | ||
| 112 | - | ||
| 113 | - if not es_client.ping(): | ||
| 114 | - print("Failed to connect to Elasticsearch") | ||
| 115 | - print("\nTroubleshooting:") | ||
| 116 | - print(" 1. Check if Elasticsearch is running: curl http://localhost:9200") | ||
| 117 | - print(" 2. If ES requires authentication, provide credentials:") | ||
| 118 | - print(" - Use --es-username and --es-password arguments, or") | ||
| 119 | - print(" - Set ES_USERNAME and ES_PASSWORD environment variables") | ||
| 120 | - print(" 3. Verify the host URL is correct: --es-host") | ||
| 121 | - return 1 | ||
| 122 | - | ||
| 123 | - print("Connected to Elasticsearch successfully") | ||
| 124 | - | ||
| 125 | - # Load data | ||
| 126 | - print(f"\n[3/6] Loading data from CSV") | ||
| 127 | - df = load_csv_data(args.csv, limit=args.limit) | ||
| 128 | - | ||
| 129 | - # Initialize encoders (if not skipping embeddings) | ||
| 130 | - text_encoder = None | ||
| 131 | - image_encoder = None | ||
| 132 | - | ||
| 133 | - if not args.skip_embeddings: | ||
| 134 | - print(f"\n[4/6] Initializing embedding encoders") | ||
| 135 | - print("This may take a few minutes on first run (downloading models)...") | ||
| 136 | - | ||
| 137 | - try: | ||
| 138 | - text_encoder = BgeEncoder() | ||
| 139 | - print("Text encoder initialized") | ||
| 140 | - except Exception as e: | ||
| 141 | - print(f"Warning: Failed to initialize text encoder: {e}") | ||
| 142 | - print("Continuing without text embeddings...") | ||
| 143 | - | ||
| 144 | - try: | ||
| 145 | - image_encoder = CLIPImageEncoder() | ||
| 146 | - print("Image encoder initialized") | ||
| 147 | - except Exception as e: | ||
| 148 | - print(f"Warning: Failed to initialize image encoder: {e}") | ||
| 149 | - print("Continuing without image embeddings...") | ||
| 150 | - else: | ||
| 151 | - print(f"\n[4/6] Skipping embedding generation (--skip-embeddings)") | ||
| 152 | - | ||
| 153 | - # Initialize data transformer | ||
| 154 | - print(f"\n[5/6] Initializing data transformation pipeline") | ||
| 155 | - transformer = DataTransformer( | ||
| 156 | - config=config, | ||
| 157 | - text_encoder=text_encoder, | ||
| 158 | - image_encoder=image_encoder, | ||
| 159 | - use_cache=True | ||
| 160 | - ) | ||
| 161 | - | ||
| 162 | - # Run indexing pipeline | ||
| 163 | - print(f"\n[6/6] Starting indexing pipeline") | ||
| 164 | - pipeline = IndexingPipeline( | ||
| 165 | - config=config, | ||
| 166 | - es_client=es_client, | ||
| 167 | - data_transformer=transformer, | ||
| 168 | - recreate_index=args.recreate_index | ||
| 169 | - ) | ||
| 170 | - | ||
| 171 | - results = pipeline.run(df, batch_size=args.batch_size) | ||
| 172 | - | ||
| 173 | - # Print summary | ||
| 174 | - print("\n" + "="*60) | ||
| 175 | - print("Ingestion Complete!") | ||
| 176 | - print("="*60) | ||
| 177 | - print(f"Total documents: {results['total']}") | ||
| 178 | - print(f"Successfully indexed: {results['success']}") | ||
| 179 | - print(f"Failed: {results['failed']}") | ||
| 180 | - print(f"Time elapsed: {results['elapsed_time']:.2f}s") | ||
| 181 | - print(f"Throughput: {results['docs_per_second']:.2f} docs/s") | ||
| 182 | - | ||
| 183 | - if results['errors']: | ||
| 184 | - print(f"\nFirst few errors:") | ||
| 185 | - for error in results['errors'][:5]: | ||
| 186 | - print(f" - {error}") | ||
| 187 | - | ||
| 188 | - # Verify index | ||
| 189 | - print(f"\nVerifying index...") | ||
| 190 | - doc_count = es_client.count(config.es_index_name) | ||
| 191 | - print(f"Documents in index: {doc_count}") | ||
| 192 | - | ||
| 193 | - print("\nIngestion completed successfully!") | ||
| 194 | - return 0 | ||
| 195 | - | ||
| 196 | - | ||
| 197 | -if __name__ == "__main__": | ||
| 198 | - sys.exit(main()) |
docs/常用查询 - ES.md
| @@ -233,7 +233,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ | @@ -233,7 +233,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ | ||
| 233 | curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_count?pretty' -H 'Content-Type: application/json' -d '{ | 233 | curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_count?pretty' -H 'Content-Type: application/json' -d '{ |
| 234 | "query": { | 234 | "query": { |
| 235 | "term": { | 235 | "term": { |
| 236 | - "tenant_id": "162" | 236 | + "tenant_id": "170" |
| 237 | } | 237 | } |
| 238 | } | 238 | } |
| 239 | }' | 239 | }' |
indexer/bulk_indexer.py
| @@ -7,7 +7,7 @@ Handles batch indexing of documents with progress tracking and error handling. | @@ -7,7 +7,7 @@ Handles batch indexing of documents with progress tracking and error handling. | ||
| 7 | from typing import List, Dict, Any, Optional | 7 | from typing import List, Dict, Any, Optional |
| 8 | from elasticsearch.helpers import bulk, BulkIndexError | 8 | from elasticsearch.helpers import bulk, BulkIndexError |
| 9 | from utils.es_client import ESClient | 9 | from utils.es_client import ESClient |
| 10 | -from indexer.mapping_generator import load_mapping, DEFAULT_INDEX_NAME | 10 | +from indexer.mapping_generator import DEFAULT_INDEX_NAME |
| 11 | import time | 11 | import time |
| 12 | 12 | ||
| 13 | 13 | ||
| @@ -196,64 +196,3 @@ class BulkIndexer: | @@ -196,64 +196,3 @@ class BulkIndexer: | ||
| 196 | except Exception as e: | 196 | except Exception as e: |
| 197 | print(f"[BulkIndexer] Update by query failed: {e}") | 197 | print(f"[BulkIndexer] Update by query failed: {e}") |
| 198 | return 0 | 198 | return 0 |
| 199 | - | ||
| 200 | - | ||
| 201 | -class IndexingPipeline: | ||
| 202 | - """Complete indexing pipeline from source data to ES.""" | ||
| 203 | - | ||
| 204 | - def __init__( | ||
| 205 | - self, | ||
| 206 | - es_client: ESClient, | ||
| 207 | - data_transformer, | ||
| 208 | - index_name: str = None, | ||
| 209 | - recreate_index: bool = False | ||
| 210 | - ): | ||
| 211 | - """ | ||
| 212 | - Initialize indexing pipeline. | ||
| 213 | - | ||
| 214 | - Args: | ||
| 215 | - es_client: Elasticsearch client | ||
| 216 | - data_transformer: Data transformer instance | ||
| 217 | - index_name: Index name (defaults to DEFAULT_INDEX_NAME) | ||
| 218 | - recreate_index: Whether to recreate index if exists | ||
| 219 | - """ | ||
| 220 | - self.es_client = es_client | ||
| 221 | - self.transformer = data_transformer | ||
| 222 | - self.index_name = index_name or DEFAULT_INDEX_NAME | ||
| 223 | - self.recreate_index = recreate_index | ||
| 224 | - | ||
| 225 | - def run(self, df, batch_size: int = 100) -> Dict[str, Any]: | ||
| 226 | - """ | ||
| 227 | - Run complete indexing pipeline. | ||
| 228 | - | ||
| 229 | - Args: | ||
| 230 | - df: Source dataframe | ||
| 231 | - batch_size: Batch size for processing | ||
| 232 | - | ||
| 233 | - Returns: | ||
| 234 | - Indexing statistics | ||
| 235 | - """ | ||
| 236 | - # Load and create index | ||
| 237 | - mapping = load_mapping() | ||
| 238 | - | ||
| 239 | - if self.recreate_index: | ||
| 240 | - if self.es_client.index_exists(self.index_name): | ||
| 241 | - print(f"[IndexingPipeline] Deleting existing index: {self.index_name}") | ||
| 242 | - self.es_client.delete_index(self.index_name) | ||
| 243 | - | ||
| 244 | - if not self.es_client.index_exists(self.index_name): | ||
| 245 | - print(f"[IndexingPipeline] Creating index: {self.index_name}") | ||
| 246 | - self.es_client.create_index(self.index_name, mapping) | ||
| 247 | - else: | ||
| 248 | - print(f"[IndexingPipeline] Using existing index: {self.index_name}") | ||
| 249 | - | ||
| 250 | - # Transform data | ||
| 251 | - print(f"[IndexingPipeline] Transforming {len(df)} documents...") | ||
| 252 | - documents = self.transformer.transform_batch(df, batch_size=batch_size) | ||
| 253 | - print(f"[IndexingPipeline] Transformed {len(documents)} documents") | ||
| 254 | - | ||
| 255 | - # Bulk index | ||
| 256 | - indexer = BulkIndexer(self.es_client, self.index_name, batch_size=500) | ||
| 257 | - results = indexer.index_documents(documents, id_field="skuId") | ||
| 258 | - | ||
| 259 | - return results |
main.py
| @@ -3,8 +3,8 @@ | @@ -3,8 +3,8 @@ | ||
| 3 | Main entry point for SearchEngine operations. | 3 | Main entry point for SearchEngine operations. |
| 4 | 4 | ||
| 5 | Provides a unified CLI for common operations: | 5 | Provides a unified CLI for common operations: |
| 6 | -- ingest: Ingest data into Elasticsearch | ||
| 7 | -- serve: Start API service | 6 | +- serve: Start API service (search + admin routes) |
| 7 | +- serve-indexer: Start dedicated Indexer API service | ||
| 8 | - search: Test search from command line | 8 | - search: Test search from command line |
| 9 | """ | 9 | """ |
| 10 | 10 | ||
| @@ -12,7 +12,6 @@ import sys | @@ -12,7 +12,6 @@ import sys | ||
| 12 | import os | 12 | import os |
| 13 | import argparse | 13 | import argparse |
| 14 | import json | 14 | import json |
| 15 | -import pandas as pd | ||
| 16 | import uvicorn | 15 | import uvicorn |
| 17 | 16 | ||
| 18 | # Add parent directory to path | 17 | # Add parent directory to path |
| @@ -23,53 +22,6 @@ from utils import ESClient | @@ -23,53 +22,6 @@ from utils import ESClient | ||
| 23 | from search import Searcher | 22 | from search import Searcher |
| 24 | 23 | ||
| 25 | 24 | ||
| 26 | -def cmd_ingest(args): | ||
| 27 | - """Run data ingestion.""" | ||
| 28 | - # Local imports to avoid hard dependency at module import time | ||
| 29 | - import pandas as pd | ||
| 30 | - from embeddings import BgeEncoder, CLIPImageEncoder | ||
| 31 | - from indexer.bulk_indexer import IndexingPipeline | ||
| 32 | - # NOTE: DataTransformer was referenced historically, but the concrete | ||
| 33 | - # implementation is now provided via customer-specific scripts | ||
| 34 | - # (e.g. data/customer1/ingest_customer1.py). If you still need a generic | ||
| 35 | - # ingestion pipeline here, you can wire your own transformer. | ||
| 36 | - from indexer.spu_transformer import SPUTransformer as DataTransformer | ||
| 37 | - print("Starting data ingestion") | ||
| 38 | - | ||
| 39 | - # Load config | ||
| 40 | - config_loader = ConfigLoader("config/config.yaml") | ||
| 41 | - config = config_loader.load_config() | ||
| 42 | - | ||
| 43 | - # Initialize ES | ||
| 44 | - es_client = ESClient(hosts=[args.es_host]) | ||
| 45 | - if not es_client.ping(): | ||
| 46 | - print(f"ERROR: Cannot connect to Elasticsearch at {args.es_host}") | ||
| 47 | - return 1 | ||
| 48 | - | ||
| 49 | - # Load data | ||
| 50 | - df = pd.read_csv(args.csv_file) | ||
| 51 | - if args.limit: | ||
| 52 | - df = df.head(args.limit) | ||
| 53 | - print(f"Loaded {len(df)} documents") | ||
| 54 | - | ||
| 55 | - # Initialize encoders | ||
| 56 | - text_encoder = None if args.skip_embeddings else BgeEncoder() | ||
| 57 | - image_encoder = None if args.skip_embeddings else CLIPImageEncoder() | ||
| 58 | - | ||
| 59 | - # Transform and index | ||
| 60 | - transformer = DataTransformer(config, text_encoder, image_encoder, use_cache=True) | ||
| 61 | - pipeline = IndexingPipeline(config, es_client, transformer, recreate_index=args.recreate) | ||
| 62 | - | ||
| 63 | - results = pipeline.run(df, batch_size=args.batch_size) | ||
| 64 | - | ||
| 65 | - print(f"\nIngestion complete:") | ||
| 66 | - print(f" Success: {results['success']}") | ||
| 67 | - print(f" Failed: {results['failed']}") | ||
| 68 | - print(f" Time: {results['elapsed_time']:.2f}s") | ||
| 69 | - | ||
| 70 | - return 0 | ||
| 71 | - | ||
| 72 | - | ||
| 73 | def cmd_serve(args): | 25 | def cmd_serve(args): |
| 74 | """Start API service.""" | 26 | """Start API service.""" |
| 75 | os.environ['ES_HOST'] = args.es_host | 27 | os.environ['ES_HOST'] = args.es_host |
| @@ -154,15 +106,6 @@ def main(): | @@ -154,15 +106,6 @@ def main(): | ||
| 154 | 106 | ||
| 155 | subparsers = parser.add_subparsers(dest='command', help='Command to execute') | 107 | subparsers = parser.add_subparsers(dest='command', help='Command to execute') |
| 156 | 108 | ||
| 157 | - # Ingest command | ||
| 158 | - ingest_parser = subparsers.add_parser('ingest', help='Ingest data into Elasticsearch') | ||
| 159 | - ingest_parser.add_argument('csv_file', help='Path to CSV data file') | ||
| 160 | - ingest_parser.add_argument('--es-host', default='http://localhost:9200', help='Elasticsearch host') | ||
| 161 | - ingest_parser.add_argument('--limit', type=int, help='Limit number of documents') | ||
| 162 | - ingest_parser.add_argument('--batch-size', type=int, default=100, help='Batch size') | ||
| 163 | - ingest_parser.add_argument('--recreate', action='store_true', help='Recreate index') | ||
| 164 | - ingest_parser.add_argument('--skip-embeddings', action='store_true', help='Skip embeddings') | ||
| 165 | - | ||
| 166 | # Serve command | 109 | # Serve command |
| 167 | serve_parser = subparsers.add_parser('serve', help='Start API service (multi-tenant)') | 110 | serve_parser = subparsers.add_parser('serve', help='Start API service (multi-tenant)') |
| 168 | serve_parser.add_argument('--host', default='0.0.0.0', help='Host to bind to') | 111 | serve_parser.add_argument('--host', default='0.0.0.0', help='Host to bind to') |
| @@ -197,9 +140,7 @@ def main(): | @@ -197,9 +140,7 @@ def main(): | ||
| 197 | return 1 | 140 | return 1 |
| 198 | 141 | ||
| 199 | # Execute command | 142 | # Execute command |
| 200 | - if args.command == 'ingest': | ||
| 201 | - return cmd_ingest(args) | ||
| 202 | - elif args.command == 'serve': | 143 | + if args.command == 'serve': |
| 203 | return cmd_serve(args) | 144 | return cmd_serve(args) |
| 204 | elif args.command == 'serve-indexer': | 145 | elif args.command == 'serve-indexer': |
| 205 | return cmd_serve_indexer(args) | 146 | return cmd_serve_indexer(args) |