Commit 8810a6fa9c3779e2fa48766c049a8296618d496f

Authored by tangwang
1 parent e7f2b240

重构

.env.example
1 1 # ====================
2 2 # OpenAI Configuration
3 3 # ====================
4   -OPENAI_API_KEY=
5   -OPENAI_MODEL=gpt-4o-mini
6   -OPENAI_EMBEDDING_MODEL=text-embedding-3-small
  4 +OPENAI_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b
  5 +OPENAI_MODEL=qwen-plus
  6 +# Base URL for Qwen/DashScope (OpenAI-compatible API)
  7 +# 北京: https://dashscope.aliyuncs.com/compatible-mode/v1
  8 +# 弗吉尼亚: https://dashscope-us.aliyuncs.com/compatible-mode/v1
  9 +# 新加坡: https://dashscope-intl.aliyuncs.com/compatible-mode/v1
  10 +OPENAI_API_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
7 11 OPENAI_TEMPERATURE=1
8 12 OPENAI_MAX_TOKENS=1000
9 13  
10 14 # ====================
11   -# CLIP Server Configuration
12   -# ====================
13   -CLIP_SERVER_URL=grpc://localhost:51000
14   -
15   -# ====================
16   -# Milvus Configuration
17   -# ====================
18   -MILVUS_HOST=localhost
19   -MILVUS_PORT=19530
20   -
21   -# Collection settings
22   -TEXT_COLLECTION_NAME=text_embeddings
23   -IMAGE_COLLECTION_NAME=image_embeddings
24   -TEXT_DIM=1536
25   -IMAGE_DIM=512
26   -
27   -# ====================
28 15 # Search Configuration
29 16 # ====================
30 17 TOP_K_RESULTS=30
31 18 SIMILARITY_THRESHOLD=0.6
32 19  
  20 +# Search API (see docs/搜索API对接指南.md)
  21 +SEARCH_API_BASE_URL=http://120.76.41.98:6002
  22 +SEARCH_API_TENANT_ID=162
  23 +
33 24 # ====================
34 25 # Application Configuration
35 26 # ====================
... ...
.gitignore
... ... @@ -53,7 +53,6 @@ data/**
53 53 *.db
54 54 *.sqlite
55 55 *.sqlite3
56   -data/milvus_lite.db
57 56  
58 57 # Docker volumes
59 58 volumes/
... ...
README.md
... ... @@ -12,9 +12,9 @@ OmniShopAgent autonomously decides which tools to call, maintains conversation s
12 12  
13 13 **Key Features:**
14 14 - Autonomous tool selection and execution
15   -- Multi-modal search (text + image)
  15 +- Text search via Search API
16 16 - Conversational context awareness
17   -- Real-time visual analysis
  17 +- Real-time visual analysis (style extraction from images)
18 18  
19 19 ## Tech Stack
20 20  
... ... @@ -22,9 +22,7 @@ OmniShopAgent autonomously decides which tools to call, maintains conversation s
22 22 |-----------|-----------|
23 23 | **Agent Framework** | LangGraph |
24 24 | **LLM** | any LLM supported by LangChain |
25   -| **Text Embedding** | text-embedding-3-small |
26   -| **Image Embedding** | CLIP ViT-B/32 |
27   -| **Vector Database** | Milvus |
  25 +| **Search** | Search API (HTTP) |
28 26 | **Frontend** | Streamlit |
29 27 | **Dataset** | Kaggle Fashion Products |
30 28  
... ... @@ -52,8 +50,7 @@ graph LR
52 50 ```
53 51  
54 52 **Available Tools:**
55   -- `search_products(query)` - Text-based semantic search
56   -- `search_by_image(image_path)` - Visual similarity search
  53 +- `search_products(query)` - Text-based product search via Search API
57 54 - `analyze_image_style(image_path)` - VLM style analysis
58 55  
59 56  
... ... @@ -66,12 +63,6 @@ User: "winter coats for women"
66 63 Agent: search_products("winter coats women") → Returns 5 products
67 64 ```
68 65  
69   -**Image Upload:**
70   -```
71   -User: [uploads sneaker photo] "find similar"
72   -Agent: search_by_image(path) → Returns visually similar shoes
73   -```
74   -
75 66 **Style Analysis + Search:**
76 67 ```
77 68 User: [uploads vintage jacket] "what style is this? find matching pants"
... ... @@ -93,6 +84,8 @@ Agent: [remembers context] → search_products("red formal dresses") → Results
93 84 User: [uploads office outfit] "I like the shirt but need something more casual"
94 85 Agent: analyze_image_style(path) → Extracts shirt details
95 86 search_products("casual shirt [color] [style]") → Returns casual alternatives
  87 +
  88 +**Note:** For image uploads "find similar", use analyze_image_style first to extract attributes, then search_products with the description.
96 89 ```
97 90  
98 91 ## Installation
... ... @@ -100,7 +93,6 @@ Agent: analyze_image_style(path) → Extracts shirt details
100 93 **Prerequisites:**
101 94 - Python 3.12+ (LangChain 1.x 要求 Python 3.10+)
102 95 - OpenAI API Key
103   -- Docker & Docker Compose
104 96  
105 97 ### 1. Setup Environment
106 98 ```bash
... ... @@ -116,38 +108,14 @@ cp .env.example .env
116 108 # Edit .env and add your OPENAI_API_KEY
117 109 ```
118 110  
119   -### 2. Download Dataset
120   -Download the [Fashion Product Images Dataset](https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset) from Kaggle and extract to `./data/`:
121   -
122   -```python
123   -python scripts/download_dataset.py
124   -```
125   -
126   -Expected structure:
127   -```
128   -data/
129   -├── images/ # ~44k product images
130   -├── styles.csv # Product metadata
131   -└── images.csv # Image filenames
132   -```
133   -
134   -### 3. Start Services
135   -
136   -```bash
137   -docker-compose up
138   -python -m clip_server
139   -```
140   -
141   -
142   -### 4. Index Data
  111 +### 2. (Optional) Download Dataset
  112 +For image style analysis, you may download the [Fashion Product Images Dataset](https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset) from Kaggle:
143 113  
144 114 ```bash
145   -python scripts/index_data.py
  115 +python scripts/download_dataset.py
146 116 ```
147 117  
148   -This generates and stores text/image embeddings for all 44k products in Milvus.
149   -
150   -### 5. Launch Application
  118 +### 3. Launch Application
151 119 ```bash
152 120 # 使用启动脚本(推荐)
153 121 ./scripts/start.sh
... ... @@ -155,6 +123,9 @@ This generates and stores text/image embeddings for all 44k products in Milvus.
155 123 # 或直接运行
156 124 streamlit run app.py
157 125 ```
  126 +
  127 +Product search uses the external Search API. Configure `SEARCH_API_BASE_URL` and `SEARCH_API_TENANT_ID` in `.env` if needed.
  128 +
158 129 Opens at `http://localhost:8501`
159 130  
160 131 ### CentOS 8 部署
... ...
app/agents/shopping_agent.py
... ... @@ -52,11 +52,14 @@ class ShoppingAgent:
52 52 self.session_id = session_id or "default"
53 53  
54 54 # Initialize LLM
55   - self.llm = ChatOpenAI(
  55 + llm_kwargs = dict(
56 56 model=settings.openai_model,
57 57 temperature=settings.openai_temperature,
58 58 api_key=settings.openai_api_key,
59 59 )
  60 + if settings.openai_api_base_url:
  61 + llm_kwargs["base_url"] = settings.openai_api_base_url
  62 + self.llm = ChatOpenAI(**llm_kwargs)
60 63  
61 64 # Get tools and bind to model
62 65 self.tools = get_all_tools()
... ... @@ -73,12 +76,11 @@ class ShoppingAgent:
73 76 # System prompt for the agent
74 77 system_prompt = """You are an intelligent fashion shopping assistant. You can:
75 78 1. Search for products by text description (use search_products)
76   -2. Find visually similar products from images (use search_by_image)
77   -3. Analyze image style and attributes (use analyze_image_style)
  79 +2. Analyze image style and attributes (use analyze_image_style)
78 80  
79 81 When a user asks about products:
80 82 - For text queries: use search_products directly
81   -- For image uploads: decide if you need to analyze_image_style first, then search
  83 +- For image uploads: use analyze_image_style first to understand the product, then use search_products with the extracted description
82 84 - You can call multiple tools in sequence if needed
83 85 - Always provide helpful, friendly responses
84 86  
... ...
app/config.py
... ... @@ -4,6 +4,7 @@ Loads environment variables and provides configuration objects
4 4 """
5 5  
6 6 import os
  7 +from typing import Optional
7 8  
8 9 from pydantic_settings import BaseSettings
9 10  
... ... @@ -17,47 +18,20 @@ class Settings(BaseSettings):
17 18 # OpenAI Configuration
18 19 openai_api_key: str
19 20 openai_model: str = "gpt-4o-mini"
20   - openai_embedding_model: str = "text-embedding-3-small"
21 21 openai_temperature: float = 0.7
22 22 openai_max_tokens: int = 1000
23   -
24   - # CLIP Server Configuration
25   - clip_server_url: str = "grpc://localhost:51000"
26   -
27   - # Milvus Configuration
28   - milvus_uri: str = "http://localhost:19530"
29   - milvus_host: str = "localhost"
30   - milvus_port: int = 19530
31   - text_collection_name: str = "text_embeddings"
32   - image_collection_name: str = "image_embeddings"
33   - text_dim: int = 1536
34   - image_dim: int = 512
35   -
36   - @property
37   - def milvus_uri_absolute(self) -> str:
38   - """Get absolute path for Milvus URI
39   -
40   - Returns:
41   - - For http/https URIs: returns as-is (Milvus Standalone)
42   - - For file paths starting with ./: converts to absolute path (Milvus Lite)
43   - - For other paths: returns as-is
44   - """
45   - import os
46   -
47   - # If it's a network URI, return as-is (Milvus Standalone)
48   - if self.milvus_uri.startswith(("http://", "https://")):
49   - return self.milvus_uri
50   - # If it's a relative path, convert to absolute (Milvus Lite)
51   - if self.milvus_uri.startswith("./"):
52   - base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
53   - return os.path.join(base_dir, self.milvus_uri[2:])
54   - # Otherwise return as-is
55   - return self.milvus_uri
  23 + # Base URL for OpenAI-compatible APIs (e.g. Qwen/DashScope)
  24 + # Qwen 北京: https://dashscope.aliyuncs.com/compatible-mode/v1
  25 + openai_api_base_url: Optional[str] = None
56 26  
57 27 # Search Configuration
58 28 top_k_results: int = 10
59 29 similarity_threshold: float = 0.6
60 30  
  31 + # Search API (see docs/搜索API对接指南.md)
  32 + search_api_base_url: str = "http://120.76.41.98:6002"
  33 + search_api_tenant_id: str = "162"
  34 +
61 35 # Application Configuration
62 36 app_host: str = "0.0.0.0"
63 37 app_port: int = 8000
... ... @@ -73,6 +47,7 @@ class Settings(BaseSettings):
73 47 env_file = ".env"
74 48 env_file_encoding = "utf-8"
75 49 case_sensitive = False
  50 + extra = "ignore"
76 51  
77 52  
78 53 # Global settings instance
... ...
app/services/__init__.py
1 1 """
2 2 Services Module
3   -Provides database and embedding services for the application
4 3 """
5   -
6   -from app.services.embedding_service import EmbeddingService, get_embedding_service
7   -from app.services.milvus_service import MilvusService, get_milvus_service
8   -
9   -__all__ = [
10   - "EmbeddingService",
11   - "get_embedding_service",
12   - "MilvusService",
13   - "get_milvus_service",
14   -]
... ...
app/services/embedding_service.py deleted
... ... @@ -1,293 +0,0 @@
1   -"""
2   -Embedding Service for Text and Image Embeddings
3   -Supports OpenAI text embeddings and CLIP image embeddings
4   -"""
5   -
6   -import logging
7   -from pathlib import Path
8   -from typing import List, Optional, Union
9   -
10   -import numpy as np
11   -from clip_client import Client as ClipClient
12   -from openai import OpenAI
13   -
14   -from app.config import settings
15   -
16   -logger = logging.getLogger(__name__)
17   -
18   -
19   -class EmbeddingService:
20   - """Service for generating text and image embeddings"""
21   -
22   - def __init__(
23   - self,
24   - openai_api_key: Optional[str] = None,
25   - clip_server_url: Optional[str] = None,
26   - ):
27   - """Initialize embedding service
28   -
29   - Args:
30   - openai_api_key: OpenAI API key. If None, uses settings.openai_api_key
31   - clip_server_url: CLIP server URL. If None, uses settings.clip_server_url
32   - """
33   - # Initialize OpenAI client for text embeddings
34   - self.openai_api_key = openai_api_key or settings.openai_api_key
35   - self.openai_client = OpenAI(api_key=self.openai_api_key)
36   - self.text_embedding_model = settings.openai_embedding_model
37   -
38   - # Initialize CLIP client for image embeddings
39   - self.clip_server_url = clip_server_url or settings.clip_server_url
40   - self.clip_client: Optional[ClipClient] = None
41   -
42   - logger.info("Embedding service initialized")
43   -
44   - def connect_clip(self) -> None:
45   - """Connect to CLIP server"""
46   - try:
47   - self.clip_client = ClipClient(server=self.clip_server_url)
48   - logger.info(f"Connected to CLIP server at {self.clip_server_url}")
49   - except Exception as e:
50   - logger.error(f"Failed to connect to CLIP server: {e}")
51   - raise
52   -
53   - def disconnect_clip(self) -> None:
54   - """Disconnect from CLIP server"""
55   - if self.clip_client:
56   - # Note: clip_client doesn't have explicit close method
57   - self.clip_client = None
58   - logger.info("Disconnected from CLIP server")
59   -
60   - def get_text_embedding(self, text: str) -> List[float]:
61   - """Get embedding for a single text
62   -
63   - Args:
64   - text: Input text
65   -
66   - Returns:
67   - Embedding vector as list of floats
68   - """
69   - try:
70   - response = self.openai_client.embeddings.create(
71   - input=text, model=self.text_embedding_model
72   - )
73   - embedding = response.data[0].embedding
74   - logger.debug(f"Generated text embedding for: {text[:50]}...")
75   - return embedding
76   - except Exception as e:
77   - logger.error(f"Failed to generate text embedding: {e}")
78   - raise
79   -
80   - def get_text_embeddings_batch(
81   - self, texts: List[str], batch_size: int = 100
82   - ) -> List[List[float]]:
83   - """Get embeddings for multiple texts in batches
84   -
85   - Args:
86   - texts: List of input texts
87   - batch_size: Number of texts to process at once
88   -
89   - Returns:
90   - List of embedding vectors
91   - """
92   - all_embeddings = []
93   -
94   - for i in range(0, len(texts), batch_size):
95   - batch = texts[i : i + batch_size]
96   -
97   - try:
98   - response = self.openai_client.embeddings.create(
99   - input=batch, model=self.text_embedding_model
100   - )
101   -
102   - # Extract embeddings in the correct order
103   - embeddings = [item.embedding for item in response.data]
104   - all_embeddings.extend(embeddings)
105   -
106   - logger.info(
107   - f"Generated text embeddings for batch {i // batch_size + 1}: {len(embeddings)} embeddings"
108   - )
109   -
110   - except Exception as e:
111   - logger.error(
112   - f"Failed to generate text embeddings for batch {i // batch_size + 1}: {e}"
113   - )
114   - raise
115   -
116   - return all_embeddings
117   -
118   - def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]:
119   - """Get CLIP embedding for a single image
120   -
121   - Args:
122   - image_path: Path to image file
123   -
124   - Returns:
125   - Embedding vector as list of floats
126   - """
127   - if not self.clip_client:
128   - raise RuntimeError("CLIP client not connected. Call connect_clip() first.")
129   -
130   - image_path = Path(image_path)
131   - if not image_path.exists():
132   - raise FileNotFoundError(f"Image not found: {image_path}")
133   -
134   - try:
135   - # Get embedding from CLIP server using image path (as string)
136   - result = self.clip_client.encode([str(image_path)])
137   -
138   - # Extract embedding - result is numpy array
139   - import numpy as np
140   -
141   - if isinstance(result, np.ndarray):
142   - # If result is numpy array, use first element
143   - embedding = (
144   - result[0].tolist() if len(result.shape) > 1 else result.tolist()
145   - )
146   - else:
147   - # If result is DocumentArray
148   - embedding = result[0].embedding.tolist()
149   -
150   - logger.debug(f"Generated image embedding for: {image_path.name}")
151   - return embedding
152   -
153   - except Exception as e:
154   - logger.error(f"Failed to generate image embedding for {image_path}: {e}")
155   - raise
156   -
157   - def get_image_embeddings_batch(
158   - self, image_paths: List[Union[str, Path]], batch_size: int = 32
159   - ) -> List[Optional[List[float]]]:
160   - """Get CLIP embeddings for multiple images in batches
161   -
162   - Args:
163   - image_paths: List of paths to image files
164   - batch_size: Number of images to process at once
165   -
166   - Returns:
167   - List of embedding vectors (None for failed images)
168   - """
169   - if not self.clip_client:
170   - raise RuntimeError("CLIP client not connected. Call connect_clip() first.")
171   -
172   - all_embeddings = []
173   -
174   - for i in range(0, len(image_paths), batch_size):
175   - batch_paths = image_paths[i : i + batch_size]
176   - valid_paths = []
177   - valid_indices = []
178   -
179   - # Check which images exist
180   - for idx, path in enumerate(batch_paths):
181   - path = Path(path)
182   - if path.exists():
183   - valid_paths.append(str(path))
184   - valid_indices.append(idx)
185   - else:
186   - logger.warning(f"Image not found: {path}")
187   -
188   - # Get embeddings for valid images
189   - if valid_paths:
190   - try:
191   - # Send paths as strings to CLIP server
192   - result = self.clip_client.encode(valid_paths)
193   -
194   - # Create embeddings list with None for missing images
195   - batch_embeddings = [None] * len(batch_paths)
196   -
197   - # Handle result format - could be numpy array or DocumentArray
198   - import numpy as np
199   -
200   - if isinstance(result, np.ndarray):
201   - # Result is numpy array - shape (n_images, embedding_dim)
202   - for idx in range(len(result)):
203   - original_idx = valid_indices[idx]
204   - batch_embeddings[original_idx] = result[idx].tolist()
205   - else:
206   - # Result is DocumentArray
207   - for idx, doc in enumerate(result):
208   - original_idx = valid_indices[idx]
209   - batch_embeddings[original_idx] = doc.embedding.tolist()
210   -
211   - all_embeddings.extend(batch_embeddings)
212   -
213   - logger.info(
214   - f"Generated image embeddings for batch {i // batch_size + 1}: "
215   - f"{len(valid_paths)}/{len(batch_paths)} successful"
216   - )
217   -
218   - except Exception as e:
219   - logger.error(
220   - f"Failed to generate image embeddings for batch {i // batch_size + 1}: {e}"
221   - )
222   - # Add None for all images in failed batch
223   - all_embeddings.extend([None] * len(batch_paths))
224   - else:
225   - # All images in batch failed to load
226   - all_embeddings.extend([None] * len(batch_paths))
227   -
228   - return all_embeddings
229   -
230   - def get_text_embedding_from_image(
231   - self, image_path: Union[str, Path]
232   - ) -> List[float]:
233   - """Get text-based embedding by describing the image
234   - This is useful for cross-modal search
235   -
236   - Note: This is a placeholder for future implementation
237   - that could use vision models to generate text descriptions
238   -
239   - Args:
240   - image_path: Path to image file
241   -
242   - Returns:
243   - Text embedding vector
244   - """
245   - # For now, we just return the image embedding
246   - # In the future, this could use a vision-language model to generate
247   - # a text description and then embed that
248   - raise NotImplementedError("Text embedding from image not yet implemented")
249   -
250   - def cosine_similarity(
251   - self, embedding1: List[float], embedding2: List[float]
252   - ) -> float:
253   - """Calculate cosine similarity between two embeddings
254   -
255   - Args:
256   - embedding1: First embedding vector
257   - embedding2: Second embedding vector
258   -
259   - Returns:
260   - Cosine similarity score (0-1)
261   - """
262   - vec1 = np.array(embedding1)
263   - vec2 = np.array(embedding2)
264   -
265   - # Normalize vectors
266   - vec1_norm = vec1 / np.linalg.norm(vec1)
267   - vec2_norm = vec2 / np.linalg.norm(vec2)
268   -
269   - # Calculate cosine similarity
270   - similarity = np.dot(vec1_norm, vec2_norm)
271   -
272   - return float(similarity)
273   -
274   - def get_embedding_dimensions(self) -> dict:
275   - """Get the dimensions of text and image embeddings
276   -
277   - Returns:
278   - Dictionary with text_dim and image_dim
279   - """
280   - return {"text_dim": settings.text_dim, "image_dim": settings.image_dim}
281   -
282   -
283   -# Global instance
284   -_embedding_service: Optional[EmbeddingService] = None
285   -
286   -
287   -def get_embedding_service() -> EmbeddingService:
288   - """Get or create the global embedding service instance"""
289   - global _embedding_service
290   - if _embedding_service is None:
291   - _embedding_service = EmbeddingService()
292   - _embedding_service.connect_clip()
293   - return _embedding_service
app/services/milvus_service.py deleted
... ... @@ -1,480 +0,0 @@
1   -"""
2   -Milvus Service for Vector Storage and Similarity Search
3   -Manages text and image embeddings in separate collections
4   -"""
5   -
6   -import logging
7   -from typing import Any, Dict, List, Optional
8   -
9   -from pymilvus import (
10   - DataType,
11   - MilvusClient,
12   -)
13   -
14   -from app.config import settings
15   -
16   -logger = logging.getLogger(__name__)
17   -
18   -
19   -class MilvusService:
20   - """Service for managing vector embeddings in Milvus"""
21   -
22   - def __init__(self, uri: Optional[str] = None):
23   - """Initialize Milvus service
24   -
25   - Args:
26   - uri: Milvus connection URI. If None, uses settings.milvus_uri
27   - """
28   - if uri:
29   - self.uri = uri
30   - else:
31   - # Use absolute path for Milvus Lite
32   - self.uri = settings.milvus_uri_absolute
33   - self.text_collection_name = settings.text_collection_name
34   - self.image_collection_name = settings.image_collection_name
35   - self.text_dim = settings.text_dim
36   - self.image_dim = settings.image_dim
37   -
38   - # Use MilvusClient for simplified operations
39   - self._client: Optional[MilvusClient] = None
40   -
41   - logger.info(f"Initializing Milvus service with URI: {self.uri}")
42   -
43   - def is_connected(self) -> bool:
44   - """Check if connected to Milvus"""
45   - return self._client is not None
46   -
47   - def connect(self) -> None:
48   - """Connect to Milvus"""
49   - if self.is_connected():
50   - return
51   - try:
52   - self._client = MilvusClient(uri=self.uri)
53   - logger.info(f"Connected to Milvus at {self.uri}")
54   - except Exception as e:
55   - logger.error(f"Failed to connect to Milvus: {e}")
56   - raise
57   -
58   - def disconnect(self) -> None:
59   - """Disconnect from Milvus"""
60   - if self._client:
61   - self._client.close()
62   - self._client = None
63   - logger.info("Disconnected from Milvus")
64   -
65   - @property
66   - def client(self) -> MilvusClient:
67   - """Get the Milvus client"""
68   - if not self._client:
69   - raise RuntimeError("Milvus not connected. Call connect() first.")
70   - return self._client
71   -
72   - def create_text_collection(self, recreate: bool = False) -> None:
73   - """Create collection for text embeddings with product metadata
74   -
75   - Args:
76   - recreate: If True, drop existing collection and recreate
77   - """
78   - if recreate and self.client.has_collection(self.text_collection_name):
79   - self.client.drop_collection(self.text_collection_name)
80   - logger.info(f"Dropped existing collection: {self.text_collection_name}")
81   -
82   - if self.client.has_collection(self.text_collection_name):
83   - logger.info(f"Text collection already exists: {self.text_collection_name}")
84   - return
85   -
86   - # Create collection with schema (includes metadata fields)
87   - schema = MilvusClient.create_schema(
88   - auto_id=False,
89   - enable_dynamic_field=True, # Allow additional metadata fields
90   - )
91   -
92   - # Core fields
93   - schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
94   - schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2000)
95   - schema.add_field(
96   - field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.text_dim
97   - )
98   -
99   - # Product metadata fields
100   - schema.add_field(
101   - field_name="productDisplayName", datatype=DataType.VARCHAR, max_length=500
102   - )
103   - schema.add_field(field_name="gender", datatype=DataType.VARCHAR, max_length=50)
104   - schema.add_field(
105   - field_name="masterCategory", datatype=DataType.VARCHAR, max_length=100
106   - )
107   - schema.add_field(
108   - field_name="subCategory", datatype=DataType.VARCHAR, max_length=100
109   - )
110   - schema.add_field(
111   - field_name="articleType", datatype=DataType.VARCHAR, max_length=100
112   - )
113   - schema.add_field(
114   - field_name="baseColour", datatype=DataType.VARCHAR, max_length=50
115   - )
116   - schema.add_field(field_name="season", datatype=DataType.VARCHAR, max_length=50)
117   - schema.add_field(field_name="usage", datatype=DataType.VARCHAR, max_length=50)
118   -
119   - # Create index parameters
120   - index_params = self.client.prepare_index_params()
121   - index_params.add_index(
122   - field_name="embedding",
123   - index_type="AUTOINDEX",
124   - metric_type="COSINE",
125   - )
126   -
127   - # Create collection
128   - self.client.create_collection(
129   - collection_name=self.text_collection_name,
130   - schema=schema,
131   - index_params=index_params,
132   - )
133   -
134   - logger.info(
135   - f"Created text collection with metadata: {self.text_collection_name}"
136   - )
137   -
138   - def create_image_collection(self, recreate: bool = False) -> None:
139   - """Create collection for image embeddings with product metadata
140   -
141   - Args:
142   - recreate: If True, drop existing collection and recreate
143   - """
144   - if recreate and self.client.has_collection(self.image_collection_name):
145   - self.client.drop_collection(self.image_collection_name)
146   - logger.info(f"Dropped existing collection: {self.image_collection_name}")
147   -
148   - if self.client.has_collection(self.image_collection_name):
149   - logger.info(
150   - f"Image collection already exists: {self.image_collection_name}"
151   - )
152   - return
153   -
154   - # Create collection with schema (includes metadata fields)
155   - schema = MilvusClient.create_schema(
156   - auto_id=False,
157   - enable_dynamic_field=True, # Allow additional metadata fields
158   - )
159   -
160   - # Core fields
161   - schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
162   - schema.add_field(
163   - field_name="image_path", datatype=DataType.VARCHAR, max_length=500
164   - )
165   - schema.add_field(
166   - field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.image_dim
167   - )
168   -
169   - # Product metadata fields
170   - schema.add_field(
171   - field_name="productDisplayName", datatype=DataType.VARCHAR, max_length=500
172   - )
173   - schema.add_field(field_name="gender", datatype=DataType.VARCHAR, max_length=50)
174   - schema.add_field(
175   - field_name="masterCategory", datatype=DataType.VARCHAR, max_length=100
176   - )
177   - schema.add_field(
178   - field_name="subCategory", datatype=DataType.VARCHAR, max_length=100
179   - )
180   - schema.add_field(
181   - field_name="articleType", datatype=DataType.VARCHAR, max_length=100
182   - )
183   - schema.add_field(
184   - field_name="baseColour", datatype=DataType.VARCHAR, max_length=50
185   - )
186   - schema.add_field(field_name="season", datatype=DataType.VARCHAR, max_length=50)
187   - schema.add_field(field_name="usage", datatype=DataType.VARCHAR, max_length=50)
188   -
189   - # Create index parameters
190   - index_params = self.client.prepare_index_params()
191   - index_params.add_index(
192   - field_name="embedding",
193   - index_type="AUTOINDEX",
194   - metric_type="COSINE",
195   - )
196   -
197   - # Create collection
198   - self.client.create_collection(
199   - collection_name=self.image_collection_name,
200   - schema=schema,
201   - index_params=index_params,
202   - )
203   -
204   - logger.info(
205   - f"Created image collection with metadata: {self.image_collection_name}"
206   - )
207   -
208   - def insert_text_embeddings(
209   - self,
210   - embeddings: List[Dict[str, Any]],
211   - ) -> int:
212   - """Insert text embeddings with metadata into collection
213   -
214   - Args:
215   - embeddings: List of dictionaries with keys:
216   - - id: unique ID (product ID)
217   - - text: the text that was embedded
218   - - embedding: the embedding vector
219   - - productDisplayName, gender, masterCategory, etc. (metadata)
220   -
221   - Returns:
222   - Number of inserted embeddings
223   - """
224   - if not embeddings:
225   - return 0
226   -
227   - try:
228   - # Insert data directly (all fields including metadata)
229   - # Milvus will accept all fields defined in schema + dynamic fields
230   - data = embeddings
231   -
232   - # Insert data
233   - result = self.client.insert(
234   - collection_name=self.text_collection_name,
235   - data=data,
236   - )
237   -
238   - logger.info(f"Inserted {len(data)} text embeddings")
239   - return len(data)
240   -
241   - except Exception as e:
242   - logger.error(f"Failed to insert text embeddings: {e}")
243   - raise
244   -
245   - def insert_image_embeddings(
246   - self,
247   - embeddings: List[Dict[str, Any]],
248   - ) -> int:
249   - """Insert image embeddings with metadata into collection
250   -
251   - Args:
252   - embeddings: List of dictionaries with keys:
253   - - id: unique ID (product ID)
254   - - image_path: path to the image file
255   - - embedding: the embedding vector
256   - - productDisplayName, gender, masterCategory, etc. (metadata)
257   -
258   - Returns:
259   - Number of inserted embeddings
260   - """
261   - if not embeddings:
262   - return 0
263   -
264   - try:
265   - # Insert data directly (all fields including metadata)
266   - # Milvus will accept all fields defined in schema + dynamic fields
267   - data = embeddings
268   -
269   - # Insert data
270   - result = self.client.insert(
271   - collection_name=self.image_collection_name,
272   - data=data,
273   - )
274   -
275   - logger.info(f"Inserted {len(data)} image embeddings")
276   - return len(data)
277   -
278   - except Exception as e:
279   - logger.error(f"Failed to insert image embeddings: {e}")
280   - raise
281   -
282   - def search_similar_text(
283   - self,
284   - query_embedding: List[float],
285   - limit: int = 10,
286   - filters: Optional[str] = None,
287   - output_fields: Optional[List[str]] = None,
288   - ) -> List[Dict[str, Any]]:
289   - """Search for similar text embeddings
290   -
291   - Args:
292   - query_embedding: Query embedding vector
293   - limit: Maximum number of results
294   - filters: Filter expression (e.g., "product_id in [1, 2, 3]")
295   - output_fields: List of fields to return
296   -
297   - Returns:
298   - List of search results with fields:
299   - - id: embedding ID
300   - - distance: similarity distance
301   - - entity: the matched entity with requested fields
302   - """
303   - try:
304   - if output_fields is None:
305   - output_fields = [
306   - "id",
307   - "text",
308   - "productDisplayName",
309   - "gender",
310   - "masterCategory",
311   - "subCategory",
312   - "articleType",
313   - "baseColour",
314   - ]
315   -
316   - search_params = {}
317   - if filters:
318   - search_params["expr"] = filters
319   -
320   - results = self.client.search(
321   - collection_name=self.text_collection_name,
322   - data=[query_embedding],
323   - limit=limit,
324   - output_fields=output_fields,
325   - search_params=search_params,
326   - )
327   -
328   - # Format results
329   - formatted_results = []
330   - if results and len(results) > 0:
331   - for hit in results[0]:
332   - result = {"id": hit.get("id"), "distance": hit.get("distance")}
333   - # Extract fields from entity
334   - entity = hit.get("entity", {})
335   - for field in output_fields:
336   - if field in entity:
337   - result[field] = entity.get(field)
338   - formatted_results.append(result)
339   -
340   - logger.debug(f"Found {len(formatted_results)} similar text embeddings")
341   - return formatted_results
342   -
343   - except Exception as e:
344   - logger.error(f"Failed to search similar text: {e}")
345   - raise
346   -
347   - def search_similar_images(
348   - self,
349   - query_embedding: List[float],
350   - limit: int = 10,
351   - filters: Optional[str] = None,
352   - output_fields: Optional[List[str]] = None,
353   - ) -> List[Dict[str, Any]]:
354   - """Search for similar image embeddings
355   -
356   - Args:
357   - query_embedding: Query embedding vector
358   - limit: Maximum number of results
359   - filters: Filter expression (e.g., "product_id in [1, 2, 3]")
360   - output_fields: List of fields to return
361   -
362   - Returns:
363   - List of search results with fields:
364   - - id: embedding ID
365   - - distance: similarity distance
366   - - entity: the matched entity with requested fields
367   - """
368   - try:
369   - if output_fields is None:
370   - output_fields = [
371   - "id",
372   - "image_path",
373   - "productDisplayName",
374   - "gender",
375   - "masterCategory",
376   - "subCategory",
377   - "articleType",
378   - "baseColour",
379   - ]
380   -
381   - search_params = {}
382   - if filters:
383   - search_params["expr"] = filters
384   -
385   - results = self.client.search(
386   - collection_name=self.image_collection_name,
387   - data=[query_embedding],
388   - limit=limit,
389   - output_fields=output_fields,
390   - search_params=search_params,
391   - )
392   -
393   - # Format results
394   - formatted_results = []
395   - if results and len(results) > 0:
396   - for hit in results[0]:
397   - result = {"id": hit.get("id"), "distance": hit.get("distance")}
398   - # Extract fields from entity
399   - entity = hit.get("entity", {})
400   - for field in output_fields:
401   - if field in entity:
402   - result[field] = entity.get(field)
403   - formatted_results.append(result)
404   -
405   - logger.debug(f"Found {len(formatted_results)} similar image embeddings")
406   - return formatted_results
407   -
408   - except Exception as e:
409   - logger.error(f"Failed to search similar images: {e}")
410   - raise
411   -
412   - def get_collection_stats(self, collection_name: str) -> Dict[str, Any]:
413   - """Get statistics for a collection
414   -
415   - Args:
416   - collection_name: Name of the collection
417   -
418   - Returns:
419   - Dictionary with collection statistics
420   - """
421   - try:
422   - stats = self.client.get_collection_stats(collection_name)
423   - return {
424   - "collection_name": collection_name,
425   - "row_count": stats.get("row_count", 0),
426   - }
427   - except Exception as e:
428   - logger.error(f"Failed to get collection stats: {e}")
429   - return {"collection_name": collection_name, "row_count": 0}
430   -
431   - def delete_by_ids(self, collection_name: str, ids: List[int]) -> int:
432   - """Delete embeddings by IDs
433   -
434   - Args:
435   - collection_name: Name of the collection
436   - ids: List of IDs to delete
437   -
438   - Returns:
439   - Number of deleted embeddings
440   - """
441   - if not ids:
442   - return 0
443   -
444   - try:
445   - self.client.delete(
446   - collection_name=collection_name,
447   - ids=ids,
448   - )
449   - logger.info(f"Deleted {len(ids)} embeddings from {collection_name}")
450   - return len(ids)
451   - except Exception as e:
452   - logger.error(f"Failed to delete embeddings: {e}")
453   - raise
454   -
455   - def clear_collection(self, collection_name: str) -> None:
456   - """Clear all data from a collection
457   -
458   - Args:
459   - collection_name: Name of the collection
460   - """
461   - try:
462   - if self.client.has_collection(collection_name):
463   - self.client.drop_collection(collection_name)
464   - logger.info(f"Dropped collection: {collection_name}")
465   - except Exception as e:
466   - logger.error(f"Failed to clear collection: {e}")
467   - raise
468   -
469   -
470   -# Global instance
471   -_milvus_service: Optional[MilvusService] = None
472   -
473   -
474   -def get_milvus_service() -> MilvusService:
475   - """Get or create the global Milvus service instance"""
476   - global _milvus_service
477   - if _milvus_service is None:
478   - _milvus_service = MilvusService()
479   - _milvus_service.connect()
480   - return _milvus_service
app/tools/__init__.py
... ... @@ -5,13 +5,11 @@ LangChain Tools for Product Search and Discovery
5 5 from app.tools.search_tools import (
6 6 analyze_image_style,
7 7 get_all_tools,
8   - search_by_image,
9 8 search_products,
10 9 )
11 10  
12 11 __all__ = [
13 12 "search_products",
14   - "search_by_image",
15 13 "analyze_image_style",
16 14 "get_all_tools",
17 15 ]
... ...
app/tools/search_tools.py
1 1 """
2 2 Search Tools for Product Discovery
3   -Provides text-based, image-based, and VLM reasoning capabilities
  3 +Provides text-based search via Search API and VLM style analysis
4 4 """
5 5  
6 6 import base64
... ... @@ -8,40 +8,24 @@ import logging
8 8 from pathlib import Path
9 9 from typing import Optional
10 10  
  11 +import requests
11 12 from langchain_core.tools import tool
12 13 from openai import OpenAI
13 14  
14 15 from app.config import settings
15   -from app.services.embedding_service import EmbeddingService
16   -from app.services.milvus_service import MilvusService
17 16  
18 17 logger = logging.getLogger(__name__)
19 18  
20   -# Initialize services as singletons
21   -_embedding_service: Optional[EmbeddingService] = None
22   -_milvus_service: Optional[MilvusService] = None
23 19 _openai_client: Optional[OpenAI] = None
24 20  
25 21  
26   -def get_embedding_service() -> EmbeddingService:
27   - global _embedding_service
28   - if _embedding_service is None:
29   - _embedding_service = EmbeddingService()
30   - return _embedding_service
31   -
32   -
33   -def get_milvus_service() -> MilvusService:
34   - global _milvus_service
35   - if _milvus_service is None:
36   - _milvus_service = MilvusService()
37   - _milvus_service.connect()
38   - return _milvus_service
39   -
40   -
41 22 def get_openai_client() -> OpenAI:
42 23 global _openai_client
43 24 if _openai_client is None:
44   - _openai_client = OpenAI(api_key=settings.openai_api_key)
  25 + kwargs = {"api_key": settings.openai_api_key}
  26 + if settings.openai_api_base_url:
  27 + kwargs["base_url"] = settings.openai_api_base_url
  28 + _openai_client = OpenAI(**kwargs)
45 29 return _openai_client
46 30  
47 31  
... ... @@ -64,30 +48,26 @@ def search_products(query: str, limit: int = 5) -> str:
64 48 try:
65 49 logger.info(f"Searching products: '{query}', limit: {limit}")
66 50  
67   - embedding_service = get_embedding_service()
68   - milvus_service = get_milvus_service()
69   -
70   - if not milvus_service.is_connected():
71   - milvus_service.connect()
72   -
73   - query_embedding = embedding_service.get_text_embedding(query)
74   -
75   - results = milvus_service.search_similar_text(
76   - query_embedding=query_embedding,
77   - limit=min(limit, 20),
78   - filters=None,
79   - output_fields=[
80   - "id",
81   - "productDisplayName",
82   - "gender",
83   - "masterCategory",
84   - "subCategory",
85   - "articleType",
86   - "baseColour",
87   - "season",
88   - "usage",
89   - ],
90   - )
  51 + url = f"{settings.search_api_base_url.rstrip('/')}/search/"
  52 + headers = {
  53 + "Content-Type": "application/json",
  54 + "X-Tenant-ID": settings.search_api_tenant_id,
  55 + }
  56 + payload = {
  57 + "query": query,
  58 + "size": min(limit, 20),
  59 + "from": 0,
  60 + "language": "zh",
  61 + }
  62 +
  63 + response = requests.post(url, json=payload, headers=headers, timeout=60)
  64 +
  65 + if response.status_code != 200:
  66 + logger.error(f"Search API error: {response.status_code} - {response.text}")
  67 + return f"Error searching products: API returned {response.status_code}"
  68 +
  69 + data = response.json()
  70 + results = data.get("results", [])
91 71  
92 72 if not results:
93 73 return "No products found matching your search."
... ... @@ -95,131 +75,40 @@ def search_products(query: str, limit: int = 5) -> str:
95 75 output = f"Found {len(results)} product(s):\n\n"
96 76  
97 77 for idx, product in enumerate(results, 1):
98   - output += f"{idx}. {product.get('productDisplayName', 'Unknown Product')}\n"
99   - output += f" ID: {product.get('id', 'N/A')}\n"
100   - output += f" Category: {product.get('masterCategory', 'N/A')} > {product.get('subCategory', 'N/A')} > {product.get('articleType', 'N/A')}\n"
101   - output += f" Color: {product.get('baseColour', 'N/A')}\n"
102   - output += f" Gender: {product.get('gender', 'N/A')}\n"
103   -
104   - if product.get("season"):
105   - output += f" Season: {product.get('season')}\n"
106   - if product.get("usage"):
107   - output += f" Usage: {product.get('usage')}\n"
108   -
109   - if "distance" in product:
110   - similarity = 1 - product["distance"]
111   - output += f" Relevance: {similarity:.2%}\n"
  78 + output += f"{idx}. {product.get('title', 'Unknown Product')}\n"
  79 + output += f" ID: {product.get('spu_id', 'N/A')}\n"
  80 + output += f" Category: {product.get('category_path', product.get('category_name', 'N/A'))}\n"
  81 + if product.get("vendor"):
  82 + output += f" Brand: {product.get('vendor')}\n"
  83 + if product.get("price") is not None:
  84 + output += f" Price: {product.get('price')}\n"
  85 +
  86 + # 规格/颜色信息
  87 + specs = product.get("specifications", [])
  88 + if specs:
  89 + color_spec = next(
  90 + (s for s in specs if s.get("name") == "color"),
  91 + None,
  92 + )
  93 + if color_spec:
  94 + output += f" Color: {color_spec.get('value', 'N/A')}\n"
  95 +
  96 + if product.get("relevance_score") is not None:
  97 + output += f" Relevance: {product['relevance_score']:.2f}\n"
112 98  
113 99 output += "\n"
114 100  
115 101 return output.strip()
116 102  
  103 + except requests.exceptions.RequestException as e:
  104 + logger.error(f"Error searching products (network): {e}", exc_info=True)
  105 + return f"Error searching products: {str(e)}"
117 106 except Exception as e:
118 107 logger.error(f"Error searching products: {e}", exc_info=True)
119 108 return f"Error searching products: {str(e)}"
120 109  
121 110  
122 111 @tool
123   -def search_by_image(image_path: str, limit: int = 5) -> str:
124   - """Find similar fashion products using an image.
125   -
126   - Use when users want visually similar items:
127   - - User uploads an image and asks "find similar items"
128   - - "Show me products that look like this"
129   -
130   - Args:
131   - image_path: Path to the image file
132   - limit: Maximum number of results (1-20)
133   -
134   - Returns:
135   - Formatted string with similar products
136   - """
137   - try:
138   - logger.info(f"Image search: '{image_path}', limit: {limit}")
139   -
140   - img_path = Path(image_path)
141   - if not img_path.exists():
142   - return f"Error: Image file not found at '{image_path}'"
143   -
144   - embedding_service = get_embedding_service()
145   - milvus_service = get_milvus_service()
146   -
147   - if not milvus_service.is_connected():
148   - milvus_service.connect()
149   -
150   - if (
151   - not hasattr(embedding_service, "clip_client")
152   - or embedding_service.clip_client is None
153   - ):
154   - embedding_service.connect_clip()
155   -
156   - image_embedding = embedding_service.get_image_embedding(image_path)
157   -
158   - if image_embedding is None:
159   - return "Error: Failed to generate embedding for image"
160   -
161   - results = milvus_service.search_similar_images(
162   - query_embedding=image_embedding,
163   - limit=min(limit + 1, 21),
164   - filters=None,
165   - output_fields=[
166   - "id",
167   - "image_path",
168   - "productDisplayName",
169   - "gender",
170   - "masterCategory",
171   - "subCategory",
172   - "articleType",
173   - "baseColour",
174   - "season",
175   - "usage",
176   - ],
177   - )
178   -
179   - if not results:
180   - return "No similar products found."
181   -
182   - # Filter out the query image itself
183   - query_id = img_path.stem
184   - filtered_results = []
185   - for result in results:
186   - result_path = result.get("image_path", "")
187   - if Path(result_path).stem != query_id:
188   - filtered_results.append(result)
189   - if len(filtered_results) >= limit:
190   - break
191   -
192   - if not filtered_results:
193   - return "No similar products found."
194   -
195   - output = f"Found {len(filtered_results)} visually similar product(s):\n\n"
196   -
197   - for idx, product in enumerate(filtered_results, 1):
198   - output += f"{idx}. {product.get('productDisplayName', 'Unknown Product')}\n"
199   - output += f" ID: {product.get('id', 'N/A')}\n"
200   - output += f" Category: {product.get('masterCategory', 'N/A')} > {product.get('subCategory', 'N/A')} > {product.get('articleType', 'N/A')}\n"
201   - output += f" Color: {product.get('baseColour', 'N/A')}\n"
202   - output += f" Gender: {product.get('gender', 'N/A')}\n"
203   -
204   - if product.get("season"):
205   - output += f" Season: {product.get('season')}\n"
206   - if product.get("usage"):
207   - output += f" Usage: {product.get('usage')}\n"
208   -
209   - if "distance" in product:
210   - similarity = 1 - product["distance"]
211   - output += f" Visual Similarity: {similarity:.2%}\n"
212   -
213   - output += "\n"
214   -
215   - return output.strip()
216   -
217   - except Exception as e:
218   - logger.error(f"Error in image search: {e}", exc_info=True)
219   - return f"Error searching by image: {str(e)}"
220   -
221   -
222   -@tool
223 112 def analyze_image_style(image_path: str) -> str:
224 113 """Analyze a fashion product image using AI vision to extract detailed style information.
225 114  
... ... @@ -291,4 +180,4 @@ Provide a comprehensive yet concise description (3-4 sentences)."""
291 180  
292 181 def get_all_tools():
293 182 """Get all available tools for the agent"""
294   - return [search_products, search_by_image, analyze_image_style]
  183 + return [search_products, analyze_image_style]
... ...
docker-compose.yml deleted
... ... @@ -1,76 +0,0 @@
1   -version: '3.5'
2   -
3   -services:
4   - etcd:
5   - container_name: milvus-etcd
6   - image: quay.io/coreos/etcd:v3.5.5
7   - environment:
8   - - ETCD_AUTO_COMPACTION_MODE=revision
9   - - ETCD_AUTO_COMPACTION_RETENTION=1000
10   - - ETCD_QUOTA_BACKEND_BYTES=4294967296
11   - - ETCD_SNAPSHOT_COUNT=50000
12   - volumes:
13   - - ./volumes/etcd:/etcd
14   - command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
15   - healthcheck:
16   - test: ["CMD", "etcdctl", "endpoint", "health"]
17   - interval: 30s
18   - timeout: 20s
19   - retries: 3
20   -
21   - minio:
22   - container_name: milvus-minio
23   - image: minio/minio:RELEASE.2023-03-20T20-16-18Z
24   - environment:
25   - MINIO_ACCESS_KEY: minioadmin
26   - MINIO_SECRET_KEY: minioadmin
27   - ports:
28   - - "9001:9001"
29   - - "9000:9000"
30   - volumes:
31   - - ./volumes/minio:/minio_data
32   - command: minio server /minio_data --console-address ":9001"
33   - healthcheck:
34   - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
35   - interval: 30s
36   - timeout: 20s
37   - retries: 3
38   -
39   - standalone:
40   - container_name: milvus-standalone
41   - image: milvusdb/milvus:v2.4.0
42   - command: ["milvus", "run", "standalone"]
43   - security_opt:
44   - - seccomp:unconfined
45   - environment:
46   - ETCD_ENDPOINTS: etcd:2379
47   - MINIO_ADDRESS: minio:9000
48   - volumes:
49   - - ./volumes/milvus:/var/lib/milvus
50   - healthcheck:
51   - test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
52   - interval: 30s
53   - start_period: 90s
54   - timeout: 20s
55   - retries: 3
56   - ports:
57   - - "19530:19530"
58   - - "9091:9091"
59   - depends_on:
60   - - "etcd"
61   - - "minio"
62   -
63   - attu:
64   - container_name: milvus-attu
65   - image: zilliz/attu:v2.4
66   - environment:
67   - MILVUS_URL: milvus-standalone:19530
68   - ports:
69   - - "8000:3000"
70   - depends_on:
71   - - "standalone"
72   -
73   -networks:
74   - default:
75   - name: milvus
76   -
docs/DEPLOY_CENTOS8.md
1   -# OmniShopAgent centOS 8 部署指南
  1 +# OmniShopAgent CentOS 8 部署指南
2 2  
3 3 ## 一、环境要求
4 4  
... ... @@ -6,8 +6,8 @@
6 6 |------|------|
7 7 | 操作系统 | CentOS 8.x |
8 8 | Python | 3.12+(LangChain 1.x 要求 3.10+) |
9   -| 内存 | 建议 8GB+(Milvus + CLIP 较占内存) |
10   -| 磁盘 | 建议 20GB+(含数据集) |
  9 +| 内存 | 建议 4GB+ |
  10 +| 磁盘 | 建议 10GB+ |
11 11  
12 12 ## 二、快速部署步骤
13 13  
... ... @@ -21,7 +21,6 @@ chmod +x scripts/*.sh
21 21  
22 22 该脚本会:
23 23 - 安装系统依赖(gcc、openssl-devel 等)
24   -- 安装 Docker(用于 Milvus)
25 24 - 安装 Python 3.12(conda 或源码编译)
26 25 - 创建虚拟环境并安装 requirements.txt
27 26  
... ... @@ -59,17 +58,7 @@ make -j $(nproc)
59 58 sudo make altinstall
60 59 ```
61 60  
62   -#### 步骤 3:安装 Docker
63   -
64   -```bash
65   -sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
66   -sudo dnf install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
67   -sudo systemctl enable docker && sudo systemctl start docker
68   -sudo usermod -aG docker $USER
69   -# 执行 newgrp docker 或重新登录
70   -```
71   -
72   -#### 步骤 4:创建虚拟环境并安装依赖
  61 +#### 步骤 3:创建虚拟环境并安装依赖
73 62  
74 63 ```bash
75 64 cd /path/to/shop_agent
... ... @@ -79,46 +68,35 @@ pip install -U pip
79 68 pip install -r requirements.txt
80 69 ```
81 70  
82   -#### 步骤 5:配置环境变量
  71 +#### 步骤 4:配置环境变量
83 72  
84 73 ```bash
85 74 cp .env.example .env
86 75 # 编辑 .env,至少配置:
87 76 # OPENAI_API_KEY=sk-xxx
88   -# MILVUS_HOST=localhost
89   -# MILVUS_PORT=19530
90   -# CLIP_SERVER_URL=grpc://localhost:51000
  77 +# SEARCH_API_BASE_URL=http://120.76.41.98:6002
  78 +# SEARCH_API_TENANT_ID=162
91 79 ```
92 80  
93   -## 三、数据准备
  81 +## 三、数据准备(可选)
94 82  
95 83 ### 3.1 下载数据集
96 84  
  85 +如需图片风格分析功能,可下载 Kaggle 数据集:
  86 +
97 87 ```bash
98 88 # 需先配置 Kaggle API:~/.kaggle/kaggle.json
99 89 python scripts/download_dataset.py
100 90 ```
101 91  
102   -### 3.2 启动 Milvus 并索引数据
103   -
104   -```bash
105   -# 启动 Milvus
106   -./scripts/run_milvus.sh
107   -
108   -# 等待就绪后,创建索引
109   -python scripts/index_data.py
110   -```
111   -
112 92 ## 四、启动服务
113 93  
114 94 ### 4.1 启动脚本说明
115 95  
116 96 | 脚本 | 用途 |
117 97 |------|------|
118   -| `start.sh` | 主启动脚本:启动 Milvus + Streamlit |
119   -| `stop.sh` | 停止所有服务 |
120   -| `run_milvus.sh` | 仅启动 Milvus |
121   -| `run_clip.sh` | 仅启动 CLIP(图像搜索需此服务) |
  98 +| `start.sh` | 主启动脚本:启动 Streamlit |
  99 +| `stop.sh` | 停止 Streamlit |
122 100 | `check_services.sh` | 健康检查 |
123 101  
124 102 ### 4.2 启动应用
... ... @@ -127,14 +105,7 @@ python scripts/index_data.py
127 105 # 方式 1:使用 start.sh(推荐)
128 106 ./scripts/start.sh
129 107  
130   -# 方式 2:分步启动
131   -# 终端 1:Milvus
132   -./scripts/run_milvus.sh
133   -
134   -# 终端 2:CLIP(图像搜索需要)
135   -./scripts/run_clip.sh
136   -
137   -# 终端 3:Streamlit
  108 +# 方式 2:直接运行
138 109 source venv/bin/activate
139 110 streamlit run app.py --server.port=8501 --server.address=0.0.0.0
140 111 ```
... ... @@ -142,7 +113,6 @@ streamlit run app.py --server.port=8501 --server.address=0.0.0.0
142 113 ### 4.3 访问地址
143 114  
144 115 - **Streamlit 应用**:http://服务器IP:8501
145   -- **Milvus Attu 管理界面**:http://服务器IP:8000
146 116  
147 117 ## 五、生产部署建议
148 118  
... ... @@ -153,7 +123,7 @@ streamlit run app.py --server.port=8501 --server.address=0.0.0.0
153 123 ```ini
154 124 [Unit]
155 125 Description=OmniShopAgent Streamlit App
156   -After=network.target docker.service
  126 +After=network.target
157 127  
158 128 [Service]
159 129 Type=simple
... ... @@ -194,7 +164,6 @@ server {
194 164  
195 165 ```bash
196 166 sudo firewall-cmd --permanent --add-port=8501/tcp
197   -sudo firewall-cmd --permanent --add-port=19530/tcp
198 167 sudo firewall-cmd --reload
199 168 ```
200 169  
... ... @@ -203,14 +172,8 @@ sudo firewall-cmd --reload
203 172 ### Q: Python 3.12 编译失败?
204 173 A: 确保已安装 `openssl-devel`、`libffi-devel`,或直接使用 Miniconda。
205 174  
206   -### Q: Docker 权限不足?
207   -A: 执行 `sudo usermod -aG docker $USER` 后重新登录。
208   -
209   -### Q: Milvus 启动超时?
210   -A: 首次启动需拉取镜像,可能较慢。可检查 `docker compose logs -f standalone`。
211   -
212   -### Q: 图像搜索不可用?
213   -A: 需单独启动 CLIP 服务:`./scripts/run_clip.sh`。
  175 +### Q: Search API 连接失败?
  176 +A: 检查 `.env` 中 `SEARCH_API_BASE_URL` 和 `SEARCH_API_TENANT_ID` 配置,确保网络可访问搜索服务。
214 177  
215 178 ### Q: 健康检查?
216 179 A: 执行 `./scripts/check_services.sh` 查看各组件状态。
... ...
docs/Skills实现方案-LangChain1.0.md
... ... @@ -7,7 +7,7 @@ Agent 鍦 system prompt 涓彧鐪嬪埌鎶鑳芥憳瑕侊紝鎸夐渶鍔犺浇璇︾粏鎶鑳藉唴瀹
7 7  
8 8 | 鎶鑳 | 鑻辨枃鏍囪瘑 | 鑱岃矗 |
9 9 |------|----------|------|
10   -| 鏌ユ壘鐩稿叧鍟嗗搧 | lookup_related | 鍩轰簬鏂囨湰/鍥剧墖鏌ユ壘鐩镐技鎴栫浉鍏冲晢鍝 |
  10 +| 鏌ユ壘鐩稿叧鍟嗗搧 | lookup_related | 鍩轰簬鏂囨湰/鍥剧墖鏌ユ壘鐩镐技鎴栫浉鍏冲晢鍝侊紙鍥剧墖闇鍏堝垎鏋愰鏍硷級 |
11 11 | 鎼滅储鍟嗗搧 | search_products | 鎸夎嚜鐒惰瑷鎻忚堪鎼滅储鍟嗗搧 |
12 12 | 妫楠屽晢鍝 | check_product | 妫楠屽晢鍝佹槸鍚︾鍚堢敤鎴疯姹 |
13 13 | 缁撴灉鍖呰 | result_packaging | 鏍煎紡鍖栥佹帓搴忋佺瓫閫夊苟鍛堢幇缁撴灉 |
... ... @@ -24,7 +24,7 @@ Agent 鍦 system prompt 涓彧鐪嬪埌鎶鑳芥憳瑕侊紝鎸夐渶鍔犺浇璇︾粏鎶鑳藉唴瀹
24 24 | **鏂瑰紡 A锛歝reate_agent + 鑷畾涔 Skill 涓棿浠** | 璐墿瀵艰喘绛変笟鍔 Agent | `langchain>=1.0`銆乣langgraph>=1.0` |
25 25 | **鏂瑰紡 B锛欴eep Agents + SKILL.md** | 渚濊禆鏂囦欢绯荤粺銆佸鎶鑳界洰褰 | `deepagents` |
26 26  
27   -璐墿瀵艰喘鍦烘櫙鎺ㄨ崘**鏂瑰紡 A**锛屾洿鏄撲笌鐜版湁 Milvus銆丆LIP 绛夋湇鍔¢泦鎴愩
  27 +璐墿瀵艰喘鍦烘櫙鎺ㄨ崘**鏂瑰紡 A**锛屾洿鏄撲笌鐜版湁 Search API 绛夋湇鍔¢泦鎴愩
28 28  
29 29 ### 2.2 鏍稿績鎬濊矾锛歅rogressive Disclosure
30 30  
... ... @@ -58,7 +58,7 @@ class Skill(TypedDict):
58 58 SKILLS: list[Skill] = [
59 59 {
60 60 "name": "lookup_related",
61   - "description": "鏌ユ壘涓庢煇鍟嗗搧鐩稿叧鐨勫叾浠栧晢鍝侊紝鏀寔浠ュ浘鎼滃浘銆佹枃鏈浉浼笺佸悓鍝佺被鎺ㄨ崘銆",
  61 + "description": "鏌ユ壘涓庢煇鍟嗗搧鐩稿叧鐨勫叾浠栧晢鍝侊紝鏀寔鏂囨湰鐩镐技銆佸悓鍝佺被鎺ㄨ崘銆",
62 62 "content": """# 鏌ユ壘鐩稿叧鍟嗗搧
63 63  
64 64 ## 閫傜敤鍦烘櫙
... ... @@ -67,12 +67,11 @@ SKILLS: list[Skill] = [
67 67 - 鐢ㄦ埛宸叉湁涓浠跺晢鍝侊紝鎯虫壘鐩稿叧娆
68 68  
69 69 ## 鎿嶄綔姝ラ
70   -1. **鏈夊浘鐗**锛氬厛璋冪敤 `analyze_image_style` 鐞嗚В椋庢牸锛屽啀璋冪敤 `search_by_image` 鎴 `search_products`
  70 +1. **鏈夊浘鐗**锛氬厛璋冪敤 `analyze_image_style` 鐞嗚В椋庢牸锛屽啀璋冪敤 `search_products` 鐢ㄦ弿杩版悳绱
71 71 2. **鏃犲浘鐗**锛氱敤 `search_products` 鎻忚堪鍝佺被+椋庢牸+棰滆壊
72 72 3. 鍙粨鍚堜笂涓嬫枃涓殑鍟嗗搧 ID銆佸搧绫诲仛鍚屽搧绫绘帹鑽
73 73  
74 74 ## 鍙敤宸ュ叿
75   -- `search_by_image(image_path, limit)`锛氫互鍥炬悳鍥
76 75 - `search_products(query, limit)`锛氭枃鏈悳绱
77 76 - `analyze_image_style(image_path)`锛氬垎鏋愬浘鐗囬鏍""",
78 77 },
... ... @@ -225,15 +224,14 @@ class ShoppingSkillMiddleware(AgentMiddleware):
225 224 from langchain.agents import create_agent
226 225 from langgraph.checkpoint.memory import MemorySaver
227 226  
228   -# 鍩虹宸ュ叿锛堟悳绱€佷互鍥炬悳鍥俱侀鏍煎垎鏋愮瓑锛
229   -from app.tools.search_tools import search_products, search_by_image, analyze_image_style
  227 +# 鍩虹宸ュ叿锛堟悳绱€侀鏍煎垎鏋愮瓑锛
  228 +from app.tools.search_tools import search_products, analyze_image_style
230 229  
231 230 agent = create_agent(
232 231 model="gpt-4o-mini",
233 232 tools=[
234 233 load_skill, # 鎶鑳藉姞杞
235 234 search_products,
236   - search_by_image,
237 235 analyze_image_style,
238 236 ],
239 237 system_prompt="""浣犳槸鏅鸿兘鏃跺皻璐墿鍔╂墜銆傛牴鎹敤鎴烽渶姹傦紝鍏堝垽鏂娇鐢ㄥ摢涓妧鑳斤紝蹇呰鏃剁敤 load_skill 鍔犺浇鎶鑳借鎯呫
... ... @@ -250,7 +248,7 @@ agent = create_agent(
250 248  
251 249 | 鑳藉姏 | 鎶鑳 | 宸ュ叿 |
252 250 |------|------|------|
253   -| 鏌ユ壘鐩稿叧 | lookup_related | search_by_image, search_products, analyze_image_style |
  251 +| 鏌ユ壘鐩稿叧 | lookup_related | search_products, analyze_image_style |
254 252 | 鎼滅储鍟嗗搧 | search_products | search_products |
255 253 | 妫楠屽晢鍝 | check_product | search_products锛堢敤 query 琛ㄨ揪绾︽潫锛 |
256 254 | 缁撴灉鍖呰 | result_packaging | 鏃狅紙绾 prompt 绾︽潫锛 |
... ...
技术实现报告.md renamed to docs/技术实现报告.md
... ... @@ -7,7 +7,7 @@ OmniShopAgent 是一个基于 **LangGraph** 和 **ReAct 模式** 的自主多模
7 7 ### 核心特性
8 8  
9 9 - **自主工具选择与执行**:Agent 根据用户意图自主选择并调用工具
10   -- **多模态搜索**:支持文本搜索 + 图像搜索
  10 +- **文本搜索**:通过 Search API 进行商品搜索
11 11 - **对话上下文感知**:多轮对话中保持上下文记忆
12 12 - **实时视觉分析**:基于 VLM 的图片风格分析
13 13  
... ... @@ -20,9 +20,7 @@ OmniShopAgent 是一个基于 **LangGraph** 和 **ReAct 模式** 的自主多模
20 20 | 运行环境 | Python 3.12 |
21 21 | Agent 框架 | LangGraph 1.x |
22 22 | LLM 框架 | LangChain 1.x(支持任意 LLM,默认 gpt-4o-mini) |
23   -| 文本向量 | text-embedding-3-small |
24   -| 图像向量 | CLIP ViT-B/32 |
25   -| 向量数据库 | Milvus |
  23 +| 搜索服务 | Search API (HTTP) |
26 24 | 前端 | Streamlit |
27 25 | 数据集 | Kaggle Fashion Products |
28 26  
... ... @@ -45,23 +43,21 @@ OmniShopAgent 是一个基于 **LangGraph** 和 **ReAct 模式** 的自主多模
45 43 │ │ START → Agent → [Has tool_calls?] → Tools → Agent → END │ │
46 44 │ └───────────────────────────────────────────────────────────┘ │
47 45 └─────────────────────────────────────────────────────────────────┘
48   - │ │ │
49   - ▼ ▼ ▼
50   -┌──────────────┐ ┌──────────────────┐ ┌─────────────────────┐
51   -│ search_ │ │ search_by_image │ │ analyze_image_style │
52   -│ products │ │ │ │ (OpenAI Vision) │
53   -└──────┬───────┘ └────────┬─────────┘ └──────────┬───────────┘
54   - │ │ │
55   - ▼ ▼ ▼
  46 + │ │
  47 + ▼ ▼
  48 +┌──────────────┐ ┌─────────────────────┐
  49 +│ search_ │ │ analyze_image_style │
  50 +│ products │ │ (OpenAI Vision) │
  51 +└──────┬───────┘ └──────────┬──────────┘
  52 + │ │
  53 + ▼ │
  54 +┌──────────────────┐ │
  55 +│ Search API │ │
  56 +│ (HTTP POST) │ │
  57 +└──────────────────┘ │
  58 + ▼
56 59 ┌─────────────────────────────────────────────────────────────────┐
57   -│ EmbeddingService (embedding_service.py) │
58   -│ OpenAI API (文本) │ CLIP Server (图像) │
59   -└─────────────────────────────────────────────────────────────────┘
60   - │
61   - ▼
62   -┌─────────────────────────────────────────────────────────────────┐
63   -│ MilvusService (milvus_service.py) │
64   -│ text_embeddings 集合 │ image_embeddings 集合 │
  60 +│ OpenAI API (VLM 风格分析) │
65 61 └─────────────────────────────────────────────────────────────────┘
66 62 ```
67 63  
... ... @@ -140,12 +136,11 @@ def _build_graph(self):
140 136 ```python
141 137 system_prompt = """You are an intelligent fashion shopping assistant. You can:
142 138 1. Search for products by text description (use search_products)
143   -2. Find visually similar products from images (use search_by_image)
144   -3. Analyze image style and attributes (use analyze_image_style)
  139 +2. Analyze image style and attributes (use analyze_image_style)
145 140  
146 141 When a user asks about products:
147 142 - For text queries: use search_products directly
148   -- For image uploads: decide if you need to analyze_image_style first, then search
  143 +- For image uploads: use analyze_image_style first to understand the product, then use search_products with the extracted description
149 144 - You can call multiple tools in sequence if needed
150 145 - Always provide helpful, friendly responses
151 146  
... ... @@ -198,41 +193,38 @@ def chat(self, query: str, image_path: Optional[str] = None) -> dict:
198 193  
199 194 ### 4.2 搜索工具实现(search_tools.py)
200 195  
201   -#### 4.2.1 文本语义搜索
  196 +#### 4.2.1 文本搜索(Search API)
202 197  
203 198 ```python
204 199 @tool
205 200 def search_products(query: str, limit: int = 5) -> str:
206 201 """Search for fashion products using natural language descriptions."""
207 202 try:
208   - embedding_service = get_embedding_service()
209   - milvus_service = get_milvus_service()
210   -
211   - query_embedding = embedding_service.get_text_embedding(query)
212   -
213   - results = milvus_service.search_similar_text(
214   - query_embedding=query_embedding,
215   - limit=min(limit, 20),
216   - filters=None,
217   - output_fields=[
218   - "id", "productDisplayName", "gender", "masterCategory",
219   - "subCategory", "articleType", "baseColour", "season", "usage",
220   - ],
221   - )
  203 + url = f"{settings.search_api_base_url.rstrip('/')}/search/"
  204 + headers = {
  205 + "Content-Type": "application/json",
  206 + "X-Tenant-ID": settings.search_api_tenant_id,
  207 + }
  208 + payload = {
  209 + "query": query,
  210 + "size": min(limit, 20),
  211 + "from": 0,
  212 + "language": "zh",
  213 + }
  214 +
  215 + response = requests.post(url, json=payload, headers=headers, timeout=60)
  216 + data = response.json()
  217 + results = data.get("results", [])
222 218  
223 219 if not results:
224 220 return "No products found matching your search."
225 221  
226 222 output = f"Found {len(results)} product(s):\n\n"
227 223 for idx, product in enumerate(results, 1):
228   - output += f"{idx}. {product.get('productDisplayName', 'Unknown Product')}\n"
229   - output += f" ID: {product.get('id', 'N/A')}\n"
230   - output += f" Category: {product.get('masterCategory')} > {product.get('subCategory')} > {product.get('articleType')}\n"
231   - output += f" Color: {product.get('baseColour')}\n"
232   - output += f" Gender: {product.get('gender')}\n"
233   - if "distance" in product:
234   - similarity = 1 - product["distance"]
235   - output += f" Relevance: {similarity:.2%}\n"
  224 + output += f"{idx}. {product.get('title', 'Unknown Product')}\n"
  225 + output += f" ID: {product.get('spu_id', 'N/A')}\n"
  226 + output += f" Category: {product.get('category_path', 'N/A')}\n"
  227 + output += f" Price: {product.get('price')}\n"
236 228 output += "\n"
237 229  
238 230 return output.strip()
... ... @@ -240,38 +232,7 @@ def search_products(query: str, limit: int = 5) -> str:
240 232 return f"Error searching products: {str(e)}"
241 233 ```
242 234  
243   -#### 4.2.2 图像相似度搜索
244   -
245   -```python
246   -@tool
247   -def search_by_image(image_path: str, limit: int = 5) -> str:
248   - """Find similar fashion products using an image."""
249   - if not Path(image_path).exists():
250   - return f"Error: Image file not found at '{image_path}'"
251   -
252   - embedding_service = get_embedding_service()
253   - milvus_service = get_milvus_service()
254   -
255   - if not embedding_service.clip_client:
256   - embedding_service.connect_clip()
257   -
258   - image_embedding = embedding_service.get_image_embedding(image_path)
259   -
260   - results = milvus_service.search_similar_images(
261   - query_embedding=image_embedding,
262   - limit=min(limit + 1, 21),
263   - output_fields=[...],
264   - )
265   -
266   - # 过滤掉查询图像本身(如上传的是商品库中的图)
267   - query_id = Path(image_path).stem
268   - filtered_results = [r for r in results if Path(r.get("image_path", "")).stem != query_id]
269   - filtered_results = filtered_results[:limit]
270   -
271   -
272   -```
273   -
274   -#### 4.2.3 视觉分析(VLM)
  235 +#### 4.2.2 视觉分析(VLM)
275 236  
276 237 ```python
277 238 @tool
... ... @@ -310,161 +271,9 @@ Provide a comprehensive yet concise description (3-4 sentences)."""
310 271  
311 272 ---
312 273  
313   -### 4.3 向量服务实现
314   -
315   -#### 4.3.1 EmbeddingService(embedding_service.py)
316   -
317   -```python
318   -class EmbeddingService:
319   - def get_text_embedding(self, text: str) -> List[float]:
320   - """OpenAI text-embedding-3-small"""
321   - response = self.openai_client.embeddings.create(
322   - input=text, model=self.text_embedding_model
323   - )
324   - return response.data[0].embedding
325   -
326   - def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]:
327   - """CLIP 图像向量"""
328   - if not self.clip_client:
329   - raise RuntimeError("CLIP client not connected. Call connect_clip() first.")
330   - result = self.clip_client.encode([str(image_path)])
331   - if isinstance(result, np.ndarray):
332   - embedding = result[0].tolist() if len(result.shape) > 1 else result.tolist()
333   - else:
334   - embedding = result[0].embedding.tolist()
335   - return embedding
336   -
337   - def get_text_embeddings_batch(self, texts: List[str], batch_size: int = 100) -> List[List[float]]:
338   - """批量文本嵌入,用于索引"""
339   - for i in range(0, len(texts), batch_size):
340   - batch = texts[i : i + batch_size]
341   - response = self.openai_client.embeddings.create(input=batch, ...)
342   - embeddings = [item.embedding for item in response.data]
343   - all_embeddings.extend(embeddings)
344   - return all_embeddings
345   -```
346   -
347   -#### 4.3.2 MilvusService(milvus_service.py)
  274 +### 4.3 Streamlit 前端(app.py)
348 275  
349   -**文本集合 Schema:**
350   -
351   -```python
352   -schema = MilvusClient.create_schema(auto_id=False, enable_dynamic_field=True)
353   -schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
354   -schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2000)
355   -schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.text_dim) # 1536
356   -schema.add_field(field_name="productDisplayName", datatype=DataType.VARCHAR, max_length=500)
357   -schema.add_field(field_name="gender", datatype=DataType.VARCHAR, max_length=50)
358   -schema.add_field(field_name="masterCategory", datatype=DataType.VARCHAR, max_length=100)
359   -# ... 更多元数据字段
360   -```
361   -
362   -**图像集合 Schema:**
363   -
364   -```python
365   -schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
366   -schema.add_field(field_name="image_path", datatype=DataType.VARCHAR, max_length=500)
367   -schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.image_dim) # 512
368   -# ... 产品元数据
369   -```
370   -
371   -**相似度搜索:**
372   -
373   -```python
374   -def search_similar_text(self, query_embedding, limit=10, output_fields=None):
375   - results = self.client.search(
376   - collection_name=self.text_collection_name,
377   - data=[query_embedding],
378   - limit=limit,
379   - output_fields=output_fields,
380   - )
381   - formatted_results = []
382   - for hit in results[0]:
383   - result = {"id": hit.get("id"), "distance": hit.get("distance")}
384   - entity = hit.get("entity", {})
385   - for field in output_fields:
386   - if field in entity:
387   - result[field] = entity.get(field)
388   - formatted_results.append(result)
389   - return formatted_results
390   -```
391   -
392   ----
393   -
394   -### 4.4 数据索引脚本(index_data.py)
395   -
396   -#### 4.4.1 产品数据加载
397   -
398   -```python
399   -def _load_products_from_csv(self) -> Dict[int, Dict[str, Any]]:
400   - products = {}
401   - # 加载 images.csv 映射
402   - with open(self.images_csv, "r") as f:
403   - images_dict = {int(row["filename"].split(".")[0]): row["link"] for row in csv.DictReader(f)}
404   -
405   - # 加载 styles.csv
406   - with open(self.styles_csv, "r") as f:
407   - for row in csv.DictReader(f):
408   - product_id = int(row["id"])
409   - products[product_id] = {
410   - "id": product_id,
411   - "gender": row.get("gender", ""),
412   - "masterCategory": row.get("masterCategory", ""),
413   - "subCategory": row.get("subCategory", ""),
414   - "articleType": row.get("articleType", ""),
415   - "baseColour": row.get("baseColour", ""),
416   - "season": row.get("season", ""),
417   - "usage": row.get("usage", ""),
418   - "productDisplayName": row.get("productDisplayName", ""),
419   - "imagePath": f"{product_id}.jpg",
420   - }
421   - return products
422   -```
423   -
424   -#### 4.4.2 文本索引
425   -
426   -```python
427   -def _create_product_text(self, product: Dict[str, Any]) -> str:
428   - """构造产品文本用于 embedding"""
429   - parts = [
430   - product.get("productDisplayName", ""),
431   - f"Gender: {product.get('gender', '')}",
432   - f"Category: {product.get('masterCategory', '')} > {product.get('subCategory', '')}",
433   - f"Type: {product.get('articleType', '')}",
434   - f"Color: {product.get('baseColour', '')}",
435   - f"Season: {product.get('season', '')}",
436   - f"Usage: {product.get('usage', '')}",
437   - ]
438   - return " | ".join([p for p in parts if p and p != "Gender: " and p != "Color: "])
439   -```
440   -
441   -#### 4.4.3 批量索引流程
442   -
443   -```python
444   -# 文本索引
445   -texts = [self._create_product_text(p) for p in products]
446   -embeddings = self.embedding_service.get_text_embeddings_batch(texts, batch_size=50)
447   -milvus_data = [{
448   - "id": product_id,
449   - "text": text[:2000],
450   - "embedding": embedding,
451   - "productDisplayName": product["productDisplayName"][:500],
452   - "gender": product["gender"][:50],
453   - # ... 其他元数据
454   -} for product_id, text, embedding in zip(...)]
455   -self.milvus_service.insert_text_embeddings(milvus_data)
456   -
457   -# 图像索引
458   -image_paths = [self.image_dir / p["imagePath"] for p in products]
459   -embeddings = self.embedding_service.get_image_embeddings_batch(image_paths, batch_size=32)
460   -# 类似插入 image_embeddings 集合
461   -```
462   -
463   ----
464   -
465   -### 4.5 Streamlit 前端(app.py)
466   -
467   -#### 4.5.1 会话与 Agent 初始化
  276 +#### 4.3.1 会话与 Agent 初始化
468 277  
469 278 ```python
470 279 def initialize_session():
... ... @@ -478,7 +287,7 @@ def initialize_session():
478 287 st.session_state.uploaded_image = None
479 288 ```
480 289  
481   -#### 4.5.2 产品信息解析
  290 +#### 4.3.2 产品信息解析
482 291  
483 292 ```python
484 293 def extract_products_from_response(response: str) -> list:
... ... @@ -501,7 +310,7 @@ def extract_products_from_response(response: str) -> list:
501 310 return products
502 311 ```
503 312  
504   -#### 4.5.3 多轮对话中的图片引用
  313 +#### 4.3.3 多轮对话中的图片引用
505 314  
506 315 ```python
507 316 # 用户输入 "make them formal" 时,若上一条消息有图片,则引用该图片
... ... @@ -514,28 +323,14 @@ if any(ref in query_lower for ref in ["this", "that", "the image", "it"]):
514 323  
515 324 ---
516 325  
517   -### 4.6 配置管理(config.py)
  326 +### 4.4 配置管理(config.py)
518 327  
519 328 ```python
520 329 class Settings(BaseSettings):
521 330 openai_api_key: str
522 331 openai_model: str = "gpt-4o-mini"
523   - openai_embedding_model: str = "text-embedding-3-small"
524   - clip_server_url: str = "grpc://localhost:51000"
525   - milvus_uri: str = "http://localhost:19530"
526   - text_collection_name: str = "text_embeddings"
527   - image_collection_name: str = "image_embeddings"
528   - text_dim: int = 1536
529   - image_dim: int = 512
530   -
531   - @property
532   - def milvus_uri_absolute(self) -> str:
533   - """支持 Milvus Standalone 和 Milvus Lite"""
534   - if self.milvus_uri.startswith(("http://", "https://")):
535   - return self.milvus_uri
536   - if self.milvus_uri.startswith("./"):
537   - return os.path.join(base_dir, self.milvus_uri[2:])
538   - return self.milvus_uri
  332 + search_api_base_url: str = "http://120.76.41.98:6002"
  333 + search_api_tenant_id: str = "162"
539 334  
540 335 class Config:
541 336 env_file = ".env"
... ... @@ -547,35 +342,22 @@ class Settings(BaseSettings):
547 342  
548 343 ### 5.1 依赖服务
549 344  
550   -```yaml
551   -# docker-compose.yml 提供
552   -- etcd: 元数据存储
553   -- minio: 对象存储
554   -- milvus-standalone: 向量数据库
555   -- attu: Milvus 管理界面
556   -```
  345 +- **Search API**:外部搜索服务(HTTP)
  346 +- **OpenAI API**:LLM 与 VLM 图像分析
557 347  
558 348 ### 5.2 启动流程
559 349  
560 350 ```bash
561 351 # 1. 环境
562 352 pip install -r requirements.txt
563   -cp .env.example .env # 配置 OPENAI_API_KEY
  353 +cp .env.example .env # 配置 OPENAI_API_KEY、SEARCH_API_* 等
564 354  
565   -# 2. 下载数据
  355 +# 2. (可选)下载数据
566 356 python scripts/download_dataset.py # Kaggle Fashion Product Images Dataset
567 357  
568   -# 3. 启动 CLIP 服务(需单独运行)
569   -python -m clip_server
570   -
571   -# 4. 启动 Milvus
572   -docker-compose up
573   -
574   -# 5. 索引数据
575   -python scripts/index_data.py
576   -
577   -# 6. 启动应用
  358 +# 3. 启动应用
578 359 streamlit run app.py
  360 +# 或 ./scripts/start.sh
579 361 ```
580 362  
581 363 ---
... ... @@ -585,7 +367,6 @@ streamlit run app.py
585 367 | 场景 | 用户输入 | Agent 行为 | 工具调用 |
586 368 |------|----------|------------|----------|
587 369 | 文本搜索 | "winter coats for women" | 直接文本搜索 | `search_products("winter coats women")` |
588   -| 图像搜索 | [上传图片] "find similar" | 图像相似度搜索 | `search_by_image(path)` |
589 370 | 风格分析+搜索 | [上传复古夹克] "what style? find matching pants" | 先分析风格再搜索 | `analyze_image_style(path)` → `search_products("vintage pants casual")` |
590 371 | 多轮上下文 | [第1轮] "show me red dresses"<br>[第2轮] "make them formal" | 结合上下文 | `search_products("red formal dresses")` |
591 372  
... ... @@ -595,10 +376,9 @@ streamlit run app.py
595 376  
596 377 1. **ReAct 模式**:Agent 自主决定何时调用工具、调用哪些工具、是否继续调用。
597 378 2. **LangGraph 状态图**:`START → Agent → [条件] → Tools → Agent → END`,支持多轮工具调用。
598   -3. **多模态**:文本 + 图像 + VLM 分析,覆盖文本搜索、以图搜图、风格理解。
599   -4. **双向量集合**:Milvus 中 text_embeddings / image_embeddings 分别存储,支持不同模态的检索。
600   -5. **会话持久化**:`MemorySaver` + `thread_id` 实现多轮对话记忆。
601   -6. **格式约束**:System prompt 严格限制产品输出格式,便于前端解析和展示。
  379 +3. **搜索与风格分析**:Search API 文本搜索 + VLM 图像风格分析。
  380 +4. **会话持久化**:`MemorySaver` + `thread_id` 实现多轮对话记忆。
  381 +5. **格式约束**:System prompt 严格限制产品输出格式,便于前端解析和展示。
602 382  
603 383 ---
604 384  
... ... @@ -611,8 +391,6 @@ OmniShopAgent/
611 391 │ │ └── shopping_agent.py
612 392 │ ├── config.py
613 393 │ ├── services/
614   -│ │ ├── embedding_service.py
615   -│ │ └── milvus_service.py
616 394 │ └── tools/
617 395 │ └── search_tools.py
618 396 ├── scripts/
... ...
docs/搜索API对接指南.md 0 → 100644
... ... @@ -0,0 +1,1651 @@
  1 +# 搜索API接口对接指南
  2 +
  3 +本文档为搜索服务的使用方提供完整的API对接指南,包括接口说明、请求参数、响应格式和使用示例。
  4 +
  5 +## 目录
  6 +
  7 +1. [快速开始](#快速开始)
  8 + - 1.1 [基础信息](#11-基础信息)
  9 + - 1.2 [最简单的搜索请求](#12-最简单的搜索请求)
  10 + - 1.3 [带过滤与分页的搜索](#13-带过滤与分页的搜索)
  11 + - 1.4 [开启分面的搜索](#14-开启分面的搜索)
  12 +
  13 +2. [接口概览](#接口概览)
  14 +
  15 +3. [搜索接口](#搜索接口)
  16 + - 3.1 [接口信息](#31-接口信息)
  17 + - 3.2 [请求参数](#32-请求参数)
  18 + - 3.3 [过滤器详解](#33-过滤器详解)
  19 + - 3.4 [分面配置](#34-分面配置)
  20 + - 3.5 [SKU筛选维度](#35-sku筛选维度)
  21 + - 3.6 [布尔表达式语法](#36-布尔表达式语法)
  22 + - 3.7 [搜索建议接口](#37-搜索建议接口)
  23 + - 3.8 [即时搜索接口](#38-即时搜索接口)
  24 + - 3.9 [获取单个文档](#39-获取单个文档)
  25 +
  26 +4. [响应格式说明](#响应格式说明)
  27 + - 4.1 [标准响应结构](#41-标准响应结构)
  28 + - 4.2 [响应字段说明](#42-响应字段说明)
  29 + - 4.2.1 [query_info 说明](#421-query_info-说明)
  30 + - 4.3 [SpuResult字段说明](#43-spuresult字段说明)
  31 + - 4.4 [SkuResult字段说明](#44-skuresult字段说明)
  32 + - 4.5 [多语言字段说明](#45-多语言字段说明)
  33 +
  34 +5. [索引接口](#索引接口)
  35 + - 5.0 [为租户创建索引](#50-为租户创建索引)
  36 + - 5.1 [全量索引接口](#51-全量索引接口)
  37 + - 5.2 [增量索引接口](#52-增量索引接口)
  38 + - 5.3 [查询文档接口](#53-查询文档接口)
  39 + - 5.4 [索引健康检查接口](#54-索引健康检查接口)
  40 +
  41 +6. [管理接口](#管理接口)
  42 + - 6.1 [健康检查](#61-健康检查)
  43 + - 6.2 [获取配置](#62-获取配置)
  44 + - 6.3 [索引统计](#63-索引统计)
  45 +
  46 +7. [常见场景示例](#常见场景示例)
  47 + - 7.1 [基础搜索与排序](#71-基础搜索与排序)
  48 + - 7.2 [过滤搜索](#72-过滤搜索)
  49 + - 7.3 [分面搜索](#73-分面搜索)
  50 + - 7.4 [规格过滤与分面](#74-规格过滤与分面)
  51 + - 7.5 [SKU筛选](#75-sku筛选)
  52 + - 7.6 [布尔表达式搜索](#76-布尔表达式搜索)
  53 + - 7.7 [分页查询](#77-分页查询)
  54 +
  55 +8. [数据模型](#数据模型)
  56 + - 8.1 [商品字段定义](#81-商品字段定义)
  57 + - 8.2 [字段类型速查](#82-字段类型速查)
  58 + - 8.3 [常用字段列表](#83-常用字段列表)
  59 + - 8.4 [支持的分析器](#84-支持的分析器)
  60 +
  61 +---
  62 +
  63 +## 快速开始
  64 +
  65 +### 1.1 基础信息
  66 +
  67 +- **Base URL**: `http://120.76.41.98:6002`
  68 +- **协议**: HTTP/HTTPS
  69 +- **数据格式**: JSON
  70 +- **字符编码**: UTF-8
  71 +- **请求方法**: POST(搜索接口)
  72 +
  73 +**重要提示**: `tenant_id` 通过 HTTP Header `X-Tenant-ID` 传递,不在请求体中。
  74 +
  75 +### 1.2 最简单的搜索请求
  76 +
  77 +```bash
  78 +curl -X POST "http://120.76.41.98:6002/search/" \
  79 + -H "Content-Type: application/json" \
  80 + -H "X-Tenant-ID: 162" \
  81 + -d '{"query": "芭比娃娃"}'
  82 +```
  83 +
  84 +### 1.3 带过滤与分页的搜索
  85 +
  86 +```bash
  87 +curl -X POST "http://120.76.41.98:6002/search/" \
  88 + -H "Content-Type: application/json" \
  89 + -H "X-Tenant-ID: 162" \
  90 + -d '{
  91 + "query": "芭比娃娃",
  92 + "size": 5,
  93 + "from": 10,
  94 + "range_filters": {
  95 + "min_price": {
  96 + "gte": 50,
  97 + "lte": 200
  98 + },
  99 + "create_time": {
  100 + "gte": "2020-01-01T00:00:00Z"
  101 + }
  102 + },
  103 + "sort_by": "price",
  104 + "sort_order": "asc"
  105 + }'
  106 +```
  107 +
  108 +### 1.4 开启分面的搜索
  109 +
  110 +```bash
  111 +curl -X POST "http://120.76.41.98:6002/search/" \
  112 + -H "Content-Type: application/json" \
  113 + -H "X-Tenant-ID: 162" \
  114 + -d '{
  115 + "query": "芭比娃娃",
  116 + "facets": [
  117 + {"field": "category1_name", "size": 10, "type": "terms"},
  118 + {"field": "specifications.color", "size": 10, "type": "terms"},
  119 + {"field": "specifications.size", "size": 10, "type": "terms"}
  120 + ],
  121 + "min_score": 0.2
  122 + }'
  123 +```
  124 +
  125 +---
  126 +
  127 +## 接口概览
  128 +
  129 +| 接口 | HTTP Method | Endpoint | 说明 |
  130 +|------|------|------|------|
  131 +| 搜索 | POST | `/search/` | 执行搜索查询 |
  132 +| 搜索建议 | GET | `/search/suggestions` | 搜索建议(框架,暂未实现) ⚠️ TODO |
  133 +| 即时搜索 | GET | `/search/instant` | 边输入边搜索(框架) ⚠️ TODO |
  134 +| 获取文档 | GET | `/search/{doc_id}` | 获取单个文档 |
  135 +| 全量索引 | POST | `/indexer/reindex` | 全量索引接口(导入数据,不删除索引) |
  136 +| 增量索引 | POST | `/indexer/index` | 增量索引接口(指定SPU ID列表进行索引,支持自动检测删除和显式删除) |
  137 +| 查询文档 | POST | `/indexer/documents` | 查询SPU文档数据(不写入ES) |
  138 +| 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 |
  139 +| 健康检查 | GET | `/admin/health` | 服务健康检查 |
  140 +| 获取配置 | GET | `/admin/config` | 获取租户配置 |
  141 +| 索引统计 | GET | `/admin/stats` | 获取索引统计信息 |
  142 +
  143 +---
  144 +
  145 +## 搜索接口
  146 +
  147 +### 3.1 接口信息
  148 +
  149 +- **端点**: `POST /search/`
  150 +- **描述**: 执行文本搜索查询,支持多语言、布尔表达式、过滤器和分面搜索
  151 +
  152 +### 3.2 请求参数
  153 +
  154 +#### 完整请求体结构
  155 +
  156 +```json
  157 +{
  158 + "query": "string (required)",
  159 + "size": 10,
  160 + "from": 0,
  161 + "language": "zh",
  162 + "filters": {},
  163 + "range_filters": {},
  164 + "facets": [],
  165 + "sort_by": "string",
  166 + "sort_order": "desc",
  167 + "min_score": 0.0,
  168 + "sku_filter_dimension": ["string"],
  169 + "debug": false,
  170 + "enable_rerank": false,
  171 + "rerank_query_template": "{query}",
  172 + "rerank_doc_template": "{title}",
  173 + "user_id": "string",
  174 + "session_id": "string"
  175 +}
  176 +```
  177 +
  178 +#### 参数详细说明
  179 +
  180 +| 参数 | 类型 | 必填 | 默认值 | 说明 |
  181 +|------|------|------|--------|------|
  182 +| `query` | string | Y | - | 搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT) |
  183 +| `size` | integer | N | 10 | 返回结果数量(1-100) |
  184 +| `from` | integer | N | 0 | 分页偏移量(用于分页) |
  185 +| `language` | string | N | "zh" | 返回语言:`zh`(中文)或 `en`(英文)。后端会根据此参数选择对应的中英文字段返回 |
  186 +| `filters` | object | N | null | 精确匹配过滤器(见[过滤器详解](#33-过滤器详解)) |
  187 +| `range_filters` | object | N | null | 数值范围过滤器(见[过滤器详解](#33-过滤器详解)) |
  188 +| `facets` | array | N | null | 分面配置(见[分面配置](#34-分面配置)) |
  189 +| `sort_by` | string | N | null | 排序字段名。支持:`price`(价格)、`sales`(销量)、`create_time`(创建时间)、`update_time`(更新时间)。默认按相关性排序 |
  190 +| `sort_order` | string | N | "desc" | 排序方向:`asc`(升序)或 `desc`(降序)。注意:`price`+`asc`=价格从低到高,`price`+`desc`=价格从高到低(后端自动映射为min_price或max_price) |
  191 +| `min_score` | float | N | null | 最小相关性分数阈值 |
  192 +| `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) |
  193 +| `debug` | boolean | N | false | 是否返回调试信息 |
  194 +| `enable_rerank` | boolean | N | false | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。开启后若 `from+size<=rerank_window` 才会触发重排 |
  195 +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 |
  196 +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 |
  197 +| `user_id` | string | N | null | 用户ID(用于个性化,预留) |
  198 +| `session_id` | string | N | null | 会话ID(用于分析,预留) |
  199 +
  200 +### 3.3 过滤器详解
  201 +
  202 +#### 3.3.1 精确匹配过滤器 (filters)
  203 +
  204 +用于精确匹配或多值匹配。对于普通字段,数组表示 OR 逻辑(匹配任意一个值);对于 specifications 字段,按维度分组处理。**任意字段名加 `_all` 后缀**表示多值 AND 逻辑(必须同时匹配所有值)。
  205 +
  206 +**格式**:
  207 +```json
  208 +{
  209 + "filters": {
  210 + "category_name": "手机", // 可以为单值 或者 数组 匹配数组中任意一个(OR)
  211 + "category1_name": "服装", // 可以为单值 或者 数组 匹配数组中任意一个(OR)
  212 + "category2_name": "男装", // 可以为单值 或者 数组 匹配数组中任意一个(OR)
  213 + "category3_name": "衬衫", // 可以为单值 或者 数组 匹配数组中任意一个(OR)
  214 + "vendor.zh.keyword": ["奇乐", "品牌A"], // 可以为单值 或者 数组 匹配数组中任意一个(OR)
  215 + "tags": "手机", // 可以为单值 或者 数组 匹配数组中任意一个(OR)
  216 + "tags_all": ["手机", "促销", "新品"], // *_all:多值为 AND,必须同时包含所有标签
  217 + "category1_name_all": ["服装", "男装"], // 同上,适用于任意可过滤字段
  218 + // specifications 嵌套过滤(特殊格式)
  219 + "specifications": {
  220 + "name": "color",
  221 + "value": "white"
  222 + }
  223 + }
  224 +}
  225 +```
  226 +
  227 +**支持的值类型**:
  228 +- 字符串:精确匹配
  229 +- 整数:精确匹配
  230 +- 布尔值:精确匹配
  231 +- 数组:匹配任意值(OR 逻辑);若字段名以 `_all` 结尾,则数组表示 AND 逻辑(必须同时匹配所有值)
  232 +- 对象:specifications 嵌套过滤(见下文)
  233 +
  234 +**`*_all` 语义(多值 AND)**:
  235 +- 任意过滤字段均可使用 `_all` 后缀,对应 ES 字段名为去掉 `_all` 后的名称。
  236 +- 例如:`tags_all: ["A", "B"]` 表示文档的 `tags` 必须**同时包含** A 和 B;`vendor.zh.keyword_all: ["奇乐", "品牌A"]` 表示同时匹配两个品牌(通常用于 keyword 多值场景)。
  237 +- `specifications_all`:传列表 `[{"name":"color","value":"white"},{"name":"size","value":"256GB"}]` 时,表示所有列出的规格条件都要满足(与 `specifications` 多维度时的 AND 一致;若同维度多值则要求文档同时满足多个值,一般用于嵌套多值场景)。
  238 +
  239 +**Specifications 嵌套过滤**:
  240 +
  241 +`specifications` 是嵌套字段,支持按规格名称和值进行过滤。
  242 +
  243 +**单个规格过滤**:
  244 +```json
  245 +{
  246 + "filters": {
  247 + "specifications": {
  248 + "name": "color",
  249 + "value": "white"
  250 + }
  251 + }
  252 +}
  253 +```
  254 +查询规格名称为"color"且值为"white"的商品。
  255 +
  256 +**多个规格过滤(按维度分组)**:
  257 +```json
  258 +{
  259 + "filters": {
  260 + "specifications": [
  261 + {"name": "color", "value": "white"},
  262 + {"name": "size", "value": "256GB"}
  263 + ]
  264 + }
  265 +}
  266 +```
  267 +查询同时满足所有规格的商品(color=white **且** size=256GB)。
  268 +
  269 +**相同维度的多个值(OR 逻辑)**:
  270 +```json
  271 +{
  272 + "filters": {
  273 + "specifications": [
  274 + {"name": "size", "value": "3"},
  275 + {"name": "size", "value": "4"},
  276 + {"name": "size", "value": "5"},
  277 + {"name": "color", "value": "green"}
  278 + ]
  279 + }
  280 +}
  281 +```
  282 +查询满足 (size=3 **或** size=4 **或** size=5) **且** color=green 的商品。
  283 +
  284 +**过滤逻辑说明**:
  285 +- **不同维度**(不同的 `name`)之间是 **AND** 关系(求交集)
  286 +- **相同维度**(相同的 `name`)的多个值之间是 **OR** 关系(求并集)
  287 +
  288 +**常用过滤字段**(详见[常用字段列表](#83-常用字段列表)):
  289 +- `category_name`: 类目名称
  290 +- `category1_name`, `category2_name`, `category3_name`: 多级类目
  291 +- `category_id`: 类目ID
  292 +- `vendor.zh.keyword`, `vendor.en.keyword`: 供应商/品牌(使用keyword子字段)
  293 +- `tags`: 标签(keyword类型,支持数组)
  294 +- `option1_name`, `option2_name`, `option3_name`: 选项名称
  295 +- `specifications`: 规格过滤(嵌套字段,格式见上文)
  296 +- 以上任意字段均可加 `_all` 后缀表示多值 AND,如 `tags_all`、`category1_name_all`。
  297 +
  298 +#### 3.3.2 范围过滤器 (range_filters)
  299 +
  300 +用于数值字段的范围过滤。
  301 +
  302 +**格式**:
  303 +```json
  304 +{
  305 + "range_filters": {
  306 + "min_price": {
  307 + "gte": 50, // 大于等于
  308 + "lte": 200 // 小于等于
  309 + },
  310 + "max_price": {
  311 + "gt": 100 // 大于
  312 + },
  313 + "create_time": {
  314 + "gte": "2024-01-01T00:00:00Z" // 日期时间字符串
  315 + }
  316 + }
  317 +}
  318 +```
  319 +
  320 +**支持的操作符**:
  321 +- `gte`: 大于等于 (>=)
  322 +- `gt`: 大于 (>)
  323 +- `lte`: 小于等于 (<=)
  324 +- `lt`: 小于 (<)
  325 +
  326 +**注意**: 至少需要指定一个操作符。
  327 +
  328 +**常用范围字段**(详见[常用字段列表](#83-常用字段列表)):
  329 +- `min_price`: 最低价格
  330 +- `max_price`: 最高价格
  331 +- `compare_at_price`: 原价
  332 +- `create_time`: 创建时间
  333 +- `update_time`: 更新时间
  334 +
  335 +### 3.4 分面配置
  336 +
  337 +用于生成分面统计(分组聚合),常用于构建筛选器UI。
  338 +
  339 +#### 3.4.1 配置格式
  340 +
  341 +```json
  342 +{
  343 + "facets": [
  344 + {
  345 + "field": "category1_name",
  346 + "size": 15,
  347 + "type": "terms",
  348 + "disjunctive": false
  349 + },
  350 + {
  351 + "field": "brand_name",
  352 + "size": 10,
  353 + "type": "terms",
  354 + "disjunctive": true
  355 + },
  356 + {
  357 + "field": "specifications.color",
  358 + "size": 20,
  359 + "type": "terms",
  360 + "disjunctive": true
  361 + },
  362 + {
  363 + "field": "min_price",
  364 + "type": "range",
  365 + "ranges": [
  366 + {"key": "0-50", "to": 50},
  367 + {"key": "50-100", "from": 50, "to": 100},
  368 + {"key": "100-200", "from": 100, "to": 200},
  369 + {"key": "200+", "from": 200}
  370 + ]
  371 + }
  372 + ]
  373 +}
  374 +```
  375 +
  376 +#### 3.4.2 Facet 字段说明
  377 +
  378 +| 字段 | 类型 | 必填 | 默认值 | 说明 |
  379 +|------|------|------|--------|------|
  380 +| `field` | string | 是 | - | 分面字段名 |
  381 +| `size` | int | 否 | 10 | 返回的分面值数量(1-100) |
  382 +| `type` | string | 否 | "terms" | 分面类型:`terms`(词条聚合)或 `range`(范围聚合) |
  383 +| `disjunctive` | bool | 否 | false | 是否支持多选(disjunctive faceting)。启用后,选中该分面的过滤器时,仍会显示其他可选项 |
  384 +| `ranges` | array | 否 | null | 范围配置(仅 `type="range"` 时需要) |
  385 +
  386 +#### 3.4.3 disjunctive字段说明
  387 +
  388 +**重要特性**: `disjunctive` 字段控制分面的行为模式。启用后,选中该分面的过滤器时,仍会显示其他可选项
  389 +
  390 +**标准模式 (disjunctive: false)**:
  391 +- **行为**: 选中某个分面值后,该分面只显示选中的值
  392 +- **适用场景**: 层级类目、互斥选择
  393 +- **示例**: 类目下钻(玩具 > 娃娃 > 芭比)
  394 +
  395 +**Multi-Select 模式 (disjunctive: true)** ⭐:
  396 +- **行为**: 选中某个分面值后,该分面仍显示所有可选项
  397 +- **适用场景**: 颜色、品牌、尺码等可切换属性
  398 +- **示例**: 选择了"红色"后,仍能看到"蓝色"、"绿色"等选项
  399 +
  400 +**推荐配置**:
  401 +
  402 +| 分面类型 | disjunctive | 原因 |
  403 +|---------|-------------|------|
  404 +| 颜色 | `true` | 用户需要切换颜色 |
  405 +| 品牌 | `true` | 用户需要比较品牌 |
  406 +| 尺码 | `true` | 用户需要查看其他尺码 |
  407 +| 类目 | `false` | 层级下钻 |
  408 +| 价格区间 | `false` | 互斥选择 |
  409 +
  410 +#### 3.4.4 规格分面说明
  411 +
  412 +`specifications` 是嵌套字段,支持两种分面模式:
  413 +
  414 +**模式1:所有规格名称的分面**:
  415 +```json
  416 +{
  417 + "facets": [
  418 + {
  419 + "field": "specifications",
  420 + "size": 10,
  421 + "type": "terms"
  422 + }
  423 + ]
  424 +}
  425 +```
  426 +返回所有规格名称(name)及其对应的值(value)列表。每个 name 会生成一个独立的分面结果。
  427 +
  428 +**模式2:指定规格名称的分面**:
  429 +```json
  430 +{
  431 + "facets": [
  432 + {
  433 + "field": "specifications.color",
  434 + "size": 20,
  435 + "type": "terms",
  436 + "disjunctive": true
  437 + },
  438 + {
  439 + "field": "specifications.size",
  440 + "size": 15,
  441 + "type": "terms",
  442 + "disjunctive": true
  443 + }
  444 + ]
  445 +}
  446 +```
  447 +只返回指定规格名称的值列表。格式:`specifications.{name}`,其中 `{name}` 是规格名称(如"color"、"size"、"material")。
  448 +
  449 +**返回格式示例**:
  450 +```json
  451 +{
  452 + "facets": [
  453 + {
  454 + "field": "specifications.color",
  455 + "label": "color",
  456 + "type": "terms",
  457 + "values": [
  458 + {"value": "white", "count": 50, "selected": true}, // ✓ selected 字段由后端标记
  459 + {"value": "black", "count": 30, "selected": false},
  460 + {"value": "red", "count": 20, "selected": false}
  461 + ]
  462 + },
  463 + {
  464 + "field": "specifications.size",
  465 + "label": "size",
  466 + "type": "terms",
  467 + "values": [
  468 + {"value": "256GB", "count": 40, "selected": false},
  469 + {"value": "512GB", "count": 20, "selected": false}
  470 + ]
  471 + }
  472 + ]
  473 +}
  474 +```
  475 +
  476 +### 3.5 SKU筛选维度
  477 +
  478 +**功能说明**:
  479 +`sku_filter_dimension` 用于控制搜索列表页中 **每个 SPU 下方可切换的子款式(子 SKU)维度**,为字符串列表。
  480 +在店铺的 **主题装修配置** 中,商家可以为店铺设置一个或多个子款式筛选维度(例如 `color`、`size`),前端列表页会在每个 SPU 下展示这些维度对应的子 SKU 列表,用户可以通过点击不同维度值(如不同颜色)来切换展示的子款式。
  481 +当指定 `sku_filter_dimension` 后,后端会根据店铺的这项配置,从所有 SKU 中筛选出这些维度组合对应的子 SKU 数据:系统会按指定维度**组合**对 SKU 进行分组,每个维度组合只返回第一个 SKU(从简实现,选择该组合下的第一款),其余不在这些维度组合中的子 SKU 将不返回。
  482 +
  483 +**支持的维度值**:
  484 +1. **直接选项字段**: `option1`、`option2`、`option3`
  485 + - 直接使用对应的 `option1_value`、`option2_value`、`option3_value` 字段进行分组
  486 +
  487 +2. **规格/选项名称**: 通过 `option1_name`、`option2_name`、`option3_name` 匹配
  488 + - 例如:如果 `option1_name` 为 `"color"`,则可以使用 `sku_filter_dimension: ["color"]` 来按颜色分组
  489 +
  490 +**示例**:
  491 +
  492 +**按颜色筛选(假设 option1_name = "color")**:
  493 +```json
  494 +{
  495 + "query": "芭比娃娃",
  496 + "sku_filter_dimension": ["color"]
  497 +}
  498 +```
  499 +
  500 +**按选项1筛选**:
  501 +```json
  502 +{
  503 + "query": "芭比娃娃",
  504 + "sku_filter_dimension": ["option1"]
  505 +}
  506 +```
  507 +
  508 +**按颜色 + 尺寸组合筛选(假设 option1_name = "color", option2_name = "size")**:
  509 +```json
  510 +{
  511 + "query": "芭比娃娃",
  512 + "sku_filter_dimension": ["color", "size"]
  513 +}
  514 +```
  515 +
  516 +### 3.6 布尔表达式语法
  517 +
  518 +搜索查询支持布尔表达式,提供更灵活的搜索能力。
  519 +
  520 +**支持的操作符**:
  521 +
  522 +| 操作符 | 描述 | 示例 |
  523 +|--------|------|------|
  524 +| `AND` | 所有词必须匹配 | `玩具 AND 乐高` |
  525 +| `OR` | 任意词匹配 | `芭比 OR 娃娃` |
  526 +| `ANDNOT` | 排除特定词 | `玩具 ANDNOT 电动` |
  527 +| `RANK` | 排序加权(不强制匹配) | `玩具 RANK 乐高` |
  528 +| `()` | 分组 | `玩具 AND (乐高 OR 芭比)` |
  529 +
  530 +**操作符优先级**(从高到低):
  531 +1. `()` - 括号
  532 +2. `ANDNOT` - 排除
  533 +3. `AND` - 与
  534 +4. `OR` - 或
  535 +5. `RANK` - 排序
  536 +
  537 +**示例**:
  538 +```
  539 +"芭比娃娃" // 简单查询
  540 +"玩具 AND 乐高" // AND 查询
  541 +"芭比 OR 娃娃" // OR 查询
  542 +"玩具 ANDNOT 电动" // 排除查询
  543 +"玩具 AND (乐高 OR 芭比)" // 复杂查询
  544 +```
  545 +
  546 +### 3.7 搜索建议接口
  547 +
  548 +> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,仅返回空结果。接口和响应格式已经固定,可平滑扩展。
  549 +
  550 +- **端点**: `GET /search/suggestions`
  551 +- **描述**: 返回搜索建议(自动补全/热词)。当前为框架实现,接口和响应格式已经固定,可平滑扩展。
  552 +
  553 +#### 查询参数
  554 +
  555 +| 参数 | 类型 | 必填 | 默认值 | 描述 |
  556 +|------|------|------|--------|------|
  557 +| `q` | string | Y | - | 查询字符串(至少 1 个字符) |
  558 +| `size` | integer | N | 5 | 返回建议数量(1-20) |
  559 +| `types` | string | N | `query` | 建议类型(逗号分隔):`query`, `product`, `category`, `brand` |
  560 +
  561 +#### 响应示例
  562 +
  563 +```json
  564 +{
  565 + "query": "芭",
  566 + "suggestions": [
  567 + {
  568 + "text": "芭比娃娃",
  569 + "type": "query",
  570 + "highlight": "<em>芭</em>比娃娃",
  571 + "popularity": 850
  572 + }
  573 + ],
  574 + "took_ms": 5
  575 +}
  576 +```
  577 +
  578 +#### 请求示例
  579 +
  580 +```bash
  581 +curl "http://localhost:6002/search/suggestions?q=芭&size=5&types=query,product"
  582 +```
  583 +
  584 +### 3.8 即时搜索接口
  585 +
  586 +> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,调用标准搜索接口。后续需要优化即时搜索性能(添加防抖/节流、实现结果缓存、简化返回字段)。
  587 +
  588 +- **端点**: `GET /search/instant`
  589 +- **描述**: 边输入边搜索,采用轻量参数响应当前输入。底层复用标准搜索能力。
  590 +
  591 +#### 查询参数
  592 +
  593 +| 参数 | 类型 | 必填 | 默认值 | 描述 |
  594 +|------|------|------|--------|------|
  595 +| `q` | string | Y | - | 搜索查询(至少 2 个字符) |
  596 +| `size` | integer | N | 5 | 返回结果数量(1-20) |
  597 +
  598 +#### 请求示例
  599 +
  600 +```bash
  601 +curl "http://localhost:6002/search/instant?q=玩具&size=5"
  602 +```
  603 +
  604 +### 3.9 获取单个文档
  605 +
  606 +- **端点**: `GET /search/{doc_id}`
  607 +- **描述**: 根据文档 ID 获取单个商品详情,用于点击结果后的详情页或排查问题。
  608 +
  609 +#### 路径参数
  610 +
  611 +| 参数 | 类型 | 描述 |
  612 +|------|------|------|
  613 +| `doc_id` | string | 商品或文档 ID |
  614 +
  615 +#### 响应示例
  616 +
  617 +```json
  618 +{
  619 + "id": "12345",
  620 + "source": {
  621 + "title": {
  622 + "zh": "芭比时尚娃娃"
  623 + },
  624 + "min_price": 89.99,
  625 + "category1_name": "玩具"
  626 + }
  627 +}
  628 +```
  629 +
  630 +#### 请求示例
  631 +
  632 +```bash
  633 +curl "http://localhost:6002/search/12345"
  634 +```
  635 +
  636 +---
  637 +
  638 +## 响应格式说明
  639 +
  640 +### 4.1 标准响应结构
  641 +
  642 +```json
  643 +{
  644 + "results": [
  645 + {
  646 + "spu_id": "12345",
  647 + "title": "芭比时尚娃娃",
  648 + "brief": "高品质芭比娃娃",
  649 + "description": "详细描述...",
  650 + "vendor": "美泰",
  651 + "category": "玩具",
  652 + "category_path": "玩具/娃娃/时尚",
  653 + "category_name": "时尚",
  654 + "category_id": "cat_001",
  655 + "category_level": 3,
  656 + "category1_name": "玩具",
  657 + "category2_name": "娃娃",
  658 + "category3_name": "时尚",
  659 + "tags": ["娃娃", "玩具", "女孩"],
  660 + "price": 89.99,
  661 + "compare_at_price": 129.99,
  662 + "currency": "USD",
  663 + "image_url": "https://example.com/image.jpg",
  664 + "in_stock": true,
  665 + "sku_prices": [89.99, 99.99, 109.99],
  666 + "sku_weights": [100, 150, 200],
  667 + "sku_weight_units": ["g", "g", "g"],
  668 + "total_inventory": 500,
  669 + "option1_name": "color",
  670 + "option2_name": "size",
  671 + "option3_name": null,
  672 + "specifications": [
  673 + {"sku_id": "sku_001", "name": "color", "value": "pink"},
  674 + {"sku_id": "sku_001", "name": "size", "value": "standard"}
  675 + ],
  676 + "skus": [
  677 + {
  678 + "sku_id": "67890",
  679 + "price": 89.99,
  680 + "compare_at_price": 129.99,
  681 + "sku": "BARBIE-001",
  682 + "stock": 100,
  683 + "weight": 0.1,
  684 + "weight_unit": "kg",
  685 + "option1_value": "pink",
  686 + "option2_value": "standard",
  687 + "option3_value": null,
  688 + "image_src": "https://example.com/sku1.jpg"
  689 + }
  690 + ],
  691 + "relevance_score": 8.5
  692 + }
  693 + ],
  694 + "total": 118,
  695 + "max_score": 8.5,
  696 + "facets": [
  697 + {
  698 + "field": "category1_name",
  699 + "label": "category1_name",
  700 + "type": "terms",
  701 + "values": [
  702 + {
  703 + "value": "玩具",
  704 + "label": "玩具",
  705 + "count": 85,
  706 + "selected": false
  707 + }
  708 + ]
  709 + },
  710 + {
  711 + "field": "specifications.color",
  712 + "label": "color",
  713 + "type": "terms",
  714 + "values": [
  715 + {
  716 + "value": "pink",
  717 + "label": "pink",
  718 + "count": 30,
  719 + "selected": false
  720 + }
  721 + ]
  722 + }
  723 + ],
  724 + "query_info": {
  725 + "original_query": "芭比娃娃",
  726 + "query_normalized": "芭比娃娃",
  727 + "rewritten_query": "芭比娃娃",
  728 + "detected_language": "zh",
  729 + "translations": {
  730 + "en": "barbie doll"
  731 + },
  732 + "domain": "default"
  733 + },
  734 + "suggestions": [],
  735 + "related_searches": [],
  736 + "took_ms": 45,
  737 + "performance_info": null,
  738 + "debug_info": null
  739 +}
  740 +```
  741 +
  742 +### 4.2 响应字段说明
  743 +
  744 +| 字段 | 类型 | 说明 |
  745 +|------|------|------|
  746 +| `results` | array | 搜索结果列表(SpuResult对象数组) |
  747 +| `results[].spu_id` | string | SPU ID |
  748 +| `results[].title` | string | 商品标题 |
  749 +| `results[].price` | float | 价格(min_price) |
  750 +| `results[].skus` | array | SKU列表(如果指定了`sku_filter_dimension`,则按维度过滤后的SKU) |
  751 +| `results[].relevance_score` | float | 相关性分数 |
  752 +| `total` | integer | 匹配的总文档数 |
  753 +| `max_score` | float | 最高相关性分数 |
  754 +| `facets` | array | 分面统计结果 |
  755 +| `query_info` | object | query处理信息 |
  756 +| `took_ms` | integer | 搜索耗时(毫秒) |
  757 +
  758 +#### 4.2.1 query_info 说明
  759 +
  760 +`query_info` 包含本次搜索的查询解析与处理结果:
  761 +
  762 +| 子字段 | 类型 | 说明 |
  763 +|--------|------|------|
  764 +| `original_query` | string | 用户原始查询 |
  765 +| `query_normalized` | string | 归一化后的查询(去空白、大小写等预处理,用于后续解析与改写) |
  766 +| `rewritten_query` | string | 重写后的查询(同义词/词典扩展等) |
  767 +| `detected_language` | string | 检测到的查询语言(如 `zh`、`en`) |
  768 +| `translations` | object | 翻译结果,键为语言代码,值为翻译文本 |
  769 +| `domain` | string | 查询域(如 `default`、`title`、`brand` 等) |
  770 +
  771 +### 4.3 SpuResult字段说明
  772 +
  773 +| 字段 | 类型 | 说明 |
  774 +|------|------|------|
  775 +| `spu_id` | string | SPU ID |
  776 +| `title` | string | 商品标题(根据language参数自动选择 `title.zh` 或 `title.en`) |
  777 +| `brief` | string | 商品短描述(根据language参数自动选择) |
  778 +| `description` | string | 商品详细描述(根据language参数自动选择) |
  779 +| `vendor` | string | 供应商/品牌(根据language参数自动选择) |
  780 +| `category` | string | 类目(兼容字段,等同于category_name) |
  781 +| `category_path` | string | 类目路径(多级,用于面包屑,根据language参数自动选择) |
  782 +| `category_name` | string | 类目名称(展示用,根据language参数自动选择) |
  783 +| `category_id` | string | 类目ID |
  784 +| `category_level` | integer | 类目层级(1/2/3) |
  785 +| `category1_name` | string | 一级类目名称 |
  786 +| `category2_name` | string | 二级类目名称 |
  787 +| `category3_name` | string | 三级类目名称 |
  788 +| `tags` | array[string] | 标签列表 |
  789 +| `price` | float | 价格(min_price) |
  790 +| `compare_at_price` | float | 原价 |
  791 +| `currency` | string | 货币单位(默认USD) |
  792 +| `image_url` | string | 主图URL |
  793 +| `in_stock` | boolean | 是否有库存(任意SKU有库存即为true) |
  794 +| `sku_prices` | array[float] | 所有SKU价格列表 |
  795 +| `sku_weights` | array[integer] | 所有SKU重量列表 |
  796 +| `sku_weight_units` | array[string] | 所有SKU重量单位列表 |
  797 +| `total_inventory` | integer | 总库存 |
  798 +| `sales` | integer | 销量(展示销量) |
  799 +| `option1_name` | string | 选项1名称(如"color") |
  800 +| `option2_name` | string | 选项2名称(如"size") |
  801 +| `option3_name` | string | 选项3名称 |
  802 +| `specifications` | array[object] | 规格列表(与ES specifications字段对应) |
  803 +| `skus` | array | SKU 列表 |
  804 +| `relevance_score` | float | 相关性分数(默认为 ES 原始分数;当开启 AI 搜索时为融合后的最终分数) |
  805 +
  806 +### 4.4 SkuResult字段说明
  807 +
  808 +| 字段 | 类型 | 说明 |
  809 +|------|------|------|
  810 +| `sku_id` | string | SKU ID |
  811 +| `price` | float | 价格 |
  812 +| `compare_at_price` | float | 原价 |
  813 +| `sku` | string | SKU编码(sku_code) |
  814 +| `stock` | integer | 库存数量 |
  815 +| `weight` | float | 重量 |
  816 +| `weight_unit` | string | 重量单位 |
  817 +| `option1_value` | string | 选项1取值(如color值) |
  818 +| `option2_value` | string | 选项2取值(如size值) |
  819 +| `option3_value` | string | 选项3取值 |
  820 +| `image_src` | string | SKU图片地址 |
  821 +
  822 +### 4.5 多语言字段说明
  823 +
  824 +- `title`, `brief`, `description`, `vendor`, `category_path`, `category_name` 会根据请求的 `language` 参数自动选择对应的中英文字段
  825 +- `language="zh"`: 优先返回 `*_zh` 字段,如果为空则回退到 `*_en` 字段
  826 +- `language="en"`: 优先返回 `*_en` 字段,如果为空则回退到 `*_zh` 字段
  827 +
  828 +---
  829 +
  830 +## 索引接口
  831 +
  832 +### 5.0 为租户创建索引
  833 +
  834 +为租户创建索引需要两个步骤:
  835 +
  836 +1. **创建索引结构**(可选,仅在需要更新 mapping 时执行)
  837 + - 使用脚本创建 ES 索引结构(基于 `mappings/search_products.json`)
  838 + - 如果索引已存在,会提示用户确认(会删除现有数据)
  839 +
  840 +2. **导入数据**(必需)
  841 + - 使用全量索引接口 `/indexer/reindex` 导入数据
  842 +
  843 +**创建索引结构**:
  844 +
  845 +```bash
  846 +./scripts/create_tenant_index.sh 170
  847 +```
  848 +
  849 +脚本会自动从项目根目录的 `.env` 文件加载 ES 配置。
  850 +
  851 +**注意事项**:
  852 +- ⚠️ 如果索引已存在,脚本会提示确认,确认后会删除现有数据
  853 +- 创建索引后,**必须**调用 `/indexer/reindex` 导入数据
  854 +- 如果只是更新数据而不需要修改索引结构,直接使用 `/indexer/reindex` 即可
  855 +
  856 +---
  857 +
  858 +### 5.1 全量索引接口
  859 +
  860 +- **端点**: `POST /indexer/reindex`
  861 +- **描述**: 全量索引,将指定租户的所有SPU数据导入到ES索引(不会删除现有索引)
  862 +
  863 +#### 请求参数
  864 +
  865 +```json
  866 +{
  867 + "tenant_id": "162",
  868 + "batch_size": 500
  869 +}
  870 +```
  871 +
  872 +| 参数 | 类型 | 必填 | 默认值 | 说明 |
  873 +|------|------|------|--------|------|
  874 +| `tenant_id` | string | Y | - | 租户ID |
  875 +| `batch_size` | integer | N | 500 | 批量导入大小 |
  876 +
  877 +#### 响应格式
  878 +
  879 +**成功响应(200 OK)**:
  880 +```json
  881 +{
  882 + "success": true,
  883 + "total": 1000,
  884 + "indexed": 1000,
  885 + "failed": 0,
  886 + "elapsed_time": 12.34,
  887 + "index_name": "search_products",
  888 + "tenant_id": "162"
  889 +}
  890 +```
  891 +
  892 +**错误响应**:
  893 +- `400 Bad Request`: 参数错误
  894 +- `503 Service Unavailable`: 服务未初始化
  895 +
  896 +#### 请求示例
  897 +
  898 +**全量索引(不会删除现有索引)**:
  899 +```bash
  900 +curl -X POST "http://localhost:6004/indexer/reindex" \
  901 + -H "Content-Type: application/json" \
  902 + -d '{
  903 + "tenant_id": "162",
  904 + "batch_size": 500
  905 + }'
  906 +```
  907 +
  908 +**查看日志**:
  909 +```bash
  910 +# 查看API日志(包含索引操作日志)
  911 +tail -f logs/api.log
  912 +
  913 +# 或者查看所有日志文件
  914 +tail -f logs/*.log
  915 +```
  916 +
  917 +> ⚠️ **重要提示**:如需 **创建索引结构**,请参考 [5.0 为租户创建索引](#50-为租户创建索引) 章节,使用 `scripts/recreate_all_tenant_indices.py` 脚本。创建后需要调用 `/indexer/reindex` 导入数据。
  918 +
  919 +**查看索引日志**:
  920 +
  921 +索引操作的所有关键信息都会记录到 `logs/indexer.log` 文件中(JSON 格式),包括:
  922 +- 请求开始和结束时间
  923 +- 租户ID、SPU ID、操作类型
  924 +- 每个SPU的处理状态
  925 +- ES批量写入结果
  926 +- 成功/失败统计和详细错误信息
  927 +
  928 +```bash
  929 +# 实时查看索引日志(包含全量和增量索引的所有操作)
  930 +tail -f logs/indexer.log
  931 +
  932 +# 使用 grep 查询(简单方式)
  933 +# 查看全量索引日志
  934 +grep "\"index_type\":\"bulk\"" logs/indexer.log | tail -100
  935 +
  936 +# 查看增量索引日志
  937 +grep "\"index_type\":\"incremental\"" logs/indexer.log | tail -100
  938 +
  939 +# 查看特定租户的索引日志
  940 +grep "\"tenant_id\":\"162\"" logs/indexer.log | tail -100
  941 +
  942 +# 使用 jq 查询(推荐,更精确的 JSON 查询)
  943 +# 安装 jq: sudo apt-get install jq 或 brew install jq
  944 +
  945 +# 查看全量索引日志
  946 +cat logs/indexer.log | jq 'select(.index_type == "bulk")' | tail -100
  947 +
  948 +# 查看增量索引日志
  949 +cat logs/indexer.log | jq 'select(.index_type == "incremental")' | tail -100
  950 +
  951 +# 查看特定租户的索引日志
  952 +cat logs/indexer.log | jq 'select(.tenant_id == "162")' | tail -100
  953 +
  954 +# 查看失败的索引操作
  955 +cat logs/indexer.log | jq 'select(.operation == "request_complete" and .failed_count > 0)'
  956 +
  957 +# 查看特定SPU的处理日志
  958 +cat logs/indexer.log | jq 'select(.spu_id == "123")'
  959 +
  960 +# 查看最近的索引请求统计
  961 +cat logs/indexer.log | jq 'select(.operation == "request_complete") | {timestamp, index_type, tenant_id, total_count, success_count, failed_count, elapsed_time}'
  962 +```
  963 +
  964 +### 5.2 增量索引接口
  965 +
  966 +- **端点**: `POST /indexer/index`
  967 +- **描述**: 增量索引接口,根据指定的SPU ID列表进行索引,直接将数据写入ES。用于增量更新指定商品。
  968 +
  969 +**删除说明**:
  970 +- `spu_ids`中的SPU:如果数据库`deleted=1`,自动从ES删除,响应状态为`deleted`
  971 +- `delete_spu_ids`中的SPU:直接删除,响应状态为`deleted`、`not_found`或`failed`
  972 +
  973 +#### 请求参数
  974 +
  975 +```json
  976 +{
  977 + "tenant_id": "162",
  978 + "spu_ids": ["123", "456", "789"],
  979 + "delete_spu_ids": ["100", "101"]
  980 +}
  981 +```
  982 +
  983 +| 参数 | 类型 | 必填 | 说明 |
  984 +|------|------|------|------|
  985 +| `tenant_id` | string | Y | 租户ID |
  986 +| `spu_ids` | array[string] | N | SPU ID列表(1-100个),要索引的SPU。如果为空,则只执行删除操作 |
  987 +| `delete_spu_ids` | array[string] | N | 显式指定要删除的SPU ID列表(1-100个),可选。无论数据库状态如何,都会从ES中删除这些SPU |
  988 +
  989 +**注意**:
  990 +- `spu_ids` 和 `delete_spu_ids` 不能同时为空
  991 +- 每个列表最多支持100个SPU ID
  992 +- 如果SPU在`spu_ids`中且数据库`deleted=1`,会自动从ES删除(自动检测删除)
  993 +
  994 +#### 响应格式
  995 +
  996 +```json
  997 +{
  998 + "spu_ids": [
  999 + {
  1000 + "spu_id": "123",
  1001 + "status": "indexed"
  1002 + },
  1003 + {
  1004 + "spu_id": "456",
  1005 + "status": "deleted"
  1006 + },
  1007 + {
  1008 + "spu_id": "789",
  1009 + "status": "failed",
  1010 + "msg": "SPU not found (unexpected)"
  1011 + }
  1012 + ],
  1013 + "delete_spu_ids": [
  1014 + {
  1015 + "spu_id": "100",
  1016 + "status": "deleted"
  1017 + },
  1018 + {
  1019 + "spu_id": "101",
  1020 + "status": "not_found"
  1021 + },
  1022 + {
  1023 + "spu_id": "102",
  1024 + "status": "failed",
  1025 + "msg": "Failed to delete from ES: Connection timeout"
  1026 + }
  1027 + ],
  1028 + "total": 6,
  1029 + "success_count": 4,
  1030 + "failed_count": 2,
  1031 + "elapsed_time": 1.23,
  1032 + "index_name": "search_products",
  1033 + "tenant_id": "162"
  1034 +}
  1035 +```
  1036 +
  1037 +| 字段 | 类型 | 说明 |
  1038 +|------|------|------|
  1039 +| `spu_ids` | array | spu_ids对应的响应列表,每个元素包含 `spu_id` 和 `status` |
  1040 +| `spu_ids[].status` | string | 状态:`indexed`(已索引)、`deleted`(已删除,自动检测)、`failed`(失败) |
  1041 +| `spu_ids[].msg` | string | 当status为`failed`时,包含失败原因(可选) |
  1042 +| `delete_spu_ids` | array | delete_spu_ids对应的响应列表,每个元素包含 `spu_id` 和 `status` |
  1043 +| `delete_spu_ids[].status` | string | 状态:`deleted`(已删除)、`not_found`(ES中不存在)、`failed`(失败) |
  1044 +| `delete_spu_ids[].msg` | string | 当status为`failed`时,包含失败原因(可选) |
  1045 +| `total` | integer | 总处理数量(spu_ids数量 + delete_spu_ids数量) |
  1046 +| `success_count` | integer | 成功数量(indexed + deleted + not_found) |
  1047 +| `failed_count` | integer | 失败数量 |
  1048 +| `elapsed_time` | float | 耗时(秒) |
  1049 +| `index_name` | string | 索引名称 |
  1050 +| `tenant_id` | string | 租户ID |
  1051 +
  1052 +**状态说明**:
  1053 +- `spu_ids` 的状态:
  1054 + - `indexed`: SPU已成功索引到ES
  1055 + - `deleted`: SPU在数据库中被标记为deleted=1,已从ES删除(自动检测)
  1056 + - `failed`: 处理失败,会包含`msg`字段说明失败原因
  1057 +- `delete_spu_ids` 的状态:
  1058 + - `deleted`: SPU已从ES成功删除
  1059 + - `not_found`: SPU在ES中不存在(也算成功,可能已经被删除过)
  1060 + - `failed`: 删除失败,会包含`msg`字段说明失败原因
  1061 +
  1062 +#### 请求示例
  1063 +
  1064 +**示例1:普通增量索引(自动检测删除)**:
  1065 +```bash
  1066 +curl -X POST "http://localhost:6004/indexer/index" \
  1067 + -H "Content-Type: application/json" \
  1068 + -d '{
  1069 + "tenant_id": "162",
  1070 + "spu_ids": ["123", "456", "789"]
  1071 + }'
  1072 +```
  1073 +说明:如果SPU 456在数据库中`deleted=1`,会自动从ES删除,在响应中`spu_ids`列表里456的状态为`deleted`。
  1074 +
  1075 +**示例2:显式删除(批量删除)**:
  1076 +```bash
  1077 +curl -X POST "http://localhost:6004/indexer/index" \
  1078 + -H "Content-Type: application/json" \
  1079 + -d '{
  1080 + "tenant_id": "162",
  1081 + "spu_ids": ["123", "456"],
  1082 + "delete_spu_ids": ["100", "101", "102"]
  1083 + }'
  1084 +```
  1085 +说明:SPU 100、101、102会被显式删除,无论数据库状态如何。
  1086 +
  1087 +**示例3:仅删除(不索引)**:
  1088 +```bash
  1089 +curl -X POST "http://localhost:6004/indexer/index" \
  1090 + -H "Content-Type: application/json" \
  1091 + -d '{
  1092 + "tenant_id": "162",
  1093 + "spu_ids": [],
  1094 + "delete_spu_ids": ["100", "101"]
  1095 + }'
  1096 +```
  1097 +说明:只执行删除操作,不进行索引。
  1098 +
  1099 +**示例4:混合操作(索引+删除)**:
  1100 +```bash
  1101 +curl -X POST "http://localhost:6004/indexer/index" \
  1102 + -H "Content-Type: application/json" \
  1103 + -d '{
  1104 + "tenant_id": "162",
  1105 + "spu_ids": ["123", "456", "789"],
  1106 + "delete_spu_ids": ["100", "101"]
  1107 + }'
  1108 +```
  1109 +说明:同时执行索引和删除操作。
  1110 +
  1111 +#### 日志说明
  1112 +
  1113 +增量索引操作的所有关键信息都会记录到 `logs/indexer.log` 文件中(JSON格式),包括:
  1114 +- 请求开始和结束时间
  1115 +- 每个SPU的处理状态(获取、转换、索引、删除)
  1116 +- ES批量写入结果
  1117 +- 成功/失败统计
  1118 +- 详细的错误信息
  1119 +
  1120 +日志查询方式请参考[5.1节查看索引日志](#51-全量重建索引接口)部分。
  1121 +
  1122 +### 5.3 查询文档接口
  1123 +
  1124 +- **端点**: `POST /indexer/documents`
  1125 +- **描述**: 查询文档接口,根据SPU ID列表获取ES文档数据(**不写入ES**)。用于查看、调试或验证SPU数据。
  1126 +
  1127 +#### 请求参数
  1128 +
  1129 +```json
  1130 +{
  1131 + "tenant_id": "162",
  1132 + "spu_ids": ["123", "456", "789"]
  1133 +}
  1134 +```
  1135 +
  1136 +| 参数 | 类型 | 必填 | 说明 |
  1137 +|------|------|------|------|
  1138 +| `tenant_id` | string | Y | 租户ID |
  1139 +| `spu_ids` | array[string] | Y | SPU ID列表(1-100个) |
  1140 +
  1141 +#### 响应格式
  1142 +
  1143 +```json
  1144 +{
  1145 + "success": [
  1146 + {
  1147 + "spu_id": "123",
  1148 + "document": {
  1149 + "tenant_id": "162",
  1150 + "spu_id": "123",
  1151 + "title": {
  1152 + "zh": "商品标题"
  1153 + },
  1154 + ...
  1155 + }
  1156 + },
  1157 + {
  1158 + "spu_id": "456",
  1159 + "document": {...}
  1160 + }
  1161 + ],
  1162 + "failed": [
  1163 + {
  1164 + "spu_id": "789",
  1165 + "error": "SPU not found or deleted"
  1166 + }
  1167 + ],
  1168 + "total": 3,
  1169 + "success_count": 2,
  1170 + "failed_count": 1
  1171 +}
  1172 +```
  1173 +
  1174 +| 字段 | 类型 | 说明 |
  1175 +|------|------|------|
  1176 +| `success` | array | 成功获取的SPU列表,每个元素包含 `spu_id` 和 `document`(完整的ES文档数据) |
  1177 +| `failed` | array | 失败的SPU列表,每个元素包含 `spu_id` 和 `error`(失败原因) |
  1178 +| `total` | integer | 总SPU数量 |
  1179 +| `success_count` | integer | 成功数量 |
  1180 +| `failed_count` | integer | 失败数量 |
  1181 +
  1182 +#### 请求示例
  1183 +
  1184 +**单个SPU查询**:
  1185 +```bash
  1186 +curl -X POST "http://localhost:6004/indexer/documents" \
  1187 + -H "Content-Type: application/json" \
  1188 + -d '{
  1189 + "tenant_id": "162",
  1190 + "spu_ids": ["123"]
  1191 + }'
  1192 +```
  1193 +
  1194 +**批量SPU查询**:
  1195 +```bash
  1196 +curl -X POST "http://localhost:6004/indexer/documents" \
  1197 + -H "Content-Type: application/json" \
  1198 + -d '{
  1199 + "tenant_id": "162",
  1200 + "spu_ids": ["123", "456", "789"]
  1201 + }'
  1202 +```
  1203 +
  1204 +#### 与 `/indexer/index` 的区别
  1205 +
  1206 +| 接口 | 功能 | 是否写入ES | 返回内容 |
  1207 +|------|------|-----------|----------|
  1208 +| `/indexer/documents` | 查询SPU文档数据 | 否 | 返回完整的ES文档数据 |
  1209 +| `/indexer/index` | 增量索引 | 是 | 返回成功/失败列表和统计信息 |
  1210 +
  1211 +**使用场景**:
  1212 +- `/indexer/documents`:用于查看、调试或验证SPU数据,不修改ES索引
  1213 +- `/indexer/index`:用于实际的增量索引操作,将更新的SPU数据同步到ES
  1214 +
  1215 +### 5.4 索引健康检查接口
  1216 +
  1217 +- **端点**: `GET /indexer/health`
  1218 +- **描述**: 检查索引服务的健康状态
  1219 +
  1220 +#### 响应格式
  1221 +
  1222 +```json
  1223 +{
  1224 + "status": "available",
  1225 + "database": "connected",
  1226 + "preloaded_data": {
  1227 + "category_mappings": 150
  1228 + }
  1229 +}
  1230 +```
  1231 +
  1232 +#### 请求示例
  1233 +
  1234 +```bash
  1235 +curl -X GET "http://localhost:6004/indexer/health"
  1236 +```
  1237 +
  1238 +---
  1239 +
  1240 +## 管理接口
  1241 +
  1242 +### 6.1 健康检查
  1243 +
  1244 +- **端点**: `GET /admin/health`
  1245 +- **描述**: 检查服务与依赖(如 Elasticsearch)状态。
  1246 +
  1247 +```json
  1248 +{
  1249 + "status": "healthy",
  1250 + "elasticsearch": "connected",
  1251 + "tenant_id": "tenant1"
  1252 +}
  1253 +```
  1254 +
  1255 +### 6.2 获取配置
  1256 +
  1257 +- **端点**: `GET /admin/config`
  1258 +- **描述**: 返回当前租户的脱敏配置,便于核对索引及排序表达式。
  1259 +
  1260 +```json
  1261 +{
  1262 + "tenant_id": "tenant1",
  1263 + "tenant_name": "Tenant1 Test Instance",
  1264 + "es_index_name": "search_tenant1",
  1265 + "num_fields": 20,
  1266 + "num_indexes": 4,
  1267 + "supported_languages": ["zh", "en", "ru"],
  1268 + "ranking_expression": "bm25() + 0.2*text_embedding_relevance()",
  1269 + "spu_enabled": false
  1270 +}
  1271 +```
  1272 +
  1273 +### 6.3 索引统计
  1274 +
  1275 +- **端点**: `GET /admin/stats`
  1276 +- **描述**: 获取索引文档数量与磁盘大小,方便监控。
  1277 +
  1278 +```json
  1279 +{
  1280 + "index_name": "search_tenant1",
  1281 + "document_count": 10000,
  1282 + "size_mb": 523.45
  1283 +}
  1284 +```
  1285 +
  1286 +---
  1287 +
  1288 +## 常见场景示例
  1289 +
  1290 +### 7.1 基础搜索与排序
  1291 +
  1292 +**按价格从低到高排序**:
  1293 +```json
  1294 +{
  1295 + "query": "玩具",
  1296 + "size": 20,
  1297 + "from": 0,
  1298 + "sort_by": "price",
  1299 + "sort_order": "asc"
  1300 +}
  1301 +```
  1302 +
  1303 +**按价格从高到低排序**:
  1304 +```json
  1305 +{
  1306 + "query": "玩具",
  1307 + "size": 20,
  1308 + "from": 0,
  1309 + "sort_by": "price",
  1310 + "sort_order": "desc"
  1311 +}
  1312 +```
  1313 +
  1314 +**按销量从高到低排序**:
  1315 +```json
  1316 +{
  1317 + "query": "玩具",
  1318 + "size": 20,
  1319 + "from": 0,
  1320 + "sort_by": "sales",
  1321 + "sort_order": "desc"
  1322 +}
  1323 +```
  1324 +
  1325 +**按默认(相关性)排序**:
  1326 +```json
  1327 +{
  1328 + "query": "玩具",
  1329 + "size": 20,
  1330 + "from": 0
  1331 +}
  1332 +```
  1333 +
  1334 +### 7.2 过滤搜索
  1335 +
  1336 +**需求**: 搜索"玩具",筛选类目为"益智玩具",价格在50-200之间
  1337 +
  1338 +```json
  1339 +{
  1340 + "query": "玩具",
  1341 + "size": 20,
  1342 + "language": "zh",
  1343 + "filters": {
  1344 + "category_name": "益智玩具"
  1345 + },
  1346 + "range_filters": {
  1347 + "min_price": {
  1348 + "gte": 50,
  1349 + "lte": 200
  1350 + }
  1351 + }
  1352 +}
  1353 +```
  1354 +
  1355 +**需求**: 搜索"手机",筛选多个品牌,价格范围
  1356 +
  1357 +```json
  1358 +{
  1359 + "query": "手机",
  1360 + "size": 20,
  1361 + "language": "zh",
  1362 + "filters": {
  1363 + "vendor.zh.keyword": ["品牌A", "品牌B"]
  1364 + },
  1365 + "range_filters": {
  1366 + "min_price": {
  1367 + "gte": 50,
  1368 + "lte": 200
  1369 + }
  1370 + }
  1371 +}
  1372 +```
  1373 +
  1374 +### 7.3 分面搜索
  1375 +
  1376 +**需求**: 搜索"玩具",获取类目和规格的分面统计,用于构建筛选器
  1377 +
  1378 +```json
  1379 +{
  1380 + "query": "玩具",
  1381 + "size": 20,
  1382 + "language": "zh",
  1383 + "facets": [
  1384 + {"field": "category1_name", "size": 15, "type": "terms"},
  1385 + {"field": "category2_name", "size": 10, "type": "terms"},
  1386 + {"field": "specifications", "size": 10, "type": "terms"}
  1387 + ]
  1388 +}
  1389 +```
  1390 +
  1391 +**需求**: 搜索"手机",获取价格区间和规格的分面统计
  1392 +
  1393 +```json
  1394 +{
  1395 + "query": "手机",
  1396 + "size": 20,
  1397 + "language": "zh",
  1398 + "facets": [
  1399 + {
  1400 + "field": "min_price",
  1401 + "type": "range",
  1402 + "ranges": [
  1403 + {"key": "0-50", "to": 50},
  1404 + {"key": "50-100", "from": 50, "to": 100},
  1405 + {"key": "100-200", "from": 100, "to": 200},
  1406 + {"key": "200+", "from": 200}
  1407 + ]
  1408 + },
  1409 + {
  1410 + "field": "specifications",
  1411 + "size": 10,
  1412 + "type": "terms"
  1413 + }
  1414 + ]
  1415 +}
  1416 +```
  1417 +
  1418 +### 7.4 规格过滤与分面
  1419 +
  1420 +**需求**: 搜索"手机",筛选color为"white"的商品
  1421 +
  1422 +```json
  1423 +{
  1424 + "query": "手机",
  1425 + "size": 20,
  1426 + "language": "zh",
  1427 + "filters": {
  1428 + "specifications": {
  1429 + "name": "color",
  1430 + "value": "white"
  1431 + }
  1432 + }
  1433 +}
  1434 +```
  1435 +
  1436 +**需求**: 搜索"手机",筛选color为"white"且size为"256GB"的商品
  1437 +
  1438 +```json
  1439 +{
  1440 + "query": "手机",
  1441 + "size": 20,
  1442 + "language": "zh",
  1443 + "filters": {
  1444 + "specifications": [
  1445 + {"name": "color", "value": "white"},
  1446 + {"name": "size", "value": "256GB"}
  1447 + ]
  1448 + }
  1449 +}
  1450 +```
  1451 +
  1452 +**需求**: 搜索"手机",筛选size为"3"、"4"或"5",且color为"green"的商品
  1453 +
  1454 +```json
  1455 +{
  1456 + "query": "手机",
  1457 + "size": 20,
  1458 + "language": "zh",
  1459 + "filters": {
  1460 + "specifications": [
  1461 + {"name": "size", "value": "3"},
  1462 + {"name": "size", "value": "4"},
  1463 + {"name": "size", "value": "5"},
  1464 + {"name": "color", "value": "green"}
  1465 + ]
  1466 + }
  1467 +}
  1468 +```
  1469 +
  1470 +**需求**: 搜索"手机",获取所有规格的分面统计
  1471 +
  1472 +```json
  1473 +{
  1474 + "query": "手机",
  1475 + "size": 20,
  1476 + "language": "zh",
  1477 + "facets": [
  1478 + {"field": "specifications", "size": 10, "type": "terms"}
  1479 + ]
  1480 +}
  1481 +```
  1482 +
  1483 +**需求**: 只获取"color"和"size"规格的分面统计
  1484 +
  1485 +```json
  1486 +{
  1487 + "query": "手机",
  1488 + "size": 20,
  1489 + "language": "zh",
  1490 + "facets": [
  1491 + {"field": "specifications.color", "size": 20, "type": "terms"},
  1492 + {"field": "specifications.size", "size": 15, "type": "terms"}
  1493 + ]
  1494 +}
  1495 +```
  1496 +
  1497 +**需求**: 搜索"手机",筛选类目和规格,并获取对应的分面统计
  1498 +
  1499 +```json
  1500 +{
  1501 + "query": "手机",
  1502 + "size": 20,
  1503 + "language": "zh",
  1504 + "filters": {
  1505 + "category_name": "手机",
  1506 + "specifications": {
  1507 + "name": "color",
  1508 + "value": "white"
  1509 + }
  1510 + },
  1511 + "facets": [
  1512 + {"field": "category1_name", "size": 15, "type": "terms"},
  1513 + {"field": "category2_name", "size": 10, "type": "terms"},
  1514 + {"field": "specifications.color", "size": 20, "type": "terms"},
  1515 + {"field": "specifications.size", "size": 15, "type": "terms"}
  1516 + ]
  1517 +}
  1518 +```
  1519 +
  1520 +### 7.5 SKU筛选
  1521 +
  1522 +**需求**: 搜索"芭比娃娃",每个SPU下按颜色筛选,每种颜色只显示一个SKU
  1523 +
  1524 +```json
  1525 +{
  1526 + "query": "芭比娃娃",
  1527 + "size": 20,
  1528 + "sku_filter_dimension": ["color"]
  1529 +}
  1530 +```
  1531 +
  1532 +**说明**:
  1533 +- 如果 `option1_name` 为 `"color"`,则使用 `sku_filter_dimension: ["color"]` 可以按颜色分组
  1534 +- 每个SPU下,每种颜色只会返回第一个SKU
  1535 +- 如果维度不匹配,返回所有SKU(不进行过滤)
  1536 +
  1537 +### 7.6 布尔表达式搜索
  1538 +
  1539 +**需求**: 搜索包含"手机"和"智能"的商品,排除"二手"
  1540 +
  1541 +```json
  1542 +{
  1543 + "query": "手机 AND 智能 ANDNOT 二手",
  1544 + "size": 20
  1545 +}
  1546 +```
  1547 +
  1548 +### 7.7 分页查询
  1549 +
  1550 +**需求**: 获取第2页结果(每页20条)
  1551 +
  1552 +```json
  1553 +{
  1554 + "query": "手机",
  1555 + "size": 20,
  1556 + "from": 20
  1557 +}
  1558 +```
  1559 +
  1560 +---
  1561 +
  1562 +## 数据模型
  1563 +
  1564 +### 8.1 商品字段定义
  1565 +
  1566 +| 字段名 | 类型 | 描述 |
  1567 +|--------|------|------|
  1568 +| `tenant_id` | keyword | 租户ID(多租户隔离) |
  1569 +| `spu_id` | keyword | SPU ID |
  1570 +| `title.<lang>` | object/text | 商品标题(多语言对象,如 `title.zh`, `title.en`) |
  1571 +| `brief.<lang>` | object/text | 商品短描述(多语言对象,如 `brief.zh`, `brief.en`) |
  1572 +| `description.<lang>` | object/text | 商品详细描述(多语言对象,如 `description.zh`, `description.en`) |
  1573 +| `vendor.<lang>` | object/text | 供应商/品牌(多语言对象,且带 keyword 子字段,如 `vendor.zh.keyword`) |
  1574 +| `category_path.<lang>` | object/text | 类目路径(多语言对象,用于搜索,如 `category_path.zh`) |
  1575 +| `category_name_text.<lang>` | object/text | 类目名称(多语言对象,用于搜索,如 `category_name_text.zh`) |
  1576 +| `category_id` | keyword | 类目ID |
  1577 +| `category_name` | keyword | 类目名称(用于过滤) |
  1578 +| `category_level` | integer | 类目层级 |
  1579 +| `category1_name`, `category2_name`, `category3_name` | keyword | 多级类目名称(用于过滤和分面) |
  1580 +| `tags` | keyword | 标签(数组) |
  1581 +| `specifications` | nested | 规格(嵌套对象数组) |
  1582 +| `option1_name`, `option2_name`, `option3_name` | keyword | 选项名称 |
  1583 +| `min_price`, `max_price` | float | 最低/最高价格 |
  1584 +| `compare_at_price` | float | 原价 |
  1585 +| `sku_prices` | float | SKU价格列表(数组) |
  1586 +| `sku_weights` | long | SKU重量列表(数组) |
  1587 +| `sku_weight_units` | keyword | SKU重量单位列表(数组) |
  1588 +| `total_inventory` | long | 总库存 |
  1589 +| `sales` | long | 销量(展示销量) |
  1590 +| `skus` | nested | SKU详细信息(嵌套对象数组) |
  1591 +| `create_time`, `update_time` | date | 创建/更新时间 |
  1592 +| `title_embedding` | dense_vector | 标题向量(1024维,仅用于搜索) |
  1593 +| `image_embedding` | nested | 图片向量(嵌套,仅用于搜索) |
  1594 +
  1595 +> 所有租户共享统一的索引结构。文本字段支持中英文双语,后端根据 `language` 参数自动选择对应字段返回。
  1596 +
  1597 +### 8.2 字段类型速查
  1598 +
  1599 +| 类型 | ES Mapping | 用途 |
  1600 +|------|------------|------|
  1601 +| `text` | `text` | 全文检索(支持中英文分析器) |
  1602 +| `keyword` | `keyword` | 精确匹配、聚合、排序 |
  1603 +| `integer` | `integer` | 整数 |
  1604 +| `long` | `long` | 长整数 |
  1605 +| `float` | `float` | 浮点数 |
  1606 +| `date` | `date` | 日期时间 |
  1607 +| `nested` | `nested` | 嵌套对象(specifications, skus, image_embedding) |
  1608 +| `dense_vector` | `dense_vector` | 向量字段(title_embedding,仅用于搜索) |
  1609 +
  1610 +### 8.3 常用字段列表
  1611 +
  1612 +#### 过滤字段
  1613 +
  1614 +- `category_name`: 类目名称
  1615 +- `category1_name`, `category2_name`, `category3_name`: 多级类目
  1616 +- `category_id`: 类目ID
  1617 +- `vendor.zh.keyword`, `vendor.en.keyword`: 供应商/品牌(使用keyword子字段)
  1618 +- `tags`: 标签(keyword类型)
  1619 +- `option1_name`, `option2_name`, `option3_name`: 选项名称
  1620 +- `specifications`: 规格过滤(嵌套字段,格式见[过滤器详解](#33-过滤器详解))
  1621 +
  1622 +#### 范围字段
  1623 +
  1624 +- `min_price`: 最低价格
  1625 +- `max_price`: 最高价格
  1626 +- `compare_at_price`: 原价
  1627 +- `create_time`: 创建时间
  1628 +- `update_time`: 更新时间
  1629 +
  1630 +#### 排序字段
  1631 +
  1632 +- `price`: 价格(后端自动根据sort_order映射:asc→min_price,desc→max_price)
  1633 +- `sales`: 销量
  1634 +- `create_time`: 创建时间
  1635 +- `update_time`: 更新时间
  1636 +- `relevance_score`: 相关性分数(默认,不指定sort_by时使用)
  1637 +
  1638 +**注意**: 前端只需传 `price`,后端会自动处理:
  1639 +- `sort_by: "price"` + `sort_order: "asc"` → 按 `min_price` 升序(价格从低到高)
  1640 +- `sort_by: "price"` + `sort_order: "desc"` → 按 `max_price` 降序(价格从高到低)
  1641 +
  1642 +### 8.4 支持的分析器
  1643 +
  1644 +| 分析器 | 语言 | 描述 |
  1645 +|--------|------|------|
  1646 +| `index_ansj` | 中文 | 中文索引分析器(用于中文字段) |
  1647 +| `query_ansj` | 中文 | 中文查询分析器(用于中文字段) |
  1648 +| `hanlp_index` ⚠️ TODO(暂不支持) | 中文 | 中文索引分析器(用于中文字段) |
  1649 +| `hanlp_standard` ⚠️ TODO(暂不支持) | 中文 | 中文查询分析器(用于中文字段) |
  1650 +| `english` | 英文 | 标准英文分析器(用于英文字段) |
  1651 +| `lowercase` | - | 小写标准化器(用于keyword子字段) |
... ...
requirements.txt
... ... @@ -12,13 +12,9 @@ langchain-openai&gt;=0.2.0
12 12 langgraph>=1.0.0
13 13 openai>=1.12.0
14 14  
15   -# Embeddings & Vision
16   -clip-client>=3.5.0 # CLIP-as-Service client
  15 +# Vision (VLM image analysis)
17 16 Pillow>=10.2.0 # Image processing
18 17  
19   -# Vector Database
20   -pymilvus>=2.3.6
21   -
22 18 # Databases
23 19 pymongo>=4.6.1
24 20  
... ...
scripts/check_services.sh
1 1 #!/usr/bin/env bash
2 2 # =============================================================================
3 3 # OmniShopAgent - 服务健康检查脚本
4   -# 检查 Milvus、CLIP、Streamlit 等依赖服务状态
  4 +# 检查 Streamlit、Search API 等依赖
5 5 # =============================================================================
6 6 set -euo pipefail
7 7  
... ... @@ -49,40 +49,16 @@ else
49 49 echo -e "${RED}FAIL${NC} 未找到"
50 50 fi
51 51  
52   -# 4. Milvus
53   -echo -n "[Milvus] "
54   -if command -v docker &>/dev/null; then
55   - if docker ps --format '{{.Names}}' 2>/dev/null | grep -q milvus-standalone; then
56   - if curl -s -o /dev/null -w "%{http_code}" http://localhost:9091/healthz 2>/dev/null | grep -q 200; then
57   - echo -e "${GREEN}OK${NC} localhost:19530"
58   - else
59   - echo -e "${YELLOW}WARN${NC} 容器运行中,健康检查未响应"
60   - fi
61   - else
62   - echo -e "${YELLOW}WARN${NC} 未运行 (docker compose up -d)"
63   - fi
64   -else
65   - echo -e "${YELLOW}SKIP${NC} Docker 未安装"
66   -fi
67   -
68   -# 5. CLIP 服务(可选)
69   -echo -n "[CLIP] "
70   -if timeout 2 bash -c 'echo >/dev/tcp/localhost/51000' 2>/dev/null; then
71   - echo -e "${GREEN}OK${NC} localhost:51000"
72   -else
73   - echo -e "${YELLOW}WARN${NC} 未运行 (图像搜索需启动: python -m clip_server launch)"
74   -fi
75   -
76   -# 6. 数据目录
  52 +# 4. 数据目录(可选,用于图片上传)
77 53 echo -n "[数据] "
78 54 if [ -d "$PROJECT_ROOT/data/images" ] && [ -f "$PROJECT_ROOT/data/styles.csv" ]; then
79 55 IMG_COUNT=$(find "$PROJECT_ROOT/data/images" -name "*.jpg" 2>/dev/null | wc -l)
80 56 echo -e "${GREEN}OK${NC} $IMG_COUNT 张图片"
81 57 else
82   - echo -e "${YELLOW}WARN${NC} 未找到 data/images 或 data/styles.csv (运行 download_dataset.py)"
  58 + echo -e "${YELLOW}WARN${NC} 未找到 data/images 或 data/styles.csv (可选,用于图片风格分析)"
83 59 fi
84 60  
85   -# 7. Streamlit
  61 +# 5. Streamlit
86 62 echo -n "[Streamlit] "
87 63 if pgrep -f "streamlit run app.py" >/dev/null 2>&1; then
88 64 echo -e "${GREEN}OK${NC} 运行中"
... ...
scripts/index_data.py deleted
... ... @@ -1,467 +0,0 @@
1   -"""
2   -Data Indexing Script
3   -Generates embeddings for products and stores them in Milvus
4   -"""
5   -
6   -import csv
7   -import logging
8   -import os
9   -import sys
10   -from pathlib import Path
11   -from typing import Any, Dict, Optional
12   -
13   -from tqdm import tqdm
14   -
15   -# Add parent directory to path
16   -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17   -
18   -# Import config and settings first
19   -# Direct imports from files to avoid __init__.py circular issues
20   -import importlib.util
21   -
22   -from app.config import get_absolute_path, settings
23   -
24   -
25   -def load_service_module(module_name, file_name):
26   - """Load a service module directly from file"""
27   - spec = importlib.util.spec_from_file_location(
28   - module_name,
29   - os.path.join(
30   - os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
31   - f"app/services/{file_name}",
32   - ),
33   - )
34   - module = importlib.util.module_from_spec(spec)
35   - spec.loader.exec_module(module)
36   - return module
37   -
38   -
39   -embedding_module = load_service_module("embedding_service", "embedding_service.py")
40   -milvus_module = load_service_module("milvus_service", "milvus_service.py")
41   -
42   -EmbeddingService = embedding_module.EmbeddingService
43   -MilvusService = milvus_module.MilvusService
44   -
45   -# Configure logging
46   -logging.basicConfig(
47   - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
48   -)
49   -logger = logging.getLogger(__name__)
50   -
51   -
52   -class DataIndexer:
53   - """Index product data by generating and storing embeddings"""
54   -
55   - def __init__(self):
56   - """Initialize services"""
57   - self.embedding_service = EmbeddingService()
58   - self.milvus_service = MilvusService()
59   -
60   - self.image_dir = Path(get_absolute_path(settings.image_data_path))
61   - self.styles_csv = get_absolute_path("./data/styles.csv")
62   - self.images_csv = get_absolute_path("./data/images.csv")
63   -
64   - # Load product data from CSV
65   - self.products = self._load_products_from_csv()
66   -
67   - def _load_products_from_csv(self) -> Dict[int, Dict[str, Any]]:
68   - """Load products from CSV files"""
69   - products = {}
70   -
71   - # Load images mapping
72   - images_dict = {}
73   - with open(self.images_csv, "r", encoding="utf-8") as f:
74   - reader = csv.DictReader(f)
75   - for row in reader:
76   - product_id = int(row["filename"].split(".")[0])
77   - images_dict[product_id] = row["link"]
78   -
79   - # Load styles/products
80   - with open(self.styles_csv, "r", encoding="utf-8") as f:
81   - reader = csv.DictReader(f)
82   - for row in reader:
83   - try:
84   - product_id = int(row["id"])
85   - products[product_id] = {
86   - "id": product_id,
87   - "gender": row.get("gender", ""),
88   - "masterCategory": row.get("masterCategory", ""),
89   - "subCategory": row.get("subCategory", ""),
90   - "articleType": row.get("articleType", ""),
91   - "baseColour": row.get("baseColour", ""),
92   - "season": row.get("season", ""),
93   - "year": int(row["year"]) if row.get("year") else 0,
94   - "usage": row.get("usage", ""),
95   - "productDisplayName": row.get("productDisplayName", ""),
96   - "imageUrl": images_dict.get(product_id, ""),
97   - "imagePath": f"{product_id}.jpg",
98   - }
99   - except (ValueError, KeyError) as e:
100   - logger.warning(f"Error loading product {row.get('id')}: {e}")
101   - continue
102   -
103   - logger.info(f"Loaded {len(products)} products from CSV")
104   - return products
105   -
106   - def setup(self) -> None:
107   - """Setup connections and collections"""
108   - logger.info("Setting up services...")
109   -
110   - # Connect to CLIP server
111   - self.embedding_service.connect_clip()
112   - logger.info("✓ CLIP server connected")
113   -
114   - # Connect to Milvus
115   - self.milvus_service.connect()
116   - logger.info("✓ Milvus connected")
117   -
118   - # Create Milvus collections
119   - self.milvus_service.create_text_collection(recreate=False)
120   - self.milvus_service.create_image_collection(recreate=False)
121   - logger.info("✓ Milvus collections ready")
122   -
123   - def teardown(self) -> None:
124   - """Close all connections"""
125   - logger.info("Closing connections...")
126   - self.embedding_service.disconnect_clip()
127   - self.milvus_service.disconnect()
128   - logger.info("✓ All connections closed")
129   -
130   - def index_text_embeddings(
131   - self, batch_size: int = 100, skip: int = 0, limit: Optional[int] = None
132   - ) -> Dict[str, int]:
133   - """Generate and store text embeddings for products
134   -
135   - Args:
136   - batch_size: Number of products to process at once
137   - skip: Number of products to skip
138   - limit: Maximum number of products to process (None for all)
139   -
140   - Returns:
141   - Dictionary with indexing statistics
142   - """
143   - logger.info("Starting text embedding indexing...")
144   -
145   - # Get products list
146   - product_ids = list(self.products.keys())[skip:]
147   - if limit:
148   - product_ids = product_ids[:limit]
149   -
150   - total_products = len(product_ids)
151   - processed = 0
152   - inserted = 0
153   - errors = 0
154   -
155   - with tqdm(total=total_products, desc="Indexing text embeddings") as pbar:
156   - while processed < total_products:
157   - # Get batch of products
158   - current_batch_size = min(batch_size, total_products - processed)
159   - batch_ids = product_ids[processed : processed + current_batch_size]
160   - products = [self.products[pid] for pid in batch_ids]
161   -
162   - if not products:
163   - break
164   -
165   - try:
166   - # Prepare texts for embedding
167   - texts = []
168   - text_mappings = []
169   -
170   - for product in products:
171   - # Create text representation of product
172   - text = self._create_product_text(product)
173   - texts.append(text)
174   - text_mappings.append(
175   - {"product_id": product["id"], "text": text}
176   - )
177   -
178   - # Generate embeddings
179   - embeddings = self.embedding_service.get_text_embeddings_batch(
180   - texts, batch_size=50 # OpenAI batch size
181   - )
182   -
183   - # Prepare data for Milvus (with metadata)
184   - milvus_data = []
185   - for idx, (mapping, embedding) in enumerate(
186   - zip(text_mappings, embeddings)
187   - ):
188   - product_id = mapping["product_id"]
189   - product = self.products[product_id]
190   -
191   - milvus_data.append(
192   - {
193   - "id": product_id,
194   - "text": mapping["text"][
195   - :2000
196   - ], # Truncate to max length
197   - "embedding": embedding,
198   - # Product metadata
199   - "productDisplayName": product["productDisplayName"][
200   - :500
201   - ],
202   - "gender": product["gender"][:50],
203   - "masterCategory": product["masterCategory"][:100],
204   - "subCategory": product["subCategory"][:100],
205   - "articleType": product["articleType"][:100],
206   - "baseColour": product["baseColour"][:50],
207   - "season": product["season"][:50],
208   - "usage": product["usage"][:50],
209   - "year": product["year"],
210   - "imageUrl": product["imageUrl"],
211   - "imagePath": product["imagePath"],
212   - }
213   - )
214   -
215   - # Insert into Milvus
216   - count = self.milvus_service.insert_text_embeddings(milvus_data)
217   - inserted += count
218   -
219   - except Exception as e:
220   - logger.error(
221   - f"Error processing text batch at offset {processed}: {e}"
222   - )
223   - errors += len(products)
224   -
225   - processed += len(products)
226   - pbar.update(len(products))
227   -
228   - stats = {"total_processed": processed, "inserted": inserted, "errors": errors}
229   -
230   - logger.info(f"Text embedding indexing completed: {stats}")
231   - return stats
232   -
233   - def index_image_embeddings(
234   - self, batch_size: int = 32, skip: int = 0, limit: Optional[int] = None
235   - ) -> Dict[str, int]:
236   - """Generate and store image embeddings for products
237   -
238   - Args:
239   - batch_size: Number of images to process at once
240   - skip: Number of products to skip
241   - limit: Maximum number of products to process (None for all)
242   -
243   - Returns:
244   - Dictionary with indexing statistics
245   - """
246   - logger.info("Starting image embedding indexing...")
247   -
248   - # Get products list
249   - product_ids = list(self.products.keys())[skip:]
250   - if limit:
251   - product_ids = product_ids[:limit]
252   -
253   - total_products = len(product_ids)
254   - processed = 0
255   - inserted = 0
256   - errors = 0
257   -
258   - with tqdm(total=total_products, desc="Indexing image embeddings") as pbar:
259   - while processed < total_products:
260   - # Get batch of products
261   - current_batch_size = min(batch_size, total_products - processed)
262   - batch_ids = product_ids[processed : processed + current_batch_size]
263   - products = [self.products[pid] for pid in batch_ids]
264   -
265   - if not products:
266   - break
267   -
268   - try:
269   - # Prepare image paths
270   - image_paths = []
271   - image_mappings = []
272   -
273   - for product in products:
274   - image_path = self.image_dir / product["imagePath"]
275   - image_paths.append(image_path)
276   - image_mappings.append(
277   - {
278   - "product_id": product["id"],
279   - "image_path": product["imagePath"],
280   - }
281   - )
282   -
283   - # Generate embeddings
284   - embeddings = self.embedding_service.get_image_embeddings_batch(
285   - image_paths, batch_size=batch_size
286   - )
287   -
288   - # Prepare data for Milvus (with metadata)
289   - milvus_data = []
290   - for idx, (mapping, embedding) in enumerate(
291   - zip(image_mappings, embeddings)
292   - ):
293   - if embedding is not None:
294   - product_id = mapping["product_id"]
295   - product = self.products[product_id]
296   -
297   - milvus_data.append(
298   - {
299   - "id": product_id,
300   - "image_path": mapping["image_path"],
301   - "embedding": embedding,
302   - # Product metadata
303   - "productDisplayName": product["productDisplayName"][
304   - :500
305   - ],
306   - "gender": product["gender"][:50],
307   - "masterCategory": product["masterCategory"][:100],
308   - "subCategory": product["subCategory"][:100],
309   - "articleType": product["articleType"][:100],
310   - "baseColour": product["baseColour"][:50],
311   - "season": product["season"][:50],
312   - "usage": product["usage"][:50],
313   - "year": product["year"],
314   - "imageUrl": product["imageUrl"],
315   - }
316   - )
317   - else:
318   - errors += 1
319   -
320   - # Insert into Milvus
321   - if milvus_data:
322   - count = self.milvus_service.insert_image_embeddings(milvus_data)
323   - inserted += count
324   -
325   - except Exception as e:
326   - logger.error(
327   - f"Error processing image batch at offset {processed}: {e}"
328   - )
329   - errors += len(products)
330   -
331   - processed += len(products)
332   - pbar.update(len(products))
333   -
334   - stats = {"total_processed": processed, "inserted": inserted, "errors": errors}
335   -
336   - logger.info(f"Image embedding indexing completed: {stats}")
337   - return stats
338   -
339   - def _create_product_text(self, product: Dict[str, Any]) -> str:
340   - """Create text representation of product for embedding
341   -
342   - Args:
343   - product: Product document
344   -
345   - Returns:
346   - Text representation
347   - """
348   - # Create a natural language description
349   - parts = [
350   - product.get("productDisplayName", ""),
351   - f"Gender: {product.get('gender', '')}",
352   - f"Category: {product.get('masterCategory', '')} > {product.get('subCategory', '')}",
353   - f"Type: {product.get('articleType', '')}",
354   - f"Color: {product.get('baseColour', '')}",
355   - f"Season: {product.get('season', '')}",
356   - f"Usage: {product.get('usage', '')}",
357   - ]
358   -
359   - text = " | ".join(
360   - [p for p in parts if p and p != "Gender: " and p != "Color: "]
361   - )
362   - return text
363   -
364   - def get_stats(self) -> Dict[str, Any]:
365   - """Get indexing statistics
366   -
367   - Returns:
368   - Dictionary with statistics
369   - """
370   - text_stats = self.milvus_service.get_collection_stats(
371   - self.milvus_service.text_collection_name
372   - )
373   - image_stats = self.milvus_service.get_collection_stats(
374   - self.milvus_service.image_collection_name
375   - )
376   -
377   - return {
378   - "total_products": len(self.products),
379   - "milvus_text": text_stats,
380   - "milvus_image": image_stats,
381   - }
382   -
383   -
384   -def main():
385   - """Main function"""
386   - import argparse
387   -
388   - parser = argparse.ArgumentParser(description="Index product data for search")
389   - parser.add_argument(
390   - "--mode",
391   - choices=["text", "image", "both"],
392   - default="both",
393   - help="Which embeddings to index",
394   - )
395   - parser.add_argument(
396   - "--batch-size", type=int, default=100, help="Batch size for processing"
397   - )
398   - parser.add_argument(
399   - "--skip", type=int, default=0, help="Number of products to skip"
400   - )
401   - parser.add_argument(
402   - "--limit", type=int, default=None, help="Maximum number of products to process"
403   - )
404   - parser.add_argument("--stats", action="store_true", help="Show statistics only")
405   -
406   - args = parser.parse_args()
407   -
408   - # Create indexer
409   - indexer = DataIndexer()
410   -
411   - try:
412   - # Setup services
413   - indexer.setup()
414   -
415   - if args.stats:
416   - # Show statistics
417   - stats = indexer.get_stats()
418   - print("\n=== Indexing Statistics ===")
419   - print(f"\nTotal Products in CSV: {stats['total_products']}")
420   -
421   - print("\nMilvus Text Embeddings:")
422   - print(f" Collection: {stats['milvus_text']['collection_name']}")
423   - print(f" Total embeddings: {stats['milvus_text']['row_count']}")
424   -
425   - print("\nMilvus Image Embeddings:")
426   - print(f" Collection: {stats['milvus_image']['collection_name']}")
427   - print(f" Total embeddings: {stats['milvus_image']['row_count']}")
428   -
429   - print(
430   - f"\nCoverage: {stats['milvus_image']['row_count'] / stats['total_products'] * 100:.1f}%"
431   - )
432   - else:
433   - # Index data
434   - if args.mode in ["text", "both"]:
435   - logger.info("=== Indexing Text Embeddings ===")
436   - text_stats = indexer.index_text_embeddings(
437   - batch_size=args.batch_size, skip=args.skip, limit=args.limit
438   - )
439   - print(f"\nText Indexing Results: {text_stats}")
440   -
441   - if args.mode in ["image", "both"]:
442   - logger.info("=== Indexing Image Embeddings ===")
443   - image_stats = indexer.index_image_embeddings(
444   - batch_size=min(args.batch_size, 32), # Smaller batch for images
445   - skip=args.skip,
446   - limit=args.limit,
447   - )
448   - print(f"\nImage Indexing Results: {image_stats}")
449   -
450   - # Show final statistics
451   - logger.info("\n=== Final Statistics ===")
452   - stats = indexer.get_stats()
453   - print(f"Total products: {stats['total_products']}")
454   - print(f"Text embeddings: {stats['milvus_text']['row_count']}")
455   - print(f"Image embeddings: {stats['milvus_image']['row_count']}")
456   -
457   - except KeyboardInterrupt:
458   - logger.info("\nIndexing interrupted by user")
459   - except Exception as e:
460   - logger.error(f"Error during indexing: {e}", exc_info=True)
461   - sys.exit(1)
462   - finally:
463   - indexer.teardown()
464   -
465   -
466   -if __name__ == "__main__":
467   - main()
scripts/run_clip.sh deleted
... ... @@ -1,22 +0,0 @@
1   -#!/usr/bin/env bash
2   -# =============================================================================
3   -# OmniShopAgent - 启动 CLIP 图像向量服务
4   -# 图像搜索、以图搜图功能依赖此服务
5   -# =============================================================================
6   -set -euo pipefail
7   -
8   -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
9   -PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
10   -VENV_DIR="${VENV_DIR:-$PROJECT_ROOT/venv}"
11   -
12   -cd "$PROJECT_ROOT"
13   -
14   -if [ -d "$VENV_DIR" ]; then
15   - set +u
16   - source "$VENV_DIR/bin/activate"
17   - set -u
18   -fi
19   -
20   -echo "启动 CLIP 服务 (端口 51000)..."
21   -echo "按 Ctrl+C 停止"
22   -exec python -m clip_server launch
scripts/run_milvus.sh deleted
... ... @@ -1,31 +0,0 @@
1   -#!/usr/bin/env bash
2   -# =============================================================================
3   -# OmniShopAgent - 启动 Milvus 向量数据库
4   -# 使用 Docker Compose 启动 Milvus 及相关依赖
5   -# =============================================================================
6   -set -euo pipefail
7   -
8   -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
9   -PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
10   -
11   -cd "$PROJECT_ROOT"
12   -
13   -if ! command -v docker &>/dev/null; then
14   - echo "错误: 未安装 Docker。请先运行 setup_env_centos8.sh"
15   - exit 1
16   -fi
17   -
18   -echo "启动 Milvus..."
19   -docker compose up -d 2>/dev/null || docker-compose up -d 2>/dev/null || {
20   - echo "错误: 无法执行 docker compose。请确保已安装 Docker Compose"
21   - exit 1
22   -}
23   -
24   -echo "等待 Milvus 就绪 (约 60 秒)..."
25   -sleep 60
26   -
27   -if curl -s -o /dev/null -w "%{http_code}" http://localhost:9091/healthz 2>/dev/null | grep -q 200; then
28   - echo "Milvus 已就绪: localhost:19530"
29   -else
30   - echo "提示: Milvus 可能仍在启动,请稍后执行 check_services.sh 检查"
31   -fi
scripts/setup_env_centos8.sh
... ... @@ -41,9 +41,9 @@ sudo dnf install -y \
41 41 tar
42 42  
43 43 # -----------------------------------------------------------------------------
44   -# 2. 安装 Docker(用于 Milvus
  44 +# 2. 检查 Docker(可选
45 45 # -----------------------------------------------------------------------------
46   -echo "[2/4] 检查/安装 Docker..."
  46 +echo "[2/4] 检查 Docker..."
47 47 if ! command -v docker &>/dev/null; then
48 48 echo " 安装 Docker..."
49 49 sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo 2>/dev/null || {
... ... @@ -142,11 +142,9 @@ echo &quot;==========================================&quot;
142 142 echo "环境准备完成!"
143 143 echo "=========================================="
144 144 echo "下一步:"
145   -echo " 1. 编辑 .env 配置 OPENAI_API_KEY"
146   -echo " 2. 下载数据: python scripts/download_dataset.py"
147   -echo " 3. 启动 Milvus: ./scripts/run_milvus.sh"
148   -echo " 4. 索引数据: python scripts/index_data.py"
149   -echo " 5. 启动应用: ./scripts/start.sh"
  145 +echo " 1. 编辑 .env 配置 OPENAI_API_KEY、SEARCH_API_BASE_URL 等"
  146 +echo " 2. (可选)下载数据: python scripts/download_dataset.py"
  147 +echo " 3. 启动应用: ./scripts/start.sh"
150 148 echo ""
151 149 echo "激活虚拟环境: source $VENV_DIR/bin/activate"
152 150 echo "=========================================="
... ...
scripts/start.sh
1 1 #!/usr/bin/env bash
2 2 # =============================================================================
3 3 # OmniShopAgent - 启动脚本
4   -# 启动 Milvus、CLIP(可选)、Streamlit 应用
  4 +# 启动 Streamlit 应用
5 5 # =============================================================================
6 6 set -euo pipefail
7 7  
8 8 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
9 9 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
10 10 VENV_DIR="${VENV_DIR:-$PROJECT_ROOT/venv}"
11   -STREAMLIT_PORT="${STREAMLIT_PORT:-8501}"
  11 +STREAMLIT_PORT="${STREAMLIT_PORT:-6008}"
12 12 STREAMLIT_HOST="${STREAMLIT_HOST:-0.0.0.0}"
13 13  
14 14 cd "$PROJECT_ROOT"
... ... @@ -27,30 +27,7 @@ echo &quot;==========================================&quot;
27 27 echo "OmniShopAgent 启动"
28 28 echo "=========================================="
29 29  
30   -# 1. 启动 Milvus(Docker)
31   -if command -v docker &>/dev/null; then
32   - echo "[1/3] 检查 Milvus..."
33   - if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q milvus-standalone; then
34   - echo " 启动 Milvus (docker compose)..."
35   - docker compose up -d 2>/dev/null || docker-compose up -d 2>/dev/null || {
36   - echo " 警告: 无法启动 Milvus,请手动执行: docker compose up -d"
37   - }
38   - echo " 等待 Milvus 就绪 (30s)..."
39   - sleep 30
40   - else
41   - echo " Milvus 已运行"
42   - fi
43   -else
44   - echo "[1/3] 跳过 Milvus: 未安装 Docker"
45   -fi
46   -
47   -# 2. 检查 CLIP(可选,图像搜索需要)
48   -echo "[2/3] 检查 CLIP 服务..."
49   -echo " 提示: 图像搜索需 CLIP。若未启动,请另开终端执行: python -m clip_server launch"
50   -echo " 文本搜索可无需 CLIP。"
51   -
52   -# 3. 启动 Streamlit
53   -echo "[3/3] 启动 Streamlit (端口 $STREAMLIT_PORT)..."
  30 +echo "[1/1] 启动 Streamlit (端口 $STREAMLIT_PORT)..."
54 31 echo ""
55 32 echo " 访问: http://$STREAMLIT_HOST:$STREAMLIT_PORT"
56 33 echo " 按 Ctrl+C 停止"
... ...
scripts/stop.sh
1 1 #!/usr/bin/env bash
2 2 # =============================================================================
3 3 # OmniShopAgent - 停止脚本
4   -# 停止 Streamlit 进程及 Milvus 容器
  4 +# 停止 Streamlit 进程
5 5 # =============================================================================
6 6 set -euo pipefail
7 7  
8 8 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
9 9 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
10   -STREAMLIT_PORT="${STREAMLIT_PORT:-8501}"
  10 +STREAMLIT_PORT="${STREAMLIT_PORT:-6008}"
11 11  
12 12 echo "=========================================="
13 13 echo "OmniShopAgent 停止"
14 14 echo "=========================================="
15 15  
16 16 # 1. 停止 Streamlit 进程
17   -echo "[1/2] 停止 Streamlit..."
  17 +echo "[1/1] 停止 Streamlit..."
18 18 if pgrep -f "streamlit run app.py" >/dev/null 2>&1; then
19 19 pkill -f "streamlit run app.py" 2>/dev/null || true
20 20 echo " Streamlit 已停止"
... ... @@ -31,16 +31,6 @@ if command -v lsof &amp;&gt;/dev/null; then
31 31 fi
32 32 fi
33 33  
34   -# 2. 可选:停止 Milvus 容器
35   -echo "[2/2] 停止 Milvus..."
36   -if command -v docker &>/dev/null; then
37   - cd "$PROJECT_ROOT"
38   - docker compose down 2>/dev/null || docker-compose down 2>/dev/null || true
39   - echo " Milvus 已停止"
40   -else
41   - echo " Docker 未安装,跳过"
42   -fi
43   -
44 34 echo "=========================================="
45 35 echo "OmniShopAgent 已停止"
46 36 echo "=========================================="
... ...