Commit 8810a6fa9c3779e2fa48766c049a8296618d496f
1 parent
e7f2b240
重构
Showing
23 changed files
with
1834 additions
and
2062 deletions
Show diff stats
.env.example
| 1 | 1 | # ==================== |
| 2 | 2 | # OpenAI Configuration |
| 3 | 3 | # ==================== |
| 4 | -OPENAI_API_KEY= | |
| 5 | -OPENAI_MODEL=gpt-4o-mini | |
| 6 | -OPENAI_EMBEDDING_MODEL=text-embedding-3-small | |
| 4 | +OPENAI_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b | |
| 5 | +OPENAI_MODEL=qwen-plus | |
| 6 | +# Base URL for Qwen/DashScope (OpenAI-compatible API) | |
| 7 | +# 北京: https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 8 | +# 弗吉尼亚: https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 9 | +# 新加坡: https://dashscope-intl.aliyuncs.com/compatible-mode/v1 | |
| 10 | +OPENAI_API_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 7 | 11 | OPENAI_TEMPERATURE=1 |
| 8 | 12 | OPENAI_MAX_TOKENS=1000 |
| 9 | 13 | |
| 10 | 14 | # ==================== |
| 11 | -# CLIP Server Configuration | |
| 12 | -# ==================== | |
| 13 | -CLIP_SERVER_URL=grpc://localhost:51000 | |
| 14 | - | |
| 15 | -# ==================== | |
| 16 | -# Milvus Configuration | |
| 17 | -# ==================== | |
| 18 | -MILVUS_HOST=localhost | |
| 19 | -MILVUS_PORT=19530 | |
| 20 | - | |
| 21 | -# Collection settings | |
| 22 | -TEXT_COLLECTION_NAME=text_embeddings | |
| 23 | -IMAGE_COLLECTION_NAME=image_embeddings | |
| 24 | -TEXT_DIM=1536 | |
| 25 | -IMAGE_DIM=512 | |
| 26 | - | |
| 27 | -# ==================== | |
| 28 | 15 | # Search Configuration |
| 29 | 16 | # ==================== |
| 30 | 17 | TOP_K_RESULTS=30 |
| 31 | 18 | SIMILARITY_THRESHOLD=0.6 |
| 32 | 19 | |
| 20 | +# Search API (see docs/搜索API对接指南.md) | |
| 21 | +SEARCH_API_BASE_URL=http://120.76.41.98:6002 | |
| 22 | +SEARCH_API_TENANT_ID=162 | |
| 23 | + | |
| 33 | 24 | # ==================== |
| 34 | 25 | # Application Configuration |
| 35 | 26 | # ==================== | ... | ... |
.gitignore
README.md
| ... | ... | @@ -12,9 +12,9 @@ OmniShopAgent autonomously decides which tools to call, maintains conversation s |
| 12 | 12 | |
| 13 | 13 | **Key Features:** |
| 14 | 14 | - Autonomous tool selection and execution |
| 15 | -- Multi-modal search (text + image) | |
| 15 | +- Text search via Search API | |
| 16 | 16 | - Conversational context awareness |
| 17 | -- Real-time visual analysis | |
| 17 | +- Real-time visual analysis (style extraction from images) | |
| 18 | 18 | |
| 19 | 19 | ## Tech Stack |
| 20 | 20 | |
| ... | ... | @@ -22,9 +22,7 @@ OmniShopAgent autonomously decides which tools to call, maintains conversation s |
| 22 | 22 | |-----------|-----------| |
| 23 | 23 | | **Agent Framework** | LangGraph | |
| 24 | 24 | | **LLM** | any LLM supported by LangChain | |
| 25 | -| **Text Embedding** | text-embedding-3-small | | |
| 26 | -| **Image Embedding** | CLIP ViT-B/32 | | |
| 27 | -| **Vector Database** | Milvus | | |
| 25 | +| **Search** | Search API (HTTP) | | |
| 28 | 26 | | **Frontend** | Streamlit | |
| 29 | 27 | | **Dataset** | Kaggle Fashion Products | |
| 30 | 28 | |
| ... | ... | @@ -52,8 +50,7 @@ graph LR |
| 52 | 50 | ``` |
| 53 | 51 | |
| 54 | 52 | **Available Tools:** |
| 55 | -- `search_products(query)` - Text-based semantic search | |
| 56 | -- `search_by_image(image_path)` - Visual similarity search | |
| 53 | +- `search_products(query)` - Text-based product search via Search API | |
| 57 | 54 | - `analyze_image_style(image_path)` - VLM style analysis |
| 58 | 55 | |
| 59 | 56 | |
| ... | ... | @@ -66,12 +63,6 @@ User: "winter coats for women" |
| 66 | 63 | Agent: search_products("winter coats women") → Returns 5 products |
| 67 | 64 | ``` |
| 68 | 65 | |
| 69 | -**Image Upload:** | |
| 70 | -``` | |
| 71 | -User: [uploads sneaker photo] "find similar" | |
| 72 | -Agent: search_by_image(path) → Returns visually similar shoes | |
| 73 | -``` | |
| 74 | - | |
| 75 | 66 | **Style Analysis + Search:** |
| 76 | 67 | ``` |
| 77 | 68 | User: [uploads vintage jacket] "what style is this? find matching pants" |
| ... | ... | @@ -93,6 +84,8 @@ Agent: [remembers context] → search_products("red formal dresses") → Results |
| 93 | 84 | User: [uploads office outfit] "I like the shirt but need something more casual" |
| 94 | 85 | Agent: analyze_image_style(path) → Extracts shirt details |
| 95 | 86 | search_products("casual shirt [color] [style]") → Returns casual alternatives |
| 87 | + | |
| 88 | +**Note:** For image uploads "find similar", use analyze_image_style first to extract attributes, then search_products with the description. | |
| 96 | 89 | ``` |
| 97 | 90 | |
| 98 | 91 | ## Installation |
| ... | ... | @@ -100,7 +93,6 @@ Agent: analyze_image_style(path) → Extracts shirt details |
| 100 | 93 | **Prerequisites:** |
| 101 | 94 | - Python 3.12+ (LangChain 1.x 要求 Python 3.10+) |
| 102 | 95 | - OpenAI API Key |
| 103 | -- Docker & Docker Compose | |
| 104 | 96 | |
| 105 | 97 | ### 1. Setup Environment |
| 106 | 98 | ```bash |
| ... | ... | @@ -116,38 +108,14 @@ cp .env.example .env |
| 116 | 108 | # Edit .env and add your OPENAI_API_KEY |
| 117 | 109 | ``` |
| 118 | 110 | |
| 119 | -### 2. Download Dataset | |
| 120 | -Download the [Fashion Product Images Dataset](https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset) from Kaggle and extract to `./data/`: | |
| 121 | - | |
| 122 | -```python | |
| 123 | -python scripts/download_dataset.py | |
| 124 | -``` | |
| 125 | - | |
| 126 | -Expected structure: | |
| 127 | -``` | |
| 128 | -data/ | |
| 129 | -├── images/ # ~44k product images | |
| 130 | -├── styles.csv # Product metadata | |
| 131 | -└── images.csv # Image filenames | |
| 132 | -``` | |
| 133 | - | |
| 134 | -### 3. Start Services | |
| 135 | - | |
| 136 | -```bash | |
| 137 | -docker-compose up | |
| 138 | -python -m clip_server | |
| 139 | -``` | |
| 140 | - | |
| 141 | - | |
| 142 | -### 4. Index Data | |
| 111 | +### 2. (Optional) Download Dataset | |
| 112 | +For image style analysis, you may download the [Fashion Product Images Dataset](https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset) from Kaggle: | |
| 143 | 113 | |
| 144 | 114 | ```bash |
| 145 | -python scripts/index_data.py | |
| 115 | +python scripts/download_dataset.py | |
| 146 | 116 | ``` |
| 147 | 117 | |
| 148 | -This generates and stores text/image embeddings for all 44k products in Milvus. | |
| 149 | - | |
| 150 | -### 5. Launch Application | |
| 118 | +### 3. Launch Application | |
| 151 | 119 | ```bash |
| 152 | 120 | # 使用启动脚本(推荐) |
| 153 | 121 | ./scripts/start.sh |
| ... | ... | @@ -155,6 +123,9 @@ This generates and stores text/image embeddings for all 44k products in Milvus. |
| 155 | 123 | # 或直接运行 |
| 156 | 124 | streamlit run app.py |
| 157 | 125 | ``` |
| 126 | + | |
| 127 | +Product search uses the external Search API. Configure `SEARCH_API_BASE_URL` and `SEARCH_API_TENANT_ID` in `.env` if needed. | |
| 128 | + | |
| 158 | 129 | Opens at `http://localhost:8501` |
| 159 | 130 | |
| 160 | 131 | ### CentOS 8 部署 | ... | ... |
app/agents/shopping_agent.py
| ... | ... | @@ -52,11 +52,14 @@ class ShoppingAgent: |
| 52 | 52 | self.session_id = session_id or "default" |
| 53 | 53 | |
| 54 | 54 | # Initialize LLM |
| 55 | - self.llm = ChatOpenAI( | |
| 55 | + llm_kwargs = dict( | |
| 56 | 56 | model=settings.openai_model, |
| 57 | 57 | temperature=settings.openai_temperature, |
| 58 | 58 | api_key=settings.openai_api_key, |
| 59 | 59 | ) |
| 60 | + if settings.openai_api_base_url: | |
| 61 | + llm_kwargs["base_url"] = settings.openai_api_base_url | |
| 62 | + self.llm = ChatOpenAI(**llm_kwargs) | |
| 60 | 63 | |
| 61 | 64 | # Get tools and bind to model |
| 62 | 65 | self.tools = get_all_tools() |
| ... | ... | @@ -73,12 +76,11 @@ class ShoppingAgent: |
| 73 | 76 | # System prompt for the agent |
| 74 | 77 | system_prompt = """You are an intelligent fashion shopping assistant. You can: |
| 75 | 78 | 1. Search for products by text description (use search_products) |
| 76 | -2. Find visually similar products from images (use search_by_image) | |
| 77 | -3. Analyze image style and attributes (use analyze_image_style) | |
| 79 | +2. Analyze image style and attributes (use analyze_image_style) | |
| 78 | 80 | |
| 79 | 81 | When a user asks about products: |
| 80 | 82 | - For text queries: use search_products directly |
| 81 | -- For image uploads: decide if you need to analyze_image_style first, then search | |
| 83 | +- For image uploads: use analyze_image_style first to understand the product, then use search_products with the extracted description | |
| 82 | 84 | - You can call multiple tools in sequence if needed |
| 83 | 85 | - Always provide helpful, friendly responses |
| 84 | 86 | ... | ... |
app/config.py
| ... | ... | @@ -4,6 +4,7 @@ Loads environment variables and provides configuration objects |
| 4 | 4 | """ |
| 5 | 5 | |
| 6 | 6 | import os |
| 7 | +from typing import Optional | |
| 7 | 8 | |
| 8 | 9 | from pydantic_settings import BaseSettings |
| 9 | 10 | |
| ... | ... | @@ -17,47 +18,20 @@ class Settings(BaseSettings): |
| 17 | 18 | # OpenAI Configuration |
| 18 | 19 | openai_api_key: str |
| 19 | 20 | openai_model: str = "gpt-4o-mini" |
| 20 | - openai_embedding_model: str = "text-embedding-3-small" | |
| 21 | 21 | openai_temperature: float = 0.7 |
| 22 | 22 | openai_max_tokens: int = 1000 |
| 23 | - | |
| 24 | - # CLIP Server Configuration | |
| 25 | - clip_server_url: str = "grpc://localhost:51000" | |
| 26 | - | |
| 27 | - # Milvus Configuration | |
| 28 | - milvus_uri: str = "http://localhost:19530" | |
| 29 | - milvus_host: str = "localhost" | |
| 30 | - milvus_port: int = 19530 | |
| 31 | - text_collection_name: str = "text_embeddings" | |
| 32 | - image_collection_name: str = "image_embeddings" | |
| 33 | - text_dim: int = 1536 | |
| 34 | - image_dim: int = 512 | |
| 35 | - | |
| 36 | - @property | |
| 37 | - def milvus_uri_absolute(self) -> str: | |
| 38 | - """Get absolute path for Milvus URI | |
| 39 | - | |
| 40 | - Returns: | |
| 41 | - - For http/https URIs: returns as-is (Milvus Standalone) | |
| 42 | - - For file paths starting with ./: converts to absolute path (Milvus Lite) | |
| 43 | - - For other paths: returns as-is | |
| 44 | - """ | |
| 45 | - import os | |
| 46 | - | |
| 47 | - # If it's a network URI, return as-is (Milvus Standalone) | |
| 48 | - if self.milvus_uri.startswith(("http://", "https://")): | |
| 49 | - return self.milvus_uri | |
| 50 | - # If it's a relative path, convert to absolute (Milvus Lite) | |
| 51 | - if self.milvus_uri.startswith("./"): | |
| 52 | - base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| 53 | - return os.path.join(base_dir, self.milvus_uri[2:]) | |
| 54 | - # Otherwise return as-is | |
| 55 | - return self.milvus_uri | |
| 23 | + # Base URL for OpenAI-compatible APIs (e.g. Qwen/DashScope) | |
| 24 | + # Qwen 北京: https://dashscope.aliyuncs.com/compatible-mode/v1 | |
| 25 | + openai_api_base_url: Optional[str] = None | |
| 56 | 26 | |
| 57 | 27 | # Search Configuration |
| 58 | 28 | top_k_results: int = 10 |
| 59 | 29 | similarity_threshold: float = 0.6 |
| 60 | 30 | |
| 31 | + # Search API (see docs/搜索API对接指南.md) | |
| 32 | + search_api_base_url: str = "http://120.76.41.98:6002" | |
| 33 | + search_api_tenant_id: str = "162" | |
| 34 | + | |
| 61 | 35 | # Application Configuration |
| 62 | 36 | app_host: str = "0.0.0.0" |
| 63 | 37 | app_port: int = 8000 |
| ... | ... | @@ -73,6 +47,7 @@ class Settings(BaseSettings): |
| 73 | 47 | env_file = ".env" |
| 74 | 48 | env_file_encoding = "utf-8" |
| 75 | 49 | case_sensitive = False |
| 50 | + extra = "ignore" | |
| 76 | 51 | |
| 77 | 52 | |
| 78 | 53 | # Global settings instance | ... | ... |
app/services/__init__.py
| 1 | 1 | """ |
| 2 | 2 | Services Module |
| 3 | -Provides database and embedding services for the application | |
| 4 | 3 | """ |
| 5 | - | |
| 6 | -from app.services.embedding_service import EmbeddingService, get_embedding_service | |
| 7 | -from app.services.milvus_service import MilvusService, get_milvus_service | |
| 8 | - | |
| 9 | -__all__ = [ | |
| 10 | - "EmbeddingService", | |
| 11 | - "get_embedding_service", | |
| 12 | - "MilvusService", | |
| 13 | - "get_milvus_service", | |
| 14 | -] | ... | ... |
app/services/embedding_service.py deleted
| ... | ... | @@ -1,293 +0,0 @@ |
| 1 | -""" | |
| 2 | -Embedding Service for Text and Image Embeddings | |
| 3 | -Supports OpenAI text embeddings and CLIP image embeddings | |
| 4 | -""" | |
| 5 | - | |
| 6 | -import logging | |
| 7 | -from pathlib import Path | |
| 8 | -from typing import List, Optional, Union | |
| 9 | - | |
| 10 | -import numpy as np | |
| 11 | -from clip_client import Client as ClipClient | |
| 12 | -from openai import OpenAI | |
| 13 | - | |
| 14 | -from app.config import settings | |
| 15 | - | |
| 16 | -logger = logging.getLogger(__name__) | |
| 17 | - | |
| 18 | - | |
| 19 | -class EmbeddingService: | |
| 20 | - """Service for generating text and image embeddings""" | |
| 21 | - | |
| 22 | - def __init__( | |
| 23 | - self, | |
| 24 | - openai_api_key: Optional[str] = None, | |
| 25 | - clip_server_url: Optional[str] = None, | |
| 26 | - ): | |
| 27 | - """Initialize embedding service | |
| 28 | - | |
| 29 | - Args: | |
| 30 | - openai_api_key: OpenAI API key. If None, uses settings.openai_api_key | |
| 31 | - clip_server_url: CLIP server URL. If None, uses settings.clip_server_url | |
| 32 | - """ | |
| 33 | - # Initialize OpenAI client for text embeddings | |
| 34 | - self.openai_api_key = openai_api_key or settings.openai_api_key | |
| 35 | - self.openai_client = OpenAI(api_key=self.openai_api_key) | |
| 36 | - self.text_embedding_model = settings.openai_embedding_model | |
| 37 | - | |
| 38 | - # Initialize CLIP client for image embeddings | |
| 39 | - self.clip_server_url = clip_server_url or settings.clip_server_url | |
| 40 | - self.clip_client: Optional[ClipClient] = None | |
| 41 | - | |
| 42 | - logger.info("Embedding service initialized") | |
| 43 | - | |
| 44 | - def connect_clip(self) -> None: | |
| 45 | - """Connect to CLIP server""" | |
| 46 | - try: | |
| 47 | - self.clip_client = ClipClient(server=self.clip_server_url) | |
| 48 | - logger.info(f"Connected to CLIP server at {self.clip_server_url}") | |
| 49 | - except Exception as e: | |
| 50 | - logger.error(f"Failed to connect to CLIP server: {e}") | |
| 51 | - raise | |
| 52 | - | |
| 53 | - def disconnect_clip(self) -> None: | |
| 54 | - """Disconnect from CLIP server""" | |
| 55 | - if self.clip_client: | |
| 56 | - # Note: clip_client doesn't have explicit close method | |
| 57 | - self.clip_client = None | |
| 58 | - logger.info("Disconnected from CLIP server") | |
| 59 | - | |
| 60 | - def get_text_embedding(self, text: str) -> List[float]: | |
| 61 | - """Get embedding for a single text | |
| 62 | - | |
| 63 | - Args: | |
| 64 | - text: Input text | |
| 65 | - | |
| 66 | - Returns: | |
| 67 | - Embedding vector as list of floats | |
| 68 | - """ | |
| 69 | - try: | |
| 70 | - response = self.openai_client.embeddings.create( | |
| 71 | - input=text, model=self.text_embedding_model | |
| 72 | - ) | |
| 73 | - embedding = response.data[0].embedding | |
| 74 | - logger.debug(f"Generated text embedding for: {text[:50]}...") | |
| 75 | - return embedding | |
| 76 | - except Exception as e: | |
| 77 | - logger.error(f"Failed to generate text embedding: {e}") | |
| 78 | - raise | |
| 79 | - | |
| 80 | - def get_text_embeddings_batch( | |
| 81 | - self, texts: List[str], batch_size: int = 100 | |
| 82 | - ) -> List[List[float]]: | |
| 83 | - """Get embeddings for multiple texts in batches | |
| 84 | - | |
| 85 | - Args: | |
| 86 | - texts: List of input texts | |
| 87 | - batch_size: Number of texts to process at once | |
| 88 | - | |
| 89 | - Returns: | |
| 90 | - List of embedding vectors | |
| 91 | - """ | |
| 92 | - all_embeddings = [] | |
| 93 | - | |
| 94 | - for i in range(0, len(texts), batch_size): | |
| 95 | - batch = texts[i : i + batch_size] | |
| 96 | - | |
| 97 | - try: | |
| 98 | - response = self.openai_client.embeddings.create( | |
| 99 | - input=batch, model=self.text_embedding_model | |
| 100 | - ) | |
| 101 | - | |
| 102 | - # Extract embeddings in the correct order | |
| 103 | - embeddings = [item.embedding for item in response.data] | |
| 104 | - all_embeddings.extend(embeddings) | |
| 105 | - | |
| 106 | - logger.info( | |
| 107 | - f"Generated text embeddings for batch {i // batch_size + 1}: {len(embeddings)} embeddings" | |
| 108 | - ) | |
| 109 | - | |
| 110 | - except Exception as e: | |
| 111 | - logger.error( | |
| 112 | - f"Failed to generate text embeddings for batch {i // batch_size + 1}: {e}" | |
| 113 | - ) | |
| 114 | - raise | |
| 115 | - | |
| 116 | - return all_embeddings | |
| 117 | - | |
| 118 | - def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: | |
| 119 | - """Get CLIP embedding for a single image | |
| 120 | - | |
| 121 | - Args: | |
| 122 | - image_path: Path to image file | |
| 123 | - | |
| 124 | - Returns: | |
| 125 | - Embedding vector as list of floats | |
| 126 | - """ | |
| 127 | - if not self.clip_client: | |
| 128 | - raise RuntimeError("CLIP client not connected. Call connect_clip() first.") | |
| 129 | - | |
| 130 | - image_path = Path(image_path) | |
| 131 | - if not image_path.exists(): | |
| 132 | - raise FileNotFoundError(f"Image not found: {image_path}") | |
| 133 | - | |
| 134 | - try: | |
| 135 | - # Get embedding from CLIP server using image path (as string) | |
| 136 | - result = self.clip_client.encode([str(image_path)]) | |
| 137 | - | |
| 138 | - # Extract embedding - result is numpy array | |
| 139 | - import numpy as np | |
| 140 | - | |
| 141 | - if isinstance(result, np.ndarray): | |
| 142 | - # If result is numpy array, use first element | |
| 143 | - embedding = ( | |
| 144 | - result[0].tolist() if len(result.shape) > 1 else result.tolist() | |
| 145 | - ) | |
| 146 | - else: | |
| 147 | - # If result is DocumentArray | |
| 148 | - embedding = result[0].embedding.tolist() | |
| 149 | - | |
| 150 | - logger.debug(f"Generated image embedding for: {image_path.name}") | |
| 151 | - return embedding | |
| 152 | - | |
| 153 | - except Exception as e: | |
| 154 | - logger.error(f"Failed to generate image embedding for {image_path}: {e}") | |
| 155 | - raise | |
| 156 | - | |
| 157 | - def get_image_embeddings_batch( | |
| 158 | - self, image_paths: List[Union[str, Path]], batch_size: int = 32 | |
| 159 | - ) -> List[Optional[List[float]]]: | |
| 160 | - """Get CLIP embeddings for multiple images in batches | |
| 161 | - | |
| 162 | - Args: | |
| 163 | - image_paths: List of paths to image files | |
| 164 | - batch_size: Number of images to process at once | |
| 165 | - | |
| 166 | - Returns: | |
| 167 | - List of embedding vectors (None for failed images) | |
| 168 | - """ | |
| 169 | - if not self.clip_client: | |
| 170 | - raise RuntimeError("CLIP client not connected. Call connect_clip() first.") | |
| 171 | - | |
| 172 | - all_embeddings = [] | |
| 173 | - | |
| 174 | - for i in range(0, len(image_paths), batch_size): | |
| 175 | - batch_paths = image_paths[i : i + batch_size] | |
| 176 | - valid_paths = [] | |
| 177 | - valid_indices = [] | |
| 178 | - | |
| 179 | - # Check which images exist | |
| 180 | - for idx, path in enumerate(batch_paths): | |
| 181 | - path = Path(path) | |
| 182 | - if path.exists(): | |
| 183 | - valid_paths.append(str(path)) | |
| 184 | - valid_indices.append(idx) | |
| 185 | - else: | |
| 186 | - logger.warning(f"Image not found: {path}") | |
| 187 | - | |
| 188 | - # Get embeddings for valid images | |
| 189 | - if valid_paths: | |
| 190 | - try: | |
| 191 | - # Send paths as strings to CLIP server | |
| 192 | - result = self.clip_client.encode(valid_paths) | |
| 193 | - | |
| 194 | - # Create embeddings list with None for missing images | |
| 195 | - batch_embeddings = [None] * len(batch_paths) | |
| 196 | - | |
| 197 | - # Handle result format - could be numpy array or DocumentArray | |
| 198 | - import numpy as np | |
| 199 | - | |
| 200 | - if isinstance(result, np.ndarray): | |
| 201 | - # Result is numpy array - shape (n_images, embedding_dim) | |
| 202 | - for idx in range(len(result)): | |
| 203 | - original_idx = valid_indices[idx] | |
| 204 | - batch_embeddings[original_idx] = result[idx].tolist() | |
| 205 | - else: | |
| 206 | - # Result is DocumentArray | |
| 207 | - for idx, doc in enumerate(result): | |
| 208 | - original_idx = valid_indices[idx] | |
| 209 | - batch_embeddings[original_idx] = doc.embedding.tolist() | |
| 210 | - | |
| 211 | - all_embeddings.extend(batch_embeddings) | |
| 212 | - | |
| 213 | - logger.info( | |
| 214 | - f"Generated image embeddings for batch {i // batch_size + 1}: " | |
| 215 | - f"{len(valid_paths)}/{len(batch_paths)} successful" | |
| 216 | - ) | |
| 217 | - | |
| 218 | - except Exception as e: | |
| 219 | - logger.error( | |
| 220 | - f"Failed to generate image embeddings for batch {i // batch_size + 1}: {e}" | |
| 221 | - ) | |
| 222 | - # Add None for all images in failed batch | |
| 223 | - all_embeddings.extend([None] * len(batch_paths)) | |
| 224 | - else: | |
| 225 | - # All images in batch failed to load | |
| 226 | - all_embeddings.extend([None] * len(batch_paths)) | |
| 227 | - | |
| 228 | - return all_embeddings | |
| 229 | - | |
| 230 | - def get_text_embedding_from_image( | |
| 231 | - self, image_path: Union[str, Path] | |
| 232 | - ) -> List[float]: | |
| 233 | - """Get text-based embedding by describing the image | |
| 234 | - This is useful for cross-modal search | |
| 235 | - | |
| 236 | - Note: This is a placeholder for future implementation | |
| 237 | - that could use vision models to generate text descriptions | |
| 238 | - | |
| 239 | - Args: | |
| 240 | - image_path: Path to image file | |
| 241 | - | |
| 242 | - Returns: | |
| 243 | - Text embedding vector | |
| 244 | - """ | |
| 245 | - # For now, we just return the image embedding | |
| 246 | - # In the future, this could use a vision-language model to generate | |
| 247 | - # a text description and then embed that | |
| 248 | - raise NotImplementedError("Text embedding from image not yet implemented") | |
| 249 | - | |
| 250 | - def cosine_similarity( | |
| 251 | - self, embedding1: List[float], embedding2: List[float] | |
| 252 | - ) -> float: | |
| 253 | - """Calculate cosine similarity between two embeddings | |
| 254 | - | |
| 255 | - Args: | |
| 256 | - embedding1: First embedding vector | |
| 257 | - embedding2: Second embedding vector | |
| 258 | - | |
| 259 | - Returns: | |
| 260 | - Cosine similarity score (0-1) | |
| 261 | - """ | |
| 262 | - vec1 = np.array(embedding1) | |
| 263 | - vec2 = np.array(embedding2) | |
| 264 | - | |
| 265 | - # Normalize vectors | |
| 266 | - vec1_norm = vec1 / np.linalg.norm(vec1) | |
| 267 | - vec2_norm = vec2 / np.linalg.norm(vec2) | |
| 268 | - | |
| 269 | - # Calculate cosine similarity | |
| 270 | - similarity = np.dot(vec1_norm, vec2_norm) | |
| 271 | - | |
| 272 | - return float(similarity) | |
| 273 | - | |
| 274 | - def get_embedding_dimensions(self) -> dict: | |
| 275 | - """Get the dimensions of text and image embeddings | |
| 276 | - | |
| 277 | - Returns: | |
| 278 | - Dictionary with text_dim and image_dim | |
| 279 | - """ | |
| 280 | - return {"text_dim": settings.text_dim, "image_dim": settings.image_dim} | |
| 281 | - | |
| 282 | - | |
| 283 | -# Global instance | |
| 284 | -_embedding_service: Optional[EmbeddingService] = None | |
| 285 | - | |
| 286 | - | |
| 287 | -def get_embedding_service() -> EmbeddingService: | |
| 288 | - """Get or create the global embedding service instance""" | |
| 289 | - global _embedding_service | |
| 290 | - if _embedding_service is None: | |
| 291 | - _embedding_service = EmbeddingService() | |
| 292 | - _embedding_service.connect_clip() | |
| 293 | - return _embedding_service |
app/services/milvus_service.py deleted
| ... | ... | @@ -1,480 +0,0 @@ |
| 1 | -""" | |
| 2 | -Milvus Service for Vector Storage and Similarity Search | |
| 3 | -Manages text and image embeddings in separate collections | |
| 4 | -""" | |
| 5 | - | |
| 6 | -import logging | |
| 7 | -from typing import Any, Dict, List, Optional | |
| 8 | - | |
| 9 | -from pymilvus import ( | |
| 10 | - DataType, | |
| 11 | - MilvusClient, | |
| 12 | -) | |
| 13 | - | |
| 14 | -from app.config import settings | |
| 15 | - | |
| 16 | -logger = logging.getLogger(__name__) | |
| 17 | - | |
| 18 | - | |
| 19 | -class MilvusService: | |
| 20 | - """Service for managing vector embeddings in Milvus""" | |
| 21 | - | |
| 22 | - def __init__(self, uri: Optional[str] = None): | |
| 23 | - """Initialize Milvus service | |
| 24 | - | |
| 25 | - Args: | |
| 26 | - uri: Milvus connection URI. If None, uses settings.milvus_uri | |
| 27 | - """ | |
| 28 | - if uri: | |
| 29 | - self.uri = uri | |
| 30 | - else: | |
| 31 | - # Use absolute path for Milvus Lite | |
| 32 | - self.uri = settings.milvus_uri_absolute | |
| 33 | - self.text_collection_name = settings.text_collection_name | |
| 34 | - self.image_collection_name = settings.image_collection_name | |
| 35 | - self.text_dim = settings.text_dim | |
| 36 | - self.image_dim = settings.image_dim | |
| 37 | - | |
| 38 | - # Use MilvusClient for simplified operations | |
| 39 | - self._client: Optional[MilvusClient] = None | |
| 40 | - | |
| 41 | - logger.info(f"Initializing Milvus service with URI: {self.uri}") | |
| 42 | - | |
| 43 | - def is_connected(self) -> bool: | |
| 44 | - """Check if connected to Milvus""" | |
| 45 | - return self._client is not None | |
| 46 | - | |
| 47 | - def connect(self) -> None: | |
| 48 | - """Connect to Milvus""" | |
| 49 | - if self.is_connected(): | |
| 50 | - return | |
| 51 | - try: | |
| 52 | - self._client = MilvusClient(uri=self.uri) | |
| 53 | - logger.info(f"Connected to Milvus at {self.uri}") | |
| 54 | - except Exception as e: | |
| 55 | - logger.error(f"Failed to connect to Milvus: {e}") | |
| 56 | - raise | |
| 57 | - | |
| 58 | - def disconnect(self) -> None: | |
| 59 | - """Disconnect from Milvus""" | |
| 60 | - if self._client: | |
| 61 | - self._client.close() | |
| 62 | - self._client = None | |
| 63 | - logger.info("Disconnected from Milvus") | |
| 64 | - | |
| 65 | - @property | |
| 66 | - def client(self) -> MilvusClient: | |
| 67 | - """Get the Milvus client""" | |
| 68 | - if not self._client: | |
| 69 | - raise RuntimeError("Milvus not connected. Call connect() first.") | |
| 70 | - return self._client | |
| 71 | - | |
| 72 | - def create_text_collection(self, recreate: bool = False) -> None: | |
| 73 | - """Create collection for text embeddings with product metadata | |
| 74 | - | |
| 75 | - Args: | |
| 76 | - recreate: If True, drop existing collection and recreate | |
| 77 | - """ | |
| 78 | - if recreate and self.client.has_collection(self.text_collection_name): | |
| 79 | - self.client.drop_collection(self.text_collection_name) | |
| 80 | - logger.info(f"Dropped existing collection: {self.text_collection_name}") | |
| 81 | - | |
| 82 | - if self.client.has_collection(self.text_collection_name): | |
| 83 | - logger.info(f"Text collection already exists: {self.text_collection_name}") | |
| 84 | - return | |
| 85 | - | |
| 86 | - # Create collection with schema (includes metadata fields) | |
| 87 | - schema = MilvusClient.create_schema( | |
| 88 | - auto_id=False, | |
| 89 | - enable_dynamic_field=True, # Allow additional metadata fields | |
| 90 | - ) | |
| 91 | - | |
| 92 | - # Core fields | |
| 93 | - schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True) | |
| 94 | - schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2000) | |
| 95 | - schema.add_field( | |
| 96 | - field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.text_dim | |
| 97 | - ) | |
| 98 | - | |
| 99 | - # Product metadata fields | |
| 100 | - schema.add_field( | |
| 101 | - field_name="productDisplayName", datatype=DataType.VARCHAR, max_length=500 | |
| 102 | - ) | |
| 103 | - schema.add_field(field_name="gender", datatype=DataType.VARCHAR, max_length=50) | |
| 104 | - schema.add_field( | |
| 105 | - field_name="masterCategory", datatype=DataType.VARCHAR, max_length=100 | |
| 106 | - ) | |
| 107 | - schema.add_field( | |
| 108 | - field_name="subCategory", datatype=DataType.VARCHAR, max_length=100 | |
| 109 | - ) | |
| 110 | - schema.add_field( | |
| 111 | - field_name="articleType", datatype=DataType.VARCHAR, max_length=100 | |
| 112 | - ) | |
| 113 | - schema.add_field( | |
| 114 | - field_name="baseColour", datatype=DataType.VARCHAR, max_length=50 | |
| 115 | - ) | |
| 116 | - schema.add_field(field_name="season", datatype=DataType.VARCHAR, max_length=50) | |
| 117 | - schema.add_field(field_name="usage", datatype=DataType.VARCHAR, max_length=50) | |
| 118 | - | |
| 119 | - # Create index parameters | |
| 120 | - index_params = self.client.prepare_index_params() | |
| 121 | - index_params.add_index( | |
| 122 | - field_name="embedding", | |
| 123 | - index_type="AUTOINDEX", | |
| 124 | - metric_type="COSINE", | |
| 125 | - ) | |
| 126 | - | |
| 127 | - # Create collection | |
| 128 | - self.client.create_collection( | |
| 129 | - collection_name=self.text_collection_name, | |
| 130 | - schema=schema, | |
| 131 | - index_params=index_params, | |
| 132 | - ) | |
| 133 | - | |
| 134 | - logger.info( | |
| 135 | - f"Created text collection with metadata: {self.text_collection_name}" | |
| 136 | - ) | |
| 137 | - | |
| 138 | - def create_image_collection(self, recreate: bool = False) -> None: | |
| 139 | - """Create collection for image embeddings with product metadata | |
| 140 | - | |
| 141 | - Args: | |
| 142 | - recreate: If True, drop existing collection and recreate | |
| 143 | - """ | |
| 144 | - if recreate and self.client.has_collection(self.image_collection_name): | |
| 145 | - self.client.drop_collection(self.image_collection_name) | |
| 146 | - logger.info(f"Dropped existing collection: {self.image_collection_name}") | |
| 147 | - | |
| 148 | - if self.client.has_collection(self.image_collection_name): | |
| 149 | - logger.info( | |
| 150 | - f"Image collection already exists: {self.image_collection_name}" | |
| 151 | - ) | |
| 152 | - return | |
| 153 | - | |
| 154 | - # Create collection with schema (includes metadata fields) | |
| 155 | - schema = MilvusClient.create_schema( | |
| 156 | - auto_id=False, | |
| 157 | - enable_dynamic_field=True, # Allow additional metadata fields | |
| 158 | - ) | |
| 159 | - | |
| 160 | - # Core fields | |
| 161 | - schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True) | |
| 162 | - schema.add_field( | |
| 163 | - field_name="image_path", datatype=DataType.VARCHAR, max_length=500 | |
| 164 | - ) | |
| 165 | - schema.add_field( | |
| 166 | - field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.image_dim | |
| 167 | - ) | |
| 168 | - | |
| 169 | - # Product metadata fields | |
| 170 | - schema.add_field( | |
| 171 | - field_name="productDisplayName", datatype=DataType.VARCHAR, max_length=500 | |
| 172 | - ) | |
| 173 | - schema.add_field(field_name="gender", datatype=DataType.VARCHAR, max_length=50) | |
| 174 | - schema.add_field( | |
| 175 | - field_name="masterCategory", datatype=DataType.VARCHAR, max_length=100 | |
| 176 | - ) | |
| 177 | - schema.add_field( | |
| 178 | - field_name="subCategory", datatype=DataType.VARCHAR, max_length=100 | |
| 179 | - ) | |
| 180 | - schema.add_field( | |
| 181 | - field_name="articleType", datatype=DataType.VARCHAR, max_length=100 | |
| 182 | - ) | |
| 183 | - schema.add_field( | |
| 184 | - field_name="baseColour", datatype=DataType.VARCHAR, max_length=50 | |
| 185 | - ) | |
| 186 | - schema.add_field(field_name="season", datatype=DataType.VARCHAR, max_length=50) | |
| 187 | - schema.add_field(field_name="usage", datatype=DataType.VARCHAR, max_length=50) | |
| 188 | - | |
| 189 | - # Create index parameters | |
| 190 | - index_params = self.client.prepare_index_params() | |
| 191 | - index_params.add_index( | |
| 192 | - field_name="embedding", | |
| 193 | - index_type="AUTOINDEX", | |
| 194 | - metric_type="COSINE", | |
| 195 | - ) | |
| 196 | - | |
| 197 | - # Create collection | |
| 198 | - self.client.create_collection( | |
| 199 | - collection_name=self.image_collection_name, | |
| 200 | - schema=schema, | |
| 201 | - index_params=index_params, | |
| 202 | - ) | |
| 203 | - | |
| 204 | - logger.info( | |
| 205 | - f"Created image collection with metadata: {self.image_collection_name}" | |
| 206 | - ) | |
| 207 | - | |
| 208 | - def insert_text_embeddings( | |
| 209 | - self, | |
| 210 | - embeddings: List[Dict[str, Any]], | |
| 211 | - ) -> int: | |
| 212 | - """Insert text embeddings with metadata into collection | |
| 213 | - | |
| 214 | - Args: | |
| 215 | - embeddings: List of dictionaries with keys: | |
| 216 | - - id: unique ID (product ID) | |
| 217 | - - text: the text that was embedded | |
| 218 | - - embedding: the embedding vector | |
| 219 | - - productDisplayName, gender, masterCategory, etc. (metadata) | |
| 220 | - | |
| 221 | - Returns: | |
| 222 | - Number of inserted embeddings | |
| 223 | - """ | |
| 224 | - if not embeddings: | |
| 225 | - return 0 | |
| 226 | - | |
| 227 | - try: | |
| 228 | - # Insert data directly (all fields including metadata) | |
| 229 | - # Milvus will accept all fields defined in schema + dynamic fields | |
| 230 | - data = embeddings | |
| 231 | - | |
| 232 | - # Insert data | |
| 233 | - result = self.client.insert( | |
| 234 | - collection_name=self.text_collection_name, | |
| 235 | - data=data, | |
| 236 | - ) | |
| 237 | - | |
| 238 | - logger.info(f"Inserted {len(data)} text embeddings") | |
| 239 | - return len(data) | |
| 240 | - | |
| 241 | - except Exception as e: | |
| 242 | - logger.error(f"Failed to insert text embeddings: {e}") | |
| 243 | - raise | |
| 244 | - | |
| 245 | - def insert_image_embeddings( | |
| 246 | - self, | |
| 247 | - embeddings: List[Dict[str, Any]], | |
| 248 | - ) -> int: | |
| 249 | - """Insert image embeddings with metadata into collection | |
| 250 | - | |
| 251 | - Args: | |
| 252 | - embeddings: List of dictionaries with keys: | |
| 253 | - - id: unique ID (product ID) | |
| 254 | - - image_path: path to the image file | |
| 255 | - - embedding: the embedding vector | |
| 256 | - - productDisplayName, gender, masterCategory, etc. (metadata) | |
| 257 | - | |
| 258 | - Returns: | |
| 259 | - Number of inserted embeddings | |
| 260 | - """ | |
| 261 | - if not embeddings: | |
| 262 | - return 0 | |
| 263 | - | |
| 264 | - try: | |
| 265 | - # Insert data directly (all fields including metadata) | |
| 266 | - # Milvus will accept all fields defined in schema + dynamic fields | |
| 267 | - data = embeddings | |
| 268 | - | |
| 269 | - # Insert data | |
| 270 | - result = self.client.insert( | |
| 271 | - collection_name=self.image_collection_name, | |
| 272 | - data=data, | |
| 273 | - ) | |
| 274 | - | |
| 275 | - logger.info(f"Inserted {len(data)} image embeddings") | |
| 276 | - return len(data) | |
| 277 | - | |
| 278 | - except Exception as e: | |
| 279 | - logger.error(f"Failed to insert image embeddings: {e}") | |
| 280 | - raise | |
| 281 | - | |
| 282 | - def search_similar_text( | |
| 283 | - self, | |
| 284 | - query_embedding: List[float], | |
| 285 | - limit: int = 10, | |
| 286 | - filters: Optional[str] = None, | |
| 287 | - output_fields: Optional[List[str]] = None, | |
| 288 | - ) -> List[Dict[str, Any]]: | |
| 289 | - """Search for similar text embeddings | |
| 290 | - | |
| 291 | - Args: | |
| 292 | - query_embedding: Query embedding vector | |
| 293 | - limit: Maximum number of results | |
| 294 | - filters: Filter expression (e.g., "product_id in [1, 2, 3]") | |
| 295 | - output_fields: List of fields to return | |
| 296 | - | |
| 297 | - Returns: | |
| 298 | - List of search results with fields: | |
| 299 | - - id: embedding ID | |
| 300 | - - distance: similarity distance | |
| 301 | - - entity: the matched entity with requested fields | |
| 302 | - """ | |
| 303 | - try: | |
| 304 | - if output_fields is None: | |
| 305 | - output_fields = [ | |
| 306 | - "id", | |
| 307 | - "text", | |
| 308 | - "productDisplayName", | |
| 309 | - "gender", | |
| 310 | - "masterCategory", | |
| 311 | - "subCategory", | |
| 312 | - "articleType", | |
| 313 | - "baseColour", | |
| 314 | - ] | |
| 315 | - | |
| 316 | - search_params = {} | |
| 317 | - if filters: | |
| 318 | - search_params["expr"] = filters | |
| 319 | - | |
| 320 | - results = self.client.search( | |
| 321 | - collection_name=self.text_collection_name, | |
| 322 | - data=[query_embedding], | |
| 323 | - limit=limit, | |
| 324 | - output_fields=output_fields, | |
| 325 | - search_params=search_params, | |
| 326 | - ) | |
| 327 | - | |
| 328 | - # Format results | |
| 329 | - formatted_results = [] | |
| 330 | - if results and len(results) > 0: | |
| 331 | - for hit in results[0]: | |
| 332 | - result = {"id": hit.get("id"), "distance": hit.get("distance")} | |
| 333 | - # Extract fields from entity | |
| 334 | - entity = hit.get("entity", {}) | |
| 335 | - for field in output_fields: | |
| 336 | - if field in entity: | |
| 337 | - result[field] = entity.get(field) | |
| 338 | - formatted_results.append(result) | |
| 339 | - | |
| 340 | - logger.debug(f"Found {len(formatted_results)} similar text embeddings") | |
| 341 | - return formatted_results | |
| 342 | - | |
| 343 | - except Exception as e: | |
| 344 | - logger.error(f"Failed to search similar text: {e}") | |
| 345 | - raise | |
| 346 | - | |
| 347 | - def search_similar_images( | |
| 348 | - self, | |
| 349 | - query_embedding: List[float], | |
| 350 | - limit: int = 10, | |
| 351 | - filters: Optional[str] = None, | |
| 352 | - output_fields: Optional[List[str]] = None, | |
| 353 | - ) -> List[Dict[str, Any]]: | |
| 354 | - """Search for similar image embeddings | |
| 355 | - | |
| 356 | - Args: | |
| 357 | - query_embedding: Query embedding vector | |
| 358 | - limit: Maximum number of results | |
| 359 | - filters: Filter expression (e.g., "product_id in [1, 2, 3]") | |
| 360 | - output_fields: List of fields to return | |
| 361 | - | |
| 362 | - Returns: | |
| 363 | - List of search results with fields: | |
| 364 | - - id: embedding ID | |
| 365 | - - distance: similarity distance | |
| 366 | - - entity: the matched entity with requested fields | |
| 367 | - """ | |
| 368 | - try: | |
| 369 | - if output_fields is None: | |
| 370 | - output_fields = [ | |
| 371 | - "id", | |
| 372 | - "image_path", | |
| 373 | - "productDisplayName", | |
| 374 | - "gender", | |
| 375 | - "masterCategory", | |
| 376 | - "subCategory", | |
| 377 | - "articleType", | |
| 378 | - "baseColour", | |
| 379 | - ] | |
| 380 | - | |
| 381 | - search_params = {} | |
| 382 | - if filters: | |
| 383 | - search_params["expr"] = filters | |
| 384 | - | |
| 385 | - results = self.client.search( | |
| 386 | - collection_name=self.image_collection_name, | |
| 387 | - data=[query_embedding], | |
| 388 | - limit=limit, | |
| 389 | - output_fields=output_fields, | |
| 390 | - search_params=search_params, | |
| 391 | - ) | |
| 392 | - | |
| 393 | - # Format results | |
| 394 | - formatted_results = [] | |
| 395 | - if results and len(results) > 0: | |
| 396 | - for hit in results[0]: | |
| 397 | - result = {"id": hit.get("id"), "distance": hit.get("distance")} | |
| 398 | - # Extract fields from entity | |
| 399 | - entity = hit.get("entity", {}) | |
| 400 | - for field in output_fields: | |
| 401 | - if field in entity: | |
| 402 | - result[field] = entity.get(field) | |
| 403 | - formatted_results.append(result) | |
| 404 | - | |
| 405 | - logger.debug(f"Found {len(formatted_results)} similar image embeddings") | |
| 406 | - return formatted_results | |
| 407 | - | |
| 408 | - except Exception as e: | |
| 409 | - logger.error(f"Failed to search similar images: {e}") | |
| 410 | - raise | |
| 411 | - | |
| 412 | - def get_collection_stats(self, collection_name: str) -> Dict[str, Any]: | |
| 413 | - """Get statistics for a collection | |
| 414 | - | |
| 415 | - Args: | |
| 416 | - collection_name: Name of the collection | |
| 417 | - | |
| 418 | - Returns: | |
| 419 | - Dictionary with collection statistics | |
| 420 | - """ | |
| 421 | - try: | |
| 422 | - stats = self.client.get_collection_stats(collection_name) | |
| 423 | - return { | |
| 424 | - "collection_name": collection_name, | |
| 425 | - "row_count": stats.get("row_count", 0), | |
| 426 | - } | |
| 427 | - except Exception as e: | |
| 428 | - logger.error(f"Failed to get collection stats: {e}") | |
| 429 | - return {"collection_name": collection_name, "row_count": 0} | |
| 430 | - | |
| 431 | - def delete_by_ids(self, collection_name: str, ids: List[int]) -> int: | |
| 432 | - """Delete embeddings by IDs | |
| 433 | - | |
| 434 | - Args: | |
| 435 | - collection_name: Name of the collection | |
| 436 | - ids: List of IDs to delete | |
| 437 | - | |
| 438 | - Returns: | |
| 439 | - Number of deleted embeddings | |
| 440 | - """ | |
| 441 | - if not ids: | |
| 442 | - return 0 | |
| 443 | - | |
| 444 | - try: | |
| 445 | - self.client.delete( | |
| 446 | - collection_name=collection_name, | |
| 447 | - ids=ids, | |
| 448 | - ) | |
| 449 | - logger.info(f"Deleted {len(ids)} embeddings from {collection_name}") | |
| 450 | - return len(ids) | |
| 451 | - except Exception as e: | |
| 452 | - logger.error(f"Failed to delete embeddings: {e}") | |
| 453 | - raise | |
| 454 | - | |
| 455 | - def clear_collection(self, collection_name: str) -> None: | |
| 456 | - """Clear all data from a collection | |
| 457 | - | |
| 458 | - Args: | |
| 459 | - collection_name: Name of the collection | |
| 460 | - """ | |
| 461 | - try: | |
| 462 | - if self.client.has_collection(collection_name): | |
| 463 | - self.client.drop_collection(collection_name) | |
| 464 | - logger.info(f"Dropped collection: {collection_name}") | |
| 465 | - except Exception as e: | |
| 466 | - logger.error(f"Failed to clear collection: {e}") | |
| 467 | - raise | |
| 468 | - | |
| 469 | - | |
| 470 | -# Global instance | |
| 471 | -_milvus_service: Optional[MilvusService] = None | |
| 472 | - | |
| 473 | - | |
| 474 | -def get_milvus_service() -> MilvusService: | |
| 475 | - """Get or create the global Milvus service instance""" | |
| 476 | - global _milvus_service | |
| 477 | - if _milvus_service is None: | |
| 478 | - _milvus_service = MilvusService() | |
| 479 | - _milvus_service.connect() | |
| 480 | - return _milvus_service |
app/tools/__init__.py
| ... | ... | @@ -5,13 +5,11 @@ LangChain Tools for Product Search and Discovery |
| 5 | 5 | from app.tools.search_tools import ( |
| 6 | 6 | analyze_image_style, |
| 7 | 7 | get_all_tools, |
| 8 | - search_by_image, | |
| 9 | 8 | search_products, |
| 10 | 9 | ) |
| 11 | 10 | |
| 12 | 11 | __all__ = [ |
| 13 | 12 | "search_products", |
| 14 | - "search_by_image", | |
| 15 | 13 | "analyze_image_style", |
| 16 | 14 | "get_all_tools", |
| 17 | 15 | ] | ... | ... |
app/tools/search_tools.py
| 1 | 1 | """ |
| 2 | 2 | Search Tools for Product Discovery |
| 3 | -Provides text-based, image-based, and VLM reasoning capabilities | |
| 3 | +Provides text-based search via Search API and VLM style analysis | |
| 4 | 4 | """ |
| 5 | 5 | |
| 6 | 6 | import base64 |
| ... | ... | @@ -8,40 +8,24 @@ import logging |
| 8 | 8 | from pathlib import Path |
| 9 | 9 | from typing import Optional |
| 10 | 10 | |
| 11 | +import requests | |
| 11 | 12 | from langchain_core.tools import tool |
| 12 | 13 | from openai import OpenAI |
| 13 | 14 | |
| 14 | 15 | from app.config import settings |
| 15 | -from app.services.embedding_service import EmbeddingService | |
| 16 | -from app.services.milvus_service import MilvusService | |
| 17 | 16 | |
| 18 | 17 | logger = logging.getLogger(__name__) |
| 19 | 18 | |
| 20 | -# Initialize services as singletons | |
| 21 | -_embedding_service: Optional[EmbeddingService] = None | |
| 22 | -_milvus_service: Optional[MilvusService] = None | |
| 23 | 19 | _openai_client: Optional[OpenAI] = None |
| 24 | 20 | |
| 25 | 21 | |
| 26 | -def get_embedding_service() -> EmbeddingService: | |
| 27 | - global _embedding_service | |
| 28 | - if _embedding_service is None: | |
| 29 | - _embedding_service = EmbeddingService() | |
| 30 | - return _embedding_service | |
| 31 | - | |
| 32 | - | |
| 33 | -def get_milvus_service() -> MilvusService: | |
| 34 | - global _milvus_service | |
| 35 | - if _milvus_service is None: | |
| 36 | - _milvus_service = MilvusService() | |
| 37 | - _milvus_service.connect() | |
| 38 | - return _milvus_service | |
| 39 | - | |
| 40 | - | |
| 41 | 22 | def get_openai_client() -> OpenAI: |
| 42 | 23 | global _openai_client |
| 43 | 24 | if _openai_client is None: |
| 44 | - _openai_client = OpenAI(api_key=settings.openai_api_key) | |
| 25 | + kwargs = {"api_key": settings.openai_api_key} | |
| 26 | + if settings.openai_api_base_url: | |
| 27 | + kwargs["base_url"] = settings.openai_api_base_url | |
| 28 | + _openai_client = OpenAI(**kwargs) | |
| 45 | 29 | return _openai_client |
| 46 | 30 | |
| 47 | 31 | |
| ... | ... | @@ -64,30 +48,26 @@ def search_products(query: str, limit: int = 5) -> str: |
| 64 | 48 | try: |
| 65 | 49 | logger.info(f"Searching products: '{query}', limit: {limit}") |
| 66 | 50 | |
| 67 | - embedding_service = get_embedding_service() | |
| 68 | - milvus_service = get_milvus_service() | |
| 69 | - | |
| 70 | - if not milvus_service.is_connected(): | |
| 71 | - milvus_service.connect() | |
| 72 | - | |
| 73 | - query_embedding = embedding_service.get_text_embedding(query) | |
| 74 | - | |
| 75 | - results = milvus_service.search_similar_text( | |
| 76 | - query_embedding=query_embedding, | |
| 77 | - limit=min(limit, 20), | |
| 78 | - filters=None, | |
| 79 | - output_fields=[ | |
| 80 | - "id", | |
| 81 | - "productDisplayName", | |
| 82 | - "gender", | |
| 83 | - "masterCategory", | |
| 84 | - "subCategory", | |
| 85 | - "articleType", | |
| 86 | - "baseColour", | |
| 87 | - "season", | |
| 88 | - "usage", | |
| 89 | - ], | |
| 90 | - ) | |
| 51 | + url = f"{settings.search_api_base_url.rstrip('/')}/search/" | |
| 52 | + headers = { | |
| 53 | + "Content-Type": "application/json", | |
| 54 | + "X-Tenant-ID": settings.search_api_tenant_id, | |
| 55 | + } | |
| 56 | + payload = { | |
| 57 | + "query": query, | |
| 58 | + "size": min(limit, 20), | |
| 59 | + "from": 0, | |
| 60 | + "language": "zh", | |
| 61 | + } | |
| 62 | + | |
| 63 | + response = requests.post(url, json=payload, headers=headers, timeout=60) | |
| 64 | + | |
| 65 | + if response.status_code != 200: | |
| 66 | + logger.error(f"Search API error: {response.status_code} - {response.text}") | |
| 67 | + return f"Error searching products: API returned {response.status_code}" | |
| 68 | + | |
| 69 | + data = response.json() | |
| 70 | + results = data.get("results", []) | |
| 91 | 71 | |
| 92 | 72 | if not results: |
| 93 | 73 | return "No products found matching your search." |
| ... | ... | @@ -95,131 +75,40 @@ def search_products(query: str, limit: int = 5) -> str: |
| 95 | 75 | output = f"Found {len(results)} product(s):\n\n" |
| 96 | 76 | |
| 97 | 77 | for idx, product in enumerate(results, 1): |
| 98 | - output += f"{idx}. {product.get('productDisplayName', 'Unknown Product')}\n" | |
| 99 | - output += f" ID: {product.get('id', 'N/A')}\n" | |
| 100 | - output += f" Category: {product.get('masterCategory', 'N/A')} > {product.get('subCategory', 'N/A')} > {product.get('articleType', 'N/A')}\n" | |
| 101 | - output += f" Color: {product.get('baseColour', 'N/A')}\n" | |
| 102 | - output += f" Gender: {product.get('gender', 'N/A')}\n" | |
| 103 | - | |
| 104 | - if product.get("season"): | |
| 105 | - output += f" Season: {product.get('season')}\n" | |
| 106 | - if product.get("usage"): | |
| 107 | - output += f" Usage: {product.get('usage')}\n" | |
| 108 | - | |
| 109 | - if "distance" in product: | |
| 110 | - similarity = 1 - product["distance"] | |
| 111 | - output += f" Relevance: {similarity:.2%}\n" | |
| 78 | + output += f"{idx}. {product.get('title', 'Unknown Product')}\n" | |
| 79 | + output += f" ID: {product.get('spu_id', 'N/A')}\n" | |
| 80 | + output += f" Category: {product.get('category_path', product.get('category_name', 'N/A'))}\n" | |
| 81 | + if product.get("vendor"): | |
| 82 | + output += f" Brand: {product.get('vendor')}\n" | |
| 83 | + if product.get("price") is not None: | |
| 84 | + output += f" Price: {product.get('price')}\n" | |
| 85 | + | |
| 86 | + # 规格/颜色信息 | |
| 87 | + specs = product.get("specifications", []) | |
| 88 | + if specs: | |
| 89 | + color_spec = next( | |
| 90 | + (s for s in specs if s.get("name") == "color"), | |
| 91 | + None, | |
| 92 | + ) | |
| 93 | + if color_spec: | |
| 94 | + output += f" Color: {color_spec.get('value', 'N/A')}\n" | |
| 95 | + | |
| 96 | + if product.get("relevance_score") is not None: | |
| 97 | + output += f" Relevance: {product['relevance_score']:.2f}\n" | |
| 112 | 98 | |
| 113 | 99 | output += "\n" |
| 114 | 100 | |
| 115 | 101 | return output.strip() |
| 116 | 102 | |
| 103 | + except requests.exceptions.RequestException as e: | |
| 104 | + logger.error(f"Error searching products (network): {e}", exc_info=True) | |
| 105 | + return f"Error searching products: {str(e)}" | |
| 117 | 106 | except Exception as e: |
| 118 | 107 | logger.error(f"Error searching products: {e}", exc_info=True) |
| 119 | 108 | return f"Error searching products: {str(e)}" |
| 120 | 109 | |
| 121 | 110 | |
| 122 | 111 | @tool |
| 123 | -def search_by_image(image_path: str, limit: int = 5) -> str: | |
| 124 | - """Find similar fashion products using an image. | |
| 125 | - | |
| 126 | - Use when users want visually similar items: | |
| 127 | - - User uploads an image and asks "find similar items" | |
| 128 | - - "Show me products that look like this" | |
| 129 | - | |
| 130 | - Args: | |
| 131 | - image_path: Path to the image file | |
| 132 | - limit: Maximum number of results (1-20) | |
| 133 | - | |
| 134 | - Returns: | |
| 135 | - Formatted string with similar products | |
| 136 | - """ | |
| 137 | - try: | |
| 138 | - logger.info(f"Image search: '{image_path}', limit: {limit}") | |
| 139 | - | |
| 140 | - img_path = Path(image_path) | |
| 141 | - if not img_path.exists(): | |
| 142 | - return f"Error: Image file not found at '{image_path}'" | |
| 143 | - | |
| 144 | - embedding_service = get_embedding_service() | |
| 145 | - milvus_service = get_milvus_service() | |
| 146 | - | |
| 147 | - if not milvus_service.is_connected(): | |
| 148 | - milvus_service.connect() | |
| 149 | - | |
| 150 | - if ( | |
| 151 | - not hasattr(embedding_service, "clip_client") | |
| 152 | - or embedding_service.clip_client is None | |
| 153 | - ): | |
| 154 | - embedding_service.connect_clip() | |
| 155 | - | |
| 156 | - image_embedding = embedding_service.get_image_embedding(image_path) | |
| 157 | - | |
| 158 | - if image_embedding is None: | |
| 159 | - return "Error: Failed to generate embedding for image" | |
| 160 | - | |
| 161 | - results = milvus_service.search_similar_images( | |
| 162 | - query_embedding=image_embedding, | |
| 163 | - limit=min(limit + 1, 21), | |
| 164 | - filters=None, | |
| 165 | - output_fields=[ | |
| 166 | - "id", | |
| 167 | - "image_path", | |
| 168 | - "productDisplayName", | |
| 169 | - "gender", | |
| 170 | - "masterCategory", | |
| 171 | - "subCategory", | |
| 172 | - "articleType", | |
| 173 | - "baseColour", | |
| 174 | - "season", | |
| 175 | - "usage", | |
| 176 | - ], | |
| 177 | - ) | |
| 178 | - | |
| 179 | - if not results: | |
| 180 | - return "No similar products found." | |
| 181 | - | |
| 182 | - # Filter out the query image itself | |
| 183 | - query_id = img_path.stem | |
| 184 | - filtered_results = [] | |
| 185 | - for result in results: | |
| 186 | - result_path = result.get("image_path", "") | |
| 187 | - if Path(result_path).stem != query_id: | |
| 188 | - filtered_results.append(result) | |
| 189 | - if len(filtered_results) >= limit: | |
| 190 | - break | |
| 191 | - | |
| 192 | - if not filtered_results: | |
| 193 | - return "No similar products found." | |
| 194 | - | |
| 195 | - output = f"Found {len(filtered_results)} visually similar product(s):\n\n" | |
| 196 | - | |
| 197 | - for idx, product in enumerate(filtered_results, 1): | |
| 198 | - output += f"{idx}. {product.get('productDisplayName', 'Unknown Product')}\n" | |
| 199 | - output += f" ID: {product.get('id', 'N/A')}\n" | |
| 200 | - output += f" Category: {product.get('masterCategory', 'N/A')} > {product.get('subCategory', 'N/A')} > {product.get('articleType', 'N/A')}\n" | |
| 201 | - output += f" Color: {product.get('baseColour', 'N/A')}\n" | |
| 202 | - output += f" Gender: {product.get('gender', 'N/A')}\n" | |
| 203 | - | |
| 204 | - if product.get("season"): | |
| 205 | - output += f" Season: {product.get('season')}\n" | |
| 206 | - if product.get("usage"): | |
| 207 | - output += f" Usage: {product.get('usage')}\n" | |
| 208 | - | |
| 209 | - if "distance" in product: | |
| 210 | - similarity = 1 - product["distance"] | |
| 211 | - output += f" Visual Similarity: {similarity:.2%}\n" | |
| 212 | - | |
| 213 | - output += "\n" | |
| 214 | - | |
| 215 | - return output.strip() | |
| 216 | - | |
| 217 | - except Exception as e: | |
| 218 | - logger.error(f"Error in image search: {e}", exc_info=True) | |
| 219 | - return f"Error searching by image: {str(e)}" | |
| 220 | - | |
| 221 | - | |
| 222 | -@tool | |
| 223 | 112 | def analyze_image_style(image_path: str) -> str: |
| 224 | 113 | """Analyze a fashion product image using AI vision to extract detailed style information. |
| 225 | 114 | |
| ... | ... | @@ -291,4 +180,4 @@ Provide a comprehensive yet concise description (3-4 sentences).""" |
| 291 | 180 | |
| 292 | 181 | def get_all_tools(): |
| 293 | 182 | """Get all available tools for the agent""" |
| 294 | - return [search_products, search_by_image, analyze_image_style] | |
| 183 | + return [search_products, analyze_image_style] | ... | ... |
docker-compose.yml deleted
| ... | ... | @@ -1,76 +0,0 @@ |
| 1 | -version: '3.5' | |
| 2 | - | |
| 3 | -services: | |
| 4 | - etcd: | |
| 5 | - container_name: milvus-etcd | |
| 6 | - image: quay.io/coreos/etcd:v3.5.5 | |
| 7 | - environment: | |
| 8 | - - ETCD_AUTO_COMPACTION_MODE=revision | |
| 9 | - - ETCD_AUTO_COMPACTION_RETENTION=1000 | |
| 10 | - - ETCD_QUOTA_BACKEND_BYTES=4294967296 | |
| 11 | - - ETCD_SNAPSHOT_COUNT=50000 | |
| 12 | - volumes: | |
| 13 | - - ./volumes/etcd:/etcd | |
| 14 | - command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd | |
| 15 | - healthcheck: | |
| 16 | - test: ["CMD", "etcdctl", "endpoint", "health"] | |
| 17 | - interval: 30s | |
| 18 | - timeout: 20s | |
| 19 | - retries: 3 | |
| 20 | - | |
| 21 | - minio: | |
| 22 | - container_name: milvus-minio | |
| 23 | - image: minio/minio:RELEASE.2023-03-20T20-16-18Z | |
| 24 | - environment: | |
| 25 | - MINIO_ACCESS_KEY: minioadmin | |
| 26 | - MINIO_SECRET_KEY: minioadmin | |
| 27 | - ports: | |
| 28 | - - "9001:9001" | |
| 29 | - - "9000:9000" | |
| 30 | - volumes: | |
| 31 | - - ./volumes/minio:/minio_data | |
| 32 | - command: minio server /minio_data --console-address ":9001" | |
| 33 | - healthcheck: | |
| 34 | - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] | |
| 35 | - interval: 30s | |
| 36 | - timeout: 20s | |
| 37 | - retries: 3 | |
| 38 | - | |
| 39 | - standalone: | |
| 40 | - container_name: milvus-standalone | |
| 41 | - image: milvusdb/milvus:v2.4.0 | |
| 42 | - command: ["milvus", "run", "standalone"] | |
| 43 | - security_opt: | |
| 44 | - - seccomp:unconfined | |
| 45 | - environment: | |
| 46 | - ETCD_ENDPOINTS: etcd:2379 | |
| 47 | - MINIO_ADDRESS: minio:9000 | |
| 48 | - volumes: | |
| 49 | - - ./volumes/milvus:/var/lib/milvus | |
| 50 | - healthcheck: | |
| 51 | - test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] | |
| 52 | - interval: 30s | |
| 53 | - start_period: 90s | |
| 54 | - timeout: 20s | |
| 55 | - retries: 3 | |
| 56 | - ports: | |
| 57 | - - "19530:19530" | |
| 58 | - - "9091:9091" | |
| 59 | - depends_on: | |
| 60 | - - "etcd" | |
| 61 | - - "minio" | |
| 62 | - | |
| 63 | - attu: | |
| 64 | - container_name: milvus-attu | |
| 65 | - image: zilliz/attu:v2.4 | |
| 66 | - environment: | |
| 67 | - MILVUS_URL: milvus-standalone:19530 | |
| 68 | - ports: | |
| 69 | - - "8000:3000" | |
| 70 | - depends_on: | |
| 71 | - - "standalone" | |
| 72 | - | |
| 73 | -networks: | |
| 74 | - default: | |
| 75 | - name: milvus | |
| 76 | - |
docs/DEPLOY_CENTOS8.md
| 1 | -# OmniShopAgent centOS 8 部署指南 | |
| 1 | +# OmniShopAgent CentOS 8 部署指南 | |
| 2 | 2 | |
| 3 | 3 | ## 一、环境要求 |
| 4 | 4 | |
| ... | ... | @@ -6,8 +6,8 @@ |
| 6 | 6 | |------|------| |
| 7 | 7 | | 操作系统 | CentOS 8.x | |
| 8 | 8 | | Python | 3.12+(LangChain 1.x 要求 3.10+) | |
| 9 | -| 内存 | 建议 8GB+(Milvus + CLIP 较占内存) | | |
| 10 | -| 磁盘 | 建议 20GB+(含数据集) | | |
| 9 | +| 内存 | 建议 4GB+ | | |
| 10 | +| 磁盘 | 建议 10GB+ | | |
| 11 | 11 | |
| 12 | 12 | ## 二、快速部署步骤 |
| 13 | 13 | |
| ... | ... | @@ -21,7 +21,6 @@ chmod +x scripts/*.sh |
| 21 | 21 | |
| 22 | 22 | 该脚本会: |
| 23 | 23 | - 安装系统依赖(gcc、openssl-devel 等) |
| 24 | -- 安装 Docker(用于 Milvus) | |
| 25 | 24 | - 安装 Python 3.12(conda 或源码编译) |
| 26 | 25 | - 创建虚拟环境并安装 requirements.txt |
| 27 | 26 | |
| ... | ... | @@ -59,17 +58,7 @@ make -j $(nproc) |
| 59 | 58 | sudo make altinstall |
| 60 | 59 | ``` |
| 61 | 60 | |
| 62 | -#### 步骤 3:安装 Docker | |
| 63 | - | |
| 64 | -```bash | |
| 65 | -sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo | |
| 66 | -sudo dnf install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin | |
| 67 | -sudo systemctl enable docker && sudo systemctl start docker | |
| 68 | -sudo usermod -aG docker $USER | |
| 69 | -# 执行 newgrp docker 或重新登录 | |
| 70 | -``` | |
| 71 | - | |
| 72 | -#### 步骤 4:创建虚拟环境并安装依赖 | |
| 61 | +#### 步骤 3:创建虚拟环境并安装依赖 | |
| 73 | 62 | |
| 74 | 63 | ```bash |
| 75 | 64 | cd /path/to/shop_agent |
| ... | ... | @@ -79,46 +68,35 @@ pip install -U pip |
| 79 | 68 | pip install -r requirements.txt |
| 80 | 69 | ``` |
| 81 | 70 | |
| 82 | -#### 步骤 5:配置环境变量 | |
| 71 | +#### 步骤 4:配置环境变量 | |
| 83 | 72 | |
| 84 | 73 | ```bash |
| 85 | 74 | cp .env.example .env |
| 86 | 75 | # 编辑 .env,至少配置: |
| 87 | 76 | # OPENAI_API_KEY=sk-xxx |
| 88 | -# MILVUS_HOST=localhost | |
| 89 | -# MILVUS_PORT=19530 | |
| 90 | -# CLIP_SERVER_URL=grpc://localhost:51000 | |
| 77 | +# SEARCH_API_BASE_URL=http://120.76.41.98:6002 | |
| 78 | +# SEARCH_API_TENANT_ID=162 | |
| 91 | 79 | ``` |
| 92 | 80 | |
| 93 | -## 三、数据准备 | |
| 81 | +## 三、数据准备(可选) | |
| 94 | 82 | |
| 95 | 83 | ### 3.1 下载数据集 |
| 96 | 84 | |
| 85 | +如需图片风格分析功能,可下载 Kaggle 数据集: | |
| 86 | + | |
| 97 | 87 | ```bash |
| 98 | 88 | # 需先配置 Kaggle API:~/.kaggle/kaggle.json |
| 99 | 89 | python scripts/download_dataset.py |
| 100 | 90 | ``` |
| 101 | 91 | |
| 102 | -### 3.2 启动 Milvus 并索引数据 | |
| 103 | - | |
| 104 | -```bash | |
| 105 | -# 启动 Milvus | |
| 106 | -./scripts/run_milvus.sh | |
| 107 | - | |
| 108 | -# 等待就绪后,创建索引 | |
| 109 | -python scripts/index_data.py | |
| 110 | -``` | |
| 111 | - | |
| 112 | 92 | ## 四、启动服务 |
| 113 | 93 | |
| 114 | 94 | ### 4.1 启动脚本说明 |
| 115 | 95 | |
| 116 | 96 | | 脚本 | 用途 | |
| 117 | 97 | |------|------| |
| 118 | -| `start.sh` | 主启动脚本:启动 Milvus + Streamlit | | |
| 119 | -| `stop.sh` | 停止所有服务 | | |
| 120 | -| `run_milvus.sh` | 仅启动 Milvus | | |
| 121 | -| `run_clip.sh` | 仅启动 CLIP(图像搜索需此服务) | | |
| 98 | +| `start.sh` | 主启动脚本:启动 Streamlit | | |
| 99 | +| `stop.sh` | 停止 Streamlit | | |
| 122 | 100 | | `check_services.sh` | 健康检查 | |
| 123 | 101 | |
| 124 | 102 | ### 4.2 启动应用 |
| ... | ... | @@ -127,14 +105,7 @@ python scripts/index_data.py |
| 127 | 105 | # 方式 1:使用 start.sh(推荐) |
| 128 | 106 | ./scripts/start.sh |
| 129 | 107 | |
| 130 | -# 方式 2:分步启动 | |
| 131 | -# 终端 1:Milvus | |
| 132 | -./scripts/run_milvus.sh | |
| 133 | - | |
| 134 | -# 终端 2:CLIP(图像搜索需要) | |
| 135 | -./scripts/run_clip.sh | |
| 136 | - | |
| 137 | -# 终端 3:Streamlit | |
| 108 | +# 方式 2:直接运行 | |
| 138 | 109 | source venv/bin/activate |
| 139 | 110 | streamlit run app.py --server.port=8501 --server.address=0.0.0.0 |
| 140 | 111 | ``` |
| ... | ... | @@ -142,7 +113,6 @@ streamlit run app.py --server.port=8501 --server.address=0.0.0.0 |
| 142 | 113 | ### 4.3 访问地址 |
| 143 | 114 | |
| 144 | 115 | - **Streamlit 应用**:http://服务器IP:8501 |
| 145 | -- **Milvus Attu 管理界面**:http://服务器IP:8000 | |
| 146 | 116 | |
| 147 | 117 | ## 五、生产部署建议 |
| 148 | 118 | |
| ... | ... | @@ -153,7 +123,7 @@ streamlit run app.py --server.port=8501 --server.address=0.0.0.0 |
| 153 | 123 | ```ini |
| 154 | 124 | [Unit] |
| 155 | 125 | Description=OmniShopAgent Streamlit App |
| 156 | -After=network.target docker.service | |
| 126 | +After=network.target | |
| 157 | 127 | |
| 158 | 128 | [Service] |
| 159 | 129 | Type=simple |
| ... | ... | @@ -194,7 +164,6 @@ server { |
| 194 | 164 | |
| 195 | 165 | ```bash |
| 196 | 166 | sudo firewall-cmd --permanent --add-port=8501/tcp |
| 197 | -sudo firewall-cmd --permanent --add-port=19530/tcp | |
| 198 | 167 | sudo firewall-cmd --reload |
| 199 | 168 | ``` |
| 200 | 169 | |
| ... | ... | @@ -203,14 +172,8 @@ sudo firewall-cmd --reload |
| 203 | 172 | ### Q: Python 3.12 编译失败? |
| 204 | 173 | A: 确保已安装 `openssl-devel`、`libffi-devel`,或直接使用 Miniconda。 |
| 205 | 174 | |
| 206 | -### Q: Docker 权限不足? | |
| 207 | -A: 执行 `sudo usermod -aG docker $USER` 后重新登录。 | |
| 208 | - | |
| 209 | -### Q: Milvus 启动超时? | |
| 210 | -A: 首次启动需拉取镜像,可能较慢。可检查 `docker compose logs -f standalone`。 | |
| 211 | - | |
| 212 | -### Q: 图像搜索不可用? | |
| 213 | -A: 需单独启动 CLIP 服务:`./scripts/run_clip.sh`。 | |
| 175 | +### Q: Search API 连接失败? | |
| 176 | +A: 检查 `.env` 中 `SEARCH_API_BASE_URL` 和 `SEARCH_API_TENANT_ID` 配置,确保网络可访问搜索服务。 | |
| 214 | 177 | |
| 215 | 178 | ### Q: 健康检查? |
| 216 | 179 | A: 执行 `./scripts/check_services.sh` 查看各组件状态。 | ... | ... |
docs/Skills实现方案-LangChain1.0.md
| ... | ... | @@ -7,7 +7,7 @@ Agent 鍦 system prompt 涓彧鐪嬪埌鎶鑳芥憳瑕侊紝鎸夐渶鍔犺浇璇︾粏鎶鑳藉唴瀹 |
| 7 | 7 | |
| 8 | 8 | | 鎶鑳 | 鑻辨枃鏍囪瘑 | 鑱岃矗 | |
| 9 | 9 | |------|----------|------| |
| 10 | -| 鏌ユ壘鐩稿叧鍟嗗搧 | lookup_related | 鍩轰簬鏂囨湰/鍥剧墖鏌ユ壘鐩镐技鎴栫浉鍏冲晢鍝 | | |
| 10 | +| 鏌ユ壘鐩稿叧鍟嗗搧 | lookup_related | 鍩轰簬鏂囨湰/鍥剧墖鏌ユ壘鐩镐技鎴栫浉鍏冲晢鍝侊紙鍥剧墖闇鍏堝垎鏋愰鏍硷級 | | |
| 11 | 11 | | 鎼滅储鍟嗗搧 | search_products | 鎸夎嚜鐒惰瑷鎻忚堪鎼滅储鍟嗗搧 | |
| 12 | 12 | | 妫楠屽晢鍝 | check_product | 妫楠屽晢鍝佹槸鍚︾鍚堢敤鎴疯姹 | |
| 13 | 13 | | 缁撴灉鍖呰 | result_packaging | 鏍煎紡鍖栥佹帓搴忋佺瓫閫夊苟鍛堢幇缁撴灉 | |
| ... | ... | @@ -24,7 +24,7 @@ Agent 鍦 system prompt 涓彧鐪嬪埌鎶鑳芥憳瑕侊紝鎸夐渶鍔犺浇璇︾粏鎶鑳藉唴瀹 |
| 24 | 24 | | **鏂瑰紡 A锛歝reate_agent + 鑷畾涔 Skill 涓棿浠** | 璐墿瀵艰喘绛変笟鍔 Agent | `langchain>=1.0`銆乣langgraph>=1.0` | |
| 25 | 25 | | **鏂瑰紡 B锛欴eep Agents + SKILL.md** | 渚濊禆鏂囦欢绯荤粺銆佸鎶鑳界洰褰 | `deepagents` | |
| 26 | 26 | |
| 27 | -璐墿瀵艰喘鍦烘櫙鎺ㄨ崘**鏂瑰紡 A**锛屾洿鏄撲笌鐜版湁 Milvus銆丆LIP 绛夋湇鍔¢泦鎴愩 | |
| 27 | +璐墿瀵艰喘鍦烘櫙鎺ㄨ崘**鏂瑰紡 A**锛屾洿鏄撲笌鐜版湁 Search API 绛夋湇鍔¢泦鎴愩 | |
| 28 | 28 | |
| 29 | 29 | ### 2.2 鏍稿績鎬濊矾锛歅rogressive Disclosure |
| 30 | 30 | |
| ... | ... | @@ -58,7 +58,7 @@ class Skill(TypedDict): |
| 58 | 58 | SKILLS: list[Skill] = [ |
| 59 | 59 | { |
| 60 | 60 | "name": "lookup_related", |
| 61 | - "description": "鏌ユ壘涓庢煇鍟嗗搧鐩稿叧鐨勫叾浠栧晢鍝侊紝鏀寔浠ュ浘鎼滃浘銆佹枃鏈浉浼笺佸悓鍝佺被鎺ㄨ崘銆", | |
| 61 | + "description": "鏌ユ壘涓庢煇鍟嗗搧鐩稿叧鐨勫叾浠栧晢鍝侊紝鏀寔鏂囨湰鐩镐技銆佸悓鍝佺被鎺ㄨ崘銆", | |
| 62 | 62 | "content": """# 鏌ユ壘鐩稿叧鍟嗗搧 |
| 63 | 63 | |
| 64 | 64 | ## 閫傜敤鍦烘櫙 |
| ... | ... | @@ -67,12 +67,11 @@ SKILLS: list[Skill] = [ |
| 67 | 67 | - 鐢ㄦ埛宸叉湁涓浠跺晢鍝侊紝鎯虫壘鐩稿叧娆 |
| 68 | 68 | |
| 69 | 69 | ## 鎿嶄綔姝ラ |
| 70 | -1. **鏈夊浘鐗**锛氬厛璋冪敤 `analyze_image_style` 鐞嗚В椋庢牸锛屽啀璋冪敤 `search_by_image` 鎴 `search_products` | |
| 70 | +1. **鏈夊浘鐗**锛氬厛璋冪敤 `analyze_image_style` 鐞嗚В椋庢牸锛屽啀璋冪敤 `search_products` 鐢ㄦ弿杩版悳绱 | |
| 71 | 71 | 2. **鏃犲浘鐗**锛氱敤 `search_products` 鎻忚堪鍝佺被+椋庢牸+棰滆壊 |
| 72 | 72 | 3. 鍙粨鍚堜笂涓嬫枃涓殑鍟嗗搧 ID銆佸搧绫诲仛鍚屽搧绫绘帹鑽 |
| 73 | 73 | |
| 74 | 74 | ## 鍙敤宸ュ叿 |
| 75 | -- `search_by_image(image_path, limit)`锛氫互鍥炬悳鍥 | |
| 76 | 75 | - `search_products(query, limit)`锛氭枃鏈悳绱 |
| 77 | 76 | - `analyze_image_style(image_path)`锛氬垎鏋愬浘鐗囬鏍""", |
| 78 | 77 | }, |
| ... | ... | @@ -225,15 +224,14 @@ class ShoppingSkillMiddleware(AgentMiddleware): |
| 225 | 224 | from langchain.agents import create_agent |
| 226 | 225 | from langgraph.checkpoint.memory import MemorySaver |
| 227 | 226 | |
| 228 | -# 鍩虹宸ュ叿锛堟悳绱€佷互鍥炬悳鍥俱侀鏍煎垎鏋愮瓑锛 | |
| 229 | -from app.tools.search_tools import search_products, search_by_image, analyze_image_style | |
| 227 | +# 鍩虹宸ュ叿锛堟悳绱€侀鏍煎垎鏋愮瓑锛 | |
| 228 | +from app.tools.search_tools import search_products, analyze_image_style | |
| 230 | 229 | |
| 231 | 230 | agent = create_agent( |
| 232 | 231 | model="gpt-4o-mini", |
| 233 | 232 | tools=[ |
| 234 | 233 | load_skill, # 鎶鑳藉姞杞 |
| 235 | 234 | search_products, |
| 236 | - search_by_image, | |
| 237 | 235 | analyze_image_style, |
| 238 | 236 | ], |
| 239 | 237 | system_prompt="""浣犳槸鏅鸿兘鏃跺皻璐墿鍔╂墜銆傛牴鎹敤鎴烽渶姹傦紝鍏堝垽鏂娇鐢ㄥ摢涓妧鑳斤紝蹇呰鏃剁敤 load_skill 鍔犺浇鎶鑳借鎯呫 |
| ... | ... | @@ -250,7 +248,7 @@ agent = create_agent( |
| 250 | 248 | |
| 251 | 249 | | 鑳藉姏 | 鎶鑳 | 宸ュ叿 | |
| 252 | 250 | |------|------|------| |
| 253 | -| 鏌ユ壘鐩稿叧 | lookup_related | search_by_image, search_products, analyze_image_style | | |
| 251 | +| 鏌ユ壘鐩稿叧 | lookup_related | search_products, analyze_image_style | | |
| 254 | 252 | | 鎼滅储鍟嗗搧 | search_products | search_products | |
| 255 | 253 | | 妫楠屽晢鍝 | check_product | search_products锛堢敤 query 琛ㄨ揪绾︽潫锛 | |
| 256 | 254 | | 缁撴灉鍖呰 | result_packaging | 鏃狅紙绾 prompt 绾︽潫锛 | | ... | ... |
技术实现报告.md renamed to docs/技术实现报告.md
| ... | ... | @@ -7,7 +7,7 @@ OmniShopAgent 是一个基于 **LangGraph** 和 **ReAct 模式** 的自主多模 |
| 7 | 7 | ### 核心特性 |
| 8 | 8 | |
| 9 | 9 | - **自主工具选择与执行**:Agent 根据用户意图自主选择并调用工具 |
| 10 | -- **多模态搜索**:支持文本搜索 + 图像搜索 | |
| 10 | +- **文本搜索**:通过 Search API 进行商品搜索 | |
| 11 | 11 | - **对话上下文感知**:多轮对话中保持上下文记忆 |
| 12 | 12 | - **实时视觉分析**:基于 VLM 的图片风格分析 |
| 13 | 13 | |
| ... | ... | @@ -20,9 +20,7 @@ OmniShopAgent 是一个基于 **LangGraph** 和 **ReAct 模式** 的自主多模 |
| 20 | 20 | | 运行环境 | Python 3.12 | |
| 21 | 21 | | Agent 框架 | LangGraph 1.x | |
| 22 | 22 | | LLM 框架 | LangChain 1.x(支持任意 LLM,默认 gpt-4o-mini) | |
| 23 | -| 文本向量 | text-embedding-3-small | | |
| 24 | -| 图像向量 | CLIP ViT-B/32 | | |
| 25 | -| 向量数据库 | Milvus | | |
| 23 | +| 搜索服务 | Search API (HTTP) | | |
| 26 | 24 | | 前端 | Streamlit | |
| 27 | 25 | | 数据集 | Kaggle Fashion Products | |
| 28 | 26 | |
| ... | ... | @@ -45,23 +43,21 @@ OmniShopAgent 是一个基于 **LangGraph** 和 **ReAct 模式** 的自主多模 |
| 45 | 43 | │ │ START → Agent → [Has tool_calls?] → Tools → Agent → END │ │ |
| 46 | 44 | │ └───────────────────────────────────────────────────────────┘ │ |
| 47 | 45 | └─────────────────────────────────────────────────────────────────┘ |
| 48 | - │ │ │ | |
| 49 | - ▼ ▼ ▼ | |
| 50 | -┌──────────────┐ ┌──────────────────┐ ┌─────────────────────┐ | |
| 51 | -│ search_ │ │ search_by_image │ │ analyze_image_style │ | |
| 52 | -│ products │ │ │ │ (OpenAI Vision) │ | |
| 53 | -└──────┬───────┘ └────────┬─────────┘ └──────────┬───────────┘ | |
| 54 | - │ │ │ | |
| 55 | - ▼ ▼ ▼ | |
| 46 | + │ │ | |
| 47 | + ▼ ▼ | |
| 48 | +┌──────────────┐ ┌─────────────────────┐ | |
| 49 | +│ search_ │ │ analyze_image_style │ | |
| 50 | +│ products │ │ (OpenAI Vision) │ | |
| 51 | +└──────┬───────┘ └──────────┬──────────┘ | |
| 52 | + │ │ | |
| 53 | + ▼ │ | |
| 54 | +┌──────────────────┐ │ | |
| 55 | +│ Search API │ │ | |
| 56 | +│ (HTTP POST) │ │ | |
| 57 | +└──────────────────┘ │ | |
| 58 | + ▼ | |
| 56 | 59 | ┌─────────────────────────────────────────────────────────────────┐ |
| 57 | -│ EmbeddingService (embedding_service.py) │ | |
| 58 | -│ OpenAI API (文本) │ CLIP Server (图像) │ | |
| 59 | -└─────────────────────────────────────────────────────────────────┘ | |
| 60 | - │ | |
| 61 | - ▼ | |
| 62 | -┌─────────────────────────────────────────────────────────────────┐ | |
| 63 | -│ MilvusService (milvus_service.py) │ | |
| 64 | -│ text_embeddings 集合 │ image_embeddings 集合 │ | |
| 60 | +│ OpenAI API (VLM 风格分析) │ | |
| 65 | 61 | └─────────────────────────────────────────────────────────────────┘ |
| 66 | 62 | ``` |
| 67 | 63 | |
| ... | ... | @@ -140,12 +136,11 @@ def _build_graph(self): |
| 140 | 136 | ```python |
| 141 | 137 | system_prompt = """You are an intelligent fashion shopping assistant. You can: |
| 142 | 138 | 1. Search for products by text description (use search_products) |
| 143 | -2. Find visually similar products from images (use search_by_image) | |
| 144 | -3. Analyze image style and attributes (use analyze_image_style) | |
| 139 | +2. Analyze image style and attributes (use analyze_image_style) | |
| 145 | 140 | |
| 146 | 141 | When a user asks about products: |
| 147 | 142 | - For text queries: use search_products directly |
| 148 | -- For image uploads: decide if you need to analyze_image_style first, then search | |
| 143 | +- For image uploads: use analyze_image_style first to understand the product, then use search_products with the extracted description | |
| 149 | 144 | - You can call multiple tools in sequence if needed |
| 150 | 145 | - Always provide helpful, friendly responses |
| 151 | 146 | |
| ... | ... | @@ -198,41 +193,38 @@ def chat(self, query: str, image_path: Optional[str] = None) -> dict: |
| 198 | 193 | |
| 199 | 194 | ### 4.2 搜索工具实现(search_tools.py) |
| 200 | 195 | |
| 201 | -#### 4.2.1 文本语义搜索 | |
| 196 | +#### 4.2.1 文本搜索(Search API) | |
| 202 | 197 | |
| 203 | 198 | ```python |
| 204 | 199 | @tool |
| 205 | 200 | def search_products(query: str, limit: int = 5) -> str: |
| 206 | 201 | """Search for fashion products using natural language descriptions.""" |
| 207 | 202 | try: |
| 208 | - embedding_service = get_embedding_service() | |
| 209 | - milvus_service = get_milvus_service() | |
| 210 | - | |
| 211 | - query_embedding = embedding_service.get_text_embedding(query) | |
| 212 | - | |
| 213 | - results = milvus_service.search_similar_text( | |
| 214 | - query_embedding=query_embedding, | |
| 215 | - limit=min(limit, 20), | |
| 216 | - filters=None, | |
| 217 | - output_fields=[ | |
| 218 | - "id", "productDisplayName", "gender", "masterCategory", | |
| 219 | - "subCategory", "articleType", "baseColour", "season", "usage", | |
| 220 | - ], | |
| 221 | - ) | |
| 203 | + url = f"{settings.search_api_base_url.rstrip('/')}/search/" | |
| 204 | + headers = { | |
| 205 | + "Content-Type": "application/json", | |
| 206 | + "X-Tenant-ID": settings.search_api_tenant_id, | |
| 207 | + } | |
| 208 | + payload = { | |
| 209 | + "query": query, | |
| 210 | + "size": min(limit, 20), | |
| 211 | + "from": 0, | |
| 212 | + "language": "zh", | |
| 213 | + } | |
| 214 | + | |
| 215 | + response = requests.post(url, json=payload, headers=headers, timeout=60) | |
| 216 | + data = response.json() | |
| 217 | + results = data.get("results", []) | |
| 222 | 218 | |
| 223 | 219 | if not results: |
| 224 | 220 | return "No products found matching your search." |
| 225 | 221 | |
| 226 | 222 | output = f"Found {len(results)} product(s):\n\n" |
| 227 | 223 | for idx, product in enumerate(results, 1): |
| 228 | - output += f"{idx}. {product.get('productDisplayName', 'Unknown Product')}\n" | |
| 229 | - output += f" ID: {product.get('id', 'N/A')}\n" | |
| 230 | - output += f" Category: {product.get('masterCategory')} > {product.get('subCategory')} > {product.get('articleType')}\n" | |
| 231 | - output += f" Color: {product.get('baseColour')}\n" | |
| 232 | - output += f" Gender: {product.get('gender')}\n" | |
| 233 | - if "distance" in product: | |
| 234 | - similarity = 1 - product["distance"] | |
| 235 | - output += f" Relevance: {similarity:.2%}\n" | |
| 224 | + output += f"{idx}. {product.get('title', 'Unknown Product')}\n" | |
| 225 | + output += f" ID: {product.get('spu_id', 'N/A')}\n" | |
| 226 | + output += f" Category: {product.get('category_path', 'N/A')}\n" | |
| 227 | + output += f" Price: {product.get('price')}\n" | |
| 236 | 228 | output += "\n" |
| 237 | 229 | |
| 238 | 230 | return output.strip() |
| ... | ... | @@ -240,38 +232,7 @@ def search_products(query: str, limit: int = 5) -> str: |
| 240 | 232 | return f"Error searching products: {str(e)}" |
| 241 | 233 | ``` |
| 242 | 234 | |
| 243 | -#### 4.2.2 图像相似度搜索 | |
| 244 | - | |
| 245 | -```python | |
| 246 | -@tool | |
| 247 | -def search_by_image(image_path: str, limit: int = 5) -> str: | |
| 248 | - """Find similar fashion products using an image.""" | |
| 249 | - if not Path(image_path).exists(): | |
| 250 | - return f"Error: Image file not found at '{image_path}'" | |
| 251 | - | |
| 252 | - embedding_service = get_embedding_service() | |
| 253 | - milvus_service = get_milvus_service() | |
| 254 | - | |
| 255 | - if not embedding_service.clip_client: | |
| 256 | - embedding_service.connect_clip() | |
| 257 | - | |
| 258 | - image_embedding = embedding_service.get_image_embedding(image_path) | |
| 259 | - | |
| 260 | - results = milvus_service.search_similar_images( | |
| 261 | - query_embedding=image_embedding, | |
| 262 | - limit=min(limit + 1, 21), | |
| 263 | - output_fields=[...], | |
| 264 | - ) | |
| 265 | - | |
| 266 | - # 过滤掉查询图像本身(如上传的是商品库中的图) | |
| 267 | - query_id = Path(image_path).stem | |
| 268 | - filtered_results = [r for r in results if Path(r.get("image_path", "")).stem != query_id] | |
| 269 | - filtered_results = filtered_results[:limit] | |
| 270 | - | |
| 271 | - | |
| 272 | -``` | |
| 273 | - | |
| 274 | -#### 4.2.3 视觉分析(VLM) | |
| 235 | +#### 4.2.2 视觉分析(VLM) | |
| 275 | 236 | |
| 276 | 237 | ```python |
| 277 | 238 | @tool |
| ... | ... | @@ -310,161 +271,9 @@ Provide a comprehensive yet concise description (3-4 sentences).""" |
| 310 | 271 | |
| 311 | 272 | --- |
| 312 | 273 | |
| 313 | -### 4.3 向量服务实现 | |
| 314 | - | |
| 315 | -#### 4.3.1 EmbeddingService(embedding_service.py) | |
| 316 | - | |
| 317 | -```python | |
| 318 | -class EmbeddingService: | |
| 319 | - def get_text_embedding(self, text: str) -> List[float]: | |
| 320 | - """OpenAI text-embedding-3-small""" | |
| 321 | - response = self.openai_client.embeddings.create( | |
| 322 | - input=text, model=self.text_embedding_model | |
| 323 | - ) | |
| 324 | - return response.data[0].embedding | |
| 325 | - | |
| 326 | - def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: | |
| 327 | - """CLIP 图像向量""" | |
| 328 | - if not self.clip_client: | |
| 329 | - raise RuntimeError("CLIP client not connected. Call connect_clip() first.") | |
| 330 | - result = self.clip_client.encode([str(image_path)]) | |
| 331 | - if isinstance(result, np.ndarray): | |
| 332 | - embedding = result[0].tolist() if len(result.shape) > 1 else result.tolist() | |
| 333 | - else: | |
| 334 | - embedding = result[0].embedding.tolist() | |
| 335 | - return embedding | |
| 336 | - | |
| 337 | - def get_text_embeddings_batch(self, texts: List[str], batch_size: int = 100) -> List[List[float]]: | |
| 338 | - """批量文本嵌入,用于索引""" | |
| 339 | - for i in range(0, len(texts), batch_size): | |
| 340 | - batch = texts[i : i + batch_size] | |
| 341 | - response = self.openai_client.embeddings.create(input=batch, ...) | |
| 342 | - embeddings = [item.embedding for item in response.data] | |
| 343 | - all_embeddings.extend(embeddings) | |
| 344 | - return all_embeddings | |
| 345 | -``` | |
| 346 | - | |
| 347 | -#### 4.3.2 MilvusService(milvus_service.py) | |
| 274 | +### 4.3 Streamlit 前端(app.py) | |
| 348 | 275 | |
| 349 | -**文本集合 Schema:** | |
| 350 | - | |
| 351 | -```python | |
| 352 | -schema = MilvusClient.create_schema(auto_id=False, enable_dynamic_field=True) | |
| 353 | -schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True) | |
| 354 | -schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2000) | |
| 355 | -schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.text_dim) # 1536 | |
| 356 | -schema.add_field(field_name="productDisplayName", datatype=DataType.VARCHAR, max_length=500) | |
| 357 | -schema.add_field(field_name="gender", datatype=DataType.VARCHAR, max_length=50) | |
| 358 | -schema.add_field(field_name="masterCategory", datatype=DataType.VARCHAR, max_length=100) | |
| 359 | -# ... 更多元数据字段 | |
| 360 | -``` | |
| 361 | - | |
| 362 | -**图像集合 Schema:** | |
| 363 | - | |
| 364 | -```python | |
| 365 | -schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True) | |
| 366 | -schema.add_field(field_name="image_path", datatype=DataType.VARCHAR, max_length=500) | |
| 367 | -schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=self.image_dim) # 512 | |
| 368 | -# ... 产品元数据 | |
| 369 | -``` | |
| 370 | - | |
| 371 | -**相似度搜索:** | |
| 372 | - | |
| 373 | -```python | |
| 374 | -def search_similar_text(self, query_embedding, limit=10, output_fields=None): | |
| 375 | - results = self.client.search( | |
| 376 | - collection_name=self.text_collection_name, | |
| 377 | - data=[query_embedding], | |
| 378 | - limit=limit, | |
| 379 | - output_fields=output_fields, | |
| 380 | - ) | |
| 381 | - formatted_results = [] | |
| 382 | - for hit in results[0]: | |
| 383 | - result = {"id": hit.get("id"), "distance": hit.get("distance")} | |
| 384 | - entity = hit.get("entity", {}) | |
| 385 | - for field in output_fields: | |
| 386 | - if field in entity: | |
| 387 | - result[field] = entity.get(field) | |
| 388 | - formatted_results.append(result) | |
| 389 | - return formatted_results | |
| 390 | -``` | |
| 391 | - | |
| 392 | ---- | |
| 393 | - | |
| 394 | -### 4.4 数据索引脚本(index_data.py) | |
| 395 | - | |
| 396 | -#### 4.4.1 产品数据加载 | |
| 397 | - | |
| 398 | -```python | |
| 399 | -def _load_products_from_csv(self) -> Dict[int, Dict[str, Any]]: | |
| 400 | - products = {} | |
| 401 | - # 加载 images.csv 映射 | |
| 402 | - with open(self.images_csv, "r") as f: | |
| 403 | - images_dict = {int(row["filename"].split(".")[0]): row["link"] for row in csv.DictReader(f)} | |
| 404 | - | |
| 405 | - # 加载 styles.csv | |
| 406 | - with open(self.styles_csv, "r") as f: | |
| 407 | - for row in csv.DictReader(f): | |
| 408 | - product_id = int(row["id"]) | |
| 409 | - products[product_id] = { | |
| 410 | - "id": product_id, | |
| 411 | - "gender": row.get("gender", ""), | |
| 412 | - "masterCategory": row.get("masterCategory", ""), | |
| 413 | - "subCategory": row.get("subCategory", ""), | |
| 414 | - "articleType": row.get("articleType", ""), | |
| 415 | - "baseColour": row.get("baseColour", ""), | |
| 416 | - "season": row.get("season", ""), | |
| 417 | - "usage": row.get("usage", ""), | |
| 418 | - "productDisplayName": row.get("productDisplayName", ""), | |
| 419 | - "imagePath": f"{product_id}.jpg", | |
| 420 | - } | |
| 421 | - return products | |
| 422 | -``` | |
| 423 | - | |
| 424 | -#### 4.4.2 文本索引 | |
| 425 | - | |
| 426 | -```python | |
| 427 | -def _create_product_text(self, product: Dict[str, Any]) -> str: | |
| 428 | - """构造产品文本用于 embedding""" | |
| 429 | - parts = [ | |
| 430 | - product.get("productDisplayName", ""), | |
| 431 | - f"Gender: {product.get('gender', '')}", | |
| 432 | - f"Category: {product.get('masterCategory', '')} > {product.get('subCategory', '')}", | |
| 433 | - f"Type: {product.get('articleType', '')}", | |
| 434 | - f"Color: {product.get('baseColour', '')}", | |
| 435 | - f"Season: {product.get('season', '')}", | |
| 436 | - f"Usage: {product.get('usage', '')}", | |
| 437 | - ] | |
| 438 | - return " | ".join([p for p in parts if p and p != "Gender: " and p != "Color: "]) | |
| 439 | -``` | |
| 440 | - | |
| 441 | -#### 4.4.3 批量索引流程 | |
| 442 | - | |
| 443 | -```python | |
| 444 | -# 文本索引 | |
| 445 | -texts = [self._create_product_text(p) for p in products] | |
| 446 | -embeddings = self.embedding_service.get_text_embeddings_batch(texts, batch_size=50) | |
| 447 | -milvus_data = [{ | |
| 448 | - "id": product_id, | |
| 449 | - "text": text[:2000], | |
| 450 | - "embedding": embedding, | |
| 451 | - "productDisplayName": product["productDisplayName"][:500], | |
| 452 | - "gender": product["gender"][:50], | |
| 453 | - # ... 其他元数据 | |
| 454 | -} for product_id, text, embedding in zip(...)] | |
| 455 | -self.milvus_service.insert_text_embeddings(milvus_data) | |
| 456 | - | |
| 457 | -# 图像索引 | |
| 458 | -image_paths = [self.image_dir / p["imagePath"] for p in products] | |
| 459 | -embeddings = self.embedding_service.get_image_embeddings_batch(image_paths, batch_size=32) | |
| 460 | -# 类似插入 image_embeddings 集合 | |
| 461 | -``` | |
| 462 | - | |
| 463 | ---- | |
| 464 | - | |
| 465 | -### 4.5 Streamlit 前端(app.py) | |
| 466 | - | |
| 467 | -#### 4.5.1 会话与 Agent 初始化 | |
| 276 | +#### 4.3.1 会话与 Agent 初始化 | |
| 468 | 277 | |
| 469 | 278 | ```python |
| 470 | 279 | def initialize_session(): |
| ... | ... | @@ -478,7 +287,7 @@ def initialize_session(): |
| 478 | 287 | st.session_state.uploaded_image = None |
| 479 | 288 | ``` |
| 480 | 289 | |
| 481 | -#### 4.5.2 产品信息解析 | |
| 290 | +#### 4.3.2 产品信息解析 | |
| 482 | 291 | |
| 483 | 292 | ```python |
| 484 | 293 | def extract_products_from_response(response: str) -> list: |
| ... | ... | @@ -501,7 +310,7 @@ def extract_products_from_response(response: str) -> list: |
| 501 | 310 | return products |
| 502 | 311 | ``` |
| 503 | 312 | |
| 504 | -#### 4.5.3 多轮对话中的图片引用 | |
| 313 | +#### 4.3.3 多轮对话中的图片引用 | |
| 505 | 314 | |
| 506 | 315 | ```python |
| 507 | 316 | # 用户输入 "make them formal" 时,若上一条消息有图片,则引用该图片 |
| ... | ... | @@ -514,28 +323,14 @@ if any(ref in query_lower for ref in ["this", "that", "the image", "it"]): |
| 514 | 323 | |
| 515 | 324 | --- |
| 516 | 325 | |
| 517 | -### 4.6 配置管理(config.py) | |
| 326 | +### 4.4 配置管理(config.py) | |
| 518 | 327 | |
| 519 | 328 | ```python |
| 520 | 329 | class Settings(BaseSettings): |
| 521 | 330 | openai_api_key: str |
| 522 | 331 | openai_model: str = "gpt-4o-mini" |
| 523 | - openai_embedding_model: str = "text-embedding-3-small" | |
| 524 | - clip_server_url: str = "grpc://localhost:51000" | |
| 525 | - milvus_uri: str = "http://localhost:19530" | |
| 526 | - text_collection_name: str = "text_embeddings" | |
| 527 | - image_collection_name: str = "image_embeddings" | |
| 528 | - text_dim: int = 1536 | |
| 529 | - image_dim: int = 512 | |
| 530 | - | |
| 531 | - @property | |
| 532 | - def milvus_uri_absolute(self) -> str: | |
| 533 | - """支持 Milvus Standalone 和 Milvus Lite""" | |
| 534 | - if self.milvus_uri.startswith(("http://", "https://")): | |
| 535 | - return self.milvus_uri | |
| 536 | - if self.milvus_uri.startswith("./"): | |
| 537 | - return os.path.join(base_dir, self.milvus_uri[2:]) | |
| 538 | - return self.milvus_uri | |
| 332 | + search_api_base_url: str = "http://120.76.41.98:6002" | |
| 333 | + search_api_tenant_id: str = "162" | |
| 539 | 334 | |
| 540 | 335 | class Config: |
| 541 | 336 | env_file = ".env" |
| ... | ... | @@ -547,35 +342,22 @@ class Settings(BaseSettings): |
| 547 | 342 | |
| 548 | 343 | ### 5.1 依赖服务 |
| 549 | 344 | |
| 550 | -```yaml | |
| 551 | -# docker-compose.yml 提供 | |
| 552 | -- etcd: 元数据存储 | |
| 553 | -- minio: 对象存储 | |
| 554 | -- milvus-standalone: 向量数据库 | |
| 555 | -- attu: Milvus 管理界面 | |
| 556 | -``` | |
| 345 | +- **Search API**:外部搜索服务(HTTP) | |
| 346 | +- **OpenAI API**:LLM 与 VLM 图像分析 | |
| 557 | 347 | |
| 558 | 348 | ### 5.2 启动流程 |
| 559 | 349 | |
| 560 | 350 | ```bash |
| 561 | 351 | # 1. 环境 |
| 562 | 352 | pip install -r requirements.txt |
| 563 | -cp .env.example .env # 配置 OPENAI_API_KEY | |
| 353 | +cp .env.example .env # 配置 OPENAI_API_KEY、SEARCH_API_* 等 | |
| 564 | 354 | |
| 565 | -# 2. 下载数据 | |
| 355 | +# 2. (可选)下载数据 | |
| 566 | 356 | python scripts/download_dataset.py # Kaggle Fashion Product Images Dataset |
| 567 | 357 | |
| 568 | -# 3. 启动 CLIP 服务(需单独运行) | |
| 569 | -python -m clip_server | |
| 570 | - | |
| 571 | -# 4. 启动 Milvus | |
| 572 | -docker-compose up | |
| 573 | - | |
| 574 | -# 5. 索引数据 | |
| 575 | -python scripts/index_data.py | |
| 576 | - | |
| 577 | -# 6. 启动应用 | |
| 358 | +# 3. 启动应用 | |
| 578 | 359 | streamlit run app.py |
| 360 | +# 或 ./scripts/start.sh | |
| 579 | 361 | ``` |
| 580 | 362 | |
| 581 | 363 | --- |
| ... | ... | @@ -585,7 +367,6 @@ streamlit run app.py |
| 585 | 367 | | 场景 | 用户输入 | Agent 行为 | 工具调用 | |
| 586 | 368 | |------|----------|------------|----------| |
| 587 | 369 | | 文本搜索 | "winter coats for women" | 直接文本搜索 | `search_products("winter coats women")` | |
| 588 | -| 图像搜索 | [上传图片] "find similar" | 图像相似度搜索 | `search_by_image(path)` | | |
| 589 | 370 | | 风格分析+搜索 | [上传复古夹克] "what style? find matching pants" | 先分析风格再搜索 | `analyze_image_style(path)` → `search_products("vintage pants casual")` | |
| 590 | 371 | | 多轮上下文 | [第1轮] "show me red dresses"<br>[第2轮] "make them formal" | 结合上下文 | `search_products("red formal dresses")` | |
| 591 | 372 | |
| ... | ... | @@ -595,10 +376,9 @@ streamlit run app.py |
| 595 | 376 | |
| 596 | 377 | 1. **ReAct 模式**:Agent 自主决定何时调用工具、调用哪些工具、是否继续调用。 |
| 597 | 378 | 2. **LangGraph 状态图**:`START → Agent → [条件] → Tools → Agent → END`,支持多轮工具调用。 |
| 598 | -3. **多模态**:文本 + 图像 + VLM 分析,覆盖文本搜索、以图搜图、风格理解。 | |
| 599 | -4. **双向量集合**:Milvus 中 text_embeddings / image_embeddings 分别存储,支持不同模态的检索。 | |
| 600 | -5. **会话持久化**:`MemorySaver` + `thread_id` 实现多轮对话记忆。 | |
| 601 | -6. **格式约束**:System prompt 严格限制产品输出格式,便于前端解析和展示。 | |
| 379 | +3. **搜索与风格分析**:Search API 文本搜索 + VLM 图像风格分析。 | |
| 380 | +4. **会话持久化**:`MemorySaver` + `thread_id` 实现多轮对话记忆。 | |
| 381 | +5. **格式约束**:System prompt 严格限制产品输出格式,便于前端解析和展示。 | |
| 602 | 382 | |
| 603 | 383 | --- |
| 604 | 384 | |
| ... | ... | @@ -611,8 +391,6 @@ OmniShopAgent/ |
| 611 | 391 | │ │ └── shopping_agent.py |
| 612 | 392 | │ ├── config.py |
| 613 | 393 | │ ├── services/ |
| 614 | -│ │ ├── embedding_service.py | |
| 615 | -│ │ └── milvus_service.py | |
| 616 | 394 | │ └── tools/ |
| 617 | 395 | │ └── search_tools.py |
| 618 | 396 | ├── scripts/ | ... | ... |
| ... | ... | @@ -0,0 +1,1651 @@ |
| 1 | +# 搜索API接口对接指南 | |
| 2 | + | |
| 3 | +本文档为搜索服务的使用方提供完整的API对接指南,包括接口说明、请求参数、响应格式和使用示例。 | |
| 4 | + | |
| 5 | +## 目录 | |
| 6 | + | |
| 7 | +1. [快速开始](#快速开始) | |
| 8 | + - 1.1 [基础信息](#11-基础信息) | |
| 9 | + - 1.2 [最简单的搜索请求](#12-最简单的搜索请求) | |
| 10 | + - 1.3 [带过滤与分页的搜索](#13-带过滤与分页的搜索) | |
| 11 | + - 1.4 [开启分面的搜索](#14-开启分面的搜索) | |
| 12 | + | |
| 13 | +2. [接口概览](#接口概览) | |
| 14 | + | |
| 15 | +3. [搜索接口](#搜索接口) | |
| 16 | + - 3.1 [接口信息](#31-接口信息) | |
| 17 | + - 3.2 [请求参数](#32-请求参数) | |
| 18 | + - 3.3 [过滤器详解](#33-过滤器详解) | |
| 19 | + - 3.4 [分面配置](#34-分面配置) | |
| 20 | + - 3.5 [SKU筛选维度](#35-sku筛选维度) | |
| 21 | + - 3.6 [布尔表达式语法](#36-布尔表达式语法) | |
| 22 | + - 3.7 [搜索建议接口](#37-搜索建议接口) | |
| 23 | + - 3.8 [即时搜索接口](#38-即时搜索接口) | |
| 24 | + - 3.9 [获取单个文档](#39-获取单个文档) | |
| 25 | + | |
| 26 | +4. [响应格式说明](#响应格式说明) | |
| 27 | + - 4.1 [标准响应结构](#41-标准响应结构) | |
| 28 | + - 4.2 [响应字段说明](#42-响应字段说明) | |
| 29 | + - 4.2.1 [query_info 说明](#421-query_info-说明) | |
| 30 | + - 4.3 [SpuResult字段说明](#43-spuresult字段说明) | |
| 31 | + - 4.4 [SkuResult字段说明](#44-skuresult字段说明) | |
| 32 | + - 4.5 [多语言字段说明](#45-多语言字段说明) | |
| 33 | + | |
| 34 | +5. [索引接口](#索引接口) | |
| 35 | + - 5.0 [为租户创建索引](#50-为租户创建索引) | |
| 36 | + - 5.1 [全量索引接口](#51-全量索引接口) | |
| 37 | + - 5.2 [增量索引接口](#52-增量索引接口) | |
| 38 | + - 5.3 [查询文档接口](#53-查询文档接口) | |
| 39 | + - 5.4 [索引健康检查接口](#54-索引健康检查接口) | |
| 40 | + | |
| 41 | +6. [管理接口](#管理接口) | |
| 42 | + - 6.1 [健康检查](#61-健康检查) | |
| 43 | + - 6.2 [获取配置](#62-获取配置) | |
| 44 | + - 6.3 [索引统计](#63-索引统计) | |
| 45 | + | |
| 46 | +7. [常见场景示例](#常见场景示例) | |
| 47 | + - 7.1 [基础搜索与排序](#71-基础搜索与排序) | |
| 48 | + - 7.2 [过滤搜索](#72-过滤搜索) | |
| 49 | + - 7.3 [分面搜索](#73-分面搜索) | |
| 50 | + - 7.4 [规格过滤与分面](#74-规格过滤与分面) | |
| 51 | + - 7.5 [SKU筛选](#75-sku筛选) | |
| 52 | + - 7.6 [布尔表达式搜索](#76-布尔表达式搜索) | |
| 53 | + - 7.7 [分页查询](#77-分页查询) | |
| 54 | + | |
| 55 | +8. [数据模型](#数据模型) | |
| 56 | + - 8.1 [商品字段定义](#81-商品字段定义) | |
| 57 | + - 8.2 [字段类型速查](#82-字段类型速查) | |
| 58 | + - 8.3 [常用字段列表](#83-常用字段列表) | |
| 59 | + - 8.4 [支持的分析器](#84-支持的分析器) | |
| 60 | + | |
| 61 | +--- | |
| 62 | + | |
| 63 | +## 快速开始 | |
| 64 | + | |
| 65 | +### 1.1 基础信息 | |
| 66 | + | |
| 67 | +- **Base URL**: `http://120.76.41.98:6002` | |
| 68 | +- **协议**: HTTP/HTTPS | |
| 69 | +- **数据格式**: JSON | |
| 70 | +- **字符编码**: UTF-8 | |
| 71 | +- **请求方法**: POST(搜索接口) | |
| 72 | + | |
| 73 | +**重要提示**: `tenant_id` 通过 HTTP Header `X-Tenant-ID` 传递,不在请求体中。 | |
| 74 | + | |
| 75 | +### 1.2 最简单的搜索请求 | |
| 76 | + | |
| 77 | +```bash | |
| 78 | +curl -X POST "http://120.76.41.98:6002/search/" \ | |
| 79 | + -H "Content-Type: application/json" \ | |
| 80 | + -H "X-Tenant-ID: 162" \ | |
| 81 | + -d '{"query": "芭比娃娃"}' | |
| 82 | +``` | |
| 83 | + | |
| 84 | +### 1.3 带过滤与分页的搜索 | |
| 85 | + | |
| 86 | +```bash | |
| 87 | +curl -X POST "http://120.76.41.98:6002/search/" \ | |
| 88 | + -H "Content-Type: application/json" \ | |
| 89 | + -H "X-Tenant-ID: 162" \ | |
| 90 | + -d '{ | |
| 91 | + "query": "芭比娃娃", | |
| 92 | + "size": 5, | |
| 93 | + "from": 10, | |
| 94 | + "range_filters": { | |
| 95 | + "min_price": { | |
| 96 | + "gte": 50, | |
| 97 | + "lte": 200 | |
| 98 | + }, | |
| 99 | + "create_time": { | |
| 100 | + "gte": "2020-01-01T00:00:00Z" | |
| 101 | + } | |
| 102 | + }, | |
| 103 | + "sort_by": "price", | |
| 104 | + "sort_order": "asc" | |
| 105 | + }' | |
| 106 | +``` | |
| 107 | + | |
| 108 | +### 1.4 开启分面的搜索 | |
| 109 | + | |
| 110 | +```bash | |
| 111 | +curl -X POST "http://120.76.41.98:6002/search/" \ | |
| 112 | + -H "Content-Type: application/json" \ | |
| 113 | + -H "X-Tenant-ID: 162" \ | |
| 114 | + -d '{ | |
| 115 | + "query": "芭比娃娃", | |
| 116 | + "facets": [ | |
| 117 | + {"field": "category1_name", "size": 10, "type": "terms"}, | |
| 118 | + {"field": "specifications.color", "size": 10, "type": "terms"}, | |
| 119 | + {"field": "specifications.size", "size": 10, "type": "terms"} | |
| 120 | + ], | |
| 121 | + "min_score": 0.2 | |
| 122 | + }' | |
| 123 | +``` | |
| 124 | + | |
| 125 | +--- | |
| 126 | + | |
| 127 | +## 接口概览 | |
| 128 | + | |
| 129 | +| 接口 | HTTP Method | Endpoint | 说明 | | |
| 130 | +|------|------|------|------| | |
| 131 | +| 搜索 | POST | `/search/` | 执行搜索查询 | | |
| 132 | +| 搜索建议 | GET | `/search/suggestions` | 搜索建议(框架,暂未实现) ⚠️ TODO | | |
| 133 | +| 即时搜索 | GET | `/search/instant` | 边输入边搜索(框架) ⚠️ TODO | | |
| 134 | +| 获取文档 | GET | `/search/{doc_id}` | 获取单个文档 | | |
| 135 | +| 全量索引 | POST | `/indexer/reindex` | 全量索引接口(导入数据,不删除索引) | | |
| 136 | +| 增量索引 | POST | `/indexer/index` | 增量索引接口(指定SPU ID列表进行索引,支持自动检测删除和显式删除) | | |
| 137 | +| 查询文档 | POST | `/indexer/documents` | 查询SPU文档数据(不写入ES) | | |
| 138 | +| 索引健康检查 | GET | `/indexer/health` | 检查索引服务状态 | | |
| 139 | +| 健康检查 | GET | `/admin/health` | 服务健康检查 | | |
| 140 | +| 获取配置 | GET | `/admin/config` | 获取租户配置 | | |
| 141 | +| 索引统计 | GET | `/admin/stats` | 获取索引统计信息 | | |
| 142 | + | |
| 143 | +--- | |
| 144 | + | |
| 145 | +## 搜索接口 | |
| 146 | + | |
| 147 | +### 3.1 接口信息 | |
| 148 | + | |
| 149 | +- **端点**: `POST /search/` | |
| 150 | +- **描述**: 执行文本搜索查询,支持多语言、布尔表达式、过滤器和分面搜索 | |
| 151 | + | |
| 152 | +### 3.2 请求参数 | |
| 153 | + | |
| 154 | +#### 完整请求体结构 | |
| 155 | + | |
| 156 | +```json | |
| 157 | +{ | |
| 158 | + "query": "string (required)", | |
| 159 | + "size": 10, | |
| 160 | + "from": 0, | |
| 161 | + "language": "zh", | |
| 162 | + "filters": {}, | |
| 163 | + "range_filters": {}, | |
| 164 | + "facets": [], | |
| 165 | + "sort_by": "string", | |
| 166 | + "sort_order": "desc", | |
| 167 | + "min_score": 0.0, | |
| 168 | + "sku_filter_dimension": ["string"], | |
| 169 | + "debug": false, | |
| 170 | + "enable_rerank": false, | |
| 171 | + "rerank_query_template": "{query}", | |
| 172 | + "rerank_doc_template": "{title}", | |
| 173 | + "user_id": "string", | |
| 174 | + "session_id": "string" | |
| 175 | +} | |
| 176 | +``` | |
| 177 | + | |
| 178 | +#### 参数详细说明 | |
| 179 | + | |
| 180 | +| 参数 | 类型 | 必填 | 默认值 | 说明 | | |
| 181 | +|------|------|------|--------|------| | |
| 182 | +| `query` | string | Y | - | 搜索查询字符串,支持布尔表达式(AND, OR, RANK, ANDNOT) | | |
| 183 | +| `size` | integer | N | 10 | 返回结果数量(1-100) | | |
| 184 | +| `from` | integer | N | 0 | 分页偏移量(用于分页) | | |
| 185 | +| `language` | string | N | "zh" | 返回语言:`zh`(中文)或 `en`(英文)。后端会根据此参数选择对应的中英文字段返回 | | |
| 186 | +| `filters` | object | N | null | 精确匹配过滤器(见[过滤器详解](#33-过滤器详解)) | | |
| 187 | +| `range_filters` | object | N | null | 数值范围过滤器(见[过滤器详解](#33-过滤器详解)) | | |
| 188 | +| `facets` | array | N | null | 分面配置(见[分面配置](#34-分面配置)) | | |
| 189 | +| `sort_by` | string | N | null | 排序字段名。支持:`price`(价格)、`sales`(销量)、`create_time`(创建时间)、`update_time`(更新时间)。默认按相关性排序 | | |
| 190 | +| `sort_order` | string | N | "desc" | 排序方向:`asc`(升序)或 `desc`(降序)。注意:`price`+`asc`=价格从低到高,`price`+`desc`=价格从高到低(后端自动映射为min_price或max_price) | | |
| 191 | +| `min_score` | float | N | null | 最小相关性分数阈值 | | |
| 192 | +| `sku_filter_dimension` | array[string] | N | null | 子SKU筛选维度列表(见[SKU筛选维度](#35-sku筛选维度)) | | |
| 193 | +| `debug` | boolean | N | false | 是否返回调试信息 | | |
| 194 | +| `enable_rerank` | boolean | N | false | 是否开启重排(调用外部重排服务对 ES 结果进行二次排序)。开启后若 `from+size<=rerank_window` 才会触发重排 | | |
| 195 | +| `rerank_query_template` | string | N | null | 重排 query 模板(可选)。支持 `{query}` 占位符;不传则使用服务端配置 | | |
| 196 | +| `rerank_doc_template` | string | N | null | 重排 doc 模板(可选)。支持 `{title} {brief} {vendor} {description} {category_path}`;不传则使用服务端配置 | | |
| 197 | +| `user_id` | string | N | null | 用户ID(用于个性化,预留) | | |
| 198 | +| `session_id` | string | N | null | 会话ID(用于分析,预留) | | |
| 199 | + | |
| 200 | +### 3.3 过滤器详解 | |
| 201 | + | |
| 202 | +#### 3.3.1 精确匹配过滤器 (filters) | |
| 203 | + | |
| 204 | +用于精确匹配或多值匹配。对于普通字段,数组表示 OR 逻辑(匹配任意一个值);对于 specifications 字段,按维度分组处理。**任意字段名加 `_all` 后缀**表示多值 AND 逻辑(必须同时匹配所有值)。 | |
| 205 | + | |
| 206 | +**格式**: | |
| 207 | +```json | |
| 208 | +{ | |
| 209 | + "filters": { | |
| 210 | + "category_name": "手机", // 可以为单值 或者 数组 匹配数组中任意一个(OR) | |
| 211 | + "category1_name": "服装", // 可以为单值 或者 数组 匹配数组中任意一个(OR) | |
| 212 | + "category2_name": "男装", // 可以为单值 或者 数组 匹配数组中任意一个(OR) | |
| 213 | + "category3_name": "衬衫", // 可以为单值 或者 数组 匹配数组中任意一个(OR) | |
| 214 | + "vendor.zh.keyword": ["奇乐", "品牌A"], // 可以为单值 或者 数组 匹配数组中任意一个(OR) | |
| 215 | + "tags": "手机", // 可以为单值 或者 数组 匹配数组中任意一个(OR) | |
| 216 | + "tags_all": ["手机", "促销", "新品"], // *_all:多值为 AND,必须同时包含所有标签 | |
| 217 | + "category1_name_all": ["服装", "男装"], // 同上,适用于任意可过滤字段 | |
| 218 | + // specifications 嵌套过滤(特殊格式) | |
| 219 | + "specifications": { | |
| 220 | + "name": "color", | |
| 221 | + "value": "white" | |
| 222 | + } | |
| 223 | + } | |
| 224 | +} | |
| 225 | +``` | |
| 226 | + | |
| 227 | +**支持的值类型**: | |
| 228 | +- 字符串:精确匹配 | |
| 229 | +- 整数:精确匹配 | |
| 230 | +- 布尔值:精确匹配 | |
| 231 | +- 数组:匹配任意值(OR 逻辑);若字段名以 `_all` 结尾,则数组表示 AND 逻辑(必须同时匹配所有值) | |
| 232 | +- 对象:specifications 嵌套过滤(见下文) | |
| 233 | + | |
| 234 | +**`*_all` 语义(多值 AND)**: | |
| 235 | +- 任意过滤字段均可使用 `_all` 后缀,对应 ES 字段名为去掉 `_all` 后的名称。 | |
| 236 | +- 例如:`tags_all: ["A", "B"]` 表示文档的 `tags` 必须**同时包含** A 和 B;`vendor.zh.keyword_all: ["奇乐", "品牌A"]` 表示同时匹配两个品牌(通常用于 keyword 多值场景)。 | |
| 237 | +- `specifications_all`:传列表 `[{"name":"color","value":"white"},{"name":"size","value":"256GB"}]` 时,表示所有列出的规格条件都要满足(与 `specifications` 多维度时的 AND 一致;若同维度多值则要求文档同时满足多个值,一般用于嵌套多值场景)。 | |
| 238 | + | |
| 239 | +**Specifications 嵌套过滤**: | |
| 240 | + | |
| 241 | +`specifications` 是嵌套字段,支持按规格名称和值进行过滤。 | |
| 242 | + | |
| 243 | +**单个规格过滤**: | |
| 244 | +```json | |
| 245 | +{ | |
| 246 | + "filters": { | |
| 247 | + "specifications": { | |
| 248 | + "name": "color", | |
| 249 | + "value": "white" | |
| 250 | + } | |
| 251 | + } | |
| 252 | +} | |
| 253 | +``` | |
| 254 | +查询规格名称为"color"且值为"white"的商品。 | |
| 255 | + | |
| 256 | +**多个规格过滤(按维度分组)**: | |
| 257 | +```json | |
| 258 | +{ | |
| 259 | + "filters": { | |
| 260 | + "specifications": [ | |
| 261 | + {"name": "color", "value": "white"}, | |
| 262 | + {"name": "size", "value": "256GB"} | |
| 263 | + ] | |
| 264 | + } | |
| 265 | +} | |
| 266 | +``` | |
| 267 | +查询同时满足所有规格的商品(color=white **且** size=256GB)。 | |
| 268 | + | |
| 269 | +**相同维度的多个值(OR 逻辑)**: | |
| 270 | +```json | |
| 271 | +{ | |
| 272 | + "filters": { | |
| 273 | + "specifications": [ | |
| 274 | + {"name": "size", "value": "3"}, | |
| 275 | + {"name": "size", "value": "4"}, | |
| 276 | + {"name": "size", "value": "5"}, | |
| 277 | + {"name": "color", "value": "green"} | |
| 278 | + ] | |
| 279 | + } | |
| 280 | +} | |
| 281 | +``` | |
| 282 | +查询满足 (size=3 **或** size=4 **或** size=5) **且** color=green 的商品。 | |
| 283 | + | |
| 284 | +**过滤逻辑说明**: | |
| 285 | +- **不同维度**(不同的 `name`)之间是 **AND** 关系(求交集) | |
| 286 | +- **相同维度**(相同的 `name`)的多个值之间是 **OR** 关系(求并集) | |
| 287 | + | |
| 288 | +**常用过滤字段**(详见[常用字段列表](#83-常用字段列表)): | |
| 289 | +- `category_name`: 类目名称 | |
| 290 | +- `category1_name`, `category2_name`, `category3_name`: 多级类目 | |
| 291 | +- `category_id`: 类目ID | |
| 292 | +- `vendor.zh.keyword`, `vendor.en.keyword`: 供应商/品牌(使用keyword子字段) | |
| 293 | +- `tags`: 标签(keyword类型,支持数组) | |
| 294 | +- `option1_name`, `option2_name`, `option3_name`: 选项名称 | |
| 295 | +- `specifications`: 规格过滤(嵌套字段,格式见上文) | |
| 296 | +- 以上任意字段均可加 `_all` 后缀表示多值 AND,如 `tags_all`、`category1_name_all`。 | |
| 297 | + | |
| 298 | +#### 3.3.2 范围过滤器 (range_filters) | |
| 299 | + | |
| 300 | +用于数值字段的范围过滤。 | |
| 301 | + | |
| 302 | +**格式**: | |
| 303 | +```json | |
| 304 | +{ | |
| 305 | + "range_filters": { | |
| 306 | + "min_price": { | |
| 307 | + "gte": 50, // 大于等于 | |
| 308 | + "lte": 200 // 小于等于 | |
| 309 | + }, | |
| 310 | + "max_price": { | |
| 311 | + "gt": 100 // 大于 | |
| 312 | + }, | |
| 313 | + "create_time": { | |
| 314 | + "gte": "2024-01-01T00:00:00Z" // 日期时间字符串 | |
| 315 | + } | |
| 316 | + } | |
| 317 | +} | |
| 318 | +``` | |
| 319 | + | |
| 320 | +**支持的操作符**: | |
| 321 | +- `gte`: 大于等于 (>=) | |
| 322 | +- `gt`: 大于 (>) | |
| 323 | +- `lte`: 小于等于 (<=) | |
| 324 | +- `lt`: 小于 (<) | |
| 325 | + | |
| 326 | +**注意**: 至少需要指定一个操作符。 | |
| 327 | + | |
| 328 | +**常用范围字段**(详见[常用字段列表](#83-常用字段列表)): | |
| 329 | +- `min_price`: 最低价格 | |
| 330 | +- `max_price`: 最高价格 | |
| 331 | +- `compare_at_price`: 原价 | |
| 332 | +- `create_time`: 创建时间 | |
| 333 | +- `update_time`: 更新时间 | |
| 334 | + | |
| 335 | +### 3.4 分面配置 | |
| 336 | + | |
| 337 | +用于生成分面统计(分组聚合),常用于构建筛选器UI。 | |
| 338 | + | |
| 339 | +#### 3.4.1 配置格式 | |
| 340 | + | |
| 341 | +```json | |
| 342 | +{ | |
| 343 | + "facets": [ | |
| 344 | + { | |
| 345 | + "field": "category1_name", | |
| 346 | + "size": 15, | |
| 347 | + "type": "terms", | |
| 348 | + "disjunctive": false | |
| 349 | + }, | |
| 350 | + { | |
| 351 | + "field": "brand_name", | |
| 352 | + "size": 10, | |
| 353 | + "type": "terms", | |
| 354 | + "disjunctive": true | |
| 355 | + }, | |
| 356 | + { | |
| 357 | + "field": "specifications.color", | |
| 358 | + "size": 20, | |
| 359 | + "type": "terms", | |
| 360 | + "disjunctive": true | |
| 361 | + }, | |
| 362 | + { | |
| 363 | + "field": "min_price", | |
| 364 | + "type": "range", | |
| 365 | + "ranges": [ | |
| 366 | + {"key": "0-50", "to": 50}, | |
| 367 | + {"key": "50-100", "from": 50, "to": 100}, | |
| 368 | + {"key": "100-200", "from": 100, "to": 200}, | |
| 369 | + {"key": "200+", "from": 200} | |
| 370 | + ] | |
| 371 | + } | |
| 372 | + ] | |
| 373 | +} | |
| 374 | +``` | |
| 375 | + | |
| 376 | +#### 3.4.2 Facet 字段说明 | |
| 377 | + | |
| 378 | +| 字段 | 类型 | 必填 | 默认值 | 说明 | | |
| 379 | +|------|------|------|--------|------| | |
| 380 | +| `field` | string | 是 | - | 分面字段名 | | |
| 381 | +| `size` | int | 否 | 10 | 返回的分面值数量(1-100) | | |
| 382 | +| `type` | string | 否 | "terms" | 分面类型:`terms`(词条聚合)或 `range`(范围聚合) | | |
| 383 | +| `disjunctive` | bool | 否 | false | 是否支持多选(disjunctive faceting)。启用后,选中该分面的过滤器时,仍会显示其他可选项 | | |
| 384 | +| `ranges` | array | 否 | null | 范围配置(仅 `type="range"` 时需要) | | |
| 385 | + | |
| 386 | +#### 3.4.3 disjunctive字段说明 | |
| 387 | + | |
| 388 | +**重要特性**: `disjunctive` 字段控制分面的行为模式。启用后,选中该分面的过滤器时,仍会显示其他可选项 | |
| 389 | + | |
| 390 | +**标准模式 (disjunctive: false)**: | |
| 391 | +- **行为**: 选中某个分面值后,该分面只显示选中的值 | |
| 392 | +- **适用场景**: 层级类目、互斥选择 | |
| 393 | +- **示例**: 类目下钻(玩具 > 娃娃 > 芭比) | |
| 394 | + | |
| 395 | +**Multi-Select 模式 (disjunctive: true)** ⭐: | |
| 396 | +- **行为**: 选中某个分面值后,该分面仍显示所有可选项 | |
| 397 | +- **适用场景**: 颜色、品牌、尺码等可切换属性 | |
| 398 | +- **示例**: 选择了"红色"后,仍能看到"蓝色"、"绿色"等选项 | |
| 399 | + | |
| 400 | +**推荐配置**: | |
| 401 | + | |
| 402 | +| 分面类型 | disjunctive | 原因 | | |
| 403 | +|---------|-------------|------| | |
| 404 | +| 颜色 | `true` | 用户需要切换颜色 | | |
| 405 | +| 品牌 | `true` | 用户需要比较品牌 | | |
| 406 | +| 尺码 | `true` | 用户需要查看其他尺码 | | |
| 407 | +| 类目 | `false` | 层级下钻 | | |
| 408 | +| 价格区间 | `false` | 互斥选择 | | |
| 409 | + | |
| 410 | +#### 3.4.4 规格分面说明 | |
| 411 | + | |
| 412 | +`specifications` 是嵌套字段,支持两种分面模式: | |
| 413 | + | |
| 414 | +**模式1:所有规格名称的分面**: | |
| 415 | +```json | |
| 416 | +{ | |
| 417 | + "facets": [ | |
| 418 | + { | |
| 419 | + "field": "specifications", | |
| 420 | + "size": 10, | |
| 421 | + "type": "terms" | |
| 422 | + } | |
| 423 | + ] | |
| 424 | +} | |
| 425 | +``` | |
| 426 | +返回所有规格名称(name)及其对应的值(value)列表。每个 name 会生成一个独立的分面结果。 | |
| 427 | + | |
| 428 | +**模式2:指定规格名称的分面**: | |
| 429 | +```json | |
| 430 | +{ | |
| 431 | + "facets": [ | |
| 432 | + { | |
| 433 | + "field": "specifications.color", | |
| 434 | + "size": 20, | |
| 435 | + "type": "terms", | |
| 436 | + "disjunctive": true | |
| 437 | + }, | |
| 438 | + { | |
| 439 | + "field": "specifications.size", | |
| 440 | + "size": 15, | |
| 441 | + "type": "terms", | |
| 442 | + "disjunctive": true | |
| 443 | + } | |
| 444 | + ] | |
| 445 | +} | |
| 446 | +``` | |
| 447 | +只返回指定规格名称的值列表。格式:`specifications.{name}`,其中 `{name}` 是规格名称(如"color"、"size"、"material")。 | |
| 448 | + | |
| 449 | +**返回格式示例**: | |
| 450 | +```json | |
| 451 | +{ | |
| 452 | + "facets": [ | |
| 453 | + { | |
| 454 | + "field": "specifications.color", | |
| 455 | + "label": "color", | |
| 456 | + "type": "terms", | |
| 457 | + "values": [ | |
| 458 | + {"value": "white", "count": 50, "selected": true}, // ✓ selected 字段由后端标记 | |
| 459 | + {"value": "black", "count": 30, "selected": false}, | |
| 460 | + {"value": "red", "count": 20, "selected": false} | |
| 461 | + ] | |
| 462 | + }, | |
| 463 | + { | |
| 464 | + "field": "specifications.size", | |
| 465 | + "label": "size", | |
| 466 | + "type": "terms", | |
| 467 | + "values": [ | |
| 468 | + {"value": "256GB", "count": 40, "selected": false}, | |
| 469 | + {"value": "512GB", "count": 20, "selected": false} | |
| 470 | + ] | |
| 471 | + } | |
| 472 | + ] | |
| 473 | +} | |
| 474 | +``` | |
| 475 | + | |
| 476 | +### 3.5 SKU筛选维度 | |
| 477 | + | |
| 478 | +**功能说明**: | |
| 479 | +`sku_filter_dimension` 用于控制搜索列表页中 **每个 SPU 下方可切换的子款式(子 SKU)维度**,为字符串列表。 | |
| 480 | +在店铺的 **主题装修配置** 中,商家可以为店铺设置一个或多个子款式筛选维度(例如 `color`、`size`),前端列表页会在每个 SPU 下展示这些维度对应的子 SKU 列表,用户可以通过点击不同维度值(如不同颜色)来切换展示的子款式。 | |
| 481 | +当指定 `sku_filter_dimension` 后,后端会根据店铺的这项配置,从所有 SKU 中筛选出这些维度组合对应的子 SKU 数据:系统会按指定维度**组合**对 SKU 进行分组,每个维度组合只返回第一个 SKU(从简实现,选择该组合下的第一款),其余不在这些维度组合中的子 SKU 将不返回。 | |
| 482 | + | |
| 483 | +**支持的维度值**: | |
| 484 | +1. **直接选项字段**: `option1`、`option2`、`option3` | |
| 485 | + - 直接使用对应的 `option1_value`、`option2_value`、`option3_value` 字段进行分组 | |
| 486 | + | |
| 487 | +2. **规格/选项名称**: 通过 `option1_name`、`option2_name`、`option3_name` 匹配 | |
| 488 | + - 例如:如果 `option1_name` 为 `"color"`,则可以使用 `sku_filter_dimension: ["color"]` 来按颜色分组 | |
| 489 | + | |
| 490 | +**示例**: | |
| 491 | + | |
| 492 | +**按颜色筛选(假设 option1_name = "color")**: | |
| 493 | +```json | |
| 494 | +{ | |
| 495 | + "query": "芭比娃娃", | |
| 496 | + "sku_filter_dimension": ["color"] | |
| 497 | +} | |
| 498 | +``` | |
| 499 | + | |
| 500 | +**按选项1筛选**: | |
| 501 | +```json | |
| 502 | +{ | |
| 503 | + "query": "芭比娃娃", | |
| 504 | + "sku_filter_dimension": ["option1"] | |
| 505 | +} | |
| 506 | +``` | |
| 507 | + | |
| 508 | +**按颜色 + 尺寸组合筛选(假设 option1_name = "color", option2_name = "size")**: | |
| 509 | +```json | |
| 510 | +{ | |
| 511 | + "query": "芭比娃娃", | |
| 512 | + "sku_filter_dimension": ["color", "size"] | |
| 513 | +} | |
| 514 | +``` | |
| 515 | + | |
| 516 | +### 3.6 布尔表达式语法 | |
| 517 | + | |
| 518 | +搜索查询支持布尔表达式,提供更灵活的搜索能力。 | |
| 519 | + | |
| 520 | +**支持的操作符**: | |
| 521 | + | |
| 522 | +| 操作符 | 描述 | 示例 | | |
| 523 | +|--------|------|------| | |
| 524 | +| `AND` | 所有词必须匹配 | `玩具 AND 乐高` | | |
| 525 | +| `OR` | 任意词匹配 | `芭比 OR 娃娃` | | |
| 526 | +| `ANDNOT` | 排除特定词 | `玩具 ANDNOT 电动` | | |
| 527 | +| `RANK` | 排序加权(不强制匹配) | `玩具 RANK 乐高` | | |
| 528 | +| `()` | 分组 | `玩具 AND (乐高 OR 芭比)` | | |
| 529 | + | |
| 530 | +**操作符优先级**(从高到低): | |
| 531 | +1. `()` - 括号 | |
| 532 | +2. `ANDNOT` - 排除 | |
| 533 | +3. `AND` - 与 | |
| 534 | +4. `OR` - 或 | |
| 535 | +5. `RANK` - 排序 | |
| 536 | + | |
| 537 | +**示例**: | |
| 538 | +``` | |
| 539 | +"芭比娃娃" // 简单查询 | |
| 540 | +"玩具 AND 乐高" // AND 查询 | |
| 541 | +"芭比 OR 娃娃" // OR 查询 | |
| 542 | +"玩具 ANDNOT 电动" // 排除查询 | |
| 543 | +"玩具 AND (乐高 OR 芭比)" // 复杂查询 | |
| 544 | +``` | |
| 545 | + | |
| 546 | +### 3.7 搜索建议接口 | |
| 547 | + | |
| 548 | +> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,仅返回空结果。接口和响应格式已经固定,可平滑扩展。 | |
| 549 | + | |
| 550 | +- **端点**: `GET /search/suggestions` | |
| 551 | +- **描述**: 返回搜索建议(自动补全/热词)。当前为框架实现,接口和响应格式已经固定,可平滑扩展。 | |
| 552 | + | |
| 553 | +#### 查询参数 | |
| 554 | + | |
| 555 | +| 参数 | 类型 | 必填 | 默认值 | 描述 | | |
| 556 | +|------|------|------|--------|------| | |
| 557 | +| `q` | string | Y | - | 查询字符串(至少 1 个字符) | | |
| 558 | +| `size` | integer | N | 5 | 返回建议数量(1-20) | | |
| 559 | +| `types` | string | N | `query` | 建议类型(逗号分隔):`query`, `product`, `category`, `brand` | | |
| 560 | + | |
| 561 | +#### 响应示例 | |
| 562 | + | |
| 563 | +```json | |
| 564 | +{ | |
| 565 | + "query": "芭", | |
| 566 | + "suggestions": [ | |
| 567 | + { | |
| 568 | + "text": "芭比娃娃", | |
| 569 | + "type": "query", | |
| 570 | + "highlight": "<em>芭</em>比娃娃", | |
| 571 | + "popularity": 850 | |
| 572 | + } | |
| 573 | + ], | |
| 574 | + "took_ms": 5 | |
| 575 | +} | |
| 576 | +``` | |
| 577 | + | |
| 578 | +#### 请求示例 | |
| 579 | + | |
| 580 | +```bash | |
| 581 | +curl "http://localhost:6002/search/suggestions?q=芭&size=5&types=query,product" | |
| 582 | +``` | |
| 583 | + | |
| 584 | +### 3.8 即时搜索接口 | |
| 585 | + | |
| 586 | +> ⚠️ **TODO**: 此接口当前为框架实现,功能暂未实现,调用标准搜索接口。后续需要优化即时搜索性能(添加防抖/节流、实现结果缓存、简化返回字段)。 | |
| 587 | + | |
| 588 | +- **端点**: `GET /search/instant` | |
| 589 | +- **描述**: 边输入边搜索,采用轻量参数响应当前输入。底层复用标准搜索能力。 | |
| 590 | + | |
| 591 | +#### 查询参数 | |
| 592 | + | |
| 593 | +| 参数 | 类型 | 必填 | 默认值 | 描述 | | |
| 594 | +|------|------|------|--------|------| | |
| 595 | +| `q` | string | Y | - | 搜索查询(至少 2 个字符) | | |
| 596 | +| `size` | integer | N | 5 | 返回结果数量(1-20) | | |
| 597 | + | |
| 598 | +#### 请求示例 | |
| 599 | + | |
| 600 | +```bash | |
| 601 | +curl "http://localhost:6002/search/instant?q=玩具&size=5" | |
| 602 | +``` | |
| 603 | + | |
| 604 | +### 3.9 获取单个文档 | |
| 605 | + | |
| 606 | +- **端点**: `GET /search/{doc_id}` | |
| 607 | +- **描述**: 根据文档 ID 获取单个商品详情,用于点击结果后的详情页或排查问题。 | |
| 608 | + | |
| 609 | +#### 路径参数 | |
| 610 | + | |
| 611 | +| 参数 | 类型 | 描述 | | |
| 612 | +|------|------|------| | |
| 613 | +| `doc_id` | string | 商品或文档 ID | | |
| 614 | + | |
| 615 | +#### 响应示例 | |
| 616 | + | |
| 617 | +```json | |
| 618 | +{ | |
| 619 | + "id": "12345", | |
| 620 | + "source": { | |
| 621 | + "title": { | |
| 622 | + "zh": "芭比时尚娃娃" | |
| 623 | + }, | |
| 624 | + "min_price": 89.99, | |
| 625 | + "category1_name": "玩具" | |
| 626 | + } | |
| 627 | +} | |
| 628 | +``` | |
| 629 | + | |
| 630 | +#### 请求示例 | |
| 631 | + | |
| 632 | +```bash | |
| 633 | +curl "http://localhost:6002/search/12345" | |
| 634 | +``` | |
| 635 | + | |
| 636 | +--- | |
| 637 | + | |
| 638 | +## 响应格式说明 | |
| 639 | + | |
| 640 | +### 4.1 标准响应结构 | |
| 641 | + | |
| 642 | +```json | |
| 643 | +{ | |
| 644 | + "results": [ | |
| 645 | + { | |
| 646 | + "spu_id": "12345", | |
| 647 | + "title": "芭比时尚娃娃", | |
| 648 | + "brief": "高品质芭比娃娃", | |
| 649 | + "description": "详细描述...", | |
| 650 | + "vendor": "美泰", | |
| 651 | + "category": "玩具", | |
| 652 | + "category_path": "玩具/娃娃/时尚", | |
| 653 | + "category_name": "时尚", | |
| 654 | + "category_id": "cat_001", | |
| 655 | + "category_level": 3, | |
| 656 | + "category1_name": "玩具", | |
| 657 | + "category2_name": "娃娃", | |
| 658 | + "category3_name": "时尚", | |
| 659 | + "tags": ["娃娃", "玩具", "女孩"], | |
| 660 | + "price": 89.99, | |
| 661 | + "compare_at_price": 129.99, | |
| 662 | + "currency": "USD", | |
| 663 | + "image_url": "https://example.com/image.jpg", | |
| 664 | + "in_stock": true, | |
| 665 | + "sku_prices": [89.99, 99.99, 109.99], | |
| 666 | + "sku_weights": [100, 150, 200], | |
| 667 | + "sku_weight_units": ["g", "g", "g"], | |
| 668 | + "total_inventory": 500, | |
| 669 | + "option1_name": "color", | |
| 670 | + "option2_name": "size", | |
| 671 | + "option3_name": null, | |
| 672 | + "specifications": [ | |
| 673 | + {"sku_id": "sku_001", "name": "color", "value": "pink"}, | |
| 674 | + {"sku_id": "sku_001", "name": "size", "value": "standard"} | |
| 675 | + ], | |
| 676 | + "skus": [ | |
| 677 | + { | |
| 678 | + "sku_id": "67890", | |
| 679 | + "price": 89.99, | |
| 680 | + "compare_at_price": 129.99, | |
| 681 | + "sku": "BARBIE-001", | |
| 682 | + "stock": 100, | |
| 683 | + "weight": 0.1, | |
| 684 | + "weight_unit": "kg", | |
| 685 | + "option1_value": "pink", | |
| 686 | + "option2_value": "standard", | |
| 687 | + "option3_value": null, | |
| 688 | + "image_src": "https://example.com/sku1.jpg" | |
| 689 | + } | |
| 690 | + ], | |
| 691 | + "relevance_score": 8.5 | |
| 692 | + } | |
| 693 | + ], | |
| 694 | + "total": 118, | |
| 695 | + "max_score": 8.5, | |
| 696 | + "facets": [ | |
| 697 | + { | |
| 698 | + "field": "category1_name", | |
| 699 | + "label": "category1_name", | |
| 700 | + "type": "terms", | |
| 701 | + "values": [ | |
| 702 | + { | |
| 703 | + "value": "玩具", | |
| 704 | + "label": "玩具", | |
| 705 | + "count": 85, | |
| 706 | + "selected": false | |
| 707 | + } | |
| 708 | + ] | |
| 709 | + }, | |
| 710 | + { | |
| 711 | + "field": "specifications.color", | |
| 712 | + "label": "color", | |
| 713 | + "type": "terms", | |
| 714 | + "values": [ | |
| 715 | + { | |
| 716 | + "value": "pink", | |
| 717 | + "label": "pink", | |
| 718 | + "count": 30, | |
| 719 | + "selected": false | |
| 720 | + } | |
| 721 | + ] | |
| 722 | + } | |
| 723 | + ], | |
| 724 | + "query_info": { | |
| 725 | + "original_query": "芭比娃娃", | |
| 726 | + "query_normalized": "芭比娃娃", | |
| 727 | + "rewritten_query": "芭比娃娃", | |
| 728 | + "detected_language": "zh", | |
| 729 | + "translations": { | |
| 730 | + "en": "barbie doll" | |
| 731 | + }, | |
| 732 | + "domain": "default" | |
| 733 | + }, | |
| 734 | + "suggestions": [], | |
| 735 | + "related_searches": [], | |
| 736 | + "took_ms": 45, | |
| 737 | + "performance_info": null, | |
| 738 | + "debug_info": null | |
| 739 | +} | |
| 740 | +``` | |
| 741 | + | |
| 742 | +### 4.2 响应字段说明 | |
| 743 | + | |
| 744 | +| 字段 | 类型 | 说明 | | |
| 745 | +|------|------|------| | |
| 746 | +| `results` | array | 搜索结果列表(SpuResult对象数组) | | |
| 747 | +| `results[].spu_id` | string | SPU ID | | |
| 748 | +| `results[].title` | string | 商品标题 | | |
| 749 | +| `results[].price` | float | 价格(min_price) | | |
| 750 | +| `results[].skus` | array | SKU列表(如果指定了`sku_filter_dimension`,则按维度过滤后的SKU) | | |
| 751 | +| `results[].relevance_score` | float | 相关性分数 | | |
| 752 | +| `total` | integer | 匹配的总文档数 | | |
| 753 | +| `max_score` | float | 最高相关性分数 | | |
| 754 | +| `facets` | array | 分面统计结果 | | |
| 755 | +| `query_info` | object | query处理信息 | | |
| 756 | +| `took_ms` | integer | 搜索耗时(毫秒) | | |
| 757 | + | |
| 758 | +#### 4.2.1 query_info 说明 | |
| 759 | + | |
| 760 | +`query_info` 包含本次搜索的查询解析与处理结果: | |
| 761 | + | |
| 762 | +| 子字段 | 类型 | 说明 | | |
| 763 | +|--------|------|------| | |
| 764 | +| `original_query` | string | 用户原始查询 | | |
| 765 | +| `query_normalized` | string | 归一化后的查询(去空白、大小写等预处理,用于后续解析与改写) | | |
| 766 | +| `rewritten_query` | string | 重写后的查询(同义词/词典扩展等) | | |
| 767 | +| `detected_language` | string | 检测到的查询语言(如 `zh`、`en`) | | |
| 768 | +| `translations` | object | 翻译结果,键为语言代码,值为翻译文本 | | |
| 769 | +| `domain` | string | 查询域(如 `default`、`title`、`brand` 等) | | |
| 770 | + | |
| 771 | +### 4.3 SpuResult字段说明 | |
| 772 | + | |
| 773 | +| 字段 | 类型 | 说明 | | |
| 774 | +|------|------|------| | |
| 775 | +| `spu_id` | string | SPU ID | | |
| 776 | +| `title` | string | 商品标题(根据language参数自动选择 `title.zh` 或 `title.en`) | | |
| 777 | +| `brief` | string | 商品短描述(根据language参数自动选择) | | |
| 778 | +| `description` | string | 商品详细描述(根据language参数自动选择) | | |
| 779 | +| `vendor` | string | 供应商/品牌(根据language参数自动选择) | | |
| 780 | +| `category` | string | 类目(兼容字段,等同于category_name) | | |
| 781 | +| `category_path` | string | 类目路径(多级,用于面包屑,根据language参数自动选择) | | |
| 782 | +| `category_name` | string | 类目名称(展示用,根据language参数自动选择) | | |
| 783 | +| `category_id` | string | 类目ID | | |
| 784 | +| `category_level` | integer | 类目层级(1/2/3) | | |
| 785 | +| `category1_name` | string | 一级类目名称 | | |
| 786 | +| `category2_name` | string | 二级类目名称 | | |
| 787 | +| `category3_name` | string | 三级类目名称 | | |
| 788 | +| `tags` | array[string] | 标签列表 | | |
| 789 | +| `price` | float | 价格(min_price) | | |
| 790 | +| `compare_at_price` | float | 原价 | | |
| 791 | +| `currency` | string | 货币单位(默认USD) | | |
| 792 | +| `image_url` | string | 主图URL | | |
| 793 | +| `in_stock` | boolean | 是否有库存(任意SKU有库存即为true) | | |
| 794 | +| `sku_prices` | array[float] | 所有SKU价格列表 | | |
| 795 | +| `sku_weights` | array[integer] | 所有SKU重量列表 | | |
| 796 | +| `sku_weight_units` | array[string] | 所有SKU重量单位列表 | | |
| 797 | +| `total_inventory` | integer | 总库存 | | |
| 798 | +| `sales` | integer | 销量(展示销量) | | |
| 799 | +| `option1_name` | string | 选项1名称(如"color") | | |
| 800 | +| `option2_name` | string | 选项2名称(如"size") | | |
| 801 | +| `option3_name` | string | 选项3名称 | | |
| 802 | +| `specifications` | array[object] | 规格列表(与ES specifications字段对应) | | |
| 803 | +| `skus` | array | SKU 列表 | | |
| 804 | +| `relevance_score` | float | 相关性分数(默认为 ES 原始分数;当开启 AI 搜索时为融合后的最终分数) | | |
| 805 | + | |
| 806 | +### 4.4 SkuResult字段说明 | |
| 807 | + | |
| 808 | +| 字段 | 类型 | 说明 | | |
| 809 | +|------|------|------| | |
| 810 | +| `sku_id` | string | SKU ID | | |
| 811 | +| `price` | float | 价格 | | |
| 812 | +| `compare_at_price` | float | 原价 | | |
| 813 | +| `sku` | string | SKU编码(sku_code) | | |
| 814 | +| `stock` | integer | 库存数量 | | |
| 815 | +| `weight` | float | 重量 | | |
| 816 | +| `weight_unit` | string | 重量单位 | | |
| 817 | +| `option1_value` | string | 选项1取值(如color值) | | |
| 818 | +| `option2_value` | string | 选项2取值(如size值) | | |
| 819 | +| `option3_value` | string | 选项3取值 | | |
| 820 | +| `image_src` | string | SKU图片地址 | | |
| 821 | + | |
| 822 | +### 4.5 多语言字段说明 | |
| 823 | + | |
| 824 | +- `title`, `brief`, `description`, `vendor`, `category_path`, `category_name` 会根据请求的 `language` 参数自动选择对应的中英文字段 | |
| 825 | +- `language="zh"`: 优先返回 `*_zh` 字段,如果为空则回退到 `*_en` 字段 | |
| 826 | +- `language="en"`: 优先返回 `*_en` 字段,如果为空则回退到 `*_zh` 字段 | |
| 827 | + | |
| 828 | +--- | |
| 829 | + | |
| 830 | +## 索引接口 | |
| 831 | + | |
| 832 | +### 5.0 为租户创建索引 | |
| 833 | + | |
| 834 | +为租户创建索引需要两个步骤: | |
| 835 | + | |
| 836 | +1. **创建索引结构**(可选,仅在需要更新 mapping 时执行) | |
| 837 | + - 使用脚本创建 ES 索引结构(基于 `mappings/search_products.json`) | |
| 838 | + - 如果索引已存在,会提示用户确认(会删除现有数据) | |
| 839 | + | |
| 840 | +2. **导入数据**(必需) | |
| 841 | + - 使用全量索引接口 `/indexer/reindex` 导入数据 | |
| 842 | + | |
| 843 | +**创建索引结构**: | |
| 844 | + | |
| 845 | +```bash | |
| 846 | +./scripts/create_tenant_index.sh 170 | |
| 847 | +``` | |
| 848 | + | |
| 849 | +脚本会自动从项目根目录的 `.env` 文件加载 ES 配置。 | |
| 850 | + | |
| 851 | +**注意事项**: | |
| 852 | +- ⚠️ 如果索引已存在,脚本会提示确认,确认后会删除现有数据 | |
| 853 | +- 创建索引后,**必须**调用 `/indexer/reindex` 导入数据 | |
| 854 | +- 如果只是更新数据而不需要修改索引结构,直接使用 `/indexer/reindex` 即可 | |
| 855 | + | |
| 856 | +--- | |
| 857 | + | |
| 858 | +### 5.1 全量索引接口 | |
| 859 | + | |
| 860 | +- **端点**: `POST /indexer/reindex` | |
| 861 | +- **描述**: 全量索引,将指定租户的所有SPU数据导入到ES索引(不会删除现有索引) | |
| 862 | + | |
| 863 | +#### 请求参数 | |
| 864 | + | |
| 865 | +```json | |
| 866 | +{ | |
| 867 | + "tenant_id": "162", | |
| 868 | + "batch_size": 500 | |
| 869 | +} | |
| 870 | +``` | |
| 871 | + | |
| 872 | +| 参数 | 类型 | 必填 | 默认值 | 说明 | | |
| 873 | +|------|------|------|--------|------| | |
| 874 | +| `tenant_id` | string | Y | - | 租户ID | | |
| 875 | +| `batch_size` | integer | N | 500 | 批量导入大小 | | |
| 876 | + | |
| 877 | +#### 响应格式 | |
| 878 | + | |
| 879 | +**成功响应(200 OK)**: | |
| 880 | +```json | |
| 881 | +{ | |
| 882 | + "success": true, | |
| 883 | + "total": 1000, | |
| 884 | + "indexed": 1000, | |
| 885 | + "failed": 0, | |
| 886 | + "elapsed_time": 12.34, | |
| 887 | + "index_name": "search_products", | |
| 888 | + "tenant_id": "162" | |
| 889 | +} | |
| 890 | +``` | |
| 891 | + | |
| 892 | +**错误响应**: | |
| 893 | +- `400 Bad Request`: 参数错误 | |
| 894 | +- `503 Service Unavailable`: 服务未初始化 | |
| 895 | + | |
| 896 | +#### 请求示例 | |
| 897 | + | |
| 898 | +**全量索引(不会删除现有索引)**: | |
| 899 | +```bash | |
| 900 | +curl -X POST "http://localhost:6004/indexer/reindex" \ | |
| 901 | + -H "Content-Type: application/json" \ | |
| 902 | + -d '{ | |
| 903 | + "tenant_id": "162", | |
| 904 | + "batch_size": 500 | |
| 905 | + }' | |
| 906 | +``` | |
| 907 | + | |
| 908 | +**查看日志**: | |
| 909 | +```bash | |
| 910 | +# 查看API日志(包含索引操作日志) | |
| 911 | +tail -f logs/api.log | |
| 912 | + | |
| 913 | +# 或者查看所有日志文件 | |
| 914 | +tail -f logs/*.log | |
| 915 | +``` | |
| 916 | + | |
| 917 | +> ⚠️ **重要提示**:如需 **创建索引结构**,请参考 [5.0 为租户创建索引](#50-为租户创建索引) 章节,使用 `scripts/recreate_all_tenant_indices.py` 脚本。创建后需要调用 `/indexer/reindex` 导入数据。 | |
| 918 | + | |
| 919 | +**查看索引日志**: | |
| 920 | + | |
| 921 | +索引操作的所有关键信息都会记录到 `logs/indexer.log` 文件中(JSON 格式),包括: | |
| 922 | +- 请求开始和结束时间 | |
| 923 | +- 租户ID、SPU ID、操作类型 | |
| 924 | +- 每个SPU的处理状态 | |
| 925 | +- ES批量写入结果 | |
| 926 | +- 成功/失败统计和详细错误信息 | |
| 927 | + | |
| 928 | +```bash | |
| 929 | +# 实时查看索引日志(包含全量和增量索引的所有操作) | |
| 930 | +tail -f logs/indexer.log | |
| 931 | + | |
| 932 | +# 使用 grep 查询(简单方式) | |
| 933 | +# 查看全量索引日志 | |
| 934 | +grep "\"index_type\":\"bulk\"" logs/indexer.log | tail -100 | |
| 935 | + | |
| 936 | +# 查看增量索引日志 | |
| 937 | +grep "\"index_type\":\"incremental\"" logs/indexer.log | tail -100 | |
| 938 | + | |
| 939 | +# 查看特定租户的索引日志 | |
| 940 | +grep "\"tenant_id\":\"162\"" logs/indexer.log | tail -100 | |
| 941 | + | |
| 942 | +# 使用 jq 查询(推荐,更精确的 JSON 查询) | |
| 943 | +# 安装 jq: sudo apt-get install jq 或 brew install jq | |
| 944 | + | |
| 945 | +# 查看全量索引日志 | |
| 946 | +cat logs/indexer.log | jq 'select(.index_type == "bulk")' | tail -100 | |
| 947 | + | |
| 948 | +# 查看增量索引日志 | |
| 949 | +cat logs/indexer.log | jq 'select(.index_type == "incremental")' | tail -100 | |
| 950 | + | |
| 951 | +# 查看特定租户的索引日志 | |
| 952 | +cat logs/indexer.log | jq 'select(.tenant_id == "162")' | tail -100 | |
| 953 | + | |
| 954 | +# 查看失败的索引操作 | |
| 955 | +cat logs/indexer.log | jq 'select(.operation == "request_complete" and .failed_count > 0)' | |
| 956 | + | |
| 957 | +# 查看特定SPU的处理日志 | |
| 958 | +cat logs/indexer.log | jq 'select(.spu_id == "123")' | |
| 959 | + | |
| 960 | +# 查看最近的索引请求统计 | |
| 961 | +cat logs/indexer.log | jq 'select(.operation == "request_complete") | {timestamp, index_type, tenant_id, total_count, success_count, failed_count, elapsed_time}' | |
| 962 | +``` | |
| 963 | + | |
| 964 | +### 5.2 增量索引接口 | |
| 965 | + | |
| 966 | +- **端点**: `POST /indexer/index` | |
| 967 | +- **描述**: 增量索引接口,根据指定的SPU ID列表进行索引,直接将数据写入ES。用于增量更新指定商品。 | |
| 968 | + | |
| 969 | +**删除说明**: | |
| 970 | +- `spu_ids`中的SPU:如果数据库`deleted=1`,自动从ES删除,响应状态为`deleted` | |
| 971 | +- `delete_spu_ids`中的SPU:直接删除,响应状态为`deleted`、`not_found`或`failed` | |
| 972 | + | |
| 973 | +#### 请求参数 | |
| 974 | + | |
| 975 | +```json | |
| 976 | +{ | |
| 977 | + "tenant_id": "162", | |
| 978 | + "spu_ids": ["123", "456", "789"], | |
| 979 | + "delete_spu_ids": ["100", "101"] | |
| 980 | +} | |
| 981 | +``` | |
| 982 | + | |
| 983 | +| 参数 | 类型 | 必填 | 说明 | | |
| 984 | +|------|------|------|------| | |
| 985 | +| `tenant_id` | string | Y | 租户ID | | |
| 986 | +| `spu_ids` | array[string] | N | SPU ID列表(1-100个),要索引的SPU。如果为空,则只执行删除操作 | | |
| 987 | +| `delete_spu_ids` | array[string] | N | 显式指定要删除的SPU ID列表(1-100个),可选。无论数据库状态如何,都会从ES中删除这些SPU | | |
| 988 | + | |
| 989 | +**注意**: | |
| 990 | +- `spu_ids` 和 `delete_spu_ids` 不能同时为空 | |
| 991 | +- 每个列表最多支持100个SPU ID | |
| 992 | +- 如果SPU在`spu_ids`中且数据库`deleted=1`,会自动从ES删除(自动检测删除) | |
| 993 | + | |
| 994 | +#### 响应格式 | |
| 995 | + | |
| 996 | +```json | |
| 997 | +{ | |
| 998 | + "spu_ids": [ | |
| 999 | + { | |
| 1000 | + "spu_id": "123", | |
| 1001 | + "status": "indexed" | |
| 1002 | + }, | |
| 1003 | + { | |
| 1004 | + "spu_id": "456", | |
| 1005 | + "status": "deleted" | |
| 1006 | + }, | |
| 1007 | + { | |
| 1008 | + "spu_id": "789", | |
| 1009 | + "status": "failed", | |
| 1010 | + "msg": "SPU not found (unexpected)" | |
| 1011 | + } | |
| 1012 | + ], | |
| 1013 | + "delete_spu_ids": [ | |
| 1014 | + { | |
| 1015 | + "spu_id": "100", | |
| 1016 | + "status": "deleted" | |
| 1017 | + }, | |
| 1018 | + { | |
| 1019 | + "spu_id": "101", | |
| 1020 | + "status": "not_found" | |
| 1021 | + }, | |
| 1022 | + { | |
| 1023 | + "spu_id": "102", | |
| 1024 | + "status": "failed", | |
| 1025 | + "msg": "Failed to delete from ES: Connection timeout" | |
| 1026 | + } | |
| 1027 | + ], | |
| 1028 | + "total": 6, | |
| 1029 | + "success_count": 4, | |
| 1030 | + "failed_count": 2, | |
| 1031 | + "elapsed_time": 1.23, | |
| 1032 | + "index_name": "search_products", | |
| 1033 | + "tenant_id": "162" | |
| 1034 | +} | |
| 1035 | +``` | |
| 1036 | + | |
| 1037 | +| 字段 | 类型 | 说明 | | |
| 1038 | +|------|------|------| | |
| 1039 | +| `spu_ids` | array | spu_ids对应的响应列表,每个元素包含 `spu_id` 和 `status` | | |
| 1040 | +| `spu_ids[].status` | string | 状态:`indexed`(已索引)、`deleted`(已删除,自动检测)、`failed`(失败) | | |
| 1041 | +| `spu_ids[].msg` | string | 当status为`failed`时,包含失败原因(可选) | | |
| 1042 | +| `delete_spu_ids` | array | delete_spu_ids对应的响应列表,每个元素包含 `spu_id` 和 `status` | | |
| 1043 | +| `delete_spu_ids[].status` | string | 状态:`deleted`(已删除)、`not_found`(ES中不存在)、`failed`(失败) | | |
| 1044 | +| `delete_spu_ids[].msg` | string | 当status为`failed`时,包含失败原因(可选) | | |
| 1045 | +| `total` | integer | 总处理数量(spu_ids数量 + delete_spu_ids数量) | | |
| 1046 | +| `success_count` | integer | 成功数量(indexed + deleted + not_found) | | |
| 1047 | +| `failed_count` | integer | 失败数量 | | |
| 1048 | +| `elapsed_time` | float | 耗时(秒) | | |
| 1049 | +| `index_name` | string | 索引名称 | | |
| 1050 | +| `tenant_id` | string | 租户ID | | |
| 1051 | + | |
| 1052 | +**状态说明**: | |
| 1053 | +- `spu_ids` 的状态: | |
| 1054 | + - `indexed`: SPU已成功索引到ES | |
| 1055 | + - `deleted`: SPU在数据库中被标记为deleted=1,已从ES删除(自动检测) | |
| 1056 | + - `failed`: 处理失败,会包含`msg`字段说明失败原因 | |
| 1057 | +- `delete_spu_ids` 的状态: | |
| 1058 | + - `deleted`: SPU已从ES成功删除 | |
| 1059 | + - `not_found`: SPU在ES中不存在(也算成功,可能已经被删除过) | |
| 1060 | + - `failed`: 删除失败,会包含`msg`字段说明失败原因 | |
| 1061 | + | |
| 1062 | +#### 请求示例 | |
| 1063 | + | |
| 1064 | +**示例1:普通增量索引(自动检测删除)**: | |
| 1065 | +```bash | |
| 1066 | +curl -X POST "http://localhost:6004/indexer/index" \ | |
| 1067 | + -H "Content-Type: application/json" \ | |
| 1068 | + -d '{ | |
| 1069 | + "tenant_id": "162", | |
| 1070 | + "spu_ids": ["123", "456", "789"] | |
| 1071 | + }' | |
| 1072 | +``` | |
| 1073 | +说明:如果SPU 456在数据库中`deleted=1`,会自动从ES删除,在响应中`spu_ids`列表里456的状态为`deleted`。 | |
| 1074 | + | |
| 1075 | +**示例2:显式删除(批量删除)**: | |
| 1076 | +```bash | |
| 1077 | +curl -X POST "http://localhost:6004/indexer/index" \ | |
| 1078 | + -H "Content-Type: application/json" \ | |
| 1079 | + -d '{ | |
| 1080 | + "tenant_id": "162", | |
| 1081 | + "spu_ids": ["123", "456"], | |
| 1082 | + "delete_spu_ids": ["100", "101", "102"] | |
| 1083 | + }' | |
| 1084 | +``` | |
| 1085 | +说明:SPU 100、101、102会被显式删除,无论数据库状态如何。 | |
| 1086 | + | |
| 1087 | +**示例3:仅删除(不索引)**: | |
| 1088 | +```bash | |
| 1089 | +curl -X POST "http://localhost:6004/indexer/index" \ | |
| 1090 | + -H "Content-Type: application/json" \ | |
| 1091 | + -d '{ | |
| 1092 | + "tenant_id": "162", | |
| 1093 | + "spu_ids": [], | |
| 1094 | + "delete_spu_ids": ["100", "101"] | |
| 1095 | + }' | |
| 1096 | +``` | |
| 1097 | +说明:只执行删除操作,不进行索引。 | |
| 1098 | + | |
| 1099 | +**示例4:混合操作(索引+删除)**: | |
| 1100 | +```bash | |
| 1101 | +curl -X POST "http://localhost:6004/indexer/index" \ | |
| 1102 | + -H "Content-Type: application/json" \ | |
| 1103 | + -d '{ | |
| 1104 | + "tenant_id": "162", | |
| 1105 | + "spu_ids": ["123", "456", "789"], | |
| 1106 | + "delete_spu_ids": ["100", "101"] | |
| 1107 | + }' | |
| 1108 | +``` | |
| 1109 | +说明:同时执行索引和删除操作。 | |
| 1110 | + | |
| 1111 | +#### 日志说明 | |
| 1112 | + | |
| 1113 | +增量索引操作的所有关键信息都会记录到 `logs/indexer.log` 文件中(JSON格式),包括: | |
| 1114 | +- 请求开始和结束时间 | |
| 1115 | +- 每个SPU的处理状态(获取、转换、索引、删除) | |
| 1116 | +- ES批量写入结果 | |
| 1117 | +- 成功/失败统计 | |
| 1118 | +- 详细的错误信息 | |
| 1119 | + | |
| 1120 | +日志查询方式请参考[5.1节查看索引日志](#51-全量重建索引接口)部分。 | |
| 1121 | + | |
| 1122 | +### 5.3 查询文档接口 | |
| 1123 | + | |
| 1124 | +- **端点**: `POST /indexer/documents` | |
| 1125 | +- **描述**: 查询文档接口,根据SPU ID列表获取ES文档数据(**不写入ES**)。用于查看、调试或验证SPU数据。 | |
| 1126 | + | |
| 1127 | +#### 请求参数 | |
| 1128 | + | |
| 1129 | +```json | |
| 1130 | +{ | |
| 1131 | + "tenant_id": "162", | |
| 1132 | + "spu_ids": ["123", "456", "789"] | |
| 1133 | +} | |
| 1134 | +``` | |
| 1135 | + | |
| 1136 | +| 参数 | 类型 | 必填 | 说明 | | |
| 1137 | +|------|------|------|------| | |
| 1138 | +| `tenant_id` | string | Y | 租户ID | | |
| 1139 | +| `spu_ids` | array[string] | Y | SPU ID列表(1-100个) | | |
| 1140 | + | |
| 1141 | +#### 响应格式 | |
| 1142 | + | |
| 1143 | +```json | |
| 1144 | +{ | |
| 1145 | + "success": [ | |
| 1146 | + { | |
| 1147 | + "spu_id": "123", | |
| 1148 | + "document": { | |
| 1149 | + "tenant_id": "162", | |
| 1150 | + "spu_id": "123", | |
| 1151 | + "title": { | |
| 1152 | + "zh": "商品标题" | |
| 1153 | + }, | |
| 1154 | + ... | |
| 1155 | + } | |
| 1156 | + }, | |
| 1157 | + { | |
| 1158 | + "spu_id": "456", | |
| 1159 | + "document": {...} | |
| 1160 | + } | |
| 1161 | + ], | |
| 1162 | + "failed": [ | |
| 1163 | + { | |
| 1164 | + "spu_id": "789", | |
| 1165 | + "error": "SPU not found or deleted" | |
| 1166 | + } | |
| 1167 | + ], | |
| 1168 | + "total": 3, | |
| 1169 | + "success_count": 2, | |
| 1170 | + "failed_count": 1 | |
| 1171 | +} | |
| 1172 | +``` | |
| 1173 | + | |
| 1174 | +| 字段 | 类型 | 说明 | | |
| 1175 | +|------|------|------| | |
| 1176 | +| `success` | array | 成功获取的SPU列表,每个元素包含 `spu_id` 和 `document`(完整的ES文档数据) | | |
| 1177 | +| `failed` | array | 失败的SPU列表,每个元素包含 `spu_id` 和 `error`(失败原因) | | |
| 1178 | +| `total` | integer | 总SPU数量 | | |
| 1179 | +| `success_count` | integer | 成功数量 | | |
| 1180 | +| `failed_count` | integer | 失败数量 | | |
| 1181 | + | |
| 1182 | +#### 请求示例 | |
| 1183 | + | |
| 1184 | +**单个SPU查询**: | |
| 1185 | +```bash | |
| 1186 | +curl -X POST "http://localhost:6004/indexer/documents" \ | |
| 1187 | + -H "Content-Type: application/json" \ | |
| 1188 | + -d '{ | |
| 1189 | + "tenant_id": "162", | |
| 1190 | + "spu_ids": ["123"] | |
| 1191 | + }' | |
| 1192 | +``` | |
| 1193 | + | |
| 1194 | +**批量SPU查询**: | |
| 1195 | +```bash | |
| 1196 | +curl -X POST "http://localhost:6004/indexer/documents" \ | |
| 1197 | + -H "Content-Type: application/json" \ | |
| 1198 | + -d '{ | |
| 1199 | + "tenant_id": "162", | |
| 1200 | + "spu_ids": ["123", "456", "789"] | |
| 1201 | + }' | |
| 1202 | +``` | |
| 1203 | + | |
| 1204 | +#### 与 `/indexer/index` 的区别 | |
| 1205 | + | |
| 1206 | +| 接口 | 功能 | 是否写入ES | 返回内容 | | |
| 1207 | +|------|------|-----------|----------| | |
| 1208 | +| `/indexer/documents` | 查询SPU文档数据 | 否 | 返回完整的ES文档数据 | | |
| 1209 | +| `/indexer/index` | 增量索引 | 是 | 返回成功/失败列表和统计信息 | | |
| 1210 | + | |
| 1211 | +**使用场景**: | |
| 1212 | +- `/indexer/documents`:用于查看、调试或验证SPU数据,不修改ES索引 | |
| 1213 | +- `/indexer/index`:用于实际的增量索引操作,将更新的SPU数据同步到ES | |
| 1214 | + | |
| 1215 | +### 5.4 索引健康检查接口 | |
| 1216 | + | |
| 1217 | +- **端点**: `GET /indexer/health` | |
| 1218 | +- **描述**: 检查索引服务的健康状态 | |
| 1219 | + | |
| 1220 | +#### 响应格式 | |
| 1221 | + | |
| 1222 | +```json | |
| 1223 | +{ | |
| 1224 | + "status": "available", | |
| 1225 | + "database": "connected", | |
| 1226 | + "preloaded_data": { | |
| 1227 | + "category_mappings": 150 | |
| 1228 | + } | |
| 1229 | +} | |
| 1230 | +``` | |
| 1231 | + | |
| 1232 | +#### 请求示例 | |
| 1233 | + | |
| 1234 | +```bash | |
| 1235 | +curl -X GET "http://localhost:6004/indexer/health" | |
| 1236 | +``` | |
| 1237 | + | |
| 1238 | +--- | |
| 1239 | + | |
| 1240 | +## 管理接口 | |
| 1241 | + | |
| 1242 | +### 6.1 健康检查 | |
| 1243 | + | |
| 1244 | +- **端点**: `GET /admin/health` | |
| 1245 | +- **描述**: 检查服务与依赖(如 Elasticsearch)状态。 | |
| 1246 | + | |
| 1247 | +```json | |
| 1248 | +{ | |
| 1249 | + "status": "healthy", | |
| 1250 | + "elasticsearch": "connected", | |
| 1251 | + "tenant_id": "tenant1" | |
| 1252 | +} | |
| 1253 | +``` | |
| 1254 | + | |
| 1255 | +### 6.2 获取配置 | |
| 1256 | + | |
| 1257 | +- **端点**: `GET /admin/config` | |
| 1258 | +- **描述**: 返回当前租户的脱敏配置,便于核对索引及排序表达式。 | |
| 1259 | + | |
| 1260 | +```json | |
| 1261 | +{ | |
| 1262 | + "tenant_id": "tenant1", | |
| 1263 | + "tenant_name": "Tenant1 Test Instance", | |
| 1264 | + "es_index_name": "search_tenant1", | |
| 1265 | + "num_fields": 20, | |
| 1266 | + "num_indexes": 4, | |
| 1267 | + "supported_languages": ["zh", "en", "ru"], | |
| 1268 | + "ranking_expression": "bm25() + 0.2*text_embedding_relevance()", | |
| 1269 | + "spu_enabled": false | |
| 1270 | +} | |
| 1271 | +``` | |
| 1272 | + | |
| 1273 | +### 6.3 索引统计 | |
| 1274 | + | |
| 1275 | +- **端点**: `GET /admin/stats` | |
| 1276 | +- **描述**: 获取索引文档数量与磁盘大小,方便监控。 | |
| 1277 | + | |
| 1278 | +```json | |
| 1279 | +{ | |
| 1280 | + "index_name": "search_tenant1", | |
| 1281 | + "document_count": 10000, | |
| 1282 | + "size_mb": 523.45 | |
| 1283 | +} | |
| 1284 | +``` | |
| 1285 | + | |
| 1286 | +--- | |
| 1287 | + | |
| 1288 | +## 常见场景示例 | |
| 1289 | + | |
| 1290 | +### 7.1 基础搜索与排序 | |
| 1291 | + | |
| 1292 | +**按价格从低到高排序**: | |
| 1293 | +```json | |
| 1294 | +{ | |
| 1295 | + "query": "玩具", | |
| 1296 | + "size": 20, | |
| 1297 | + "from": 0, | |
| 1298 | + "sort_by": "price", | |
| 1299 | + "sort_order": "asc" | |
| 1300 | +} | |
| 1301 | +``` | |
| 1302 | + | |
| 1303 | +**按价格从高到低排序**: | |
| 1304 | +```json | |
| 1305 | +{ | |
| 1306 | + "query": "玩具", | |
| 1307 | + "size": 20, | |
| 1308 | + "from": 0, | |
| 1309 | + "sort_by": "price", | |
| 1310 | + "sort_order": "desc" | |
| 1311 | +} | |
| 1312 | +``` | |
| 1313 | + | |
| 1314 | +**按销量从高到低排序**: | |
| 1315 | +```json | |
| 1316 | +{ | |
| 1317 | + "query": "玩具", | |
| 1318 | + "size": 20, | |
| 1319 | + "from": 0, | |
| 1320 | + "sort_by": "sales", | |
| 1321 | + "sort_order": "desc" | |
| 1322 | +} | |
| 1323 | +``` | |
| 1324 | + | |
| 1325 | +**按默认(相关性)排序**: | |
| 1326 | +```json | |
| 1327 | +{ | |
| 1328 | + "query": "玩具", | |
| 1329 | + "size": 20, | |
| 1330 | + "from": 0 | |
| 1331 | +} | |
| 1332 | +``` | |
| 1333 | + | |
| 1334 | +### 7.2 过滤搜索 | |
| 1335 | + | |
| 1336 | +**需求**: 搜索"玩具",筛选类目为"益智玩具",价格在50-200之间 | |
| 1337 | + | |
| 1338 | +```json | |
| 1339 | +{ | |
| 1340 | + "query": "玩具", | |
| 1341 | + "size": 20, | |
| 1342 | + "language": "zh", | |
| 1343 | + "filters": { | |
| 1344 | + "category_name": "益智玩具" | |
| 1345 | + }, | |
| 1346 | + "range_filters": { | |
| 1347 | + "min_price": { | |
| 1348 | + "gte": 50, | |
| 1349 | + "lte": 200 | |
| 1350 | + } | |
| 1351 | + } | |
| 1352 | +} | |
| 1353 | +``` | |
| 1354 | + | |
| 1355 | +**需求**: 搜索"手机",筛选多个品牌,价格范围 | |
| 1356 | + | |
| 1357 | +```json | |
| 1358 | +{ | |
| 1359 | + "query": "手机", | |
| 1360 | + "size": 20, | |
| 1361 | + "language": "zh", | |
| 1362 | + "filters": { | |
| 1363 | + "vendor.zh.keyword": ["品牌A", "品牌B"] | |
| 1364 | + }, | |
| 1365 | + "range_filters": { | |
| 1366 | + "min_price": { | |
| 1367 | + "gte": 50, | |
| 1368 | + "lte": 200 | |
| 1369 | + } | |
| 1370 | + } | |
| 1371 | +} | |
| 1372 | +``` | |
| 1373 | + | |
| 1374 | +### 7.3 分面搜索 | |
| 1375 | + | |
| 1376 | +**需求**: 搜索"玩具",获取类目和规格的分面统计,用于构建筛选器 | |
| 1377 | + | |
| 1378 | +```json | |
| 1379 | +{ | |
| 1380 | + "query": "玩具", | |
| 1381 | + "size": 20, | |
| 1382 | + "language": "zh", | |
| 1383 | + "facets": [ | |
| 1384 | + {"field": "category1_name", "size": 15, "type": "terms"}, | |
| 1385 | + {"field": "category2_name", "size": 10, "type": "terms"}, | |
| 1386 | + {"field": "specifications", "size": 10, "type": "terms"} | |
| 1387 | + ] | |
| 1388 | +} | |
| 1389 | +``` | |
| 1390 | + | |
| 1391 | +**需求**: 搜索"手机",获取价格区间和规格的分面统计 | |
| 1392 | + | |
| 1393 | +```json | |
| 1394 | +{ | |
| 1395 | + "query": "手机", | |
| 1396 | + "size": 20, | |
| 1397 | + "language": "zh", | |
| 1398 | + "facets": [ | |
| 1399 | + { | |
| 1400 | + "field": "min_price", | |
| 1401 | + "type": "range", | |
| 1402 | + "ranges": [ | |
| 1403 | + {"key": "0-50", "to": 50}, | |
| 1404 | + {"key": "50-100", "from": 50, "to": 100}, | |
| 1405 | + {"key": "100-200", "from": 100, "to": 200}, | |
| 1406 | + {"key": "200+", "from": 200} | |
| 1407 | + ] | |
| 1408 | + }, | |
| 1409 | + { | |
| 1410 | + "field": "specifications", | |
| 1411 | + "size": 10, | |
| 1412 | + "type": "terms" | |
| 1413 | + } | |
| 1414 | + ] | |
| 1415 | +} | |
| 1416 | +``` | |
| 1417 | + | |
| 1418 | +### 7.4 规格过滤与分面 | |
| 1419 | + | |
| 1420 | +**需求**: 搜索"手机",筛选color为"white"的商品 | |
| 1421 | + | |
| 1422 | +```json | |
| 1423 | +{ | |
| 1424 | + "query": "手机", | |
| 1425 | + "size": 20, | |
| 1426 | + "language": "zh", | |
| 1427 | + "filters": { | |
| 1428 | + "specifications": { | |
| 1429 | + "name": "color", | |
| 1430 | + "value": "white" | |
| 1431 | + } | |
| 1432 | + } | |
| 1433 | +} | |
| 1434 | +``` | |
| 1435 | + | |
| 1436 | +**需求**: 搜索"手机",筛选color为"white"且size为"256GB"的商品 | |
| 1437 | + | |
| 1438 | +```json | |
| 1439 | +{ | |
| 1440 | + "query": "手机", | |
| 1441 | + "size": 20, | |
| 1442 | + "language": "zh", | |
| 1443 | + "filters": { | |
| 1444 | + "specifications": [ | |
| 1445 | + {"name": "color", "value": "white"}, | |
| 1446 | + {"name": "size", "value": "256GB"} | |
| 1447 | + ] | |
| 1448 | + } | |
| 1449 | +} | |
| 1450 | +``` | |
| 1451 | + | |
| 1452 | +**需求**: 搜索"手机",筛选size为"3"、"4"或"5",且color为"green"的商品 | |
| 1453 | + | |
| 1454 | +```json | |
| 1455 | +{ | |
| 1456 | + "query": "手机", | |
| 1457 | + "size": 20, | |
| 1458 | + "language": "zh", | |
| 1459 | + "filters": { | |
| 1460 | + "specifications": [ | |
| 1461 | + {"name": "size", "value": "3"}, | |
| 1462 | + {"name": "size", "value": "4"}, | |
| 1463 | + {"name": "size", "value": "5"}, | |
| 1464 | + {"name": "color", "value": "green"} | |
| 1465 | + ] | |
| 1466 | + } | |
| 1467 | +} | |
| 1468 | +``` | |
| 1469 | + | |
| 1470 | +**需求**: 搜索"手机",获取所有规格的分面统计 | |
| 1471 | + | |
| 1472 | +```json | |
| 1473 | +{ | |
| 1474 | + "query": "手机", | |
| 1475 | + "size": 20, | |
| 1476 | + "language": "zh", | |
| 1477 | + "facets": [ | |
| 1478 | + {"field": "specifications", "size": 10, "type": "terms"} | |
| 1479 | + ] | |
| 1480 | +} | |
| 1481 | +``` | |
| 1482 | + | |
| 1483 | +**需求**: 只获取"color"和"size"规格的分面统计 | |
| 1484 | + | |
| 1485 | +```json | |
| 1486 | +{ | |
| 1487 | + "query": "手机", | |
| 1488 | + "size": 20, | |
| 1489 | + "language": "zh", | |
| 1490 | + "facets": [ | |
| 1491 | + {"field": "specifications.color", "size": 20, "type": "terms"}, | |
| 1492 | + {"field": "specifications.size", "size": 15, "type": "terms"} | |
| 1493 | + ] | |
| 1494 | +} | |
| 1495 | +``` | |
| 1496 | + | |
| 1497 | +**需求**: 搜索"手机",筛选类目和规格,并获取对应的分面统计 | |
| 1498 | + | |
| 1499 | +```json | |
| 1500 | +{ | |
| 1501 | + "query": "手机", | |
| 1502 | + "size": 20, | |
| 1503 | + "language": "zh", | |
| 1504 | + "filters": { | |
| 1505 | + "category_name": "手机", | |
| 1506 | + "specifications": { | |
| 1507 | + "name": "color", | |
| 1508 | + "value": "white" | |
| 1509 | + } | |
| 1510 | + }, | |
| 1511 | + "facets": [ | |
| 1512 | + {"field": "category1_name", "size": 15, "type": "terms"}, | |
| 1513 | + {"field": "category2_name", "size": 10, "type": "terms"}, | |
| 1514 | + {"field": "specifications.color", "size": 20, "type": "terms"}, | |
| 1515 | + {"field": "specifications.size", "size": 15, "type": "terms"} | |
| 1516 | + ] | |
| 1517 | +} | |
| 1518 | +``` | |
| 1519 | + | |
| 1520 | +### 7.5 SKU筛选 | |
| 1521 | + | |
| 1522 | +**需求**: 搜索"芭比娃娃",每个SPU下按颜色筛选,每种颜色只显示一个SKU | |
| 1523 | + | |
| 1524 | +```json | |
| 1525 | +{ | |
| 1526 | + "query": "芭比娃娃", | |
| 1527 | + "size": 20, | |
| 1528 | + "sku_filter_dimension": ["color"] | |
| 1529 | +} | |
| 1530 | +``` | |
| 1531 | + | |
| 1532 | +**说明**: | |
| 1533 | +- 如果 `option1_name` 为 `"color"`,则使用 `sku_filter_dimension: ["color"]` 可以按颜色分组 | |
| 1534 | +- 每个SPU下,每种颜色只会返回第一个SKU | |
| 1535 | +- 如果维度不匹配,返回所有SKU(不进行过滤) | |
| 1536 | + | |
| 1537 | +### 7.6 布尔表达式搜索 | |
| 1538 | + | |
| 1539 | +**需求**: 搜索包含"手机"和"智能"的商品,排除"二手" | |
| 1540 | + | |
| 1541 | +```json | |
| 1542 | +{ | |
| 1543 | + "query": "手机 AND 智能 ANDNOT 二手", | |
| 1544 | + "size": 20 | |
| 1545 | +} | |
| 1546 | +``` | |
| 1547 | + | |
| 1548 | +### 7.7 分页查询 | |
| 1549 | + | |
| 1550 | +**需求**: 获取第2页结果(每页20条) | |
| 1551 | + | |
| 1552 | +```json | |
| 1553 | +{ | |
| 1554 | + "query": "手机", | |
| 1555 | + "size": 20, | |
| 1556 | + "from": 20 | |
| 1557 | +} | |
| 1558 | +``` | |
| 1559 | + | |
| 1560 | +--- | |
| 1561 | + | |
| 1562 | +## 数据模型 | |
| 1563 | + | |
| 1564 | +### 8.1 商品字段定义 | |
| 1565 | + | |
| 1566 | +| 字段名 | 类型 | 描述 | | |
| 1567 | +|--------|------|------| | |
| 1568 | +| `tenant_id` | keyword | 租户ID(多租户隔离) | | |
| 1569 | +| `spu_id` | keyword | SPU ID | | |
| 1570 | +| `title.<lang>` | object/text | 商品标题(多语言对象,如 `title.zh`, `title.en`) | | |
| 1571 | +| `brief.<lang>` | object/text | 商品短描述(多语言对象,如 `brief.zh`, `brief.en`) | | |
| 1572 | +| `description.<lang>` | object/text | 商品详细描述(多语言对象,如 `description.zh`, `description.en`) | | |
| 1573 | +| `vendor.<lang>` | object/text | 供应商/品牌(多语言对象,且带 keyword 子字段,如 `vendor.zh.keyword`) | | |
| 1574 | +| `category_path.<lang>` | object/text | 类目路径(多语言对象,用于搜索,如 `category_path.zh`) | | |
| 1575 | +| `category_name_text.<lang>` | object/text | 类目名称(多语言对象,用于搜索,如 `category_name_text.zh`) | | |
| 1576 | +| `category_id` | keyword | 类目ID | | |
| 1577 | +| `category_name` | keyword | 类目名称(用于过滤) | | |
| 1578 | +| `category_level` | integer | 类目层级 | | |
| 1579 | +| `category1_name`, `category2_name`, `category3_name` | keyword | 多级类目名称(用于过滤和分面) | | |
| 1580 | +| `tags` | keyword | 标签(数组) | | |
| 1581 | +| `specifications` | nested | 规格(嵌套对象数组) | | |
| 1582 | +| `option1_name`, `option2_name`, `option3_name` | keyword | 选项名称 | | |
| 1583 | +| `min_price`, `max_price` | float | 最低/最高价格 | | |
| 1584 | +| `compare_at_price` | float | 原价 | | |
| 1585 | +| `sku_prices` | float | SKU价格列表(数组) | | |
| 1586 | +| `sku_weights` | long | SKU重量列表(数组) | | |
| 1587 | +| `sku_weight_units` | keyword | SKU重量单位列表(数组) | | |
| 1588 | +| `total_inventory` | long | 总库存 | | |
| 1589 | +| `sales` | long | 销量(展示销量) | | |
| 1590 | +| `skus` | nested | SKU详细信息(嵌套对象数组) | | |
| 1591 | +| `create_time`, `update_time` | date | 创建/更新时间 | | |
| 1592 | +| `title_embedding` | dense_vector | 标题向量(1024维,仅用于搜索) | | |
| 1593 | +| `image_embedding` | nested | 图片向量(嵌套,仅用于搜索) | | |
| 1594 | + | |
| 1595 | +> 所有租户共享统一的索引结构。文本字段支持中英文双语,后端根据 `language` 参数自动选择对应字段返回。 | |
| 1596 | + | |
| 1597 | +### 8.2 字段类型速查 | |
| 1598 | + | |
| 1599 | +| 类型 | ES Mapping | 用途 | | |
| 1600 | +|------|------------|------| | |
| 1601 | +| `text` | `text` | 全文检索(支持中英文分析器) | | |
| 1602 | +| `keyword` | `keyword` | 精确匹配、聚合、排序 | | |
| 1603 | +| `integer` | `integer` | 整数 | | |
| 1604 | +| `long` | `long` | 长整数 | | |
| 1605 | +| `float` | `float` | 浮点数 | | |
| 1606 | +| `date` | `date` | 日期时间 | | |
| 1607 | +| `nested` | `nested` | 嵌套对象(specifications, skus, image_embedding) | | |
| 1608 | +| `dense_vector` | `dense_vector` | 向量字段(title_embedding,仅用于搜索) | | |
| 1609 | + | |
| 1610 | +### 8.3 常用字段列表 | |
| 1611 | + | |
| 1612 | +#### 过滤字段 | |
| 1613 | + | |
| 1614 | +- `category_name`: 类目名称 | |
| 1615 | +- `category1_name`, `category2_name`, `category3_name`: 多级类目 | |
| 1616 | +- `category_id`: 类目ID | |
| 1617 | +- `vendor.zh.keyword`, `vendor.en.keyword`: 供应商/品牌(使用keyword子字段) | |
| 1618 | +- `tags`: 标签(keyword类型) | |
| 1619 | +- `option1_name`, `option2_name`, `option3_name`: 选项名称 | |
| 1620 | +- `specifications`: 规格过滤(嵌套字段,格式见[过滤器详解](#33-过滤器详解)) | |
| 1621 | + | |
| 1622 | +#### 范围字段 | |
| 1623 | + | |
| 1624 | +- `min_price`: 最低价格 | |
| 1625 | +- `max_price`: 最高价格 | |
| 1626 | +- `compare_at_price`: 原价 | |
| 1627 | +- `create_time`: 创建时间 | |
| 1628 | +- `update_time`: 更新时间 | |
| 1629 | + | |
| 1630 | +#### 排序字段 | |
| 1631 | + | |
| 1632 | +- `price`: 价格(后端自动根据sort_order映射:asc→min_price,desc→max_price) | |
| 1633 | +- `sales`: 销量 | |
| 1634 | +- `create_time`: 创建时间 | |
| 1635 | +- `update_time`: 更新时间 | |
| 1636 | +- `relevance_score`: 相关性分数(默认,不指定sort_by时使用) | |
| 1637 | + | |
| 1638 | +**注意**: 前端只需传 `price`,后端会自动处理: | |
| 1639 | +- `sort_by: "price"` + `sort_order: "asc"` → 按 `min_price` 升序(价格从低到高) | |
| 1640 | +- `sort_by: "price"` + `sort_order: "desc"` → 按 `max_price` 降序(价格从高到低) | |
| 1641 | + | |
| 1642 | +### 8.4 支持的分析器 | |
| 1643 | + | |
| 1644 | +| 分析器 | 语言 | 描述 | | |
| 1645 | +|--------|------|------| | |
| 1646 | +| `index_ansj` | 中文 | 中文索引分析器(用于中文字段) | | |
| 1647 | +| `query_ansj` | 中文 | 中文查询分析器(用于中文字段) | | |
| 1648 | +| `hanlp_index` ⚠️ TODO(暂不支持) | 中文 | 中文索引分析器(用于中文字段) | | |
| 1649 | +| `hanlp_standard` ⚠️ TODO(暂不支持) | 中文 | 中文查询分析器(用于中文字段) | | |
| 1650 | +| `english` | 英文 | 标准英文分析器(用于英文字段) | | |
| 1651 | +| `lowercase` | - | 小写标准化器(用于keyword子字段) | | ... | ... |
requirements.txt
| ... | ... | @@ -12,13 +12,9 @@ langchain-openai>=0.2.0 |
| 12 | 12 | langgraph>=1.0.0 |
| 13 | 13 | openai>=1.12.0 |
| 14 | 14 | |
| 15 | -# Embeddings & Vision | |
| 16 | -clip-client>=3.5.0 # CLIP-as-Service client | |
| 15 | +# Vision (VLM image analysis) | |
| 17 | 16 | Pillow>=10.2.0 # Image processing |
| 18 | 17 | |
| 19 | -# Vector Database | |
| 20 | -pymilvus>=2.3.6 | |
| 21 | - | |
| 22 | 18 | # Databases |
| 23 | 19 | pymongo>=4.6.1 |
| 24 | 20 | ... | ... |
scripts/check_services.sh
| 1 | 1 | #!/usr/bin/env bash |
| 2 | 2 | # ============================================================================= |
| 3 | 3 | # OmniShopAgent - 服务健康检查脚本 |
| 4 | -# 检查 Milvus、CLIP、Streamlit 等依赖服务状态 | |
| 4 | +# 检查 Streamlit、Search API 等依赖 | |
| 5 | 5 | # ============================================================================= |
| 6 | 6 | set -euo pipefail |
| 7 | 7 | |
| ... | ... | @@ -49,40 +49,16 @@ else |
| 49 | 49 | echo -e "${RED}FAIL${NC} 未找到" |
| 50 | 50 | fi |
| 51 | 51 | |
| 52 | -# 4. Milvus | |
| 53 | -echo -n "[Milvus] " | |
| 54 | -if command -v docker &>/dev/null; then | |
| 55 | - if docker ps --format '{{.Names}}' 2>/dev/null | grep -q milvus-standalone; then | |
| 56 | - if curl -s -o /dev/null -w "%{http_code}" http://localhost:9091/healthz 2>/dev/null | grep -q 200; then | |
| 57 | - echo -e "${GREEN}OK${NC} localhost:19530" | |
| 58 | - else | |
| 59 | - echo -e "${YELLOW}WARN${NC} 容器运行中,健康检查未响应" | |
| 60 | - fi | |
| 61 | - else | |
| 62 | - echo -e "${YELLOW}WARN${NC} 未运行 (docker compose up -d)" | |
| 63 | - fi | |
| 64 | -else | |
| 65 | - echo -e "${YELLOW}SKIP${NC} Docker 未安装" | |
| 66 | -fi | |
| 67 | - | |
| 68 | -# 5. CLIP 服务(可选) | |
| 69 | -echo -n "[CLIP] " | |
| 70 | -if timeout 2 bash -c 'echo >/dev/tcp/localhost/51000' 2>/dev/null; then | |
| 71 | - echo -e "${GREEN}OK${NC} localhost:51000" | |
| 72 | -else | |
| 73 | - echo -e "${YELLOW}WARN${NC} 未运行 (图像搜索需启动: python -m clip_server launch)" | |
| 74 | -fi | |
| 75 | - | |
| 76 | -# 6. 数据目录 | |
| 52 | +# 4. 数据目录(可选,用于图片上传) | |
| 77 | 53 | echo -n "[数据] " |
| 78 | 54 | if [ -d "$PROJECT_ROOT/data/images" ] && [ -f "$PROJECT_ROOT/data/styles.csv" ]; then |
| 79 | 55 | IMG_COUNT=$(find "$PROJECT_ROOT/data/images" -name "*.jpg" 2>/dev/null | wc -l) |
| 80 | 56 | echo -e "${GREEN}OK${NC} $IMG_COUNT 张图片" |
| 81 | 57 | else |
| 82 | - echo -e "${YELLOW}WARN${NC} 未找到 data/images 或 data/styles.csv (运行 download_dataset.py)" | |
| 58 | + echo -e "${YELLOW}WARN${NC} 未找到 data/images 或 data/styles.csv (可选,用于图片风格分析)" | |
| 83 | 59 | fi |
| 84 | 60 | |
| 85 | -# 7. Streamlit | |
| 61 | +# 5. Streamlit | |
| 86 | 62 | echo -n "[Streamlit] " |
| 87 | 63 | if pgrep -f "streamlit run app.py" >/dev/null 2>&1; then |
| 88 | 64 | echo -e "${GREEN}OK${NC} 运行中" | ... | ... |
scripts/index_data.py deleted
| ... | ... | @@ -1,467 +0,0 @@ |
| 1 | -""" | |
| 2 | -Data Indexing Script | |
| 3 | -Generates embeddings for products and stores them in Milvus | |
| 4 | -""" | |
| 5 | - | |
| 6 | -import csv | |
| 7 | -import logging | |
| 8 | -import os | |
| 9 | -import sys | |
| 10 | -from pathlib import Path | |
| 11 | -from typing import Any, Dict, Optional | |
| 12 | - | |
| 13 | -from tqdm import tqdm | |
| 14 | - | |
| 15 | -# Add parent directory to path | |
| 16 | -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| 17 | - | |
| 18 | -# Import config and settings first | |
| 19 | -# Direct imports from files to avoid __init__.py circular issues | |
| 20 | -import importlib.util | |
| 21 | - | |
| 22 | -from app.config import get_absolute_path, settings | |
| 23 | - | |
| 24 | - | |
| 25 | -def load_service_module(module_name, file_name): | |
| 26 | - """Load a service module directly from file""" | |
| 27 | - spec = importlib.util.spec_from_file_location( | |
| 28 | - module_name, | |
| 29 | - os.path.join( | |
| 30 | - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), | |
| 31 | - f"app/services/{file_name}", | |
| 32 | - ), | |
| 33 | - ) | |
| 34 | - module = importlib.util.module_from_spec(spec) | |
| 35 | - spec.loader.exec_module(module) | |
| 36 | - return module | |
| 37 | - | |
| 38 | - | |
| 39 | -embedding_module = load_service_module("embedding_service", "embedding_service.py") | |
| 40 | -milvus_module = load_service_module("milvus_service", "milvus_service.py") | |
| 41 | - | |
| 42 | -EmbeddingService = embedding_module.EmbeddingService | |
| 43 | -MilvusService = milvus_module.MilvusService | |
| 44 | - | |
| 45 | -# Configure logging | |
| 46 | -logging.basicConfig( | |
| 47 | - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| 48 | -) | |
| 49 | -logger = logging.getLogger(__name__) | |
| 50 | - | |
| 51 | - | |
| 52 | -class DataIndexer: | |
| 53 | - """Index product data by generating and storing embeddings""" | |
| 54 | - | |
| 55 | - def __init__(self): | |
| 56 | - """Initialize services""" | |
| 57 | - self.embedding_service = EmbeddingService() | |
| 58 | - self.milvus_service = MilvusService() | |
| 59 | - | |
| 60 | - self.image_dir = Path(get_absolute_path(settings.image_data_path)) | |
| 61 | - self.styles_csv = get_absolute_path("./data/styles.csv") | |
| 62 | - self.images_csv = get_absolute_path("./data/images.csv") | |
| 63 | - | |
| 64 | - # Load product data from CSV | |
| 65 | - self.products = self._load_products_from_csv() | |
| 66 | - | |
| 67 | - def _load_products_from_csv(self) -> Dict[int, Dict[str, Any]]: | |
| 68 | - """Load products from CSV files""" | |
| 69 | - products = {} | |
| 70 | - | |
| 71 | - # Load images mapping | |
| 72 | - images_dict = {} | |
| 73 | - with open(self.images_csv, "r", encoding="utf-8") as f: | |
| 74 | - reader = csv.DictReader(f) | |
| 75 | - for row in reader: | |
| 76 | - product_id = int(row["filename"].split(".")[0]) | |
| 77 | - images_dict[product_id] = row["link"] | |
| 78 | - | |
| 79 | - # Load styles/products | |
| 80 | - with open(self.styles_csv, "r", encoding="utf-8") as f: | |
| 81 | - reader = csv.DictReader(f) | |
| 82 | - for row in reader: | |
| 83 | - try: | |
| 84 | - product_id = int(row["id"]) | |
| 85 | - products[product_id] = { | |
| 86 | - "id": product_id, | |
| 87 | - "gender": row.get("gender", ""), | |
| 88 | - "masterCategory": row.get("masterCategory", ""), | |
| 89 | - "subCategory": row.get("subCategory", ""), | |
| 90 | - "articleType": row.get("articleType", ""), | |
| 91 | - "baseColour": row.get("baseColour", ""), | |
| 92 | - "season": row.get("season", ""), | |
| 93 | - "year": int(row["year"]) if row.get("year") else 0, | |
| 94 | - "usage": row.get("usage", ""), | |
| 95 | - "productDisplayName": row.get("productDisplayName", ""), | |
| 96 | - "imageUrl": images_dict.get(product_id, ""), | |
| 97 | - "imagePath": f"{product_id}.jpg", | |
| 98 | - } | |
| 99 | - except (ValueError, KeyError) as e: | |
| 100 | - logger.warning(f"Error loading product {row.get('id')}: {e}") | |
| 101 | - continue | |
| 102 | - | |
| 103 | - logger.info(f"Loaded {len(products)} products from CSV") | |
| 104 | - return products | |
| 105 | - | |
| 106 | - def setup(self) -> None: | |
| 107 | - """Setup connections and collections""" | |
| 108 | - logger.info("Setting up services...") | |
| 109 | - | |
| 110 | - # Connect to CLIP server | |
| 111 | - self.embedding_service.connect_clip() | |
| 112 | - logger.info("✓ CLIP server connected") | |
| 113 | - | |
| 114 | - # Connect to Milvus | |
| 115 | - self.milvus_service.connect() | |
| 116 | - logger.info("✓ Milvus connected") | |
| 117 | - | |
| 118 | - # Create Milvus collections | |
| 119 | - self.milvus_service.create_text_collection(recreate=False) | |
| 120 | - self.milvus_service.create_image_collection(recreate=False) | |
| 121 | - logger.info("✓ Milvus collections ready") | |
| 122 | - | |
| 123 | - def teardown(self) -> None: | |
| 124 | - """Close all connections""" | |
| 125 | - logger.info("Closing connections...") | |
| 126 | - self.embedding_service.disconnect_clip() | |
| 127 | - self.milvus_service.disconnect() | |
| 128 | - logger.info("✓ All connections closed") | |
| 129 | - | |
| 130 | - def index_text_embeddings( | |
| 131 | - self, batch_size: int = 100, skip: int = 0, limit: Optional[int] = None | |
| 132 | - ) -> Dict[str, int]: | |
| 133 | - """Generate and store text embeddings for products | |
| 134 | - | |
| 135 | - Args: | |
| 136 | - batch_size: Number of products to process at once | |
| 137 | - skip: Number of products to skip | |
| 138 | - limit: Maximum number of products to process (None for all) | |
| 139 | - | |
| 140 | - Returns: | |
| 141 | - Dictionary with indexing statistics | |
| 142 | - """ | |
| 143 | - logger.info("Starting text embedding indexing...") | |
| 144 | - | |
| 145 | - # Get products list | |
| 146 | - product_ids = list(self.products.keys())[skip:] | |
| 147 | - if limit: | |
| 148 | - product_ids = product_ids[:limit] | |
| 149 | - | |
| 150 | - total_products = len(product_ids) | |
| 151 | - processed = 0 | |
| 152 | - inserted = 0 | |
| 153 | - errors = 0 | |
| 154 | - | |
| 155 | - with tqdm(total=total_products, desc="Indexing text embeddings") as pbar: | |
| 156 | - while processed < total_products: | |
| 157 | - # Get batch of products | |
| 158 | - current_batch_size = min(batch_size, total_products - processed) | |
| 159 | - batch_ids = product_ids[processed : processed + current_batch_size] | |
| 160 | - products = [self.products[pid] for pid in batch_ids] | |
| 161 | - | |
| 162 | - if not products: | |
| 163 | - break | |
| 164 | - | |
| 165 | - try: | |
| 166 | - # Prepare texts for embedding | |
| 167 | - texts = [] | |
| 168 | - text_mappings = [] | |
| 169 | - | |
| 170 | - for product in products: | |
| 171 | - # Create text representation of product | |
| 172 | - text = self._create_product_text(product) | |
| 173 | - texts.append(text) | |
| 174 | - text_mappings.append( | |
| 175 | - {"product_id": product["id"], "text": text} | |
| 176 | - ) | |
| 177 | - | |
| 178 | - # Generate embeddings | |
| 179 | - embeddings = self.embedding_service.get_text_embeddings_batch( | |
| 180 | - texts, batch_size=50 # OpenAI batch size | |
| 181 | - ) | |
| 182 | - | |
| 183 | - # Prepare data for Milvus (with metadata) | |
| 184 | - milvus_data = [] | |
| 185 | - for idx, (mapping, embedding) in enumerate( | |
| 186 | - zip(text_mappings, embeddings) | |
| 187 | - ): | |
| 188 | - product_id = mapping["product_id"] | |
| 189 | - product = self.products[product_id] | |
| 190 | - | |
| 191 | - milvus_data.append( | |
| 192 | - { | |
| 193 | - "id": product_id, | |
| 194 | - "text": mapping["text"][ | |
| 195 | - :2000 | |
| 196 | - ], # Truncate to max length | |
| 197 | - "embedding": embedding, | |
| 198 | - # Product metadata | |
| 199 | - "productDisplayName": product["productDisplayName"][ | |
| 200 | - :500 | |
| 201 | - ], | |
| 202 | - "gender": product["gender"][:50], | |
| 203 | - "masterCategory": product["masterCategory"][:100], | |
| 204 | - "subCategory": product["subCategory"][:100], | |
| 205 | - "articleType": product["articleType"][:100], | |
| 206 | - "baseColour": product["baseColour"][:50], | |
| 207 | - "season": product["season"][:50], | |
| 208 | - "usage": product["usage"][:50], | |
| 209 | - "year": product["year"], | |
| 210 | - "imageUrl": product["imageUrl"], | |
| 211 | - "imagePath": product["imagePath"], | |
| 212 | - } | |
| 213 | - ) | |
| 214 | - | |
| 215 | - # Insert into Milvus | |
| 216 | - count = self.milvus_service.insert_text_embeddings(milvus_data) | |
| 217 | - inserted += count | |
| 218 | - | |
| 219 | - except Exception as e: | |
| 220 | - logger.error( | |
| 221 | - f"Error processing text batch at offset {processed}: {e}" | |
| 222 | - ) | |
| 223 | - errors += len(products) | |
| 224 | - | |
| 225 | - processed += len(products) | |
| 226 | - pbar.update(len(products)) | |
| 227 | - | |
| 228 | - stats = {"total_processed": processed, "inserted": inserted, "errors": errors} | |
| 229 | - | |
| 230 | - logger.info(f"Text embedding indexing completed: {stats}") | |
| 231 | - return stats | |
| 232 | - | |
| 233 | - def index_image_embeddings( | |
| 234 | - self, batch_size: int = 32, skip: int = 0, limit: Optional[int] = None | |
| 235 | - ) -> Dict[str, int]: | |
| 236 | - """Generate and store image embeddings for products | |
| 237 | - | |
| 238 | - Args: | |
| 239 | - batch_size: Number of images to process at once | |
| 240 | - skip: Number of products to skip | |
| 241 | - limit: Maximum number of products to process (None for all) | |
| 242 | - | |
| 243 | - Returns: | |
| 244 | - Dictionary with indexing statistics | |
| 245 | - """ | |
| 246 | - logger.info("Starting image embedding indexing...") | |
| 247 | - | |
| 248 | - # Get products list | |
| 249 | - product_ids = list(self.products.keys())[skip:] | |
| 250 | - if limit: | |
| 251 | - product_ids = product_ids[:limit] | |
| 252 | - | |
| 253 | - total_products = len(product_ids) | |
| 254 | - processed = 0 | |
| 255 | - inserted = 0 | |
| 256 | - errors = 0 | |
| 257 | - | |
| 258 | - with tqdm(total=total_products, desc="Indexing image embeddings") as pbar: | |
| 259 | - while processed < total_products: | |
| 260 | - # Get batch of products | |
| 261 | - current_batch_size = min(batch_size, total_products - processed) | |
| 262 | - batch_ids = product_ids[processed : processed + current_batch_size] | |
| 263 | - products = [self.products[pid] for pid in batch_ids] | |
| 264 | - | |
| 265 | - if not products: | |
| 266 | - break | |
| 267 | - | |
| 268 | - try: | |
| 269 | - # Prepare image paths | |
| 270 | - image_paths = [] | |
| 271 | - image_mappings = [] | |
| 272 | - | |
| 273 | - for product in products: | |
| 274 | - image_path = self.image_dir / product["imagePath"] | |
| 275 | - image_paths.append(image_path) | |
| 276 | - image_mappings.append( | |
| 277 | - { | |
| 278 | - "product_id": product["id"], | |
| 279 | - "image_path": product["imagePath"], | |
| 280 | - } | |
| 281 | - ) | |
| 282 | - | |
| 283 | - # Generate embeddings | |
| 284 | - embeddings = self.embedding_service.get_image_embeddings_batch( | |
| 285 | - image_paths, batch_size=batch_size | |
| 286 | - ) | |
| 287 | - | |
| 288 | - # Prepare data for Milvus (with metadata) | |
| 289 | - milvus_data = [] | |
| 290 | - for idx, (mapping, embedding) in enumerate( | |
| 291 | - zip(image_mappings, embeddings) | |
| 292 | - ): | |
| 293 | - if embedding is not None: | |
| 294 | - product_id = mapping["product_id"] | |
| 295 | - product = self.products[product_id] | |
| 296 | - | |
| 297 | - milvus_data.append( | |
| 298 | - { | |
| 299 | - "id": product_id, | |
| 300 | - "image_path": mapping["image_path"], | |
| 301 | - "embedding": embedding, | |
| 302 | - # Product metadata | |
| 303 | - "productDisplayName": product["productDisplayName"][ | |
| 304 | - :500 | |
| 305 | - ], | |
| 306 | - "gender": product["gender"][:50], | |
| 307 | - "masterCategory": product["masterCategory"][:100], | |
| 308 | - "subCategory": product["subCategory"][:100], | |
| 309 | - "articleType": product["articleType"][:100], | |
| 310 | - "baseColour": product["baseColour"][:50], | |
| 311 | - "season": product["season"][:50], | |
| 312 | - "usage": product["usage"][:50], | |
| 313 | - "year": product["year"], | |
| 314 | - "imageUrl": product["imageUrl"], | |
| 315 | - } | |
| 316 | - ) | |
| 317 | - else: | |
| 318 | - errors += 1 | |
| 319 | - | |
| 320 | - # Insert into Milvus | |
| 321 | - if milvus_data: | |
| 322 | - count = self.milvus_service.insert_image_embeddings(milvus_data) | |
| 323 | - inserted += count | |
| 324 | - | |
| 325 | - except Exception as e: | |
| 326 | - logger.error( | |
| 327 | - f"Error processing image batch at offset {processed}: {e}" | |
| 328 | - ) | |
| 329 | - errors += len(products) | |
| 330 | - | |
| 331 | - processed += len(products) | |
| 332 | - pbar.update(len(products)) | |
| 333 | - | |
| 334 | - stats = {"total_processed": processed, "inserted": inserted, "errors": errors} | |
| 335 | - | |
| 336 | - logger.info(f"Image embedding indexing completed: {stats}") | |
| 337 | - return stats | |
| 338 | - | |
| 339 | - def _create_product_text(self, product: Dict[str, Any]) -> str: | |
| 340 | - """Create text representation of product for embedding | |
| 341 | - | |
| 342 | - Args: | |
| 343 | - product: Product document | |
| 344 | - | |
| 345 | - Returns: | |
| 346 | - Text representation | |
| 347 | - """ | |
| 348 | - # Create a natural language description | |
| 349 | - parts = [ | |
| 350 | - product.get("productDisplayName", ""), | |
| 351 | - f"Gender: {product.get('gender', '')}", | |
| 352 | - f"Category: {product.get('masterCategory', '')} > {product.get('subCategory', '')}", | |
| 353 | - f"Type: {product.get('articleType', '')}", | |
| 354 | - f"Color: {product.get('baseColour', '')}", | |
| 355 | - f"Season: {product.get('season', '')}", | |
| 356 | - f"Usage: {product.get('usage', '')}", | |
| 357 | - ] | |
| 358 | - | |
| 359 | - text = " | ".join( | |
| 360 | - [p for p in parts if p and p != "Gender: " and p != "Color: "] | |
| 361 | - ) | |
| 362 | - return text | |
| 363 | - | |
| 364 | - def get_stats(self) -> Dict[str, Any]: | |
| 365 | - """Get indexing statistics | |
| 366 | - | |
| 367 | - Returns: | |
| 368 | - Dictionary with statistics | |
| 369 | - """ | |
| 370 | - text_stats = self.milvus_service.get_collection_stats( | |
| 371 | - self.milvus_service.text_collection_name | |
| 372 | - ) | |
| 373 | - image_stats = self.milvus_service.get_collection_stats( | |
| 374 | - self.milvus_service.image_collection_name | |
| 375 | - ) | |
| 376 | - | |
| 377 | - return { | |
| 378 | - "total_products": len(self.products), | |
| 379 | - "milvus_text": text_stats, | |
| 380 | - "milvus_image": image_stats, | |
| 381 | - } | |
| 382 | - | |
| 383 | - | |
| 384 | -def main(): | |
| 385 | - """Main function""" | |
| 386 | - import argparse | |
| 387 | - | |
| 388 | - parser = argparse.ArgumentParser(description="Index product data for search") | |
| 389 | - parser.add_argument( | |
| 390 | - "--mode", | |
| 391 | - choices=["text", "image", "both"], | |
| 392 | - default="both", | |
| 393 | - help="Which embeddings to index", | |
| 394 | - ) | |
| 395 | - parser.add_argument( | |
| 396 | - "--batch-size", type=int, default=100, help="Batch size for processing" | |
| 397 | - ) | |
| 398 | - parser.add_argument( | |
| 399 | - "--skip", type=int, default=0, help="Number of products to skip" | |
| 400 | - ) | |
| 401 | - parser.add_argument( | |
| 402 | - "--limit", type=int, default=None, help="Maximum number of products to process" | |
| 403 | - ) | |
| 404 | - parser.add_argument("--stats", action="store_true", help="Show statistics only") | |
| 405 | - | |
| 406 | - args = parser.parse_args() | |
| 407 | - | |
| 408 | - # Create indexer | |
| 409 | - indexer = DataIndexer() | |
| 410 | - | |
| 411 | - try: | |
| 412 | - # Setup services | |
| 413 | - indexer.setup() | |
| 414 | - | |
| 415 | - if args.stats: | |
| 416 | - # Show statistics | |
| 417 | - stats = indexer.get_stats() | |
| 418 | - print("\n=== Indexing Statistics ===") | |
| 419 | - print(f"\nTotal Products in CSV: {stats['total_products']}") | |
| 420 | - | |
| 421 | - print("\nMilvus Text Embeddings:") | |
| 422 | - print(f" Collection: {stats['milvus_text']['collection_name']}") | |
| 423 | - print(f" Total embeddings: {stats['milvus_text']['row_count']}") | |
| 424 | - | |
| 425 | - print("\nMilvus Image Embeddings:") | |
| 426 | - print(f" Collection: {stats['milvus_image']['collection_name']}") | |
| 427 | - print(f" Total embeddings: {stats['milvus_image']['row_count']}") | |
| 428 | - | |
| 429 | - print( | |
| 430 | - f"\nCoverage: {stats['milvus_image']['row_count'] / stats['total_products'] * 100:.1f}%" | |
| 431 | - ) | |
| 432 | - else: | |
| 433 | - # Index data | |
| 434 | - if args.mode in ["text", "both"]: | |
| 435 | - logger.info("=== Indexing Text Embeddings ===") | |
| 436 | - text_stats = indexer.index_text_embeddings( | |
| 437 | - batch_size=args.batch_size, skip=args.skip, limit=args.limit | |
| 438 | - ) | |
| 439 | - print(f"\nText Indexing Results: {text_stats}") | |
| 440 | - | |
| 441 | - if args.mode in ["image", "both"]: | |
| 442 | - logger.info("=== Indexing Image Embeddings ===") | |
| 443 | - image_stats = indexer.index_image_embeddings( | |
| 444 | - batch_size=min(args.batch_size, 32), # Smaller batch for images | |
| 445 | - skip=args.skip, | |
| 446 | - limit=args.limit, | |
| 447 | - ) | |
| 448 | - print(f"\nImage Indexing Results: {image_stats}") | |
| 449 | - | |
| 450 | - # Show final statistics | |
| 451 | - logger.info("\n=== Final Statistics ===") | |
| 452 | - stats = indexer.get_stats() | |
| 453 | - print(f"Total products: {stats['total_products']}") | |
| 454 | - print(f"Text embeddings: {stats['milvus_text']['row_count']}") | |
| 455 | - print(f"Image embeddings: {stats['milvus_image']['row_count']}") | |
| 456 | - | |
| 457 | - except KeyboardInterrupt: | |
| 458 | - logger.info("\nIndexing interrupted by user") | |
| 459 | - except Exception as e: | |
| 460 | - logger.error(f"Error during indexing: {e}", exc_info=True) | |
| 461 | - sys.exit(1) | |
| 462 | - finally: | |
| 463 | - indexer.teardown() | |
| 464 | - | |
| 465 | - | |
| 466 | -if __name__ == "__main__": | |
| 467 | - main() |
scripts/run_clip.sh deleted
| ... | ... | @@ -1,22 +0,0 @@ |
| 1 | -#!/usr/bin/env bash | |
| 2 | -# ============================================================================= | |
| 3 | -# OmniShopAgent - 启动 CLIP 图像向量服务 | |
| 4 | -# 图像搜索、以图搜图功能依赖此服务 | |
| 5 | -# ============================================================================= | |
| 6 | -set -euo pipefail | |
| 7 | - | |
| 8 | -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| 9 | -PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" | |
| 10 | -VENV_DIR="${VENV_DIR:-$PROJECT_ROOT/venv}" | |
| 11 | - | |
| 12 | -cd "$PROJECT_ROOT" | |
| 13 | - | |
| 14 | -if [ -d "$VENV_DIR" ]; then | |
| 15 | - set +u | |
| 16 | - source "$VENV_DIR/bin/activate" | |
| 17 | - set -u | |
| 18 | -fi | |
| 19 | - | |
| 20 | -echo "启动 CLIP 服务 (端口 51000)..." | |
| 21 | -echo "按 Ctrl+C 停止" | |
| 22 | -exec python -m clip_server launch |
scripts/run_milvus.sh deleted
| ... | ... | @@ -1,31 +0,0 @@ |
| 1 | -#!/usr/bin/env bash | |
| 2 | -# ============================================================================= | |
| 3 | -# OmniShopAgent - 启动 Milvus 向量数据库 | |
| 4 | -# 使用 Docker Compose 启动 Milvus 及相关依赖 | |
| 5 | -# ============================================================================= | |
| 6 | -set -euo pipefail | |
| 7 | - | |
| 8 | -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| 9 | -PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" | |
| 10 | - | |
| 11 | -cd "$PROJECT_ROOT" | |
| 12 | - | |
| 13 | -if ! command -v docker &>/dev/null; then | |
| 14 | - echo "错误: 未安装 Docker。请先运行 setup_env_centos8.sh" | |
| 15 | - exit 1 | |
| 16 | -fi | |
| 17 | - | |
| 18 | -echo "启动 Milvus..." | |
| 19 | -docker compose up -d 2>/dev/null || docker-compose up -d 2>/dev/null || { | |
| 20 | - echo "错误: 无法执行 docker compose。请确保已安装 Docker Compose" | |
| 21 | - exit 1 | |
| 22 | -} | |
| 23 | - | |
| 24 | -echo "等待 Milvus 就绪 (约 60 秒)..." | |
| 25 | -sleep 60 | |
| 26 | - | |
| 27 | -if curl -s -o /dev/null -w "%{http_code}" http://localhost:9091/healthz 2>/dev/null | grep -q 200; then | |
| 28 | - echo "Milvus 已就绪: localhost:19530" | |
| 29 | -else | |
| 30 | - echo "提示: Milvus 可能仍在启动,请稍后执行 check_services.sh 检查" | |
| 31 | -fi |
scripts/setup_env_centos8.sh
| ... | ... | @@ -41,9 +41,9 @@ sudo dnf install -y \ |
| 41 | 41 | tar |
| 42 | 42 | |
| 43 | 43 | # ----------------------------------------------------------------------------- |
| 44 | -# 2. 安装 Docker(用于 Milvus) | |
| 44 | +# 2. 检查 Docker(可选) | |
| 45 | 45 | # ----------------------------------------------------------------------------- |
| 46 | -echo "[2/4] 检查/安装 Docker..." | |
| 46 | +echo "[2/4] 检查 Docker..." | |
| 47 | 47 | if ! command -v docker &>/dev/null; then |
| 48 | 48 | echo " 安装 Docker..." |
| 49 | 49 | sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo 2>/dev/null || { |
| ... | ... | @@ -142,11 +142,9 @@ echo "==========================================" |
| 142 | 142 | echo "环境准备完成!" |
| 143 | 143 | echo "==========================================" |
| 144 | 144 | echo "下一步:" |
| 145 | -echo " 1. 编辑 .env 配置 OPENAI_API_KEY" | |
| 146 | -echo " 2. 下载数据: python scripts/download_dataset.py" | |
| 147 | -echo " 3. 启动 Milvus: ./scripts/run_milvus.sh" | |
| 148 | -echo " 4. 索引数据: python scripts/index_data.py" | |
| 149 | -echo " 5. 启动应用: ./scripts/start.sh" | |
| 145 | +echo " 1. 编辑 .env 配置 OPENAI_API_KEY、SEARCH_API_BASE_URL 等" | |
| 146 | +echo " 2. (可选)下载数据: python scripts/download_dataset.py" | |
| 147 | +echo " 3. 启动应用: ./scripts/start.sh" | |
| 150 | 148 | echo "" |
| 151 | 149 | echo "激活虚拟环境: source $VENV_DIR/bin/activate" |
| 152 | 150 | echo "==========================================" | ... | ... |
scripts/start.sh
| 1 | 1 | #!/usr/bin/env bash |
| 2 | 2 | # ============================================================================= |
| 3 | 3 | # OmniShopAgent - 启动脚本 |
| 4 | -# 启动 Milvus、CLIP(可选)、Streamlit 应用 | |
| 4 | +# 启动 Streamlit 应用 | |
| 5 | 5 | # ============================================================================= |
| 6 | 6 | set -euo pipefail |
| 7 | 7 | |
| 8 | 8 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 9 | 9 | PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" |
| 10 | 10 | VENV_DIR="${VENV_DIR:-$PROJECT_ROOT/venv}" |
| 11 | -STREAMLIT_PORT="${STREAMLIT_PORT:-8501}" | |
| 11 | +STREAMLIT_PORT="${STREAMLIT_PORT:-6008}" | |
| 12 | 12 | STREAMLIT_HOST="${STREAMLIT_HOST:-0.0.0.0}" |
| 13 | 13 | |
| 14 | 14 | cd "$PROJECT_ROOT" |
| ... | ... | @@ -27,30 +27,7 @@ echo "==========================================" |
| 27 | 27 | echo "OmniShopAgent 启动" |
| 28 | 28 | echo "==========================================" |
| 29 | 29 | |
| 30 | -# 1. 启动 Milvus(Docker) | |
| 31 | -if command -v docker &>/dev/null; then | |
| 32 | - echo "[1/3] 检查 Milvus..." | |
| 33 | - if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q milvus-standalone; then | |
| 34 | - echo " 启动 Milvus (docker compose)..." | |
| 35 | - docker compose up -d 2>/dev/null || docker-compose up -d 2>/dev/null || { | |
| 36 | - echo " 警告: 无法启动 Milvus,请手动执行: docker compose up -d" | |
| 37 | - } | |
| 38 | - echo " 等待 Milvus 就绪 (30s)..." | |
| 39 | - sleep 30 | |
| 40 | - else | |
| 41 | - echo " Milvus 已运行" | |
| 42 | - fi | |
| 43 | -else | |
| 44 | - echo "[1/3] 跳过 Milvus: 未安装 Docker" | |
| 45 | -fi | |
| 46 | - | |
| 47 | -# 2. 检查 CLIP(可选,图像搜索需要) | |
| 48 | -echo "[2/3] 检查 CLIP 服务..." | |
| 49 | -echo " 提示: 图像搜索需 CLIP。若未启动,请另开终端执行: python -m clip_server launch" | |
| 50 | -echo " 文本搜索可无需 CLIP。" | |
| 51 | - | |
| 52 | -# 3. 启动 Streamlit | |
| 53 | -echo "[3/3] 启动 Streamlit (端口 $STREAMLIT_PORT)..." | |
| 30 | +echo "[1/1] 启动 Streamlit (端口 $STREAMLIT_PORT)..." | |
| 54 | 31 | echo "" |
| 55 | 32 | echo " 访问: http://$STREAMLIT_HOST:$STREAMLIT_PORT" |
| 56 | 33 | echo " 按 Ctrl+C 停止" | ... | ... |
scripts/stop.sh
| 1 | 1 | #!/usr/bin/env bash |
| 2 | 2 | # ============================================================================= |
| 3 | 3 | # OmniShopAgent - 停止脚本 |
| 4 | -# 停止 Streamlit 进程及 Milvus 容器 | |
| 4 | +# 停止 Streamlit 进程 | |
| 5 | 5 | # ============================================================================= |
| 6 | 6 | set -euo pipefail |
| 7 | 7 | |
| 8 | 8 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 9 | 9 | PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" |
| 10 | -STREAMLIT_PORT="${STREAMLIT_PORT:-8501}" | |
| 10 | +STREAMLIT_PORT="${STREAMLIT_PORT:-6008}" | |
| 11 | 11 | |
| 12 | 12 | echo "==========================================" |
| 13 | 13 | echo "OmniShopAgent 停止" |
| 14 | 14 | echo "==========================================" |
| 15 | 15 | |
| 16 | 16 | # 1. 停止 Streamlit 进程 |
| 17 | -echo "[1/2] 停止 Streamlit..." | |
| 17 | +echo "[1/1] 停止 Streamlit..." | |
| 18 | 18 | if pgrep -f "streamlit run app.py" >/dev/null 2>&1; then |
| 19 | 19 | pkill -f "streamlit run app.py" 2>/dev/null || true |
| 20 | 20 | echo " Streamlit 已停止" |
| ... | ... | @@ -31,16 +31,6 @@ if command -v lsof &>/dev/null; then |
| 31 | 31 | fi |
| 32 | 32 | fi |
| 33 | 33 | |
| 34 | -# 2. 可选:停止 Milvus 容器 | |
| 35 | -echo "[2/2] 停止 Milvus..." | |
| 36 | -if command -v docker &>/dev/null; then | |
| 37 | - cd "$PROJECT_ROOT" | |
| 38 | - docker compose down 2>/dev/null || docker-compose down 2>/dev/null || true | |
| 39 | - echo " Milvus 已停止" | |
| 40 | -else | |
| 41 | - echo " Docker 未安装,跳过" | |
| 42 | -fi | |
| 43 | - | |
| 44 | 34 | echo "==========================================" |
| 45 | 35 | echo "OmniShopAgent 已停止" |
| 46 | 36 | echo "==========================================" | ... | ... |