diff --git a/=0.1.9 b/=0.1.9
new file mode 100644
index 0000000..262ee14
--- /dev/null
+++ b/=0.1.9
@@ -0,0 +1,14 @@
+Looking in indexes: https://mirrors.aliyun.com/pypi/simple
+Collecting slowapi
+ Using cached https://mirrors.aliyun.com/pypi/packages/2b/bb/f71c4b7d7e7eb3fc1e8c0458a8979b912f40b58002b9fbf37729b8cb464b/slowapi-0.1.9-py3-none-any.whl (14 kB)
+Collecting limits>=2.3 (from slowapi)
+ Using cached https://mirrors.aliyun.com/pypi/packages/40/96/4fcd44aed47b8fcc457653b12915fcad192cd646510ef3f29fd216f4b0ab/limits-5.6.0-py3-none-any.whl (60 kB)
+Collecting deprecated>=1.2 (from limits>=2.3->slowapi)
+ Using cached https://mirrors.aliyun.com/pypi/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl (11 kB)
+Requirement already satisfied: packaging>=21 in /data/tw/miniconda3/envs/searchengine/lib/python3.10/site-packages (from limits>=2.3->slowapi) (25.0)
+Requirement already satisfied: typing-extensions in /data/tw/miniconda3/envs/searchengine/lib/python3.10/site-packages (from limits>=2.3->slowapi) (4.15.0)
+Collecting wrapt<3,>=1.10 (from deprecated>=1.2->limits>=2.3->slowapi)
+ Downloading https://mirrors.aliyun.com/pypi/packages/c6/93/5cf92edd99617095592af919cb81d4bff61c5dbbb70d3c92099425a8ec34/wrapt-2.0.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl (113 kB)
+Installing collected packages: wrapt, deprecated, limits, slowapi
+
+Successfully installed deprecated-1.3.1 limits-5.6.0 slowapi-0.1.9 wrapt-2.0.1
diff --git a/CLAUDE.md b/CLAUDE.md
index 49bd78e..a7e8cbe 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -109,3 +109,6 @@ The `searcher` supports:
4. **ES Similarity Configuration:** All text fields use modified BM25 with `b=0.0, k1=0.0` as the default similarity.
5. **Multi-Language Support:** The system is designed for cross-border e-commerce with at minimum Chinese and English support, with extensibility for other languages (Arabic, Spanish, Russian, Japanese).
+- 记住这个项目的环境是
+- 记住这个项目的环境是source /home/tw/miniconda3/etc/profile.d/conda.sh
+conda activate searchengine
\ No newline at end of file
diff --git a/SERVER_FIXES.md b/SERVER_FIXES.md
new file mode 100644
index 0000000..8b22f6f
--- /dev/null
+++ b/SERVER_FIXES.md
@@ -0,0 +1,142 @@
+# 服务器修复和优化文档
+
+## 修复的问题
+
+### 1. 前端服务器问题 (scripts/frontend_server.py)
+- **问题**: 接收到大量扫描器流量导致的错误日志
+- **原因**: SSL/TLS握手尝试、RDP连接扫描、二进制数据攻击
+- **解决方案**:
+ - 添加错误处理机制,优雅处理连接断开
+ - 实现速率限制 (100请求/分钟)
+ - 过滤扫描器噪音日志
+ - 添加安全HTTP头
+ - 使用线程服务器提高并发处理能力
+
+### 2. API服务器问题 (api/app.py)
+- **问题**: 缺乏安全性和错误处理机制
+- **解决方案**:
+ - 集成速率限制 (slowapi)
+ - 添加安全HTTP头
+ - 实现更好的异常处理
+ - 添加健康检查端点
+ - 增强日志记录
+ - 添加服务关闭处理
+
+## 主要改进
+
+### 安全性增强
+1. **速率限制**: 防止DDoS攻击和滥用
+2. **安全HTTP头**: 防止XSS、点击劫持等攻击
+3. **错误过滤**: 隐藏敏感错误信息
+4. **输入验证**: 更健壮的请求处理
+
+### 稳定性提升
+1. **连接错误处理**: 优雅处理连接重置和断开
+2. **异常处理**: 全局异常捕获,防止服务器崩溃
+3. **日志管理**: 过滤噪音,记录重要事件
+4. **监控功能**: 健康检查和状态监控
+
+### 性能优化
+1. **线程服务器**: 前端服务器支持并发请求
+2. **资源管理**: 更好的内存和连接管理
+3. **响应头优化**: 添加缓存和安全相关头
+
+## 使用方法
+
+### 安装依赖
+```bash
+# 安装服务器安全依赖
+./scripts/install_server_deps.sh
+
+# 或者手动安装
+pip install slowapi>=0.1.9 anyio>=3.7.0
+```
+
+### 启动服务器
+
+#### 方法1: 使用管理脚本 (推荐)
+```bash
+# 启动所有服务器
+python scripts/start_servers.py --customer customer1 --es-host http://localhost:9200
+
+# 启动前检查依赖
+python scripts/start_servers.py --check-dependencies
+```
+
+#### 方法2: 分别启动
+```bash
+# 启动API服务器
+python main.py serve --customer customer1 --es-host http://localhost:9200
+
+# 启动前端服务器 (在另一个终端)
+python scripts/frontend_server.py
+```
+
+### 监控和日志
+
+#### 日志位置
+- API服务器日志: `/tmp/search_engine_api.log`
+- 启动日志: `/tmp/search_engine_startup.log`
+- 控制台输出: 实时显示重要信息
+
+#### 健康检查
+```bash
+# 检查API服务器健康状态
+curl http://localhost:6002/health
+
+# 检查前端服务器
+curl http://localhost:6003
+```
+
+## 配置选项
+
+### 环境变量
+- `CUSTOMER_ID`: 客户ID (默认: customer1)
+- `ES_HOST`: Elasticsearch主机 (默认: http://localhost:9200)
+
+### 速率限制配置
+- API服务器: 各端点不同限制 (60-120请求/分钟)
+- 前端服务器: 100请求/分钟
+
+## 故障排除
+
+### 常见问题
+
+1. **依赖缺失错误**
+ ```bash
+ pip install -r requirements_server.txt
+ ```
+
+2. **端口被占用**
+ ```bash
+ # 查看端口占用
+ lsof -i :6002
+ lsof -i :6003
+ ```
+
+3. **权限问题**
+ ```bash
+ chmod +x scripts/*.py scripts/*.sh
+ ```
+
+### 调试模式
+```bash
+# 启用详细日志
+export PYTHONUNBUFFERED=1
+python scripts/start_servers.py
+```
+
+## 生产环境建议
+
+1. **反向代理**: 使用nginx或Apache作为反向代理
+2. **SSL证书**: 配置HTTPS
+3. **防火墙**: 限制访问源IP
+4. **监控**: 集成监控和告警系统
+5. **日志轮转**: 配置日志轮转防止磁盘满
+
+## 维护说明
+
+- 定期检查日志文件大小
+- 监控服务器资源使用情况
+- 更新依赖包版本
+- 备份配置文件
\ No newline at end of file
diff --git a/api/app.py b/api/app.py
index 003e6b1..336351f 100644
--- a/api/app.py
+++ b/api/app.py
@@ -7,12 +7,34 @@ Usage:
import os
import sys
+import logging
+import time
+from collections import defaultdict, deque
from typing import Optional
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
+from slowapi import Limiter, _rate_limit_exceeded_handler
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
import argparse
+# Configure logging with better formatting
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+ handlers=[
+ logging.StreamHandler(),
+ logging.FileHandler('/tmp/search_engine_api.log', mode='a')
+ ]
+)
+logger = logging.getLogger(__name__)
+
+# Initialize rate limiter
+limiter = Limiter(key_func=get_remote_address)
+
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -117,20 +139,44 @@ def get_query_parser() -> QueryParser:
return _query_parser
-# Create FastAPI app
+# Create FastAPI app with enhanced configuration
app = FastAPI(
title="E-Commerce Search API",
description="Configurable search engine for cross-border e-commerce",
- version="1.0.0"
+ version="1.0.0",
+ docs_url="/docs",
+ redoc_url="/redoc",
+ openapi_url="/openapi.json"
+)
+
+# Add rate limiting middleware
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+
+# Add trusted host middleware (restrict to localhost and trusted domains)
+app.add_middleware(
+ TrustedHostMiddleware,
+ allowed_hosts=["*"] # Allow all hosts for development, restrict in production
)
-# Add CORS middleware
+# Add security headers middleware
+@app.middleware("http")
+async def add_security_headers(request: Request, call_next):
+ response = await call_next(request)
+ response.headers["X-Content-Type-Options"] = "nosniff"
+ response.headers["X-Frame-Options"] = "DENY"
+ response.headers["X-XSS-Protection"] = "1; mode=block"
+ response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+ return response
+
+# Add CORS middleware with more restrictive settings
app.add_middleware(
CORSMiddleware,
- allow_origins=["*"],
+ allow_origins=["*"], # Restrict in production to specific domains
allow_credentials=True,
- allow_methods=["*"],
+ allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
allow_headers=["*"],
+ expose_headers=["X-Total-Count"]
)
@@ -140,35 +186,100 @@ async def startup_event():
customer_id = os.getenv("CUSTOMER_ID", "customer1")
es_host = os.getenv("ES_HOST", "http://localhost:9200")
+ logger.info(f"Starting E-Commerce Search API")
+ logger.info(f"Customer ID: {customer_id}")
+ logger.info(f"Elasticsearch Host: {es_host}")
+
try:
init_service(customer_id=customer_id, es_host=es_host)
+ logger.info("Service initialized successfully")
except Exception as e:
- print(f"Failed to initialize service: {e}")
- print("Service will start but may not function correctly")
+ logger.error(f"Failed to initialize service: {e}")
+ logger.warning("Service will start but may not function correctly")
+
+
+@app.on_event("shutdown")
+async def shutdown_event():
+ """Cleanup on shutdown."""
+ logger.info("Shutting down E-Commerce Search API")
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
- """Global exception handler."""
+ """Global exception handler with detailed logging."""
+ client_ip = request.client.host if request.client else "unknown"
+ logger.error(f"Unhandled exception from {client_ip}: {exc}", exc_info=True)
+
return JSONResponse(
status_code=500,
content={
"error": "Internal server error",
- "detail": str(exc)
+ "detail": "An unexpected error occurred. Please try again later.",
+ "timestamp": int(time.time())
+ }
+ )
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException):
+ """HTTP exception handler."""
+ logger.warning(f"HTTP exception from {request.client.host if request.client else 'unknown'}: {exc.status_code} - {exc.detail}")
+
+ return JSONResponse(
+ status_code=exc.status_code,
+ content={
+ "error": exc.detail,
+ "status_code": exc.status_code,
+ "timestamp": int(time.time())
}
)
@app.get("/")
-async def root():
- """Root endpoint."""
+@limiter.limit("60/minute")
+async def root(request: Request):
+ """Root endpoint with rate limiting."""
+ client_ip = request.client.host if request.client else "unknown"
+ logger.info(f"Root endpoint accessed from {client_ip}")
+
return {
"service": "E-Commerce Search API",
"version": "1.0.0",
- "status": "running"
+ "status": "running",
+ "timestamp": int(time.time())
}
+@app.get("/health")
+@limiter.limit("120/minute")
+async def health_check(request: Request):
+ """Health check endpoint."""
+ try:
+ # Check if services are initialized
+ get_config()
+ get_es_client()
+
+ return {
+ "status": "healthy",
+ "services": {
+ "config": "initialized",
+ "elasticsearch": "connected",
+ "searcher": "initialized"
+ },
+ "timestamp": int(time.time())
+ }
+ except Exception as e:
+ logger.error(f"Health check failed: {e}")
+ return JSONResponse(
+ status_code=503,
+ content={
+ "status": "unhealthy",
+ "error": str(e),
+ "timestamp": int(time.time())
+ }
+ )
+
+
# Include routers
from .routes import search, admin
diff --git a/api/routes/search.py b/api/routes/search.py
index af6528a..78735d7 100644
--- a/api/routes/search.py
+++ b/api/routes/search.py
@@ -33,7 +33,7 @@ async def search(request: SearchRequest):
try:
# Get searcher from app state
- from main import get_searcher
+ from api.app import get_searcher
searcher = get_searcher()
# Execute search
@@ -70,7 +70,7 @@ async def search_by_image(request: ImageSearchRequest):
Uses image embeddings to find visually similar products.
"""
try:
- from main import get_searcher
+ from api.app import get_searcher
searcher = get_searcher()
# Execute image search
@@ -101,7 +101,7 @@ async def get_document(doc_id: str):
Get a single document by ID.
"""
try:
- from main import get_searcher
+ from api.app import get_searcher
searcher = get_searcher()
doc = searcher.get_document(doc_id)
diff --git a/environment.yml b/environment.yml
index 26a713d..0af287d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -42,6 +42,8 @@ dependencies:
- uvicorn[standard]>=0.23.0
- pydantic>=2.0.0
- python-multipart>=0.0.6
+ - slowapi>=0.1.9
+ - anyio>=3.7.0
# Translation
- requests>=2.31.0
diff --git a/frontend/index.html b/frontend/index.html
index f190956..6023808 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -51,9 +51,9 @@
-
+