Commit 167a0cb2f7c30c0c5446b584eadbfe95c7a46525
1 parent
a7653f3c
删除临时文件
Showing
8 changed files
with
0 additions
and
1443 deletions
Show diff stats
API_CLEANUP_SUMMARY.md deleted
| @@ -1,234 +0,0 @@ | @@ -1,234 +0,0 @@ | ||
| 1 | -# API清理总结报告 | ||
| 2 | - | ||
| 3 | -## 🎯 清理目标 | ||
| 4 | - | ||
| 5 | -移除前端API中的内部参数,使复杂功能对用户透明,简化API接口。 | ||
| 6 | - | ||
| 7 | -## ❌ 清理前的问题 | ||
| 8 | - | ||
| 9 | -### 暴露的内部参数 | ||
| 10 | -```json | ||
| 11 | -{ | ||
| 12 | - "query": "芭比娃娃", | ||
| 13 | - "size": 10, | ||
| 14 | - "from_": 0, | ||
| 15 | - "enable_translation": true, // ❌ 用户不需要关心 | ||
| 16 | - "enable_embedding": true, // ❌ 用户不需要关心 | ||
| 17 | - "enable_rerank": true, // ❌ 用户不需要关心 | ||
| 18 | - "min_score": null | ||
| 19 | -} | ||
| 20 | -``` | ||
| 21 | - | ||
| 22 | -### 前端日志显示 | ||
| 23 | -``` | ||
| 24 | -enable_translation=False, enable_embedding=False, enable_rerank=True | ||
| 25 | -``` | ||
| 26 | - | ||
| 27 | -用户需要了解和配置内部功能,违背了系统设计的简洁性原则。 | ||
| 28 | - | ||
| 29 | -## ✅ 清理方案 | ||
| 30 | - | ||
| 31 | -### 1. API模型清理 | ||
| 32 | -**文件**: `api/models.py` | ||
| 33 | - | ||
| 34 | -**清理前**: | ||
| 35 | -```python | ||
| 36 | -class SearchRequest(BaseModel): | ||
| 37 | - query: str = Field(...) | ||
| 38 | - size: int = Field(10, ge=1, le=100) | ||
| 39 | - from_: int = Field(0, ge=0, alias="from") | ||
| 40 | - filters: Optional[Dict[str, Any]] = Field(None) | ||
| 41 | - enable_translation: bool = Field(True) # ❌ 移除 | ||
| 42 | - enable_embedding: bool = Field(True) # ❌ 移除 | ||
| 43 | - enable_rerank: bool = Field(True) # ❌ 移除 | ||
| 44 | - min_score: Optional[float] = Field(None) | ||
| 45 | -``` | ||
| 46 | - | ||
| 47 | -**清理后**: | ||
| 48 | -```python | ||
| 49 | -class SearchRequest(BaseModel): | ||
| 50 | - query: str = Field(...) | ||
| 51 | - size: int = Field(10, ge=1, le=100) | ||
| 52 | - from_: int = Field(0, ge=0, alias="from") | ||
| 53 | - filters: Optional[Dict[str, Any]] = Field(None) | ||
| 54 | - min_score: Optional[float] = Field(None) | ||
| 55 | -``` | ||
| 56 | - | ||
| 57 | -### 2. API路由清理 | ||
| 58 | -**文件**: `api/routes/search.py` | ||
| 59 | - | ||
| 60 | -**清理前**: | ||
| 61 | -```python | ||
| 62 | -result = searcher.search( | ||
| 63 | - query=request.query, | ||
| 64 | - enable_translation=request.enable_translation, # ❌ 移除 | ||
| 65 | - enable_embedding=request.enable_embedding, # ❌ 移除 | ||
| 66 | - enable_rerank=request.enable_rerank, # ❌ 移除 | ||
| 67 | - # ... | ||
| 68 | -) | ||
| 69 | -``` | ||
| 70 | - | ||
| 71 | -**清理后**: | ||
| 72 | -```python | ||
| 73 | -result = searcher.search( | ||
| 74 | - query=request.query, | ||
| 75 | - # 使用后端配置默认值 | ||
| 76 | -) | ||
| 77 | -``` | ||
| 78 | - | ||
| 79 | -### 3. 搜索器参数清理 | ||
| 80 | -**文件**: `search/searcher.py` | ||
| 81 | - | ||
| 82 | -**清理前**: | ||
| 83 | -```python | ||
| 84 | -def search( | ||
| 85 | - self, | ||
| 86 | - query: str, | ||
| 87 | - enable_translation: Optional[bool] = None, # ❌ 移除 | ||
| 88 | - enable_embedding: Optional[bool] = None, # ❌ 移除 | ||
| 89 | - enable_rerank: bool = True, # ❌ 移除 | ||
| 90 | - # ... | ||
| 91 | -): | ||
| 92 | -``` | ||
| 93 | - | ||
| 94 | -**清理后**: | ||
| 95 | -```python | ||
| 96 | -def search( | ||
| 97 | - self, | ||
| 98 | - query: str, | ||
| 99 | - # 使用配置文件默认值 | ||
| 100 | - # ... | ||
| 101 | -): | ||
| 102 | - # 始终使用配置默认值 | ||
| 103 | - enable_translation = self.config.query_config.enable_translation | ||
| 104 | - enable_embedding = self.config.query_config.enable_text_embedding | ||
| 105 | - enable_rerank = True | ||
| 106 | -``` | ||
| 107 | - | ||
| 108 | -## 🧪 清理验证 | ||
| 109 | - | ||
| 110 | -### ✅ API模型验证 | ||
| 111 | -```python | ||
| 112 | -# 创建请求不再需要内部参数 | ||
| 113 | -search_request = SearchRequest( | ||
| 114 | - query="芭比娃娃", | ||
| 115 | - size=10, | ||
| 116 | - filters={"categoryName": "玩具"} | ||
| 117 | -) | ||
| 118 | - | ||
| 119 | -# 验证内部参数已移除 | ||
| 120 | -assert not hasattr(search_request, 'enable_translation') | ||
| 121 | -assert not hasattr(search_request, 'enable_embedding') | ||
| 122 | -assert not hasattr(search_request, 'enable_rerank') | ||
| 123 | -``` | ||
| 124 | - | ||
| 125 | -### ✅ 功能透明性验证 | ||
| 126 | -```python | ||
| 127 | -# 前端调用简洁明了 | ||
| 128 | -frontend_request = { | ||
| 129 | - "query": "芭比娃娃", | ||
| 130 | - "size": 10, | ||
| 131 | - "filters": {"categoryName": "玩具"} | ||
| 132 | -} | ||
| 133 | - | ||
| 134 | -# 后端自动使用配置默认值 | ||
| 135 | -backend_flags = { | ||
| 136 | - "translation_enabled": True, # 来自配置文件 | ||
| 137 | - "embedding_enabled": True, # 来自配置文件 | ||
| 138 | - "rerank_enabled": True # 固定启用 | ||
| 139 | -} | ||
| 140 | -``` | ||
| 141 | - | ||
| 142 | -### ✅ 日志验证 | ||
| 143 | -**清理前**: | ||
| 144 | -``` | ||
| 145 | -enable_translation=False, enable_embedding=False, enable_rerank=True | ||
| 146 | -``` | ||
| 147 | - | ||
| 148 | -**清理后**: | ||
| 149 | -``` | ||
| 150 | -enable_translation=True, enable_embedding=True, enable_rerank=True | ||
| 151 | -``` | ||
| 152 | - | ||
| 153 | -## 🎊 清理结果 | ||
| 154 | - | ||
| 155 | -### ✅ 用户友好的API | ||
| 156 | -```json | ||
| 157 | -{ | ||
| 158 | - "query": "芭比娃娃", | ||
| 159 | - "size": 10, | ||
| 160 | - "from_": 0, | ||
| 161 | - "filters": { | ||
| 162 | - "categoryName": "玩具" | ||
| 163 | - }, | ||
| 164 | - "min_score": null | ||
| 165 | -} | ||
| 166 | -``` | ||
| 167 | - | ||
| 168 | -### ✅ 完整的功能保持 | ||
| 169 | -- ✅ **翻译功能**: 自动启用,支持多语言搜索 | ||
| 170 | -- ✅ **向量搜索**: 自动启用,支持语义搜索 | ||
| 171 | -- ✅ **自定义排序**: 自动启用,使用配置的排序表达式 | ||
| 172 | -- ✅ **查询重写**: 自动启用,支持品牌和类目映射 | ||
| 173 | - | ||
| 174 | -### ✅ 配置驱动 | ||
| 175 | -```yaml | ||
| 176 | -# customer1_config.yaml | ||
| 177 | -query_config: | ||
| 178 | - enable_translation: true # 控制翻译功能 | ||
| 179 | - enable_text_embedding: true # 控制向量功能 | ||
| 180 | - enable_query_rewrite: true # 控制查询重写 | ||
| 181 | -``` | ||
| 182 | - | ||
| 183 | -## 🌟 最终效果 | ||
| 184 | - | ||
| 185 | -### 🔒 内部实现完全透明 | ||
| 186 | -- 用户无需了解 `enable_translation`、`enable_embedding`、`enable_rerank` | ||
| 187 | -- 系统自动根据配置启用所有功能 | ||
| 188 | -- API接口简洁明了,易于使用 | ||
| 189 | - | ||
| 190 | -### 🚀 功能完整保持 | ||
| 191 | -- 所有高级功能正常工作 | ||
| 192 | -- 性能监控和日志记录完整 | ||
| 193 | -- 请求上下文和错误处理保持不变 | ||
| 194 | - | ||
| 195 | -### 📱 前端集成友好 | ||
| 196 | -- API调用参数最少化 | ||
| 197 | -- 错误处理简化 | ||
| 198 | -- 响应结构清晰 | ||
| 199 | - | ||
| 200 | -## 📈 改进指标 | ||
| 201 | - | ||
| 202 | -| 指标 | 清理前 | 清理后 | 改进 | | ||
| 203 | -|------|--------|--------|------| | ||
| 204 | -| API参数数量 | 8个 | 5个 | ⬇️ 37.5% | | ||
| 205 | -| 用户理解难度 | 高 | 低 | ⬇️ 显著改善 | | ||
| 206 | -| 前端代码复杂度 | 高 | 低 | ⬇️ 显著简化 | | ||
| 207 | -| 功能完整性 | 100% | 100% | ➡️ 保持不变 | | ||
| 208 | - | ||
| 209 | -## 🎉 总结 | ||
| 210 | - | ||
| 211 | -API清理完全成功!现在系统具有: | ||
| 212 | - | ||
| 213 | -- ✅ **简洁的API接口** - 用户只需关心基本搜索参数 | ||
| 214 | -- ✅ **透明的功能启用** - 高级功能自动启用,用户无需配置 | ||
| 215 | -- ✅ **配置驱动的灵活性** - 管理员可通过配置文件控制功能 | ||
| 216 | -- ✅ **完整的向后兼容性** - 内部调用仍然支持参数传递 | ||
| 217 | -- ✅ **优秀的用户体验** - API对开发者友好,易于集成 | ||
| 218 | - | ||
| 219 | -**现在的前端调用就像这样简单:** | ||
| 220 | - | ||
| 221 | -```javascript | ||
| 222 | -// 前端调用 - 简洁明了 | ||
| 223 | -const response = await fetch('/search/', { | ||
| 224 | - method: 'POST', | ||
| 225 | - headers: { 'Content-Type': 'application/json' }, | ||
| 226 | - body: JSON.stringify({ | ||
| 227 | - query: "芭比娃娃", | ||
| 228 | - size: 10, | ||
| 229 | - filters: { categoryName: "玩具" } | ||
| 230 | - }) | ||
| 231 | -}); | ||
| 232 | - | ||
| 233 | -// 自动获得翻译、向量搜索、排序等所有功能! | ||
| 234 | -``` | ||
| 235 | \ No newline at end of file | 0 | \ No newline at end of file |
BUGFIX_REPORT.md deleted
| @@ -1,105 +0,0 @@ | @@ -1,105 +0,0 @@ | ||
| 1 | -# 错误修复报告:请求上下文和日志系统 | ||
| 2 | - | ||
| 3 | -## 🐛 问题描述 | ||
| 4 | - | ||
| 5 | -在集成请求上下文管理器后,系统出现了以下错误: | ||
| 6 | - | ||
| 7 | -``` | ||
| 8 | -TypeError: Logger._log() got an unexpected keyword argument 'reqid' | ||
| 9 | -``` | ||
| 10 | - | ||
| 11 | -错误发生在搜索请求处理过程中,导致搜索功能完全不可用。 | ||
| 12 | - | ||
| 13 | -## 🔍 问题分析 | ||
| 14 | - | ||
| 15 | -根本原因是日志调用的格式不正确。Python 标准库的 `logger.info()`、`logger.debug()` 等方法不接受任意的 `reqid` 和 `uid` 关键字参数,需要通过 `extra` 参数传递。 | ||
| 16 | - | ||
| 17 | -## 🔧 修复内容 | ||
| 18 | - | ||
| 19 | -### 1. `utils/logger.py` | ||
| 20 | -- **问题**: 缺少对自定义参数的处理 | ||
| 21 | -- **修复**: 添加了 `_log_with_context()` 辅助函数来正确处理自定义参数 | ||
| 22 | -- **状态**: ✅ 已修复 | ||
| 23 | - | ||
| 24 | -### 2. `context/request_context.py` | ||
| 25 | -- **问题**: 多处日志调用直接使用 `reqid=..., uid=...` 参数 | ||
| 26 | -- **修复**: 所有日志调用改为使用 `extra={'reqid': ..., 'uid': ...}` 格式 | ||
| 27 | -- **影响**: 7处日志调用修复 | ||
| 28 | -- **状态**: ✅ 已修复 | ||
| 29 | - | ||
| 30 | -### 3. `query/query_parser.py` | ||
| 31 | -- **问题**: 查询解析中的日志调用格式错误 | ||
| 32 | -- **修复**: 修复了内部日志函数的参数传递格式 | ||
| 33 | -- **影响**: 2处日志调用修复 | ||
| 34 | -- **状态**: ✅ 已修复 | ||
| 35 | - | ||
| 36 | -### 4. `search/searcher.py` | ||
| 37 | -- **问题**: 搜索过程中的日志调用格式错误 | ||
| 38 | -- **修复**: 批量替换所有日志调用格式 | ||
| 39 | -- **影响**: 多处日志调用修复 | ||
| 40 | -- **状态**: ✅ 已修复 | ||
| 41 | - | ||
| 42 | -### 5. `api/routes/search.py` | ||
| 43 | -- **问题**: API路由中的日志调用格式错误 | ||
| 44 | -- **修复**: 修复日志调用格式 | ||
| 45 | -- **状态**: ✅ 已修复 | ||
| 46 | - | ||
| 47 | -## ✅ 验证结果 | ||
| 48 | - | ||
| 49 | -通过 `verification_report.py` 进行了全面测试: | ||
| 50 | - | ||
| 51 | -- ✅ 基础模块导入正常 | ||
| 52 | -- ✅ 日志系统正常工作 | ||
| 53 | -- ✅ 请求上下文创建正常 | ||
| 54 | -- ✅ 查询解析功能正常(修复验证) | ||
| 55 | -- ✅ 中文查询处理正常 | ||
| 56 | -- ✅ 性能摘要生成正常 | ||
| 57 | - | ||
| 58 | -**总计:6/6 测试通过** | ||
| 59 | - | ||
| 60 | -## 🎯 修复效果 | ||
| 61 | - | ||
| 62 | -### 修复前 | ||
| 63 | -``` | ||
| 64 | -2025-11-11 11:58:55,061 - request_context - ERROR - 设置错误信息 | TypeError: Logger._log() got an unexpected keyword argument 'reqid' | ||
| 65 | -2025-11-11 11:58:55,061 - request_context - ERROR - 查询解析失败 | 错误: Logger._log() got an unexpected keyword argument 'reqid' | ||
| 66 | -2025-11-11 11:58:55,061 - request_context - ERROR - 搜索请求失败 | 错误: Logger._log() got an unexpected keyword argument 'reqid' | ||
| 67 | -INFO: 117.129.43.129:26083 - "POST /search/ HTTP/1.1" 500 Internal Server Error | ||
| 68 | -``` | ||
| 69 | - | ||
| 70 | -### 修复后 | ||
| 71 | -``` | ||
| 72 | -2025-11-11 12:01:41,242 | INFO | request_context | 开始查询解析 | 原查询: '芭比娃娃' | 生成向量: False | ||
| 73 | -2025-11-11 12:01:41,242 | INFO | request_context | 查询重写 | '芭比娃娃' -> 'brand:芭比' | ||
| 74 | -2025-11-11 12:01:41,242 | INFO | request_context | 查询解析完成 | 原查询: '芭比娃娃' | 最终查询: 'brand:芭比' | 语言: en | 域: default | 翻译数量: 0 | 向量: 否 | ||
| 75 | -``` | ||
| 76 | - | ||
| 77 | -## 📝 最佳实践 | ||
| 78 | - | ||
| 79 | -### 正确的日志调用格式 | ||
| 80 | -```python | ||
| 81 | -# ❌ 错误的格式 | ||
| 82 | -logger.info("消息", reqid=context.reqid, uid=context.uid) | ||
| 83 | - | ||
| 84 | -# ✅ 正确的格式 | ||
| 85 | -logger.info("消息", extra={'reqid': context.reqid, 'uid': context.uid}) | ||
| 86 | -``` | ||
| 87 | - | ||
| 88 | -### 自测试流程 | ||
| 89 | -1. 修改代码后立即运行自测脚本 | ||
| 90 | -2. 验证所有模块导入正常 | ||
| 91 | -3. 测试关键功能路径 | ||
| 92 | -4. 检查日志输出格式正确 | ||
| 93 | - | ||
| 94 | -## 🚀 系统状态 | ||
| 95 | - | ||
| 96 | -**状态**: ✅ 完全修复并可正常使用 | ||
| 97 | - | ||
| 98 | -**功能**: | ||
| 99 | -- 请求级别的上下文管理 | ||
| 100 | -- 结构化日志记录 | ||
| 101 | -- 性能监控和跟踪 | ||
| 102 | -- 错误和警告收集 | ||
| 103 | -- 完整的搜索请求可见性 | ||
| 104 | - | ||
| 105 | -**可用性**: 系统现在可以正常处理所有搜索请求,提供完整的请求跟踪和性能监控。 | ||
| 106 | \ No newline at end of file | 0 | \ No newline at end of file |
COMMIT_SUMMARY.md deleted
| @@ -1,116 +0,0 @@ | @@ -1,116 +0,0 @@ | ||
| 1 | -# 提交内容总结 | ||
| 2 | - | ||
| 3 | -## 📊 修改统计 | ||
| 4 | -- **修改文件**: 4个核心文件 | ||
| 5 | -- **新增文件**: 30+个文件(测试、文档、工具脚本等) | ||
| 6 | -- **总变更**: 37个文件 | ||
| 7 | - | ||
| 8 | -## 🎯 核心功能修改 | ||
| 9 | - | ||
| 10 | -### 1. 请求上下文和日志系统 (`utils/logger.py`, `context/request_context.py`) | ||
| 11 | -- **新增**: 结构化日志系统,支持请求级别的上下文跟踪 | ||
| 12 | -- **新增**: 请求上下文管理器,存储查询分析结果和中间结果 | ||
| 13 | -- **新增**: 性能监控,跟踪各阶段耗时和百分比 | ||
| 14 | -- **修复**: 日志参数传递格式,解决 `Logger._log()` 错误 | ||
| 15 | - | ||
| 16 | -### 2. 查询解析系统 (`query/query_parser.py`) | ||
| 17 | -- **增强**: 集成请求上下文,存储解析过程中的所有中间结果 | ||
| 18 | -- **增强**: 支持查询分析结果的完整记录和日志 | ||
| 19 | -- **修复**: 翻译功能API端点问题,从免费端点改为付费端点 | ||
| 20 | -- **增强**: 错误处理和警告跟踪机制 | ||
| 21 | - | ||
| 22 | -### 3. 搜索引擎核心 (`search/searcher.py`) | ||
| 23 | -- **新增**: 完整的请求级性能监控 | ||
| 24 | -- **新增**: 各阶段(查询解析、布尔解析、查询构建、ES搜索、结果处理)的时间跟踪 | ||
| 25 | -- **新增**: 上下文驱动的配置管理,自动使用配置文件默认值 | ||
| 26 | -- **移除**: 对外暴露的内部参数(enable_translation、enable_embedding、enable_rerank) | ||
| 27 | - | ||
| 28 | -### 4. API接口 (`api/models.py`, `api/routes/search.py`) | ||
| 29 | -- **简化**: 移除前端不需要的内部参数,API从8个参数减少到5个 | ||
| 30 | -- **新增**: 请求ID和用户ID自动提取,支持请求关联 | ||
| 31 | -- **新增**: 性能信息包含在响应中 | ||
| 32 | -- **增强**: 请求上下文的完整集成 | ||
| 33 | - | ||
| 34 | -## 🔧 技术改进 | ||
| 35 | - | ||
| 36 | -### 性能监控 | ||
| 37 | -- **查询解析阶段**: 自动跟踪和记录耗时 | ||
| 38 | -- **布尔表达式解析**: AST生成和分析耗时 | ||
| 39 | -- **ES查询构建**: 查询复杂度和构建时间 | ||
| 40 | -- **ES搜索执行**: 响应时间和命中统计 | ||
| 41 | -- **结果处理**: 排序和格式化耗时 | ||
| 42 | - | ||
| 43 | -### 日志系统 | ||
| 44 | -- **结构化日志**: JSON格式,便于分析和搜索 | ||
| 45 | -- **请求关联**: 每个日志条目包含reqid和uid | ||
| 46 | -- **自动轮转**: 按天自动分割日志文件 | ||
| 47 | -- **分级记录**: 支持不同日志级别和组件特定配置 | ||
| 48 | - | ||
| 49 | -### 请求上下文 | ||
| 50 | -- **查询分析**: 原查询、标准化、重写、翻译、向量等完整记录 | ||
| 51 | -- **中间结果**: ES查询、响应、处理结果等存储 | ||
| 52 | -- **性能指标**: 详细的阶段耗时和百分比分析 | ||
| 53 | -- **错误跟踪**: 完整的错误信息和警告记录 | ||
| 54 | - | ||
| 55 | -## 🐛 修复的问题 | ||
| 56 | - | ||
| 57 | -### 1. 翻译功能修复 | ||
| 58 | -- **问题**: DeepL付费API密钥使用免费端点导致403错误 | ||
| 59 | -- **解决**: 更换为正确的付费API端点 | ||
| 60 | -- **结果**: 翻译功能正常,支持多语言(中文→英文、俄文等) | ||
| 61 | - | ||
| 62 | -### 2. 向量生成修复 | ||
| 63 | -- **问题**: GPU内存不足导致CUDA out of memory错误 | ||
| 64 | -- **解决**: 清理GPU内存,恢复向量生成功能 | ||
| 65 | -- **结果**: 1024维向量正常生成,支持语义搜索 | ||
| 66 | - | ||
| 67 | -### 3. 日志系统修复 | ||
| 68 | -- **问题**: Logger._log()不接受自定义参数格式 | ||
| 69 | -- **解决**: 使用extra参数传递reqid、uid等自定义字段 | ||
| 70 | -- **结果**: 日志系统完全正常,支持请求级跟踪 | ||
| 71 | - | ||
| 72 | -## 🌟 用户体验改进 | ||
| 73 | - | ||
| 74 | -### API简化 | ||
| 75 | -- **前端调用**: 参数从8个减少到5个(减少37.5%) | ||
| 76 | -- **内部透明**: enable_translation、enable_embedding、enable_rerank对用户透明 | ||
| 77 | -- **功能完整**: 所有高级功能自动启用,用户无需配置 | ||
| 78 | - | ||
| 79 | -### 响应增强 | ||
| 80 | -- **性能信息**: 包含详细的阶段耗时和百分比 | ||
| 81 | -- **查询信息**: 包含查询分析、翻译、重写等完整信息 | ||
| 82 | -- **请求跟踪**: 每个请求有唯一ID,便于问题排查 | ||
| 83 | - | ||
| 84 | -## 📁 新增文件分类 | ||
| 85 | - | ||
| 86 | -### 测试文件 | ||
| 87 | -- `test_*.py`: 各种功能和集成测试 | ||
| 88 | -- `tests/`: 单元测试和集成测试框架 | ||
| 89 | - | ||
| 90 | -### 文档文件 | ||
| 91 | -- `*_SUMMARY.md`: 详细的修复和清理总结 | ||
| 92 | -- `docs/`: 系统文档和使用指南 | ||
| 93 | - | ||
| 94 | -### 工具脚本 | ||
| 95 | -- `scripts/`: 测试环境和性能测试脚本 | ||
| 96 | -- `demo_*.py`: 功能演示和示例 | ||
| 97 | - | ||
| 98 | -### 配置文件 | ||
| 99 | -- `.github/workflows/`: CI/CD流水线配置 | ||
| 100 | - | ||
| 101 | -## 🎯 核心价值 | ||
| 102 | - | ||
| 103 | -### 对用户 | ||
| 104 | -- **API更简洁**: 只需要关心基本搜索参数 | ||
| 105 | -- **功能更强大**: 自动获得翻译、向量搜索、排序等高级功能 | ||
| 106 | -- **响应更详细**: 包含性能和查询处理信息 | ||
| 107 | - | ||
| 108 | -### 对开发者 | ||
| 109 | -- **调试更容易**: 完整的请求级日志和上下文 | ||
| 110 | -- **性能可观测**: 详细的阶段耗时分析 | ||
| 111 | -- **问题定位快**: 通过reqid快速追踪请求全流程 | ||
| 112 | - | ||
| 113 | -### 对运维 | ||
| 114 | -- **日志结构化**: 便于日志分析和监控 | ||
| 115 | -- **配置灵活**: 通过配置文件控制功能开关 | ||
| 116 | -- **监控完善**: 自动化的性能和错误监控 | ||
| 117 | \ No newline at end of file | 0 | \ No newline at end of file |
FIXES_SUMMARY.md deleted
| @@ -1,96 +0,0 @@ | @@ -1,96 +0,0 @@ | ||
| 1 | -# 修复总结报告 | ||
| 2 | - | ||
| 3 | -## 🎯 问题描述 | ||
| 4 | - | ||
| 5 | -系统出现以下问题: | ||
| 6 | -1. **翻译功能返回None** - 查询"推车"翻译结果为`{'en': None, 'ru': None}` | ||
| 7 | -2. **向量生成失败** - 向量显示为"否",没有生成1024维向量 | ||
| 8 | - | ||
| 9 | -## 🔍 根本原因分析 | ||
| 10 | - | ||
| 11 | -### 1. 翻译问题 | ||
| 12 | -- **根本原因**: 使用了错误的API端点 | ||
| 13 | -- **具体问题**: DeepL付费API密钥 `c9293ab4-ad25-479b-919f-ab4e63b429ed` 被用于免费端点 | ||
| 14 | -- **错误信息**: `"Wrong endpoint. Use https://api.deepl.com"` | ||
| 15 | - | ||
| 16 | -### 2. 向量问题 | ||
| 17 | -- **根本原因**: GPU内存不足 | ||
| 18 | -- **具体问题**: Tesla T4 GPU被其他进程占用14GB,只剩6MB可用内存 | ||
| 19 | -- **错误信息**: `"CUDA out of memory. Tried to allocate 20.00 MiB"` | ||
| 20 | - | ||
| 21 | -## ✅ 修复方案 | ||
| 22 | - | ||
| 23 | -### 1. 翻译功能修复 | ||
| 24 | -**解决方案**: 使用正确的DeepL付费API端点 | ||
| 25 | - | ||
| 26 | -**修复代码**: | ||
| 27 | -```python | ||
| 28 | -# 修复前 | ||
| 29 | -DEEPL_API_URL = "https://api-free.deepl.com/v2/translate" # Free tier | ||
| 30 | - | ||
| 31 | -# 修复后 | ||
| 32 | -DEEPL_API_URL = "https://api.deepl.com/v2/translate" # Pro tier | ||
| 33 | -``` | ||
| 34 | - | ||
| 35 | -**验证结果**: | ||
| 36 | -- ✅ 英文翻译: `'推车'` → `'push a cart'` | ||
| 37 | -- ✅ 俄文翻译: `'推车'` → `'толкать тележку'` | ||
| 38 | - | ||
| 39 | -### 2. 向量生成修复 | ||
| 40 | -**解决方案**: 清理GPU内存,恢复向量生成功能 | ||
| 41 | - | ||
| 42 | -**执行步骤**: | ||
| 43 | -1. 识别占用GPU的进程 | ||
| 44 | -2. 清理GPU内存 | ||
| 45 | -3. 验证向量生成功能 | ||
| 46 | - | ||
| 47 | -**验证结果**: | ||
| 48 | -- ✅ 向量生成: 成功生成1024维向量 | ||
| 49 | -- ✅ 向量质量: 正常的浮点数值 `[0.023, -0.0009, -0.006, ...]` | ||
| 50 | - | ||
| 51 | -## 🧪 修复验证 | ||
| 52 | - | ||
| 53 | -### 测试用例 | ||
| 54 | -```python | ||
| 55 | -test_query = "推车" | ||
| 56 | -result = parser.parse(test_query, context=context, generate_vector=True) | ||
| 57 | -``` | ||
| 58 | - | ||
| 59 | -### 修复前结果 | ||
| 60 | -``` | ||
| 61 | -翻译完成 | 结果: {'en': None, 'ru': None} | ||
| 62 | -查询解析完成 | 翻译数量: 2 | 向量: 否 | ||
| 63 | -``` | ||
| 64 | - | ||
| 65 | -### 修复后结果 | ||
| 66 | -``` | ||
| 67 | -翻译完成 | 结果: {'en': 'push a cart', 'ru': 'толкать тележку'} | ||
| 68 | -查询解析完成 | 翻译数量: 2 | 向量: 是 | ||
| 69 | -``` | ||
| 70 | - | ||
| 71 | -### 详细结果验证 | ||
| 72 | -- ✅ **翻译功能**: 英文和俄文翻译都成功 | ||
| 73 | -- ✅ **向量功能**: 成功生成1024维向量 | ||
| 74 | -- ✅ **上下文存储**: 所有中间结果正确存储 | ||
| 75 | -- ✅ **性能监控**: 请求跟踪和日志记录正常 | ||
| 76 | - | ||
| 77 | -## 📊 系统状态 | ||
| 78 | - | ||
| 79 | -**修复后的查询解析流程**: | ||
| 80 | -1. ✅ 查询标准化: `'推车'` → `'推车'` | ||
| 81 | -2. ✅ 语言检测: `'zh'` (中文) | ||
| 82 | -3. ✅ 查询重写: 无重写(简单查询) | ||
| 83 | -4. ✅ 翻译处理: 多语言翻译成功 | ||
| 84 | -5. ✅ 向量生成: 1024维向量生成成功 | ||
| 85 | -6. ✅ 结果存储: 上下文正确存储所有中间结果 | ||
| 86 | - | ||
| 87 | -## 🎉 最终状态 | ||
| 88 | - | ||
| 89 | -**系统现在完全正常工作**: | ||
| 90 | -- ✅ 翻译功能支持多语言查询 | ||
| 91 | -- ✅ 向量生成支持语义搜索 | ||
| 92 | -- ✅ 请求上下文提供完整可见性 | ||
| 93 | -- ✅ 性能监控跟踪所有处理阶段 | ||
| 94 | -- ✅ 结构化日志记录所有操作 | ||
| 95 | - | ||
| 96 | -**所有问题已彻底解决,系统恢复正常运行!** 🚀 | ||
| 97 | \ No newline at end of file | 0 | \ No newline at end of file |
IMPLEMENTATION_SUMMARY.md deleted
| @@ -1,389 +0,0 @@ | @@ -1,389 +0,0 @@ | ||
| 1 | -# E-Commerce Search Engine SaaS - Implementation Summary | ||
| 2 | - | ||
| 3 | -## Overview | ||
| 4 | - | ||
| 5 | -A complete, production-ready configurable search engine for cross-border e-commerce has been implemented. The system supports multi-tenant configurations, multi-language processing, semantic search with embeddings, and flexible ranking. | ||
| 6 | - | ||
| 7 | -## What Was Built | ||
| 8 | - | ||
| 9 | -### 1. Core Configuration System (config/) | ||
| 10 | - | ||
| 11 | -**field_types.py** - Defines all supported field types and ES mappings: | ||
| 12 | -- TEXT, KEYWORD, TEXT_EMBEDDING, IMAGE_EMBEDDING | ||
| 13 | -- Numeric types (INT, LONG, FLOAT, DOUBLE) | ||
| 14 | -- Date and Boolean types | ||
| 15 | -- Analyzer definitions (Chinese, English, Russian, Arabic, Spanish, Japanese) | ||
| 16 | -- ES mapping generation for each field type | ||
| 17 | - | ||
| 18 | -**config_loader.py** - YAML configuration loader and validator: | ||
| 19 | -- Loads customer-specific configurations | ||
| 20 | -- Validates field references and dependencies | ||
| 21 | -- Supports application + index structure definitions | ||
| 22 | -- Customer-specific query, ranking, and SPU settings | ||
| 23 | - | ||
| 24 | -**customer1_config.yaml** - Complete example configuration: | ||
| 25 | -- 16 fields including text, embeddings, keywords, metadata | ||
| 26 | -- 4 query domains (default, title, category, brand) | ||
| 27 | -- Multi-language support (zh, en, ru) | ||
| 28 | -- Query rewriting rules | ||
| 29 | -- Ranking expression: `bm25() + 0.2*text_embedding_relevance()` | ||
| 30 | - | ||
| 31 | -### 2. Data Ingestion Pipeline (indexer/) | ||
| 32 | - | ||
| 33 | -**mapping_generator.py** - Generates ES mappings from configuration: | ||
| 34 | -- Converts field configs to ES mapping JSON | ||
| 35 | -- Applies default analyzers and similarity settings | ||
| 36 | -- Helper methods to get embedding fields and match fields | ||
| 37 | - | ||
| 38 | -**data_transformer.py** - Transforms source data to ES documents: | ||
| 39 | -- Batch embedding generation for efficiency | ||
| 40 | -- Text embeddings using BGE-M3 (1024-dim) | ||
| 41 | -- Image embeddings using CN-CLIP (1024-dim) | ||
| 42 | -- Embedding cache to avoid recomputation | ||
| 43 | -- Type conversion and validation | ||
| 44 | - | ||
| 45 | -**bulk_indexer.py** - Bulk indexing with error handling: | ||
| 46 | -- Batch processing with configurable size | ||
| 47 | -- Retry logic for failed batches | ||
| 48 | -- Progress tracking and statistics | ||
| 49 | -- Index creation and refresh | ||
| 50 | - | ||
| 51 | -**IndexingPipeline** - Complete end-to-end ingestion: | ||
| 52 | -- Creates/recreates index with proper mapping | ||
| 53 | -- Transforms data with embeddings | ||
| 54 | -- Bulk indexes documents | ||
| 55 | -- Reports statistics | ||
| 56 | - | ||
| 57 | -### 3. Query Processing (query/) | ||
| 58 | - | ||
| 59 | -**language_detector.py** - Rule-based language detection: | ||
| 60 | -- Detects Chinese, English, Russian, Arabic, Japanese | ||
| 61 | -- Unicode range analysis | ||
| 62 | -- Script percentage calculation | ||
| 63 | - | ||
| 64 | -**translator.py** - Multi-language translation: | ||
| 65 | -- DeepL API integration | ||
| 66 | -- Translation caching | ||
| 67 | -- Automatic target language determination | ||
| 68 | -- Mock mode for testing without API key | ||
| 69 | - | ||
| 70 | -**query_rewriter.py** - Query rewriting and normalization: | ||
| 71 | -- Dictionary-based rewriting (brand/category mappings) | ||
| 72 | -- Query normalization (whitespace, special chars) | ||
| 73 | -- Domain extraction (e.g., "brand:Nike" -> domain + query) | ||
| 74 | - | ||
| 75 | -**query_parser.py** - Main query processing pipeline: | ||
| 76 | -- Orchestrates all query processing stages | ||
| 77 | -- Normalization → Rewriting → Language Detection → Translation → Embedding | ||
| 78 | -- Returns ParsedQuery with all processing results | ||
| 79 | -- Supports multi-language query expansion | ||
| 80 | - | ||
| 81 | -### 4. Search Engine (search/) | ||
| 82 | - | ||
| 83 | -**boolean_parser.py** - Boolean expression parser: | ||
| 84 | -- Supports AND, OR, RANK, ANDNOT operators | ||
| 85 | -- Parentheses for grouping | ||
| 86 | -- Correct operator precedence | ||
| 87 | -- Builds query tree for ES conversion | ||
| 88 | - | ||
| 89 | -**es_query_builder.py** - ES DSL query builder: | ||
| 90 | -- Converts query trees to ES bool queries | ||
| 91 | -- Multi-match with BM25 scoring | ||
| 92 | -- KNN queries for embeddings | ||
| 93 | -- Filter support (term, range, terms) | ||
| 94 | -- SPU collapse and aggregations | ||
| 95 | - | ||
| 96 | -**ranking_engine.py** - Configurable ranking: | ||
| 97 | -- Expression parser (e.g., "bm25() + 0.2*text_embedding_relevance()") | ||
| 98 | -- Function evaluation (bm25, text_embedding_relevance, field_value, timeliness) | ||
| 99 | -- Score calculation from expressions | ||
| 100 | -- Coefficient handling | ||
| 101 | - | ||
| 102 | -**searcher.py** - Main search orchestrator: | ||
| 103 | -- Integrates QueryParser and BooleanParser | ||
| 104 | -- Builds ES queries with hybrid BM25+KNN | ||
| 105 | -- Applies custom ranking | ||
| 106 | -- Handles SPU aggregation | ||
| 107 | -- Image similarity search | ||
| 108 | -- Result formatting | ||
| 109 | - | ||
| 110 | -### 5. Embeddings (embeddings/) | ||
| 111 | - | ||
| 112 | -**text_encoder.py** - BGE-M3 text encoder: | ||
| 113 | -- Singleton pattern for model reuse | ||
| 114 | -- Thread-safe initialization | ||
| 115 | -- Batch encoding support | ||
| 116 | -- GPU/CPU device selection | ||
| 117 | -- 1024-dimensional vectors | ||
| 118 | - | ||
| 119 | -**image_encoder.py** - CN-CLIP image encoder: | ||
| 120 | -- ViT-H-14 model | ||
| 121 | -- URL and local file support | ||
| 122 | -- Image validation and preprocessing | ||
| 123 | -- Batch encoding | ||
| 124 | -- 1024-dimensional vectors | ||
| 125 | - | ||
| 126 | -### 6. Utilities (utils/) | ||
| 127 | - | ||
| 128 | -**db_connector.py** - MySQL database connections: | ||
| 129 | -- SQLAlchemy engine creation | ||
| 130 | -- Connection pooling | ||
| 131 | -- Configuration from dict | ||
| 132 | -- Connection testing | ||
| 133 | - | ||
| 134 | -**es_client.py** - Elasticsearch client wrapper: | ||
| 135 | -- Connection management | ||
| 136 | -- Index CRUD operations | ||
| 137 | -- Bulk indexing helper | ||
| 138 | -- Search and count operations | ||
| 139 | -- Ping and health checks | ||
| 140 | - | ||
| 141 | -**cache.py** - Caching system: | ||
| 142 | -- EmbeddingCache: File-based cache for vectors | ||
| 143 | -- DictCache: JSON cache for translations/rules | ||
| 144 | -- MD5-based cache keys | ||
| 145 | -- Category support | ||
| 146 | - | ||
| 147 | -### 7. REST API (api/) | ||
| 148 | - | ||
| 149 | -**app.py** - FastAPI application: | ||
| 150 | -- Service initialization with configuration | ||
| 151 | -- Global exception handling | ||
| 152 | -- CORS middleware | ||
| 153 | -- Startup event handling | ||
| 154 | -- Environment variable support | ||
| 155 | - | ||
| 156 | -**models.py** - Pydantic request/response models: | ||
| 157 | -- SearchRequest, ImageSearchRequest | ||
| 158 | -- SearchResponse, DocumentResponse | ||
| 159 | -- HealthResponse, ErrorResponse | ||
| 160 | -- Validation and documentation | ||
| 161 | - | ||
| 162 | -**routes/search.py** - Search endpoints: | ||
| 163 | -- POST /search/ - Text search with all features | ||
| 164 | -- POST /search/image - Image similarity search | ||
| 165 | -- GET /search/{doc_id} - Get document by ID | ||
| 166 | - | ||
| 167 | -**routes/admin.py** - Admin endpoints: | ||
| 168 | -- GET /admin/health - Service health check | ||
| 169 | -- GET /admin/config - Get configuration | ||
| 170 | -- GET /admin/stats - Index statistics | ||
| 171 | -- GET/POST /admin/rewrite-rules - Manage rewrite rules | ||
| 172 | - | ||
| 173 | -### 8. Customer1 Implementation | ||
| 174 | - | ||
| 175 | -**ingest_customer1.py** - Data ingestion script: | ||
| 176 | -- Command-line interface | ||
| 177 | -- CSV loading with limit support | ||
| 178 | -- Embedding generation (optional) | ||
| 179 | -- Index creation/recreation | ||
| 180 | -- Progress tracking and statistics | ||
| 181 | - | ||
| 182 | -**customer1_config.yaml** - Production configuration: | ||
| 183 | -- 16 fields optimized for e-commerce | ||
| 184 | -- Multi-language fields (Chinese, English, Russian) | ||
| 185 | -- Text and image embeddings | ||
| 186 | -- Query rewrite rules for common terms | ||
| 187 | -- Configured for Shoplazza data structure | ||
| 188 | - | ||
| 189 | -## Technical Highlights | ||
| 190 | - | ||
| 191 | -### Architecture Decisions | ||
| 192 | - | ||
| 193 | -1. **Configuration-Driven**: Everything customizable via YAML | ||
| 194 | - - Field definitions, analyzers, ranking | ||
| 195 | - - No code changes for new customers | ||
| 196 | - | ||
| 197 | -2. **Hybrid Search**: BM25 + Embeddings | ||
| 198 | - - Lexical matching for precise queries | ||
| 199 | - - Semantic search for conceptual queries | ||
| 200 | - - Configurable blend (default: 80% BM25, 20% embeddings) | ||
| 201 | - | ||
| 202 | -3. **Multi-Language**: Automatic translation | ||
| 203 | - - Query language detection | ||
| 204 | - - Translation to all supported languages | ||
| 205 | - - Multi-language field search | ||
| 206 | - | ||
| 207 | -4. **Performance Optimization**: | ||
| 208 | - - Embedding caching (file-based) | ||
| 209 | - - Batch processing for embeddings | ||
| 210 | - - Connection pooling for DB and ES | ||
| 211 | - - Singleton pattern for ML models | ||
| 212 | - | ||
| 213 | -5. **Extensibility**: | ||
| 214 | - - Pluggable analyzers | ||
| 215 | - - Custom ranking expressions | ||
| 216 | - - Boolean operator support | ||
| 217 | - - SPU aggregation | ||
| 218 | - | ||
| 219 | -### Key Features Implemented | ||
| 220 | - | ||
| 221 | -✅ **Multi-tenant configuration system** | ||
| 222 | -✅ **Elasticsearch mapping generation** | ||
| 223 | -✅ **Data transformation with embeddings** | ||
| 224 | -✅ **Bulk indexing with error handling** | ||
| 225 | -✅ **Query parsing and rewriting** | ||
| 226 | -✅ **Language detection and translation** | ||
| 227 | -✅ **Boolean expression parsing** | ||
| 228 | -✅ **Hybrid BM25 + KNN search** | ||
| 229 | -✅ **Configurable ranking engine** | ||
| 230 | -✅ **Image similarity search** | ||
| 231 | -✅ **RESTful API service** | ||
| 232 | -✅ **Comprehensive caching** | ||
| 233 | -✅ **Admin endpoints** | ||
| 234 | -✅ **Customer1 test case** | ||
| 235 | - | ||
| 236 | -## Usage Examples | ||
| 237 | - | ||
| 238 | -### Data Ingestion | ||
| 239 | - | ||
| 240 | -```bash | ||
| 241 | -python data/customer1/ingest_customer1.py \ | ||
| 242 | - --csv data/customer1/goods_with_pic.5years_congku.csv.shuf.1w \ | ||
| 243 | - --limit 1000 \ | ||
| 244 | - --recreate-index \ | ||
| 245 | - --batch-size 100 \ | ||
| 246 | - --es-host http://localhost:9200 | ||
| 247 | -``` | ||
| 248 | - | ||
| 249 | -### Start API Service | ||
| 250 | - | ||
| 251 | -```bash | ||
| 252 | -python -m api.app \ | ||
| 253 | - --host 0.0.0.0 \ | ||
| 254 | - --port 6002 \ | ||
| 255 | - --customer customer1 \ | ||
| 256 | - --es-host http://localhost:9200 | ||
| 257 | -``` | ||
| 258 | - | ||
| 259 | -### Search Examples | ||
| 260 | - | ||
| 261 | -```bash | ||
| 262 | -# Simple Chinese query (auto-translates to English/Russian) | ||
| 263 | -curl -X POST http://localhost:6002/search/ \ | ||
| 264 | - -H "Content-Type: application/json" \ | ||
| 265 | - -d '{"query": "芭比娃娃", "size": 10}' | ||
| 266 | - | ||
| 267 | -# Boolean query | ||
| 268 | -curl -X POST http://localhost:6002/search/ \ | ||
| 269 | - -H "Content-Type: application/json" \ | ||
| 270 | - -d '{"query": "toy AND (barbie OR doll) ANDNOT cheap", "size": 10}' | ||
| 271 | - | ||
| 272 | -# Query with filters | ||
| 273 | -curl -X POST http://localhost:6002/search/ \ | ||
| 274 | - -H "Content-Type: application/json" \ | ||
| 275 | - -d '{ | ||
| 276 | - "query": "消防", | ||
| 277 | - "size": 10, | ||
| 278 | - "filters": {"categoryName_keyword": "消防"} | ||
| 279 | - }' | ||
| 280 | - | ||
| 281 | -# Image search | ||
| 282 | -curl -X POST http://localhost:6002/search/image \ | ||
| 283 | - -H "Content-Type: application/json" \ | ||
| 284 | - -d '{ | ||
| 285 | - "image_url": "https://oss.essa.cn/example.jpg", | ||
| 286 | - "size": 10 | ||
| 287 | - }' | ||
| 288 | -``` | ||
| 289 | - | ||
| 290 | -## Next Steps for Production | ||
| 291 | - | ||
| 292 | -### Required: | ||
| 293 | -1. **DeepL API Key**: Set for production translation | ||
| 294 | -2. **ML Models**: Download BGE-M3 and CN-CLIP models | ||
| 295 | -3. **Elasticsearch Cluster**: Production ES setup | ||
| 296 | -4. **MySQL Connection**: Configure Shoplazza database access | ||
| 297 | - | ||
| 298 | -### Recommended: | ||
| 299 | -1. **Redis Cache**: Replace file cache with Redis | ||
| 300 | -2. **Async Processing**: Celery for batch indexing | ||
| 301 | -3. **Monitoring**: Prometheus + Grafana | ||
| 302 | -4. **Load Testing**: Benchmark with production data | ||
| 303 | -5. **CI/CD**: Automated testing and deployment | ||
| 304 | - | ||
| 305 | -### Optional Enhancements: | ||
| 306 | -1. **Image Upload**: Support direct image upload vs URL | ||
| 307 | -2. **Personalization**: User-based ranking adjustments | ||
| 308 | -3. **A/B Testing**: Ranking expression experiments | ||
| 309 | -4. **Analytics**: Query logging and analysis | ||
| 310 | -5. **Auto-complete**: Suggest-as-you-type | ||
| 311 | - | ||
| 312 | -## Files Created | ||
| 313 | - | ||
| 314 | -**Configuration (5 files)**: | ||
| 315 | -- config/field_types.py | ||
| 316 | -- config/config_loader.py | ||
| 317 | -- config/__init__.py | ||
| 318 | -- config/schema/customer1_config.yaml | ||
| 319 | - | ||
| 320 | -**Indexer (4 files)**: | ||
| 321 | -- indexer/mapping_generator.py | ||
| 322 | -- indexer/data_transformer.py | ||
| 323 | -- indexer/bulk_indexer.py | ||
| 324 | -- indexer/__init__.py | ||
| 325 | - | ||
| 326 | -**Query (5 files)**: | ||
| 327 | -- query/language_detector.py | ||
| 328 | -- query/translator.py | ||
| 329 | -- query/query_rewriter.py | ||
| 330 | -- query/query_parser.py | ||
| 331 | -- query/__init__.py | ||
| 332 | - | ||
| 333 | -**Search (5 files)**: | ||
| 334 | -- search/boolean_parser.py | ||
| 335 | -- search/es_query_builder.py | ||
| 336 | -- search/ranking_engine.py | ||
| 337 | -- search/searcher.py | ||
| 338 | -- search/__init__.py | ||
| 339 | - | ||
| 340 | -**Embeddings (3 files)**: | ||
| 341 | -- embeddings/text_encoder.py | ||
| 342 | -- embeddings/image_encoder.py | ||
| 343 | -- embeddings/__init__.py | ||
| 344 | - | ||
| 345 | -**Utils (4 files)**: | ||
| 346 | -- utils/db_connector.py | ||
| 347 | -- utils/es_client.py | ||
| 348 | -- utils/cache.py | ||
| 349 | -- utils/__init__.py | ||
| 350 | - | ||
| 351 | -**API (6 files)**: | ||
| 352 | -- api/app.py | ||
| 353 | -- api/models.py | ||
| 354 | -- api/routes/search.py | ||
| 355 | -- api/routes/admin.py | ||
| 356 | -- api/routes/__init__.py | ||
| 357 | -- api/__init__.py | ||
| 358 | - | ||
| 359 | -**Data (1 file)**: | ||
| 360 | -- data/customer1/ingest_customer1.py | ||
| 361 | - | ||
| 362 | -**Documentation (3 files)**: | ||
| 363 | -- README.md | ||
| 364 | -- requirements.txt | ||
| 365 | -- IMPLEMENTATION_SUMMARY.md (this file) | ||
| 366 | - | ||
| 367 | -**Total: 36 implementation files** | ||
| 368 | - | ||
| 369 | -## Success Criteria Met | ||
| 370 | - | ||
| 371 | -✅ **Configurable Universal Search System**: Complete YAML-based configuration | ||
| 372 | -✅ **Multi-tenant Support**: Customer-specific schemas and extensions | ||
| 373 | -✅ **QueryParser Module**: Rewriting, translation, embedding generation | ||
| 374 | -✅ **Searcher Module**: Boolean operators, hybrid ranking, SPU support | ||
| 375 | -✅ **Customer1 Case Study**: Complete configuration and ingestion script | ||
| 376 | -✅ **REST API Service**: Full-featured FastAPI application | ||
| 377 | -✅ **Production-Ready**: Error handling, caching, monitoring endpoints | ||
| 378 | - | ||
| 379 | -## Conclusion | ||
| 380 | - | ||
| 381 | -A complete, production-grade e-commerce search SaaS has been implemented following industry best practices. The system is: | ||
| 382 | - | ||
| 383 | -- **Flexible**: Configuration-driven for easy customization | ||
| 384 | -- **Scalable**: Designed for multi-tenant deployment | ||
| 385 | -- **Powerful**: Hybrid search with semantic understanding | ||
| 386 | -- **International**: Multi-language support with translation | ||
| 387 | -- **Extensible**: Modular architecture for future enhancements | ||
| 388 | - | ||
| 389 | -The implementation is ready for deployment and testing with real data. |
SERVER_FIXES.md deleted
| @@ -1,142 +0,0 @@ | @@ -1,142 +0,0 @@ | ||
| 1 | -# 服务器修复和优化文档 | ||
| 2 | - | ||
| 3 | -## 修复的问题 | ||
| 4 | - | ||
| 5 | -### 1. 前端服务器问题 (scripts/frontend_server.py) | ||
| 6 | -- **问题**: 接收到大量扫描器流量导致的错误日志 | ||
| 7 | -- **原因**: SSL/TLS握手尝试、RDP连接扫描、二进制数据攻击 | ||
| 8 | -- **解决方案**: | ||
| 9 | - - 添加错误处理机制,优雅处理连接断开 | ||
| 10 | - - 实现速率限制 (100请求/分钟) | ||
| 11 | - - 过滤扫描器噪音日志 | ||
| 12 | - - 添加安全HTTP头 | ||
| 13 | - - 使用线程服务器提高并发处理能力 | ||
| 14 | - | ||
| 15 | -### 2. API服务器问题 (api/app.py) | ||
| 16 | -- **问题**: 缺乏安全性和错误处理机制 | ||
| 17 | -- **解决方案**: | ||
| 18 | - - 集成速率限制 (slowapi) | ||
| 19 | - - 添加安全HTTP头 | ||
| 20 | - - 实现更好的异常处理 | ||
| 21 | - - 添加健康检查端点 | ||
| 22 | - - 增强日志记录 | ||
| 23 | - - 添加服务关闭处理 | ||
| 24 | - | ||
| 25 | -## 主要改进 | ||
| 26 | - | ||
| 27 | -### 安全性增强 | ||
| 28 | -1. **速率限制**: 防止DDoS攻击和滥用 | ||
| 29 | -2. **安全HTTP头**: 防止XSS、点击劫持等攻击 | ||
| 30 | -3. **错误过滤**: 隐藏敏感错误信息 | ||
| 31 | -4. **输入验证**: 更健壮的请求处理 | ||
| 32 | - | ||
| 33 | -### 稳定性提升 | ||
| 34 | -1. **连接错误处理**: 优雅处理连接重置和断开 | ||
| 35 | -2. **异常处理**: 全局异常捕获,防止服务器崩溃 | ||
| 36 | -3. **日志管理**: 过滤噪音,记录重要事件 | ||
| 37 | -4. **监控功能**: 健康检查和状态监控 | ||
| 38 | - | ||
| 39 | -### 性能优化 | ||
| 40 | -1. **线程服务器**: 前端服务器支持并发请求 | ||
| 41 | -2. **资源管理**: 更好的内存和连接管理 | ||
| 42 | -3. **响应头优化**: 添加缓存和安全相关头 | ||
| 43 | - | ||
| 44 | -## 使用方法 | ||
| 45 | - | ||
| 46 | -### 安装依赖 | ||
| 47 | -```bash | ||
| 48 | -# 安装服务器安全依赖 | ||
| 49 | -./scripts/install_server_deps.sh | ||
| 50 | - | ||
| 51 | -# 或者手动安装 | ||
| 52 | -pip install slowapi>=0.1.9 anyio>=3.7.0 | ||
| 53 | -``` | ||
| 54 | - | ||
| 55 | -### 启动服务器 | ||
| 56 | - | ||
| 57 | -#### 方法1: 使用管理脚本 (推荐) | ||
| 58 | -```bash | ||
| 59 | -# 启动所有服务器 | ||
| 60 | -python scripts/start_servers.py --customer customer1 --es-host http://localhost:9200 | ||
| 61 | - | ||
| 62 | -# 启动前检查依赖 | ||
| 63 | -python scripts/start_servers.py --check-dependencies | ||
| 64 | -``` | ||
| 65 | - | ||
| 66 | -#### 方法2: 分别启动 | ||
| 67 | -```bash | ||
| 68 | -# 启动API服务器 | ||
| 69 | -python main.py serve --customer customer1 --es-host http://localhost:9200 | ||
| 70 | - | ||
| 71 | -# 启动前端服务器 (在另一个终端) | ||
| 72 | -python scripts/frontend_server.py | ||
| 73 | -``` | ||
| 74 | - | ||
| 75 | -### 监控和日志 | ||
| 76 | - | ||
| 77 | -#### 日志位置 | ||
| 78 | -- API服务器日志: `/tmp/search_engine_api.log` | ||
| 79 | -- 启动日志: `/tmp/search_engine_startup.log` | ||
| 80 | -- 控制台输出: 实时显示重要信息 | ||
| 81 | - | ||
| 82 | -#### 健康检查 | ||
| 83 | -```bash | ||
| 84 | -# 检查API服务器健康状态 | ||
| 85 | -curl http://localhost:6002/health | ||
| 86 | - | ||
| 87 | -# 检查前端服务器 | ||
| 88 | -curl http://localhost:6003 | ||
| 89 | -``` | ||
| 90 | - | ||
| 91 | -## 配置选项 | ||
| 92 | - | ||
| 93 | -### 环境变量 | ||
| 94 | -- `CUSTOMER_ID`: 客户ID (默认: customer1) | ||
| 95 | -- `ES_HOST`: Elasticsearch主机 (默认: http://localhost:9200) | ||
| 96 | - | ||
| 97 | -### 速率限制配置 | ||
| 98 | -- API服务器: 各端点不同限制 (60-120请求/分钟) | ||
| 99 | -- 前端服务器: 100请求/分钟 | ||
| 100 | - | ||
| 101 | -## 故障排除 | ||
| 102 | - | ||
| 103 | -### 常见问题 | ||
| 104 | - | ||
| 105 | -1. **依赖缺失错误** | ||
| 106 | - ```bash | ||
| 107 | - pip install -r requirements_server.txt | ||
| 108 | - ``` | ||
| 109 | - | ||
| 110 | -2. **端口被占用** | ||
| 111 | - ```bash | ||
| 112 | - # 查看端口占用 | ||
| 113 | - lsof -i :6002 | ||
| 114 | - lsof -i :6003 | ||
| 115 | - ``` | ||
| 116 | - | ||
| 117 | -3. **权限问题** | ||
| 118 | - ```bash | ||
| 119 | - chmod +x scripts/*.py scripts/*.sh | ||
| 120 | - ``` | ||
| 121 | - | ||
| 122 | -### 调试模式 | ||
| 123 | -```bash | ||
| 124 | -# 启用详细日志 | ||
| 125 | -export PYTHONUNBUFFERED=1 | ||
| 126 | -python scripts/start_servers.py | ||
| 127 | -``` | ||
| 128 | - | ||
| 129 | -## 生产环境建议 | ||
| 130 | - | ||
| 131 | -1. **反向代理**: 使用nginx或Apache作为反向代理 | ||
| 132 | -2. **SSL证书**: 配置HTTPS | ||
| 133 | -3. **防火墙**: 限制访问源IP | ||
| 134 | -4. **监控**: 集成监控和告警系统 | ||
| 135 | -5. **日志轮转**: 配置日志轮转防止磁盘满 | ||
| 136 | - | ||
| 137 | -## 维护说明 | ||
| 138 | - | ||
| 139 | -- 定期检查日志文件大小 | ||
| 140 | -- 监控服务器资源使用情况 | ||
| 141 | -- 更新依赖包版本 | ||
| 142 | -- 备份配置文件 | ||
| 143 | \ No newline at end of file | 0 | \ No newline at end of file |
demo_context_logging.py deleted
| @@ -1,141 +0,0 @@ | @@ -1,141 +0,0 @@ | ||
| 1 | -#!/usr/bin/env python3 | ||
| 2 | -""" | ||
| 3 | -Demonstration of the Request Context and Logging system | ||
| 4 | - | ||
| 5 | -This script demonstrates how the request-scoped context management | ||
| 6 | -and structured logging work together to provide complete visibility | ||
| 7 | -into search request processing. | ||
| 8 | -""" | ||
| 9 | - | ||
| 10 | -import time | ||
| 11 | -import sys | ||
| 12 | -import os | ||
| 13 | - | ||
| 14 | -# Add the project root to Python path | ||
| 15 | -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | ||
| 16 | - | ||
| 17 | -# Setup the environment (use the conda environment) | ||
| 18 | -os.system('source /home/tw/miniconda3/etc/profile.d/conda.sh && conda activate searchengine') | ||
| 19 | - | ||
| 20 | -def demo_request_context(): | ||
| 21 | - """Demonstrate RequestContext functionality""" | ||
| 22 | - print("🚀 Starting Request Context and Logging Demo") | ||
| 23 | - print("=" * 60) | ||
| 24 | - | ||
| 25 | - try: | ||
| 26 | - from utils.logger import get_logger, setup_logging | ||
| 27 | - from context.request_context import create_request_context, RequestContextStage | ||
| 28 | - | ||
| 29 | - # Setup logging | ||
| 30 | - setup_logging(log_level="INFO", log_dir="demo_logs") | ||
| 31 | - logger = get_logger("demo") | ||
| 32 | - | ||
| 33 | - print("✅ Logging infrastructure initialized") | ||
| 34 | - | ||
| 35 | - # Create a request context | ||
| 36 | - context = create_request_context("demo123", "demo_user") | ||
| 37 | - print(f"✅ Created request context: reqid={context.reqid}, uid={context.uid}") | ||
| 38 | - | ||
| 39 | - # Simulate a complete search pipeline | ||
| 40 | - with context: # Use context manager for automatic timing | ||
| 41 | - logger.info("开始模拟搜索请求处理", extra={'reqid': context.reqid, 'uid': context.uid}) | ||
| 42 | - | ||
| 43 | - # Stage 1: Query parsing | ||
| 44 | - context.start_stage(RequestContextStage.QUERY_PARSING) | ||
| 45 | - time.sleep(0.02) # Simulate work | ||
| 46 | - | ||
| 47 | - # Store query analysis results | ||
| 48 | - context.store_query_analysis( | ||
| 49 | - original_query="红色高跟鞋 品牌:Nike", | ||
| 50 | - normalized_query="红色 高跟鞋 品牌:Nike", | ||
| 51 | - rewritten_query="红色 高跟鞋 品牌:nike", | ||
| 52 | - detected_language="zh", | ||
| 53 | - translations={"en": "red high heels brand:nike"}, | ||
| 54 | - domain="brand" | ||
| 55 | - ) | ||
| 56 | - | ||
| 57 | - context.store_intermediate_result("query_vector_shape", (1024,)) | ||
| 58 | - context.end_stage(RequestContextStage.QUERY_PARSING) | ||
| 59 | - | ||
| 60 | - # Stage 2: Boolean parsing | ||
| 61 | - context.start_stage(RequestContextStage.BOOLEAN_PARSING) | ||
| 62 | - time.sleep(0.005) # Simulate work | ||
| 63 | - context.store_intermediate_result("boolean_ast", "AND(红色, 高跟鞋, BRAND:nike)") | ||
| 64 | - context.end_stage(RequestContextStage.BOOLEAN_PARSING) | ||
| 65 | - | ||
| 66 | - # Stage 3: Query building | ||
| 67 | - context.start_stage(RequestContextStage.QUERY_BUILDING) | ||
| 68 | - time.sleep(0.01) # Simulate work | ||
| 69 | - es_query = { | ||
| 70 | - "query": {"bool": {"must": [{"match": {"title": "红色 高跟鞋"}}]}}, | ||
| 71 | - "knn": {"field": "text_embedding", "query_vector": [0.1] * 1024} | ||
| 72 | - } | ||
| 73 | - context.store_intermediate_result("es_query", es_query) | ||
| 74 | - context.end_stage(RequestContextStage.QUERY_BUILDING) | ||
| 75 | - | ||
| 76 | - # Stage 4: Elasticsearch search | ||
| 77 | - context.start_stage(RequestContextStage.ELASTICSEARCH_SEARCH) | ||
| 78 | - time.sleep(0.05) # Simulate work | ||
| 79 | - es_response = { | ||
| 80 | - "hits": {"total": {"value": 42}, "max_score": 0.95, "hits": []}, | ||
| 81 | - "took": 15 | ||
| 82 | - } | ||
| 83 | - context.store_intermediate_result("es_response", es_response) | ||
| 84 | - context.end_stage(RequestContextStage.ELASTICSEARCH_SEARCH) | ||
| 85 | - | ||
| 86 | - # Stage 5: Result processing | ||
| 87 | - context.start_stage(RequestContextStage.RESULT_PROCESSING) | ||
| 88 | - time.sleep(0.01) # Simulate work | ||
| 89 | - context.store_intermediate_result("processed_hits", [ | ||
| 90 | - {"_id": "1", "_score": 0.95}, | ||
| 91 | - {"_id": "2", "_score": 0.87} | ||
| 92 | - ]) | ||
| 93 | - context.end_stage(RequestContextStage.RESULT_PROCESSING) | ||
| 94 | - | ||
| 95 | - # Add a warning to demonstrate warning tracking | ||
| 96 | - context.add_warning("查询被重写: '红色 高跟鞋 品牌:Nike' -> 'red high heels brand:nike'") | ||
| 97 | - | ||
| 98 | - # Get and display summary | ||
| 99 | - summary = context.get_summary() | ||
| 100 | - print("\n📊 Request Summary:") | ||
| 101 | - print("-" * 40) | ||
| 102 | - print(f"Request ID: {summary['request_info']['reqid']}") | ||
| 103 | - print(f"User ID: {summary['request_info']['uid']}") | ||
| 104 | - print(f"Total Duration: {summary['performance']['total_duration_ms']:.2f}ms") | ||
| 105 | - print("\n⏱️ Stage Breakdown:") | ||
| 106 | - for stage, duration in summary['performance']['stage_timings_ms'].items(): | ||
| 107 | - percentage = summary['performance']['stage_percentages'].get(stage, 0) | ||
| 108 | - print(f" {stage}: {duration:.2f}ms ({percentage}%)") | ||
| 109 | - | ||
| 110 | - print("\n🔍 Query Analysis:") | ||
| 111 | - print(f" Original: '{summary['query_analysis']['original_query']}'") | ||
| 112 | - print(f" Rewritten: '{summary['query_analysis']['rewritten_query']}'") | ||
| 113 | - print(f" Language: {summary['query_analysis']['detected_language']}") | ||
| 114 | - print(f" Domain: {summary['query_analysis']['domain']}") | ||
| 115 | - print(f" Has Vector: {summary['query_analysis']['has_vector']}") | ||
| 116 | - | ||
| 117 | - print("\n📈 Results:") | ||
| 118 | - print(f" Total Hits: {summary['results']['total_hits']}") | ||
| 119 | - print(f" ES Query Size: {summary['results']['es_query_size']} chars") | ||
| 120 | - | ||
| 121 | - print("\n⚠️ Warnings:") | ||
| 122 | - print(f" Count: {summary['request_info']['warnings_count']}") | ||
| 123 | - | ||
| 124 | - print("\n✅ Demo completed successfully!") | ||
| 125 | - print(f"📁 Logs are available in: demo_logs/") | ||
| 126 | - | ||
| 127 | - except Exception as e: | ||
| 128 | - print(f"❌ Demo failed: {e}") | ||
| 129 | - import traceback | ||
| 130 | - traceback.print_exc() | ||
| 131 | - return False | ||
| 132 | - | ||
| 133 | - return True | ||
| 134 | - | ||
| 135 | -if __name__ == "__main__": | ||
| 136 | - success = demo_request_context() | ||
| 137 | - if success: | ||
| 138 | - print("\n🎉 Request Context and Logging system is ready for production!") | ||
| 139 | - else: | ||
| 140 | - print("\n💥 Please check the errors above") | ||
| 141 | - sys.exit(1) | ||
| 142 | \ No newline at end of file | 0 | \ No newline at end of file |
diagnose_issues.py deleted
| @@ -1,220 +0,0 @@ | @@ -1,220 +0,0 @@ | ||
| 1 | -#!/usr/bin/env python3 | ||
| 2 | -""" | ||
| 3 | -诊断翻译和向量生成问题 | ||
| 4 | -""" | ||
| 5 | - | ||
| 6 | -import sys | ||
| 7 | -import os | ||
| 8 | -import traceback | ||
| 9 | - | ||
| 10 | -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | ||
| 11 | - | ||
| 12 | -def diagnose_translation_issue(): | ||
| 13 | - """诊断翻译问题""" | ||
| 14 | - print("🔍 诊断翻译功能...") | ||
| 15 | - print("-" * 50) | ||
| 16 | - | ||
| 17 | - try: | ||
| 18 | - from query.translator import Translator | ||
| 19 | - from config.env_config import get_deepl_key | ||
| 20 | - | ||
| 21 | - # 检查API密钥 | ||
| 22 | - try: | ||
| 23 | - api_key = get_deepl_key() | ||
| 24 | - print(f"✅ DeepL API密钥已配置: {'*' * len(api_key[:8]) if api_key else 'None'}") | ||
| 25 | - except Exception as e: | ||
| 26 | - print(f"❌ DeepL API密钥配置失败: {e}") | ||
| 27 | - api_key = None | ||
| 28 | - | ||
| 29 | - # 创建翻译器 | ||
| 30 | - translator = Translator(api_key=api_key, use_cache=True) | ||
| 31 | - print(f"✅ 翻译器创建成功,API密钥状态: {'已配置' if api_key else '未配置'}") | ||
| 32 | - | ||
| 33 | - # 测试翻译 | ||
| 34 | - test_text = "推车" | ||
| 35 | - print(f"\n📝 测试翻译文本: '{test_text}'") | ||
| 36 | - | ||
| 37 | - # 测试英文翻译 | ||
| 38 | - result_en = translator.translate(test_text, "en", "zh") | ||
| 39 | - print(f"🇺🇸 英文翻译结果: {result_en}") | ||
| 40 | - | ||
| 41 | - # 测试俄文翻译 | ||
| 42 | - result_ru = translator.translate(test_text, "ru", "zh") | ||
| 43 | - print(f"🇷🇺 俄文翻译结果: {result_ru}") | ||
| 44 | - | ||
| 45 | - # 测试多语言翻译 | ||
| 46 | - results = translator.translate_multi(test_text, ["en", "ru"], "zh") | ||
| 47 | - print(f"🌍 多语言翻译结果: {results}") | ||
| 48 | - | ||
| 49 | - # 检查翻译需求逻辑 | ||
| 50 | - needs = translator.get_translation_needs("zh", ["en", "ru"]) | ||
| 51 | - print(f"🎯 翻译需求分析: {needs}") | ||
| 52 | - | ||
| 53 | - if api_key: | ||
| 54 | - print("\n✅ 翻译功能配置正确,可能的问题:") | ||
| 55 | - print(" 1. 网络连接问题") | ||
| 56 | - print(" 2. API限额或配额问题") | ||
| 57 | - print(" 3. DeepL服务暂时不可用") | ||
| 58 | - else: | ||
| 59 | - print("\n⚠️ 翻译功能处于模拟模式(无API密钥)") | ||
| 60 | - print(" 这会导致翻译返回原始文本或None") | ||
| 61 | - | ||
| 62 | - except Exception as e: | ||
| 63 | - print(f"❌ 翻译功能诊断失败: {e}") | ||
| 64 | - traceback.print_exc() | ||
| 65 | - | ||
| 66 | -def diagnose_embedding_issue(): | ||
| 67 | - """诊断向量生成问题""" | ||
| 68 | - print("\n🔍 诊断向量生成功能...") | ||
| 69 | - print("-" * 50) | ||
| 70 | - | ||
| 71 | - try: | ||
| 72 | - from embeddings.text_encoder import BgeEncoder | ||
| 73 | - import torch | ||
| 74 | - | ||
| 75 | - # 检查CUDA可用性 | ||
| 76 | - cuda_available = torch.cuda.is_available() | ||
| 77 | - print(f"🔧 CUDA可用性: {'是' if cuda_available else '否'}") | ||
| 78 | - if cuda_available: | ||
| 79 | - print(f"🔧 CUDA设备数量: {torch.cuda.device_count()}") | ||
| 80 | - print(f"🔧 当前CUDA设备: {torch.cuda.current_device()}") | ||
| 81 | - | ||
| 82 | - # 尝试创建编码器 | ||
| 83 | - print("\n📦 尝试创建BGE编码器...") | ||
| 84 | - try: | ||
| 85 | - encoder = BgeEncoder() | ||
| 86 | - print("✅ BGE编码器创建成功") | ||
| 87 | - except Exception as e: | ||
| 88 | - print(f"❌ BGE编码器创建失败: {e}") | ||
| 89 | - print("可能的原因:") | ||
| 90 | - print(" 1. 模型文件未下载") | ||
| 91 | - print(" 2. 内存不足") | ||
| 92 | - print(" 3. 依赖包未正确安装") | ||
| 93 | - return | ||
| 94 | - | ||
| 95 | - # 测试向量生成 | ||
| 96 | - test_text = "推车" | ||
| 97 | - print(f"\n📝 测试向量生成文本: '{test_text}'") | ||
| 98 | - | ||
| 99 | - try: | ||
| 100 | - # 尝试CPU模式 | ||
| 101 | - print("🔄 尝试CPU模式...") | ||
| 102 | - embedding_cpu = encoder.encode(test_text, device='cpu') | ||
| 103 | - print(f"✅ CPU模式向量生成成功,形状: {embedding_cpu.shape}") | ||
| 104 | - | ||
| 105 | - # 尝试CUDA模式(如果可用) | ||
| 106 | - if cuda_available: | ||
| 107 | - print("🔄 尝试CUDA模式...") | ||
| 108 | - embedding_cuda = encoder.encode(test_text, device='cuda') | ||
| 109 | - print(f"✅ CUDA模式向量生成成功,形状: {embedding_cuda.shape}") | ||
| 110 | - else: | ||
| 111 | - print("⚠️ CUDA不可用,跳过GPU测试") | ||
| 112 | - | ||
| 113 | - except Exception as e: | ||
| 114 | - print(f"❌ 向量生成失败: {e}") | ||
| 115 | - print("可能的原因:") | ||
| 116 | - print(" 1. 模型加载问题") | ||
| 117 | - print(" 2. 内存不足") | ||
| 118 | - print(" 3. 设备配置问题") | ||
| 119 | - | ||
| 120 | - except Exception as e: | ||
| 121 | - print(f"❌ 向量生成功能诊断失败: {e}") | ||
| 122 | - traceback.print_exc() | ||
| 123 | - | ||
| 124 | -def diagnose_config_issue(): | ||
| 125 | - """诊断配置问题""" | ||
| 126 | - print("\n🔍 诊断配置问题...") | ||
| 127 | - print("-" * 50) | ||
| 128 | - | ||
| 129 | - try: | ||
| 130 | - from config import CustomerConfig | ||
| 131 | - from config.config_loader import load_customer_config | ||
| 132 | - | ||
| 133 | - # 加载配置 | ||
| 134 | - config = load_customer_config("customer1") | ||
| 135 | - print(f"✅ 配置加载成功: {config.customer_id}") | ||
| 136 | - | ||
| 137 | - # 检查查询配置 | ||
| 138 | - query_config = config.query_config | ||
| 139 | - print(f"📝 翻译功能启用: {query_config.enable_translation}") | ||
| 140 | - print(f"🔤 向量生成启用: {query_config.enable_text_embedding}") | ||
| 141 | - print(f"🌍 支持的语言: {query_config.supported_languages}") | ||
| 142 | - | ||
| 143 | - # 检查API密钥配置 | ||
| 144 | - try: | ||
| 145 | - from config.env_config import get_deepl_key | ||
| 146 | - api_key = get_deepl_key() | ||
| 147 | - print(f"🔑 DeepL API密钥: {'已配置' if api_key else '未配置'}") | ||
| 148 | - except: | ||
| 149 | - print("🔑 DeepL API密钥: 配置加载失败") | ||
| 150 | - | ||
| 151 | - except Exception as e: | ||
| 152 | - print(f"❌ 配置诊断失败: {e}") | ||
| 153 | - traceback.print_exc() | ||
| 154 | - | ||
| 155 | -def simulate_query_parsing(): | ||
| 156 | - """模拟查询解析过程""" | ||
| 157 | - print("\n🔍 模拟查询解析过程...") | ||
| 158 | - print("-" * 50) | ||
| 159 | - | ||
| 160 | - try: | ||
| 161 | - from context.request_context import create_request_context | ||
| 162 | - from query.query_parser import QueryParser | ||
| 163 | - from config import CustomerConfig | ||
| 164 | - from config.config_loader import load_customer_config | ||
| 165 | - | ||
| 166 | - # 加载配置 | ||
| 167 | - config = load_customer_config("customer1") | ||
| 168 | - parser = QueryParser(config) | ||
| 169 | - context = create_request_context("test_diagnosis", "diagnosis_user") | ||
| 170 | - | ||
| 171 | - # 模拟解析"推车" | ||
| 172 | - print("📝 开始解析查询: '推车'") | ||
| 173 | - | ||
| 174 | - # 检查各个功能是否启用 | ||
| 175 | - print(f" - 翻译功能: {'启用' if config.query_config.enable_translation else '禁用'}") | ||
| 176 | - print(f" - 向量功能: {'启用' if config.query_config.enable_text_embedding else '禁用'}") | ||
| 177 | - | ||
| 178 | - # 检查翻译器状态 | ||
| 179 | - if hasattr(parser, '_translator') and parser._translator: | ||
| 180 | - translator_has_key = bool(parser._translator.api_key) | ||
| 181 | - print(f" - 翻译器API密钥: {'有' if translator_has_key else '无'}") | ||
| 182 | - else: | ||
| 183 | - print(f" - 翻译器状态: 未初始化") | ||
| 184 | - | ||
| 185 | - # 检查向量编码器状态 | ||
| 186 | - if hasattr(parser, '_text_encoder') and parser._text_encoder: | ||
| 187 | - print(f" - 向量编码器: 已初始化") | ||
| 188 | - else: | ||
| 189 | - print(f" - 向量编码器: 未初始化") | ||
| 190 | - | ||
| 191 | - # 执行解析 | ||
| 192 | - result = parser.parse("推车", context=context, generate_vector=config.query_config.enable_text_embedding) | ||
| 193 | - | ||
| 194 | - print(f"\n📊 解析结果:") | ||
| 195 | - print(f" 原查询: {result.original_query}") | ||
| 196 | - print(f" 标准化: {result.normalized_query}") | ||
| 197 | - print(f" 重写后: {result.rewritten_query}") | ||
| 198 | - print(f" 检测语言: {result.detected_language}") | ||
| 199 | - print(f" 域: {result.domain}") | ||
| 200 | - print(f" 翻译结果: {result.translations}") | ||
| 201 | - print(f" 向量: {'有' if result.query_vector is not None else '无'}") | ||
| 202 | - | ||
| 203 | - if result.query_vector is not None: | ||
| 204 | - print(f" 向量形状: {result.query_vector.shape}") | ||
| 205 | - | ||
| 206 | - except Exception as e: | ||
| 207 | - print(f"❌ 查询解析模拟失败: {e}") | ||
| 208 | - traceback.print_exc() | ||
| 209 | - | ||
| 210 | -if __name__ == "__main__": | ||
| 211 | - print("🧪 开始系统诊断...") | ||
| 212 | - print("=" * 60) | ||
| 213 | - | ||
| 214 | - diagnose_translation_issue() | ||
| 215 | - diagnose_embedding_issue() | ||
| 216 | - diagnose_config_issue() | ||
| 217 | - simulate_query_parsing() | ||
| 218 | - | ||
| 219 | - print("\n" + "=" * 60) | ||
| 220 | - print("🏁 诊断完成!请查看上述结果找出问题原因。") | ||
| 221 | \ No newline at end of file | 0 | \ No newline at end of file |