Commit 484adbfe9ccfae39f5ce92c0173011d724402d47

Authored by tangwang
1 parent a7920e17

adapt ubuntu; conda -> venv

1 # Elasticsearch Configuration 1 # Elasticsearch Configuration
2 ES_HOST=http://localhost:9200 2 ES_HOST=http://localhost:9200
3 -ES_USERNAME=essa 3 +ES_USERNAME=saas
4 ES_PASSWORD=4hOaLaf41y2VuI8y 4 ES_PASSWORD=4hOaLaf41y2VuI8y
5 5
6 # Redis Configuration (Optional) 6 # Redis Configuration (Optional)
@@ -30,7 +30,7 @@ IMAGE_MODEL_DIR=/data/tw/models/cn-clip # å·²ç»æ”¹ä¸ºweb请求了,ä¸ä½¿ç”¨æœ @@ -30,7 +30,7 @@ IMAGE_MODEL_DIR=/data/tw/models/cn-clip # å·²ç»æ”¹ä¸ºweb请求了,ä¸ä½¿ç”¨æœ
30 CACHE_DIR=.cache 30 CACHE_DIR=.cache
31 31
32 # Frontend API Base URL 32 # Frontend API Base URL
33 -API_BASE_URL=http://120.76.41.98:6002 33 +API_BASE_URL=http://43.166.252.75:6002
34 34
35 35
36 DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b 36 DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b
@@ -49,6 +49,7 @@ __pycache__ @@ -49,6 +49,7 @@ __pycache__
49 .history.txt 49 .history.txt
50 log/ 50 log/
51 logs/ 51 logs/
  52 +.venv/
52 nohup.out 53 nohup.out
53 temp/ 54 temp/
54 indexer_input* 55 indexer_input*
@@ -70,7 +70,7 @@ query anchor @@ -70,7 +70,7 @@ query anchor
70 70
71 对外: 71 对外:
72 embedding服务: 72 embedding服务:
73 - curl -X POST http://120.76.41.98:6005/embed/text \ 73 + curl -X POST http://43.166.252.75:6005/embed/text \
74 -H "Content-Type: application/json" \ 74 -H "Content-Type: application/json" \
75 -d '["衣服", "Bohemian Maxi Dress"]' 75 -d '["衣服", "Bohemian Maxi Dress"]'
76 76
@@ -91,7 +91,7 @@ localhost替换为 @@ -91,7 +91,7 @@ localhost替换为
91 服务器内网地址: 91 服务器内网地址:
92 10.0.163.168 92 10.0.163.168
93 公网地址: 93 公网地址:
94 -120.76.41.98 94 +43.166.252.75
95 95
96 96
97 # 电商搜索引擎 SaaS 97 # 电商搜索引擎 SaaS
@@ -101,15 +101,18 @@ localhost替换为 @@ -101,15 +101,18 @@ localhost替换为
101 101
102 ## 项目环境 102 ## 项目环境
103 103
104 -以项目根目录的 **`activate.sh`** 为准(会激活 Conda 环境 `searchengine` 并加载 `.env`): 104 +以项目根目录的 **`activate.sh`** 为准(**优先激活 venv:`./.venv`,并加载 `.env`;兼容 Conda 回退**):
105 105
106 ```bash 106 ```bash
107 -# 若在新机器且 Conda 不在默认路径,先设置:  
108 -# - 你的 conda 是 ~/anaconda3/bin/conda,则:export CONDA_ROOT=$HOME/anaconda3 107 +# 推荐:首次创建 venv(默认安装基础依赖)
  108 +./scripts/create_venv.sh
  109 +
  110 +# 如需本地向量/图片编码(会安装 torch/transformers 等重依赖)
  111 +# INSTALL_ML=1 ./scripts/create_venv.sh
109 source activate.sh 112 source activate.sh
110 ``` 113 ```
111 114
112 -新机器首次需创建环境,见 `docs/环境配置说明.md`(`conda env create -f environment.yml` 或 `pip install -r requirements.txt`)。 115 +新机器首次需创建环境,见 `docs/环境配置说明.md`(推荐 venv;Conda 为兼容旧流程)。
113 116
114 ## 测试pipeline 117 ## 测试pipeline
115 118
1 #!/bin/bash 1 #!/bin/bash
2 -# 新机器部署:可设置 CONDA_ROOT 指向本机 Conda 路径  
3 -# 例如你的 conda 是 ~/anaconda3/bin/conda,则 export CONDA_ROOT=$HOME/anaconda3  
4 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
5 -source "$CONDA_ROOT/etc/profile.d/conda.sh"  
6 -conda activate searchengine 2 +#
  3 +# Unified environment activator (venv preferred, conda fallback).
  4 +#
  5 +# Usage:
  6 +# source activate.sh
  7 +#
  8 +# Priority:
  9 +# 1) ./.venv (Python venv)
  10 +# 2) conda env "searchengine" (legacy)
  11 +#
  12 +
  13 +# Must be sourced
  14 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  15 + echo "ERROR: Please source this script: source activate.sh" >&2
  16 + exit 1
  17 +fi
  18 +
  19 +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  20 +
  21 +# 1) venv (preferred)
  22 +VENV_ACTIVATE="${PROJECT_ROOT}/.venv/bin/activate"
  23 +if [[ -f "${VENV_ACTIVATE}" ]]; then
  24 + # shellcheck disable=SC1090
  25 + source "${VENV_ACTIVATE}"
  26 + ENV_KIND="venv"
  27 +else
  28 + # 2) conda fallback (legacy)
  29 + # 新机器部署:可设置 CONDA_ROOT 指向本机 Conda 路径
  30 + # 例如你的 conda 是 ~/anaconda3/bin/conda,则 export CONDA_ROOT=$HOME/anaconda3
  31 + CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"
  32 + if [[ -f "${CONDA_ROOT}/etc/profile.d/conda.sh" ]]; then
  33 + # shellcheck disable=SC1091
  34 + source "${CONDA_ROOT}/etc/profile.d/conda.sh"
  35 + conda activate searchengine
  36 + ENV_KIND="conda"
  37 + else
  38 + echo "ERROR: No .venv found and conda.sh not found at ${CONDA_ROOT}/etc/profile.d/conda.sh" >&2
  39 + echo " - Create venv: ./scripts/create_venv.sh" >&2
  40 + echo " - Or set CONDA_ROOT to your conda install path" >&2
  41 + return 1
  42 + fi
  43 +fi
7 44
8 # 如果需要加载 .env 中的环境变量 45 # 如果需要加载 .env 中的环境变量
9 -if [ -f .env ]; then 46 +ENV_FILE="${PROJECT_ROOT}/.env"
  47 +if [ -f "${ENV_FILE}" ]; then
10 set -a # 自动导出所有变量 48 set -a # 自动导出所有变量
11 - source <(grep -v '^#' .env | grep -v '^$' | sed 's/#.*$//' | sed 's/\r$//') 49 + # NOTE: This loader tolerates comments/blank lines and strips inline comments.
  50 + source <(grep -v '^#' "${ENV_FILE}" | grep -v '^$' | sed 's/#.*$//' | sed 's/\r$//')
12 set +a # 关闭自动导出 51 set +a # 关闭自动导出
13 fi 52 fi
14 53
15 -echo "Environment activated: searchengine" 54 +echo "Environment activated (${ENV_KIND}): ${VIRTUAL_ENV:-${CONDA_DEFAULT_ENV:-unknown}}"
docs/CNCLIP_SERVICE说明文档.md
@@ -12,7 +12,7 @@ normlize后的结果: @@ -12,7 +12,7 @@ normlize后的结果:
12 https://aisearch.cdn.bcebos.com/fileManager/GtB5doGAr1skTx38P7fb7Q/182.jpg?authorization=bce-auth-v1%2F7e22d8caf5af46cc9310f1e3021709f3%2F2025-12-30T04%3A45%3A38Z%2F86400%2Fhost%2Ffe222039926cb7ff593021af40268c782b8892598114e24773d0c1bfc976a8df 12 https://aisearch.cdn.bcebos.com/fileManager/GtB5doGAr1skTx38P7fb7Q/182.jpg?authorization=bce-auth-v1%2F7e22d8caf5af46cc9310f1e3021709f3%2F2025-12-30T04%3A45%3A38Z%2F86400%2Fhost%2Ffe222039926cb7ff593021af40268c782b8892598114e24773d0c1bfc976a8df
13 https://oss.essa.cn/2e353867-7496-4d4e-a7c8-0af50f49f6eb.jpg?x-oss-process=image/resize,m_lfit,w_2048,h_2048 13 https://oss.essa.cn/2e353867-7496-4d4e-a7c8-0af50f49f6eb.jpg?x-oss-process=image/resize,m_lfit,w_2048,h_2048
14 14
15 -curl -X POST "http://120.76.41.98:5000/embedding/generate_image_embeddings" -H "Content-Type: application/json" -d '[ 15 +curl -X POST "http://43.166.252.75:5000/embedding/generate_image_embeddings" -H "Content-Type: application/json" -d '[
16 { 16 {
17 "id": "test_1", 17 "id": "test_1",
18 "pic_url": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg" 18 "pic_url": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg"
docs/ES/ES_8.18/1_ES配置和使用.md
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 2
3 ## 相关链接 3 ## 相关链接
4 - 接口文档:http://rap.essa.top:88/workspace/myWorkspace.do?projectId=78#2187 4 - 接口文档:http://rap.essa.top:88/workspace/myWorkspace.do?projectId=78#2187
5 -- Kibana 控制台:http://120.76.41.98:5601/app/dev_tools#/console/shell 5 +- Kibana 控制台:http://43.166.252.75:5601/app/dev_tools#/console/shell
6 6
7 ## 分词方面 7 ## 分词方面
8 8
docs/ES/ES_8.18/2_kibana安装.md
@@ -47,5 +47,5 @@ systemctl enable kibana @@ -47,5 +47,5 @@ systemctl enable kibana
47 ``` 47 ```
48 48
49 在阿里云上面配置允许访问5601端口后,可以浏览器打开: 49 在阿里云上面配置允许访问5601端口后,可以浏览器打开:
50 -http://120.76.41.98:5601/ 50 +http://43.166.252.75:5601/
51 51
docs/Usage-Guide.md
@@ -27,10 +27,19 @@ @@ -27,10 +27,19 @@
27 27
28 #### 1. 安装 Python 依赖与激活环境 28 #### 1. 安装 Python 依赖与激活环境
29 29
30 -**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。新机器部署时若 Conda 不在默认路径,请先设置 `CONDA_ROOT`(例如你的 conda 是 `~/anaconda3/bin/conda`,则 `export CONDA_ROOT=$HOME/anaconda3`)。详见 `docs/环境配置说明.md`。 30 +**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。目前推荐 venv(`.venv`);Conda 仅作为兼容回退(需要 `CONDA_ROOT`)。详见 `docs/环境配置说明.md`。
31 31
32 ```bash 32 ```bash
33 cd /data/saas-search 33 cd /data/saas-search
  34 +./scripts/create_venv.sh # 首次创建 venv(只需执行一次)
  35 +source activate.sh
  36 +```
  37 +
  38 +如果需要本地 embedding / 图像编码(会安装 torch/transformers 等较重依赖):
  39 +
  40 +```bash
  41 +cd /data/saas-search
  42 +INSTALL_ML=1 ./scripts/create_venv.sh
34 source activate.sh 43 source activate.sh
35 ``` 44 ```
36 45
docs/temporary/sku_image_src问题诊断报告.md
@@ -98,7 +98,7 @@ else: @@ -98,7 +98,7 @@ else:
98 98
99 2. **验证修复**:重新索引后,查询 ES 验证 `image_src` 字段是否已包含: 99 2. **验证修复**:重新索引后,查询 ES 验证 `image_src` 字段是否已包含:
100 ```bash 100 ```bash
101 - curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' \ 101 + curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' \
102 -H 'Content-Type: application/json' \ 102 -H 'Content-Type: application/json' \
103 -d '{ 103 -d '{
104 "size": 1, 104 "size": 1,
docs/常用查询 - ES.md
@@ -8,7 +8,7 @@ @@ -8,7 +8,7 @@
8 # 一般情况下不需要在查询中再按 tenant_id 过滤(可选保留用于排查)。 8 # 一般情况下不需要在查询中再按 tenant_id 过滤(可选保留用于排查)。
9 9
10 ### 1. 根据 tenant_id / spu_id 查询 10 ### 1. 根据 tenant_id / spu_id 查询
11 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ 11 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{
12 "size": 11, 12 "size": 11,
13 "_source": ["title"], 13 "_source": ["title"],
14 "query": { 14 "query": {
@@ -21,7 +21,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_ @@ -21,7 +21,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_
21 }' 21 }'
22 22
23 23
24 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ 24 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{
25 "size": 100, 25 "size": 100,
26 "_source": ["title"], 26 "_source": ["title"],
27 "query": { 27 "query": {
@@ -30,7 +30,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_ @@ -30,7 +30,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_
30 }' 30 }'
31 31
32 32
33 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ 33 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{
34 "size": 5, 34 "size": 5,
35 "_source": ["title", "keywords", "tags"], 35 "_source": ["title", "keywords", "tags"],
36 "query": { 36 "query": {
@@ -43,7 +43,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_ @@ -43,7 +43,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_
43 }' 43 }'
44 44
45 45
46 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ 46 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{
47 "size": 1, 47 "size": 1,
48 "_source": ["title", "keywords", "tags"], 48 "_source": ["title", "keywords", "tags"],
49 "query": { 49 "query": {
@@ -65,7 +65,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_ @@ -65,7 +65,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_
65 }' 65 }'
66 66
67 67
68 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ 68 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{
69 "size": 1, 69 "size": 1,
70 "_source": ["title"], 70 "_source": ["title"],
71 "query": { 71 "query": {
@@ -86,17 +86,17 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_ @@ -86,17 +86,17 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_
86 } 86 }
87 }' 87 }'
88 88
89 -Curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ 89 +Curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{
90 "analyzer": "index_ansj", 90 "analyzer": "index_ansj",
91 "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" 91 "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝"
92 }' 92 }'
93 93
94 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ 94 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{
95 "analyzer": "query_ansj", 95 "analyzer": "query_ansj",
96 "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" 96 "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝"
97 }' 97 }'
98 98
99 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ 99 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{
100 "size": 100, 100 "size": 100,
101 "from": 0, 101 "from": 0,
102 "query": { 102 "query": {
@@ -131,7 +131,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_ @@ -131,7 +131,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_
131 } 131 }
132 }' 132 }'
133 133
134 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ 134 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{
135 "size": 1, 135 "size": 1,
136 "from": 0, 136 "from": 0,
137 "query": { 137 "query": {
@@ -258,7 +258,7 @@ GET /search_products_tenant_2/_search @@ -258,7 +258,7 @@ GET /search_products_tenant_2/_search
258 } 258 }
259 259
260 260
261 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ 261 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{
262 "size": 5, 262 "size": 5,
263 "query": { 263 "query": {
264 "bool": { 264 "bool": {
@@ -271,7 +271,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/ @@ -271,7 +271,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/
271 271
272 272
273 ### 2. 统计租户的总文档数 273 ### 2. 统计租户的总文档数
274 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_count?pretty' -H 'Content-Type: application/json' -d '{ 274 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_count?pretty' -H 'Content-Type: application/json' -d '{
275 "query": { 275 "query": {
276 "match_all": {} 276 "match_all": {}
277 } 277 }
@@ -285,7 +285,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -285,7 +285,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
285 ## 1. 检查ES文档的分面字段数据 285 ## 1. 检查ES文档的分面字段数据
286 286
287 ### 1.1 查询特定租户的商品,显示分面相关字段 287 ### 1.1 查询特定租户的商品,显示分面相关字段
288 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 288 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
289 "query": { 289 "query": {
290 "term": { 290 "term": {
291 "tenant_id": "162" 291 "tenant_id": "162"
@@ -306,7 +306,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -306,7 +306,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
306 }' 306 }'
307 307
308 ### 1.2 验证category1_name字段是否有数据 308 ### 1.2 验证category1_name字段是否有数据
309 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 309 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
310 "query": { 310 "query": {
311 "bool": { 311 "bool": {
312 "filter": [ 312 "filter": [
@@ -319,7 +319,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -319,7 +319,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
319 }' 319 }'
320 320
321 ### 1.3 验证specifications字段是否有数据 321 ### 1.3 验证specifications字段是否有数据
322 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 322 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
323 "query": { 323 "query": {
324 "bool": { 324 "bool": {
325 "filter": [ 325 "filter": [
@@ -334,7 +334,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -334,7 +334,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
334 ## 2. 分面聚合查询(Facet Aggregations) 334 ## 2. 分面聚合查询(Facet Aggregations)
335 335
336 ### 2.1 category1_name 分面聚合 336 ### 2.1 category1_name 分面聚合
337 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 337 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
338 "query": { 338 "query": {
339 "match_all": {} 339 "match_all": {}
340 }, 340 },
@@ -350,7 +350,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -350,7 +350,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
350 }' 350 }'
351 351
352 ### 2.2 specifications.color 分面聚合 352 ### 2.2 specifications.color 分面聚合
353 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 353 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
354 "query": { 354 "query": {
355 "match_all": {} 355 "match_all": {}
356 }, 356 },
@@ -382,7 +382,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -382,7 +382,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
382 }' 382 }'
383 383
384 ### 2.3 specifications.size 分面聚合 384 ### 2.3 specifications.size 分面聚合
385 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 385 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
386 "query": { 386 "query": {
387 "match_all": {} 387 "match_all": {}
388 }, 388 },
@@ -414,7 +414,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -414,7 +414,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
414 }' 414 }'
415 415
416 ### 2.4 specifications.material 分面聚合 416 ### 2.4 specifications.material 分面聚合
417 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 417 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
418 "query": { 418 "query": {
419 "match_all": {} 419 "match_all": {}
420 }, 420 },
@@ -446,7 +446,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -446,7 +446,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
446 }' 446 }'
447 447
448 ### 2.5 综合分面聚合(category + color + size + material) 448 ### 2.5 综合分面聚合(category + color + size + material)
449 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 449 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
450 "query": { 450 "query": {
451 "match_all": {} 451 "match_all": {}
452 }, 452 },
@@ -530,7 +530,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -530,7 +530,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
530 ## 3. 检查specifications嵌套字段的详细结构 530 ## 3. 检查specifications嵌套字段的详细结构
531 531
532 ### 3.1 查看specifications的name字段有哪些值 532 ### 3.1 查看specifications的name字段有哪些值
533 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ 533 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{
534 "query": { 534 "query": {
535 "term": { 535 "term": {
536 "tenant_id": "162" 536 "tenant_id": "162"
@@ -555,7 +555,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/_s @@ -555,7 +555,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/_s
555 }' 555 }'
556 556
557 ### 3.2 查看某个商品的完整specifications数据 557 ### 3.2 查看某个商品的完整specifications数据
558 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ 558 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{
559 "query": { 559 "query": {
560 "bool": { 560 "bool": {
561 "filter": [ 561 "filter": [
@@ -571,7 +571,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/_s @@ -571,7 +571,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products/_s
571 ## 4. 统计查询 571 ## 4. 统计查询
572 572
573 ### 4.1 统计有category1_name的文档数量 573 ### 4.1 统计有category1_name的文档数量
574 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ 574 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{
575 "query": { 575 "query": {
576 "bool": { 576 "bool": {
577 "filter": [ 577 "filter": [
@@ -582,7 +582,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -582,7 +582,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
582 }' 582 }'
583 583
584 ### 4.2 统计有specifications的文档数量 584 ### 4.2 统计有specifications的文档数量
585 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ 585 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{
586 "query": { 586 "query": {
587 "bool": { 587 "bool": {
588 "filter": [ 588 "filter": [
@@ -596,7 +596,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -596,7 +596,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
596 ## 5. 诊断问题场景 596 ## 5. 诊断问题场景
597 597
598 ### 5.1 查找没有category1_name但有category的文档(MySQL有数据但ES没有) 598 ### 5.1 查找没有category1_name但有category的文档(MySQL有数据但ES没有)
599 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 599 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
600 "query": { 600 "query": {
601 "bool": { 601 "bool": {
602 "filter": [ 602 "filter": [
@@ -612,7 +612,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te @@ -612,7 +612,7 @@ curl -u &#39;essa:4hOaLaf41y2VuI8y&#39; -X GET &#39;http://localhost:9200/search_products_te
612 }' 612 }'
613 613
614 ### 5.2 查找有option但没有specifications的文档(数据转换问题) 614 ### 5.2 查找有option但没有specifications的文档(数据转换问题)
615 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ 615 +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{
616 "query": { 616 "query": {
617 "bool": { 617 "bool": {
618 "filter": [ 618 "filter": [
docs/搜索API对接指南.md
@@ -64,7 +64,7 @@ @@ -64,7 +64,7 @@
64 64
65 ### 1.1 基础信息 65 ### 1.1 基础信息
66 66
67 -- **Base URL**: `http://120.76.41.98:6002` 67 +- **Base URL**: `http://43.166.252.75:6002`
68 - **协议**: HTTP/HTTPS 68 - **协议**: HTTP/HTTPS
69 - **数据格式**: JSON 69 - **数据格式**: JSON
70 - **字符编码**: UTF-8 70 - **字符编码**: UTF-8
@@ -75,7 +75,7 @@ @@ -75,7 +75,7 @@
75 ### 1.2 最简单的搜索请求 75 ### 1.2 最简单的搜索请求
76 76
77 ```bash 77 ```bash
78 -curl -X POST "http://120.76.41.98:6002/search/" \ 78 +curl -X POST "http://43.166.252.75:6002/search/" \
79 -H "Content-Type: application/json" \ 79 -H "Content-Type: application/json" \
80 -H "X-Tenant-ID: 162" \ 80 -H "X-Tenant-ID: 162" \
81 -d '{"query": "芭比娃娃"}' 81 -d '{"query": "芭比娃娃"}'
@@ -84,7 +84,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -84,7 +84,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
84 ### 1.3 带过滤与分页的搜索 84 ### 1.3 带过滤与分页的搜索
85 85
86 ```bash 86 ```bash
87 -curl -X POST "http://120.76.41.98:6002/search/" \ 87 +curl -X POST "http://43.166.252.75:6002/search/" \
88 -H "Content-Type: application/json" \ 88 -H "Content-Type: application/json" \
89 -H "X-Tenant-ID: 162" \ 89 -H "X-Tenant-ID: 162" \
90 -d '{ 90 -d '{
@@ -108,7 +108,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \ @@ -108,7 +108,7 @@ curl -X POST &quot;http://120.76.41.98:6002/search/&quot; \
108 ### 1.4 开启分面的搜索 108 ### 1.4 开启分面的搜索
109 109
110 ```bash 110 ```bash
111 -curl -X POST "http://120.76.41.98:6002/search/" \ 111 +curl -X POST "http://43.166.252.75:6002/search/" \
112 -H "Content-Type: application/json" \ 112 -H "Content-Type: application/json" \
113 -H "X-Tenant-ID: 162" \ 113 -H "X-Tenant-ID: 162" \
114 -d '{ 114 -d '{
docs/环境配置说明.md
@@ -11,14 +11,36 @@ @@ -11,14 +11,36 @@
11 11
12 ## 2. Python 运行环境 12 ## 2. Python 运行环境
13 13
14 -**推荐方式(与项目脚本一致)**:使用项目根目录下的 `activate.sh` 激活环境,会自动加载当前目录下的 `.env`(忽略注释与空行): 14 +本项目历史上使用 Conda 管理环境;目前推荐使用 **venv**(更轻量、对 CI/容器更友好)。项目根目录下的 `activate.sh` 已升级为 **优先激活 `.venv`,并兼容 Conda 回退**,且会自动加载当前目录下的 `.env`(忽略注释与空行)。
  15 +
  16 +### 2.1 venv(推荐)
  17 +
  18 +首次创建 venv:
  19 +
  20 +```bash
  21 +cd /data/saas-search
  22 +./scripts/create_venv.sh
  23 +source activate.sh
  24 +```
  25 +
  26 +如需运行本地 embedding / 图像编码服务(torch/transformers 等依赖较重):
15 27
16 ```bash 28 ```bash
17 cd /data/saas-search 29 cd /data/saas-search
  30 +INSTALL_ML=1 ./scripts/create_venv.sh
18 source activate.sh 31 source activate.sh
19 ``` 32 ```
20 33
21 -`activate.sh` 会激活 Conda 环境 `searchengine`。若在新机器上部署,请先设置本机 Conda 路径再执行: 34 +日常使用:
  35 +
  36 +```bash
  37 +cd /data/saas-search
  38 +source activate.sh
  39 +```
  40 +
  41 +### 2.2 Conda(兼容旧流程)
  42 +
  43 +`activate.sh` 会在未发现 `.venv` 时回退激活 Conda 环境 `searchengine`。若在新机器上部署,请先设置本机 Conda 路径再执行:
22 44
23 ```bash 45 ```bash
24 # 你的 conda 在 ~/anaconda3/bin/conda,则 CONDA_ROOT=~/anaconda3 46 # 你的 conda 在 ~/anaconda3/bin/conda,则 CONDA_ROOT=~/anaconda3
@@ -26,7 +48,7 @@ export CONDA_ROOT=$HOME/anaconda3 # 或你的 Conda 安装路径(如 /home/u @@ -26,7 +48,7 @@ export CONDA_ROOT=$HOME/anaconda3 # 或你的 Conda 安装路径(如 /home/u
26 source activate.sh 48 source activate.sh
27 ``` 49 ```
28 50
29 -**新机器首次部署(创建环境)**:若本机尚未创建 `searchengine` 环境,任选其一: 51 +**新机器首次部署(创建 Conda 环境)**:若本机尚未创建 `searchengine` 环境,任选其一:
30 52
31 - **方式 A(推荐,与 environment.yml 一致)**: 53 - **方式 A(推荐,与 environment.yml 一致)**:
32 ```bash 54 ```bash
@@ -82,7 +104,7 @@ DB_PASSWORD=P89cZHS5d7dFyc9R @@ -82,7 +104,7 @@ DB_PASSWORD=P89cZHS5d7dFyc9R
82 104
83 # Elasticsearch 105 # Elasticsearch
84 ES_HOST=http://localhost:9200 106 ES_HOST=http://localhost:9200
85 -ES_USERNAME=essa 107 +ES_USERNAME=saas
86 ES_PASSWORD=4hOaLaf41y2VuI8y 108 ES_PASSWORD=4hOaLaf41y2VuI8y
87 109
88 # Redis(可选) 110 # Redis(可选)
@@ -105,7 +127,7 @@ API_PORT=6002 @@ -105,7 +127,7 @@ API_PORT=6002
105 | 项目 | 值 | 127 | 项目 | 值 |
106 |------|----| 128 |------|----|
107 | **MySQL** | host `120.79.247.228`, port `3316`, user `saas`, password `P89cZHS5d7dFyc9R` | 129 | **MySQL** | host `120.79.247.228`, port `3316`, user `saas`, password `P89cZHS5d7dFyc9R` |
108 -| **Elasticsearch** | host `http://localhost:9200`, user `essa`, password `4hOaLaf41y2VuI8y` | 130 +| **Elasticsearch** | host `http://localhost:9200`, user `saas`, password `4hOaLaf41y2VuI8y` |
109 | **Redis(可选)** | host `localhost`, port `6479`, password `BMfv5aI31kgHWtlx` | 131 | **Redis(可选)** | host `localhost`, port `6479`, password `BMfv5aI31kgHWtlx` |
110 | **DeepL** | `c9293ab4-ad25-479b-919f-ab4e63b429ed` | 132 | **DeepL** | `c9293ab4-ad25-479b-919f-ab4e63b429ed` |
111 133
frontend/README.md
@@ -82,7 +82,7 @@ bash scripts/start_backend.sh @@ -82,7 +82,7 @@ bash scripts/start_backend.sh
82 ``` 82 ```
83 83
84 2. **访问前端**: 84 2. **访问前端**:
85 -打开浏览器访问:`http://120.76.41.98:6002/` 85 +打开浏览器访问:`http://43.166.252.75:6002/`
86 86
87 ### 搜索示例 87 ### 搜索示例
88 88
@@ -97,7 +97,7 @@ bash scripts/start_backend.sh @@ -97,7 +97,7 @@ bash scripts/start_backend.sh
97 前端通过以下接口与后端通信: 97 前端通过以下接口与后端通信:
98 98
99 ```javascript 99 ```javascript
100 -POST http://120.76.41.98:6002/search/ 100 +POST http://43.166.252.75:6002/search/
101 101
102 请求体: 102 请求体:
103 { 103 {
@@ -198,7 +198,7 @@ categoryPath.set(categoryLang, translationCategoryPath) @@ -198,7 +198,7 @@ categoryPath.set(categoryLang, translationCategoryPath)
198 你当前要使用的翻译接口(Python 侧): 198 你当前要使用的翻译接口(Python 侧):
199 199
200 ```bash 200 ```bash
201 -curl -X POST http://120.76.41.98:6006/translate \ 201 +curl -X POST http://43.166.252.75:6006/translate \
202 -H "Content-Type: application/json" \ 202 -H "Content-Type: application/json" \
203 -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣", 203 -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣",
204 "target_lang":"en", 204 "target_lang":"en",
indexer/prompts.txt
@@ -22,7 +22,7 @@ java索引程序职责: @@ -22,7 +22,7 @@ java索引程序职责:
22 本模块: 22 本模块:
23 负责 msyql 基础数据 → 索引结构的doc (包括缓存) 23 负责 msyql 基础数据 → 索引结构的doc (包括缓存)
24 24
25 -翻译接口: curl -X POST http://120.76.41.98:6006/translate -H "Content-Type: application/json" -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣","target_lang":"en","source_lang":"auto"}' 25 +翻译接口: curl -X POST http://43.166.252.75:6006/translate -H "Content-Type: application/json" -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣","target_lang":"en","source_lang":"auto"}'
26 26
27 java的组织doc的逻辑都需要迁移过来。 27 java的组织doc的逻辑都需要迁移过来。
28 28
query/query_parser.py
@@ -8,7 +8,6 @@ from typing import Dict, List, Optional, Any, Union @@ -8,7 +8,6 @@ from typing import Dict, List, Optional, Any, Union
8 import numpy as np 8 import numpy as np
9 import logging 9 import logging
10 import re 10 import re
11 -import hanlp  
12 from concurrent.futures import Future, ThreadPoolExecutor, as_completed 11 from concurrent.futures import Future, ThreadPoolExecutor, as_completed
13 12
14 from embeddings import BgeEncoder 13 from embeddings import BgeEncoder
@@ -19,6 +18,10 @@ from .query_rewriter import QueryRewriter, QueryNormalizer @@ -19,6 +18,10 @@ from .query_rewriter import QueryRewriter, QueryNormalizer
19 18
20 logger = logging.getLogger(__name__) 19 logger = logging.getLogger(__name__)
21 20
  21 +try:
  22 + import hanlp # type: ignore
  23 +except Exception: # pragma: no cover
  24 + hanlp = None
22 25
23 class ParsedQuery: 26 class ParsedQuery:
24 """Container for parsed query results.""" 27 """Container for parsed query results."""
@@ -94,12 +97,22 @@ class QueryParser: @@ -94,12 +97,22 @@ class QueryParser:
94 self.language_detector = LanguageDetector() 97 self.language_detector = LanguageDetector()
95 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) 98 self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary)
96 99
97 - # Initialize HanLP components at startup  
98 - logger.info("Initializing HanLP components...")  
99 - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)  
100 - self._tok.config.output_spans = True  
101 - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)  
102 - logger.info("HanLP components initialized") 100 + # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer.
  101 + self._tok = None
  102 + self._pos_tag = None
  103 + if hanlp is not None:
  104 + try:
  105 + logger.info("Initializing HanLP components...")
  106 + self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
  107 + self._tok.config.output_spans = True
  108 + self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
  109 + logger.info("HanLP components initialized")
  110 + except Exception as e:
  111 + logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}")
  112 + self._tok = None
  113 + self._pos_tag = None
  114 + else:
  115 + logger.info("HanLP not installed; using simple tokenizer")
103 116
104 @property 117 @property
105 def text_encoder(self) -> BgeEncoder: 118 def text_encoder(self) -> BgeEncoder:
@@ -121,32 +134,51 @@ class QueryParser: @@ -121,32 +134,51 @@ class QueryParser:
121 translation_context=self.config.query_config.translation_context 134 translation_context=self.config.query_config.translation_context
122 ) 135 )
123 return self._translator 136 return self._translator
  137 +
  138 + def _simple_tokenize(self, text: str) -> List[str]:
  139 + """
  140 + Lightweight tokenizer fallback.
  141 +
  142 + - Groups consecutive CJK chars as a token
  143 + - Groups consecutive latin/digits/underscore/dash as a token
  144 + """
  145 + if not text:
  146 + return []
  147 + pattern = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*")
  148 + return pattern.findall(text)
124 149
125 def _extract_keywords(self, query: str) -> str: 150 def _extract_keywords(self, query: str) -> str:
126 """Extract keywords (nouns with length > 1) from query.""" 151 """Extract keywords (nouns with length > 1) from query."""
127 - tok_result = self._tok(query)  
128 - if not tok_result:  
129 - return ""  
130 -  
131 - words = [x[0] for x in tok_result]  
132 - pos_tags = self._pos_tag(words)  
133 -  
134 - keywords = []  
135 - for word, pos in zip(words, pos_tags):  
136 - if len(word) > 1 and pos.startswith('N'):  
137 - keywords.append(word)  
138 - 152 + if self._tok is not None and self._pos_tag is not None:
  153 + tok_result = self._tok(query)
  154 + if not tok_result:
  155 + return ""
  156 + words = [x[0] for x in tok_result]
  157 + pos_tags = self._pos_tag(words)
  158 + keywords = []
  159 + for word, pos in zip(words, pos_tags):
  160 + if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"):
  161 + keywords.append(word)
  162 + return " ".join(keywords)
  163 +
  164 + # Fallback: treat tokens with length > 1 as "keywords"
  165 + tokens = self._simple_tokenize(query)
  166 + keywords = [t for t in tokens if len(t) > 1]
139 return " ".join(keywords) 167 return " ".join(keywords)
140 168
141 def _get_token_count(self, query: str) -> int: 169 def _get_token_count(self, query: str) -> int:
142 - """Get token count using HanLP."""  
143 - tok_result = self._tok(query)  
144 - return len(tok_result) if tok_result else 0 170 + """Get token count (HanLP if available, otherwise simple)."""
  171 + if self._tok is not None:
  172 + tok_result = self._tok(query)
  173 + return len(tok_result) if tok_result else 0
  174 + return len(self._simple_tokenize(query))
145 175
146 def _get_query_tokens(self, query: str) -> List[str]: 176 def _get_query_tokens(self, query: str) -> List[str]:
147 - """Get token list using HanLP."""  
148 - tok_result = self._tok(query)  
149 - return [x[0] for x in tok_result] if tok_result else [] 177 + """Get token list (HanLP if available, otherwise simple)."""
  178 + if self._tok is not None:
  179 + tok_result = self._tok(query)
  180 + return [x[0] for x in tok_result] if tok_result else []
  181 + return self._simple_tokenize(query)
150 182
151 def parse( 183 def parse(
152 self, 184 self,
@@ -12,14 +12,13 @@ pandas&gt;=2.0.0 @@ -12,14 +12,13 @@ pandas&gt;=2.0.0
12 # Elasticsearch 12 # Elasticsearch
13 elasticsearch>=8.0.0,<9.0.0 13 elasticsearch>=8.0.0,<9.0.0
14 14
15 -# ML/Embeddings  
16 -torch>=2.0.0  
17 -sentence-transformers>=2.2.0  
18 -transformers>=4.30.0  
19 -modelscope>=1.9.0  
20 -cn-clip>=1.5.0 15 +# Redis (cache; used by translator/embeddings)
  16 +redis>=5.0.0
  17 +
  18 +# Math / vector utilities (used across modules)
21 numpy>=1.24.0 19 numpy>=1.24.0
22 -pillow>=10.0.0 20 +
  21 +# LLM/Translation clients
23 openai>=1.0.0 22 openai>=1.0.0
24 23
25 # API 24 # API
requirements_ml.txt 0 → 100644
@@ -0,0 +1,16 @@ @@ -0,0 +1,16 @@
  1 +# Optional heavy dependencies for local embedding/image encoding.
  2 +#
  3 +# Install when you need:
  4 +# - `./scripts/start_embedding_service.sh` (local embeddings server)
  5 +# - local BGE-M3 / CN-CLIP inference
  6 +#
  7 +# Notes:
  8 +# - `torch` wheels can be very large; if you want CPU-only wheels,
  9 +# consider installing torch separately with the official CPU index.
  10 +#
  11 +torch>=2.0.0
  12 +sentence-transformers>=2.2.0
  13 +transformers>=4.30.0
  14 +modelscope>=1.9.0
  15 +cn-clip>=1.5.0
  16 +pillow>=10.0.0
scripts/create_venv.sh 0 → 100644
@@ -0,0 +1,59 @@ @@ -0,0 +1,59 @@
  1 +#!/bin/bash
  2 +#
  3 +# Create and initialize Python venv for saas-search.
  4 +#
  5 +# Usage:
  6 +# ./scripts/create_venv.sh
  7 +#
  8 +set -euo pipefail
  9 +
  10 +PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
  11 +cd "${PROJECT_ROOT}"
  12 +
  13 +VENV_DIR="${PROJECT_ROOT}/.venv"
  14 +
  15 +PYTHON_BIN="${PYTHON_BIN:-python3.10}"
  16 +if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
  17 + PYTHON_BIN="python3"
  18 +fi
  19 +
  20 +echo "Using python: $(${PYTHON_BIN} --version)"
  21 +
  22 +if ! "${PYTHON_BIN}" -c "import ensurepip" >/dev/null 2>&1; then
  23 + echo "ERROR: ensurepip is not available for ${PYTHON_BIN}." >&2
  24 + echo "On Ubuntu/Debian, install the venv package first, e.g.:" >&2
  25 + echo " sudo apt-get update -y && sudo apt-get install -y python3-venv" >&2
  26 + echo "If you are using Python 3.12 specifically, you may need:" >&2
  27 + echo " sudo apt-get install -y python3.12-venv" >&2
  28 + exit 1
  29 +fi
  30 +
  31 +if [[ -d "${VENV_DIR}" ]]; then
  32 + if [[ -f "${VENV_DIR}/bin/activate" ]]; then
  33 + echo "venv already exists at ${VENV_DIR}"
  34 + else
  35 + echo "Found incomplete venv at ${VENV_DIR}, recreating..."
  36 + rm -rf "${VENV_DIR}"
  37 + "${PYTHON_BIN}" -m venv "${VENV_DIR}"
  38 + fi
  39 +else
  40 + echo "Creating venv at ${VENV_DIR} ..."
  41 + "${PYTHON_BIN}" -m venv "${VENV_DIR}"
  42 +fi
  43 +
  44 +# shellcheck disable=SC1091
  45 +source "${VENV_DIR}/bin/activate"
  46 +
  47 +python -m pip install --upgrade pip setuptools wheel
  48 +python -m pip install -r requirements.txt
  49 +
  50 +if [[ "${INSTALL_ML:-0}" == "1" ]]; then
  51 + echo
  52 + echo "INSTALL_ML=1 detected. Installing optional ML dependencies..."
  53 + python -m pip install -r requirements_ml.txt
  54 +fi
  55 +
  56 +echo
  57 +echo "Done."
  58 +echo "Next:"
  59 +echo " source activate.sh"
scripts/mock_data.sh
@@ -20,9 +20,7 @@ @@ -20,9 +20,7 @@
20 # ============================================================================ 20 # ============================================================================
21 21
22 cd "$(dirname "$0")/.." 22 cd "$(dirname "$0")/.."
23 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
24 -source "$CONDA_ROOT/etc/profile.d/conda.sh"  
25 -conda activate searchengine 23 +source ./activate.sh
26 24
27 GREEN='\033[0;32m' 25 GREEN='\033[0;32m'
28 YELLOW='\033[1;33m' 26 YELLOW='\033[1;33m'
@@ -33,13 +31,6 @@ echo -e &quot;${GREEN}========================================${NC}&quot; @@ -33,13 +31,6 @@ echo -e &quot;${GREEN}========================================${NC}&quot;
33 echo -e "${GREEN}Mock Data Script${NC}" 31 echo -e "${GREEN}Mock Data Script${NC}"
34 echo -e "${GREEN}========================================${NC}" 32 echo -e "${GREEN}========================================${NC}"
35 33
36 -# Load config from .env file if it exists  
37 -if [ -f .env ]; then  
38 - set -a  
39 - source .env  
40 - set +a  
41 -fi  
42 -  
43 # ============================================================================ 34 # ============================================================================
44 # 写死的配置参数(不需要配置化,这是测试数据构造脚本) 35 # 写死的配置参数(不需要配置化,这是测试数据构造脚本)
45 # ============================================================================ 36 # ============================================================================
scripts/start_backend.sh
@@ -5,9 +5,7 @@ @@ -5,9 +5,7 @@
5 set -e 5 set -e
6 6
7 cd "$(dirname "$0")/.." 7 cd "$(dirname "$0")/.."
8 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
9 -source "$CONDA_ROOT/etc/profile.d/conda.sh"  
10 -conda activate searchengine 8 +source ./activate.sh
11 9
12 GREEN='\033[0;32m' 10 GREEN='\033[0;32m'
13 YELLOW='\033[1;33m' 11 YELLOW='\033[1;33m'
@@ -17,13 +15,6 @@ echo -e &quot;${GREEN}========================================${NC}&quot; @@ -17,13 +15,6 @@ echo -e &quot;${GREEN}========================================${NC}&quot;
17 echo -e "${GREEN}Starting Backend API Service${NC}" 15 echo -e "${GREEN}Starting Backend API Service${NC}"
18 echo -e "${GREEN}========================================${NC}" 16 echo -e "${GREEN}========================================${NC}"
19 17
20 -# Load config from .env file if it exists  
21 -if [ -f .env ]; then  
22 - set -a  
23 - source .env  
24 - set +a  
25 -fi  
26 -  
27 echo -e "\n${YELLOW}Configuration:${NC}" 18 echo -e "\n${YELLOW}Configuration:${NC}"
28 echo " API Host: ${API_HOST:-0.0.0.0}" 19 echo " API Host: ${API_HOST:-0.0.0.0}"
29 echo " API Port: ${API_PORT:-6002}" 20 echo " API Port: ${API_PORT:-6002}"
scripts/start_embedding_service.sh
@@ -12,12 +12,7 @@ set -e @@ -12,12 +12,7 @@ set -e
12 12
13 cd "$(dirname "$0")/.." 13 cd "$(dirname "$0")/.."
14 14
15 -# Load conda env if available (keep consistent with other scripts)  
16 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
17 -if [ -f "$CONDA_ROOT/etc/profile.d/conda.sh" ]; then  
18 - source "$CONDA_ROOT/etc/profile.d/conda.sh"  
19 - conda activate searchengine  
20 -fi 15 +source ./activate.sh
21 16
22 EMBEDDING_SERVICE_HOST=$(python -c "from embeddings.config import CONFIG; print(CONFIG.HOST)") 17 EMBEDDING_SERVICE_HOST=$(python -c "from embeddings.config import CONFIG; print(CONFIG.HOST)")
23 EMBEDDING_SERVICE_PORT=$(python -c "from embeddings.config import CONFIG; print(CONFIG.PORT)") 18 EMBEDDING_SERVICE_PORT=$(python -c "from embeddings.config import CONFIG; print(CONFIG.PORT)")
scripts/start_frontend.sh
@@ -5,9 +5,7 @@ @@ -5,9 +5,7 @@
5 set -e 5 set -e
6 6
7 cd "$(dirname "$0")/.." 7 cd "$(dirname "$0")/.."
8 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
9 -source "$CONDA_ROOT/etc/profile.d/conda.sh"  
10 -conda activate searchengine 8 +source ./activate.sh
11 9
12 GREEN='\033[0;32m' 10 GREEN='\033[0;32m'
13 YELLOW='\033[1;33m' 11 YELLOW='\033[1;33m'
scripts/start_indexer.sh
@@ -5,9 +5,7 @@ @@ -5,9 +5,7 @@
5 set -e 5 set -e
6 6
7 cd "$(dirname "$0")/.." 7 cd "$(dirname "$0")/.."
8 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
9 -source "$CONDA_ROOT/etc/profile.d/conda.sh"  
10 -conda activate searchengine 8 +source ./activate.sh
11 9
12 GREEN='\033[0;32m' 10 GREEN='\033[0;32m'
13 YELLOW='\033[1;33m' 11 YELLOW='\033[1;33m'
@@ -17,13 +15,6 @@ echo -e &quot;${GREEN}========================================${NC}&quot; @@ -17,13 +15,6 @@ echo -e &quot;${GREEN}========================================${NC}&quot;
17 echo -e "${GREEN}Starting Indexer API Service${NC}" 15 echo -e "${GREEN}Starting Indexer API Service${NC}"
18 echo -e "${GREEN}========================================${NC}" 16 echo -e "${GREEN}========================================${NC}"
19 17
20 -# Load config from .env file if it exists  
21 -if [ -f .env ]; then  
22 - set -a  
23 - source .env  
24 - set +a  
25 -fi  
26 -  
27 echo -e "\n${YELLOW}Configuration:${NC}" 18 echo -e "\n${YELLOW}Configuration:${NC}"
28 echo " INDEXER Host: ${INDEXER_HOST:-0.0.0.0}" 19 echo " INDEXER Host: ${INDEXER_HOST:-0.0.0.0}"
29 echo " INDEXER Port: ${INDEXER_PORT:-6004}" 20 echo " INDEXER Port: ${INDEXER_PORT:-6004}"
scripts/tenant3__csv_to_shoplazza_xlsx.sh
1 -# 激活环境  
2 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
3 -source "$CONDA_ROOT/etc/profile.d/conda.sh"  
4 -conda activate searchengine 1 +#!/bin/bash
  2 +set -e
  3 +
  4 +cd "$(dirname "$0")/.."
  5 +source ./activate.sh
5 6
6 # # 基本使用(生成所有数据) 7 # # 基本使用(生成所有数据)
7 # python scripts/csv_to_excel.py 8 # python scripts/csv_to_excel.py
1 #!/bin/bash 1 #!/bin/bash
2 2
3 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"  
4 -source "$CONDA_ROOT/etc/profile.d/conda.sh"  
5 -  
6 # saas-search Setup and Startup Script 3 # saas-search Setup and Startup Script
7 # This script sets up the environment and starts all services 4 # This script sets up the environment and starts all services
8 5
@@ -22,24 +19,16 @@ echo -e &quot;${GREEN}========================================${NC}&quot; @@ -22,24 +19,16 @@ echo -e &quot;${GREEN}========================================${NC}&quot;
22 cd "$(dirname "$0")" 19 cd "$(dirname "$0")"
23 PROJECT_ROOT=$(pwd) 20 PROJECT_ROOT=$(pwd)
24 21
25 -echo -e "\n${YELLOW}Step 1: Setting up Conda environment${NC}"  
26 -# Check if conda is available  
27 -if ! command -v conda &> /dev/null; then  
28 - echo -e "${RED}Error: conda not found. Please install Miniconda or Anaconda${NC}"  
29 - exit 1  
30 -fi 22 +echo -e "\n${YELLOW}Step 1: Setting up Python environment (venv preferred)${NC}"
31 23
32 -# Check if environment exists  
33 -if conda env list | grep -q "searchengine"; then  
34 - echo -e "${GREEN}Environment 'searchengine' already exists${NC}"  
35 - conda activate searchengine  
36 -else  
37 - echo -e "${YELLOW}Creating conda environment 'searchengine'...${NC}"  
38 - conda env create -f environment.yml  
39 - conda activate searchengine  
40 - echo -e "${GREEN}Environment created successfully!${NC}" 24 +if [ ! -f "${PROJECT_ROOT}/.venv/bin/activate" ]; then
  25 + echo -e "${YELLOW}Creating venv and installing dependencies...${NC}"
  26 + ./scripts/create_venv.sh
41 fi 27 fi
42 28
  29 +# Activate environment + load .env
  30 +source ./activate.sh
  31 +
43 # Verify environment 32 # Verify environment
44 echo -e "\n${YELLOW}Current Python version:${NC}" 33 echo -e "\n${YELLOW}Current Python version:${NC}"
45 python --version 34 python --version
@@ -74,7 +63,7 @@ echo -e &quot;${GREEN}Setup Complete!${NC}&quot; @@ -74,7 +63,7 @@ echo -e &quot;${GREEN}Setup Complete!${NC}&quot;
74 echo -e "${GREEN}========================================${NC}" 63 echo -e "${GREEN}========================================${NC}"
75 echo "" 64 echo ""
76 echo -e "Next steps:" 65 echo -e "Next steps:"
77 -echo -e " 1. Ingest data: ${YELLOW}./scripts/ingest.sh${NC}"  
78 -echo -e " 2. Start backend: ${YELLOW}./scripts/start_backend.sh${NC}" 66 +echo -e " 1. Start backend: ${YELLOW}./scripts/start_backend.sh${NC}"
  67 +echo -e " 2. Start indexer: ${YELLOW}./scripts/start_indexer.sh${NC}"
79 echo -e " 3. Start frontend: ${YELLOW}./scripts/start_frontend.sh${NC}" 68 echo -e " 3. Start frontend: ${YELLOW}./scripts/start_frontend.sh${NC}"
80 echo "" 69 echo ""