From 484adbfe9ccfae39f5ce92c0173011d724402d47 Mon Sep 17 00:00:00 2001 From: tangwang Date: Fri, 6 Mar 2026 18:50:20 +0800 Subject: [PATCH] adapt ubuntu; conda -> venv --- .env | 4 ++-- .gitignore | 1 + README.md | 15 +++++++++------ activate.sh | 55 +++++++++++++++++++++++++++++++++++++++++++++++-------- docs/CNCLIP_SERVICE说明文档.md | 2 +- docs/ES/ES_8.18/1_ES配置和使用.md | 2 +- docs/ES/ES_8.18/2_kibana安装.md | 2 +- docs/Usage-Guide.md | 11 ++++++++++- docs/temporary/sku_image_src问题诊断报告.md | 2 +- docs/常用查询 - ES.md | 50 +++++++++++++++++++++++++------------------------- docs/搜索API对接指南.md | 8 ++++---- docs/环境配置说明.md | 32 +++++++++++++++++++++++++++----- frontend/README.md | 4 ++-- indexer/README.md | 2 +- indexer/prompts.txt | 2 +- query/query_parser.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------- requirements.txt | 13 ++++++------- requirements_ml.txt | 16 ++++++++++++++++ scripts/create_venv.sh | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/mock_data.sh | 11 +---------- scripts/start_backend.sh | 11 +---------- scripts/start_embedding_service.sh | 7 +------ scripts/start_frontend.sh | 4 +--- scripts/start_indexer.sh | 11 +---------- scripts/tenant3__csv_to_shoplazza_xlsx.sh | 9 +++++---- setup.sh | 29 +++++++++-------------------- 26 files changed, 290 insertions(+), 154 deletions(-) create mode 100644 requirements_ml.txt create mode 100644 scripts/create_venv.sh diff --git a/.env b/.env index 4370753..47cb794 100644 --- a/.env +++ b/.env @@ -1,6 +1,6 @@ # Elasticsearch Configuration ES_HOST=http://localhost:9200 -ES_USERNAME=essa +ES_USERNAME=saas ES_PASSWORD=4hOaLaf41y2VuI8y # Redis Configuration (Optional) @@ -30,7 +30,7 @@ IMAGE_MODEL_DIR=/data/tw/models/cn-clip # 已经改为web请求了,不使用 CACHE_DIR=.cache # Frontend API Base URL -API_BASE_URL=http://120.76.41.98:6002 +API_BASE_URL=http://43.166.252.75:6002 DASHSCOPE_API_KEY=sk-c3b8d4db061840aa8effb748df2a997b diff --git a/.gitignore b/.gitignore index 095cc8c..98f368c 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ __pycache__ .history.txt log/ logs/ +.venv/ nohup.out temp/ indexer_input* diff --git a/README.md b/README.md index 34768fc..573f9d7 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ query anchor 对外: embedding服务: - curl -X POST http://120.76.41.98:6005/embed/text \ + curl -X POST http://43.166.252.75:6005/embed/text \ -H "Content-Type: application/json" \ -d '["衣服", "Bohemian Maxi Dress"]' @@ -91,7 +91,7 @@ localhost替换为 服务器内网地址: 10.0.163.168 公网地址: -120.76.41.98 +43.166.252.75 # 电商搜索引擎 SaaS @@ -101,15 +101,18 @@ localhost替换为 ## 项目环境 -以项目根目录的 **`activate.sh`** 为准(会激活 Conda 环境 `searchengine` 并加载 `.env`): +以项目根目录的 **`activate.sh`** 为准(**优先激活 venv:`./.venv`,并加载 `.env`;兼容 Conda 回退**): ```bash -# 若在新机器且 Conda 不在默认路径,先设置: -# - 你的 conda 是 ~/anaconda3/bin/conda,则:export CONDA_ROOT=$HOME/anaconda3 +# 推荐:首次创建 venv(默认安装基础依赖) +./scripts/create_venv.sh + +# 如需本地向量/图片编码(会安装 torch/transformers 等重依赖) +# INSTALL_ML=1 ./scripts/create_venv.sh source activate.sh ``` -新机器首次需创建环境,见 `docs/环境配置说明.md`(`conda env create -f environment.yml` 或 `pip install -r requirements.txt`)。 +新机器首次需创建环境,见 `docs/环境配置说明.md`(推荐 venv;Conda 为兼容旧流程)。 ## 测试pipeline diff --git a/activate.sh b/activate.sh index e2c8aa3..7e4555a 100644 --- a/activate.sh +++ b/activate.sh @@ -1,15 +1,54 @@ #!/bin/bash -# 新机器部署:可设置 CONDA_ROOT 指向本机 Conda 路径 -# 例如你的 conda 是 ~/anaconda3/bin/conda,则 export CONDA_ROOT=$HOME/anaconda3 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -source "$CONDA_ROOT/etc/profile.d/conda.sh" -conda activate searchengine +# +# Unified environment activator (venv preferred, conda fallback). +# +# Usage: +# source activate.sh +# +# Priority: +# 1) ./.venv (Python venv) +# 2) conda env "searchengine" (legacy) +# + +# Must be sourced +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + echo "ERROR: Please source this script: source activate.sh" >&2 + exit 1 +fi + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# 1) venv (preferred) +VENV_ACTIVATE="${PROJECT_ROOT}/.venv/bin/activate" +if [[ -f "${VENV_ACTIVATE}" ]]; then + # shellcheck disable=SC1090 + source "${VENV_ACTIVATE}" + ENV_KIND="venv" +else + # 2) conda fallback (legacy) + # 新机器部署:可设置 CONDA_ROOT 指向本机 Conda 路径 + # 例如你的 conda 是 ~/anaconda3/bin/conda,则 export CONDA_ROOT=$HOME/anaconda3 + CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" + if [[ -f "${CONDA_ROOT}/etc/profile.d/conda.sh" ]]; then + # shellcheck disable=SC1091 + source "${CONDA_ROOT}/etc/profile.d/conda.sh" + conda activate searchengine + ENV_KIND="conda" + else + echo "ERROR: No .venv found and conda.sh not found at ${CONDA_ROOT}/etc/profile.d/conda.sh" >&2 + echo " - Create venv: ./scripts/create_venv.sh" >&2 + echo " - Or set CONDA_ROOT to your conda install path" >&2 + return 1 + fi +fi # 如果需要加载 .env 中的环境变量 -if [ -f .env ]; then +ENV_FILE="${PROJECT_ROOT}/.env" +if [ -f "${ENV_FILE}" ]; then set -a # 自动导出所有变量 - source <(grep -v '^#' .env | grep -v '^$' | sed 's/#.*$//' | sed 's/\r$//') + # NOTE: This loader tolerates comments/blank lines and strips inline comments. + source <(grep -v '^#' "${ENV_FILE}" | grep -v '^$' | sed 's/#.*$//' | sed 's/\r$//') set +a # 关闭自动导出 fi -echo "Environment activated: searchengine" +echo "Environment activated (${ENV_KIND}): ${VIRTUAL_ENV:-${CONDA_DEFAULT_ENV:-unknown}}" diff --git a/docs/CNCLIP_SERVICE说明文档.md b/docs/CNCLIP_SERVICE说明文档.md index c9ca65e..208f4eb 100644 --- a/docs/CNCLIP_SERVICE说明文档.md +++ b/docs/CNCLIP_SERVICE说明文档.md @@ -12,7 +12,7 @@ normlize后的结果: https://aisearch.cdn.bcebos.com/fileManager/GtB5doGAr1skTx38P7fb7Q/182.jpg?authorization=bce-auth-v1%2F7e22d8caf5af46cc9310f1e3021709f3%2F2025-12-30T04%3A45%3A38Z%2F86400%2Fhost%2Ffe222039926cb7ff593021af40268c782b8892598114e24773d0c1bfc976a8df https://oss.essa.cn/2e353867-7496-4d4e-a7c8-0af50f49f6eb.jpg?x-oss-process=image/resize,m_lfit,w_2048,h_2048 -curl -X POST "http://120.76.41.98:5000/embedding/generate_image_embeddings" -H "Content-Type: application/json" -d '[ +curl -X POST "http://43.166.252.75:5000/embedding/generate_image_embeddings" -H "Content-Type: application/json" -d '[ { "id": "test_1", "pic_url": "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg" diff --git a/docs/ES/ES_8.18/1_ES配置和使用.md b/docs/ES/ES_8.18/1_ES配置和使用.md index 569f12f..6debb96 100644 --- a/docs/ES/ES_8.18/1_ES配置和使用.md +++ b/docs/ES/ES_8.18/1_ES配置和使用.md @@ -2,7 +2,7 @@ ## 相关链接 - 接口文档:http://rap.essa.top:88/workspace/myWorkspace.do?projectId=78#2187 -- Kibana 控制台:http://120.76.41.98:5601/app/dev_tools#/console/shell +- Kibana 控制台:http://43.166.252.75:5601/app/dev_tools#/console/shell ## 分词方面 diff --git a/docs/ES/ES_8.18/2_kibana安装.md b/docs/ES/ES_8.18/2_kibana安装.md index e9c7a31..cfef31b 100644 --- a/docs/ES/ES_8.18/2_kibana安装.md +++ b/docs/ES/ES_8.18/2_kibana安装.md @@ -47,5 +47,5 @@ systemctl enable kibana ``` 在阿里云上面配置允许访问5601端口后,可以浏览器打开: -http://120.76.41.98:5601/ +http://43.166.252.75:5601/ diff --git a/docs/Usage-Guide.md b/docs/Usage-Guide.md index 4f51214..eb1671b 100644 --- a/docs/Usage-Guide.md +++ b/docs/Usage-Guide.md @@ -27,10 +27,19 @@ #### 1. 安装 Python 依赖与激活环境 -**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。新机器部署时若 Conda 不在默认路径,请先设置 `CONDA_ROOT`(例如你的 conda 是 `~/anaconda3/bin/conda`,则 `export CONDA_ROOT=$HOME/anaconda3`)。详见 `docs/环境配置说明.md`。 +**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。目前推荐 venv(`.venv`);Conda 仅作为兼容回退(需要 `CONDA_ROOT`)。详见 `docs/环境配置说明.md`。 ```bash cd /data/saas-search +./scripts/create_venv.sh # 首次创建 venv(只需执行一次) +source activate.sh +``` + +如果需要本地 embedding / 图像编码(会安装 torch/transformers 等较重依赖): + +```bash +cd /data/saas-search +INSTALL_ML=1 ./scripts/create_venv.sh source activate.sh ``` diff --git a/docs/temporary/sku_image_src问题诊断报告.md b/docs/temporary/sku_image_src问题诊断报告.md index 9a98b98..ddd3d31 100644 --- a/docs/temporary/sku_image_src问题诊断报告.md +++ b/docs/temporary/sku_image_src问题诊断报告.md @@ -98,7 +98,7 @@ else: 2. **验证修复**:重新索引后,查询 ES 验证 `image_src` 字段是否已包含: ```bash - curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' \ + curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' \ -H 'Content-Type: application/json' \ -d '{ "size": 1, diff --git a/docs/常用查询 - ES.md b/docs/常用查询 - ES.md index 4be7823..91e90e9 100644 --- a/docs/常用查询 - ES.md +++ b/docs/常用查询 - ES.md @@ -8,7 +8,7 @@ # 一般情况下不需要在查询中再按 tenant_id 过滤(可选保留用于排查)。 ### 1. 根据 tenant_id / spu_id 查询 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 11, "_source": ["title"], "query": { @@ -21,7 +21,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_ }' -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 100, "_source": ["title"], "query": { @@ -30,7 +30,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_ }' -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 5, "_source": ["title", "keywords", "tags"], "query": { @@ -43,7 +43,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_ }' -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 1, "_source": ["title", "keywords", "tags"], "query": { @@ -65,7 +65,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_ }' -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 1, "_source": ["title"], "query": { @@ -86,17 +86,17 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_ } }' -Curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ +Curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ "analyzer": "index_ansj", "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" }' -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_analyze' -H 'Content-Type: application/json' -d '{ "analyzer": "query_ansj", "text": "14寸第4代-眼珠实身冰雪公仔带手动大推车,搪胶雪宝宝" }' -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 100, "from": 0, "query": { @@ -131,7 +131,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_ } }' -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 1, "from": 0, "query": { @@ -258,7 +258,7 @@ GET /search_products_tenant_2/_search } -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ "size": 5, "query": { "bool": { @@ -271,7 +271,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/ ### 2. 统计租户的总文档数 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_count?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_170/_count?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} } @@ -285,7 +285,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te ## 1. 检查ES文档的分面字段数据 ### 1.1 查询特定租户的商品,显示分面相关字段 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "term": { "tenant_id": "162" @@ -306,7 +306,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 1.2 验证category1_name字段是否有数据 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -319,7 +319,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 1.3 验证specifications字段是否有数据 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -334,7 +334,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te ## 2. 分面聚合查询(Facet Aggregations) ### 2.1 category1_name 分面聚合 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, @@ -350,7 +350,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 2.2 specifications.color 分面聚合 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, @@ -382,7 +382,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 2.3 specifications.size 分面聚合 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, @@ -414,7 +414,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 2.4 specifications.material 分面聚合 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, @@ -446,7 +446,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 2.5 综合分面聚合(category + color + size + material) -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "match_all": {} }, @@ -530,7 +530,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te ## 3. 检查specifications嵌套字段的详细结构 ### 3.1 查看specifications的name字段有哪些值 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "term": { "tenant_id": "162" @@ -555,7 +555,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_s }' ### 3.2 查看某个商品的完整specifications数据 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -571,7 +571,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products/_s ## 4. 统计查询 ### 4.1 统计有category1_name的文档数量 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -582,7 +582,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 4.2 统计有specifications的文档数量 -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_count?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -596,7 +596,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te ## 5. 诊断问题场景 ### 5.1 查找没有category1_name但有category的文档(MySQL有数据但ES没有) -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ @@ -612,7 +612,7 @@ curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_te }' ### 5.2 查找有option但没有specifications的文档(数据转换问题) -curl -u 'essa:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ +curl -u 'saas:4hOaLaf41y2VuI8y' -X GET 'http://localhost:9200/search_products_tenant_162/_search?pretty' -H 'Content-Type: application/json' -d '{ "query": { "bool": { "filter": [ diff --git a/docs/搜索API对接指南.md b/docs/搜索API对接指南.md index ee4ae55..9bcf1c4 100644 --- a/docs/搜索API对接指南.md +++ b/docs/搜索API对接指南.md @@ -64,7 +64,7 @@ ### 1.1 基础信息 -- **Base URL**: `http://120.76.41.98:6002` +- **Base URL**: `http://43.166.252.75:6002` - **协议**: HTTP/HTTPS - **数据格式**: JSON - **字符编码**: UTF-8 @@ -75,7 +75,7 @@ ### 1.2 最简单的搜索请求 ```bash -curl -X POST "http://120.76.41.98:6002/search/" \ +curl -X POST "http://43.166.252.75:6002/search/" \ -H "Content-Type: application/json" \ -H "X-Tenant-ID: 162" \ -d '{"query": "芭比娃娃"}' @@ -84,7 +84,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ ### 1.3 带过滤与分页的搜索 ```bash -curl -X POST "http://120.76.41.98:6002/search/" \ +curl -X POST "http://43.166.252.75:6002/search/" \ -H "Content-Type: application/json" \ -H "X-Tenant-ID: 162" \ -d '{ @@ -108,7 +108,7 @@ curl -X POST "http://120.76.41.98:6002/search/" \ ### 1.4 开启分面的搜索 ```bash -curl -X POST "http://120.76.41.98:6002/search/" \ +curl -X POST "http://43.166.252.75:6002/search/" \ -H "Content-Type: application/json" \ -H "X-Tenant-ID: 162" \ -d '{ diff --git a/docs/环境配置说明.md b/docs/环境配置说明.md index 50c1a15..f96025d 100644 --- a/docs/环境配置说明.md +++ b/docs/环境配置说明.md @@ -11,14 +11,36 @@ ## 2. Python 运行环境 -**推荐方式(与项目脚本一致)**:使用项目根目录下的 `activate.sh` 激活环境,会自动加载当前目录下的 `.env`(忽略注释与空行): +本项目历史上使用 Conda 管理环境;目前推荐使用 **venv**(更轻量、对 CI/容器更友好)。项目根目录下的 `activate.sh` 已升级为 **优先激活 `.venv`,并兼容 Conda 回退**,且会自动加载当前目录下的 `.env`(忽略注释与空行)。 + +### 2.1 venv(推荐) + +首次创建 venv: + +```bash +cd /data/saas-search +./scripts/create_venv.sh +source activate.sh +``` + +如需运行本地 embedding / 图像编码服务(torch/transformers 等依赖较重): ```bash cd /data/saas-search +INSTALL_ML=1 ./scripts/create_venv.sh source activate.sh ``` -`activate.sh` 会激活 Conda 环境 `searchengine`。若在新机器上部署,请先设置本机 Conda 路径再执行: +日常使用: + +```bash +cd /data/saas-search +source activate.sh +``` + +### 2.2 Conda(兼容旧流程) + +`activate.sh` 会在未发现 `.venv` 时回退激活 Conda 环境 `searchengine`。若在新机器上部署,请先设置本机 Conda 路径再执行: ```bash # 你的 conda 在 ~/anaconda3/bin/conda,则 CONDA_ROOT=~/anaconda3 @@ -26,7 +48,7 @@ export CONDA_ROOT=$HOME/anaconda3 # 或你的 Conda 安装路径(如 /home/u source activate.sh ``` -**新机器首次部署(创建环境)**:若本机尚未创建 `searchengine` 环境,任选其一: +**新机器首次部署(创建 Conda 环境)**:若本机尚未创建 `searchengine` 环境,任选其一: - **方式 A(推荐,与 environment.yml 一致)**: ```bash @@ -82,7 +104,7 @@ DB_PASSWORD=P89cZHS5d7dFyc9R # Elasticsearch ES_HOST=http://localhost:9200 -ES_USERNAME=essa +ES_USERNAME=saas ES_PASSWORD=4hOaLaf41y2VuI8y # Redis(可选) @@ -105,7 +127,7 @@ API_PORT=6002 | 项目 | 值 | |------|----| | **MySQL** | host `120.79.247.228`, port `3316`, user `saas`, password `P89cZHS5d7dFyc9R` | -| **Elasticsearch** | host `http://localhost:9200`, user `essa`, password `4hOaLaf41y2VuI8y` | +| **Elasticsearch** | host `http://localhost:9200`, user `saas`, password `4hOaLaf41y2VuI8y` | | **Redis(可选)** | host `localhost`, port `6479`, password `BMfv5aI31kgHWtlx` | | **DeepL** | `c9293ab4-ad25-479b-919f-ab4e63b429ed` | diff --git a/frontend/README.md b/frontend/README.md index 85db218..fed4290 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -82,7 +82,7 @@ bash scripts/start_backend.sh ``` 2. **访问前端**: -打开浏览器访问:`http://120.76.41.98:6002/` +打开浏览器访问:`http://43.166.252.75:6002/` ### 搜索示例 @@ -97,7 +97,7 @@ bash scripts/start_backend.sh 前端通过以下接口与后端通信: ```javascript -POST http://120.76.41.98:6002/search/ +POST http://43.166.252.75:6002/search/ 请求体: { diff --git a/indexer/README.md b/indexer/README.md index a173256..03f3f60 100644 --- a/indexer/README.md +++ b/indexer/README.md @@ -198,7 +198,7 @@ categoryPath.set(categoryLang, translationCategoryPath) 你当前要使用的翻译接口(Python 侧): ```bash -curl -X POST http://120.76.41.98:6006/translate \ +curl -X POST http://43.166.252.75:6006/translate \ -H "Content-Type: application/json" \ -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣", "target_lang":"en", diff --git a/indexer/prompts.txt b/indexer/prompts.txt index 24dded0..8003e7d 100644 --- a/indexer/prompts.txt +++ b/indexer/prompts.txt @@ -22,7 +22,7 @@ java索引程序职责: 本模块: 负责 msyql 基础数据 → 索引结构的doc (包括缓存) -翻译接口: curl -X POST http://120.76.41.98:6006/translate -H "Content-Type: application/json" -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣","target_lang":"en","source_lang":"auto"}' +翻译接口: curl -X POST http://43.166.252.75:6006/translate -H "Content-Type: application/json" -d '{"text":"儿童小男孩女孩开学 100 天衬衫短袖 搞笑图案字母印花庆祝上衣","target_lang":"en","source_lang":"auto"}' java的组织doc的逻辑都需要迁移过来。 diff --git a/query/query_parser.py b/query/query_parser.py index 452d1de..e042532 100644 --- a/query/query_parser.py +++ b/query/query_parser.py @@ -8,7 +8,6 @@ from typing import Dict, List, Optional, Any, Union import numpy as np import logging import re -import hanlp from concurrent.futures import Future, ThreadPoolExecutor, as_completed from embeddings import BgeEncoder @@ -19,6 +18,10 @@ from .query_rewriter import QueryRewriter, QueryNormalizer logger = logging.getLogger(__name__) +try: + import hanlp # type: ignore +except Exception: # pragma: no cover + hanlp = None class ParsedQuery: """Container for parsed query results.""" @@ -94,12 +97,22 @@ class QueryParser: self.language_detector = LanguageDetector() self.rewriter = QueryRewriter(config.query_config.rewrite_dictionary) - # Initialize HanLP components at startup - logger.info("Initializing HanLP components...") - self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) - self._tok.config.output_spans = True - self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) - logger.info("HanLP components initialized") + # Optional HanLP components (heavy). If unavailable, fall back to a lightweight tokenizer. + self._tok = None + self._pos_tag = None + if hanlp is not None: + try: + logger.info("Initializing HanLP components...") + self._tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF) + self._tok.config.output_spans = True + self._pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) + logger.info("HanLP components initialized") + except Exception as e: + logger.warning(f"HanLP init failed, falling back to simple tokenizer: {e}") + self._tok = None + self._pos_tag = None + else: + logger.info("HanLP not installed; using simple tokenizer") @property def text_encoder(self) -> BgeEncoder: @@ -121,32 +134,51 @@ class QueryParser: translation_context=self.config.query_config.translation_context ) return self._translator + + def _simple_tokenize(self, text: str) -> List[str]: + """ + Lightweight tokenizer fallback. + + - Groups consecutive CJK chars as a token + - Groups consecutive latin/digits/underscore/dash as a token + """ + if not text: + return [] + pattern = re.compile(r"[\u4e00-\u9fff]+|[A-Za-z0-9_]+(?:-[A-Za-z0-9_]+)*") + return pattern.findall(text) def _extract_keywords(self, query: str) -> str: """Extract keywords (nouns with length > 1) from query.""" - tok_result = self._tok(query) - if not tok_result: - return "" - - words = [x[0] for x in tok_result] - pos_tags = self._pos_tag(words) - - keywords = [] - for word, pos in zip(words, pos_tags): - if len(word) > 1 and pos.startswith('N'): - keywords.append(word) - + if self._tok is not None and self._pos_tag is not None: + tok_result = self._tok(query) + if not tok_result: + return "" + words = [x[0] for x in tok_result] + pos_tags = self._pos_tag(words) + keywords = [] + for word, pos in zip(words, pos_tags): + if len(word) > 1 and isinstance(pos, str) and pos.startswith("N"): + keywords.append(word) + return " ".join(keywords) + + # Fallback: treat tokens with length > 1 as "keywords" + tokens = self._simple_tokenize(query) + keywords = [t for t in tokens if len(t) > 1] return " ".join(keywords) def _get_token_count(self, query: str) -> int: - """Get token count using HanLP.""" - tok_result = self._tok(query) - return len(tok_result) if tok_result else 0 + """Get token count (HanLP if available, otherwise simple).""" + if self._tok is not None: + tok_result = self._tok(query) + return len(tok_result) if tok_result else 0 + return len(self._simple_tokenize(query)) def _get_query_tokens(self, query: str) -> List[str]: - """Get token list using HanLP.""" - tok_result = self._tok(query) - return [x[0] for x in tok_result] if tok_result else [] + """Get token list (HanLP if available, otherwise simple).""" + if self._tok is not None: + tok_result = self._tok(query) + return [x[0] for x in tok_result] if tok_result else [] + return self._simple_tokenize(query) def parse( self, diff --git a/requirements.txt b/requirements.txt index b05cf2a..19b98e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,14 +12,13 @@ pandas>=2.0.0 # Elasticsearch elasticsearch>=8.0.0,<9.0.0 -# ML/Embeddings -torch>=2.0.0 -sentence-transformers>=2.2.0 -transformers>=4.30.0 -modelscope>=1.9.0 -cn-clip>=1.5.0 +# Redis (cache; used by translator/embeddings) +redis>=5.0.0 + +# Math / vector utilities (used across modules) numpy>=1.24.0 -pillow>=10.0.0 + +# LLM/Translation clients openai>=1.0.0 # API diff --git a/requirements_ml.txt b/requirements_ml.txt new file mode 100644 index 0000000..e6e55d3 --- /dev/null +++ b/requirements_ml.txt @@ -0,0 +1,16 @@ +# Optional heavy dependencies for local embedding/image encoding. +# +# Install when you need: +# - `./scripts/start_embedding_service.sh` (local embeddings server) +# - local BGE-M3 / CN-CLIP inference +# +# Notes: +# - `torch` wheels can be very large; if you want CPU-only wheels, +# consider installing torch separately with the official CPU index. +# +torch>=2.0.0 +sentence-transformers>=2.2.0 +transformers>=4.30.0 +modelscope>=1.9.0 +cn-clip>=1.5.0 +pillow>=10.0.0 diff --git a/scripts/create_venv.sh b/scripts/create_venv.sh new file mode 100644 index 0000000..2f3c5d9 --- /dev/null +++ b/scripts/create_venv.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# +# Create and initialize Python venv for saas-search. +# +# Usage: +# ./scripts/create_venv.sh +# +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "${PROJECT_ROOT}" + +VENV_DIR="${PROJECT_ROOT}/.venv" + +PYTHON_BIN="${PYTHON_BIN:-python3.10}" +if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then + PYTHON_BIN="python3" +fi + +echo "Using python: $(${PYTHON_BIN} --version)" + +if ! "${PYTHON_BIN}" -c "import ensurepip" >/dev/null 2>&1; then + echo "ERROR: ensurepip is not available for ${PYTHON_BIN}." >&2 + echo "On Ubuntu/Debian, install the venv package first, e.g.:" >&2 + echo " sudo apt-get update -y && sudo apt-get install -y python3-venv" >&2 + echo "If you are using Python 3.12 specifically, you may need:" >&2 + echo " sudo apt-get install -y python3.12-venv" >&2 + exit 1 +fi + +if [[ -d "${VENV_DIR}" ]]; then + if [[ -f "${VENV_DIR}/bin/activate" ]]; then + echo "venv already exists at ${VENV_DIR}" + else + echo "Found incomplete venv at ${VENV_DIR}, recreating..." + rm -rf "${VENV_DIR}" + "${PYTHON_BIN}" -m venv "${VENV_DIR}" + fi +else + echo "Creating venv at ${VENV_DIR} ..." + "${PYTHON_BIN}" -m venv "${VENV_DIR}" +fi + +# shellcheck disable=SC1091 +source "${VENV_DIR}/bin/activate" + +python -m pip install --upgrade pip setuptools wheel +python -m pip install -r requirements.txt + +if [[ "${INSTALL_ML:-0}" == "1" ]]; then + echo + echo "INSTALL_ML=1 detected. Installing optional ML dependencies..." + python -m pip install -r requirements_ml.txt +fi + +echo +echo "Done." +echo "Next:" +echo " source activate.sh" diff --git a/scripts/mock_data.sh b/scripts/mock_data.sh index 401c85f..c50bcaa 100755 --- a/scripts/mock_data.sh +++ b/scripts/mock_data.sh @@ -20,9 +20,7 @@ # ============================================================================ cd "$(dirname "$0")/.." -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -source "$CONDA_ROOT/etc/profile.d/conda.sh" -conda activate searchengine +source ./activate.sh GREEN='\033[0;32m' YELLOW='\033[1;33m' @@ -33,13 +31,6 @@ echo -e "${GREEN}========================================${NC}" echo -e "${GREEN}Mock Data Script${NC}" echo -e "${GREEN}========================================${NC}" -# Load config from .env file if it exists -if [ -f .env ]; then - set -a - source .env - set +a -fi - # ============================================================================ # 写死的配置参数(不需要配置化,这是测试数据构造脚本) # ============================================================================ diff --git a/scripts/start_backend.sh b/scripts/start_backend.sh index 919e6da..49c2d27 100755 --- a/scripts/start_backend.sh +++ b/scripts/start_backend.sh @@ -5,9 +5,7 @@ set -e cd "$(dirname "$0")/.." -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -source "$CONDA_ROOT/etc/profile.d/conda.sh" -conda activate searchengine +source ./activate.sh GREEN='\033[0;32m' YELLOW='\033[1;33m' @@ -17,13 +15,6 @@ echo -e "${GREEN}========================================${NC}" echo -e "${GREEN}Starting Backend API Service${NC}" echo -e "${GREEN}========================================${NC}" -# Load config from .env file if it exists -if [ -f .env ]; then - set -a - source .env - set +a -fi - echo -e "\n${YELLOW}Configuration:${NC}" echo " API Host: ${API_HOST:-0.0.0.0}" echo " API Port: ${API_PORT:-6002}" diff --git a/scripts/start_embedding_service.sh b/scripts/start_embedding_service.sh index 0a678d6..ff30a19 100755 --- a/scripts/start_embedding_service.sh +++ b/scripts/start_embedding_service.sh @@ -12,12 +12,7 @@ set -e cd "$(dirname "$0")/.." -# Load conda env if available (keep consistent with other scripts) -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -if [ -f "$CONDA_ROOT/etc/profile.d/conda.sh" ]; then - source "$CONDA_ROOT/etc/profile.d/conda.sh" - conda activate searchengine -fi +source ./activate.sh EMBEDDING_SERVICE_HOST=$(python -c "from embeddings.config import CONFIG; print(CONFIG.HOST)") EMBEDDING_SERVICE_PORT=$(python -c "from embeddings.config import CONFIG; print(CONFIG.PORT)") diff --git a/scripts/start_frontend.sh b/scripts/start_frontend.sh index 837a7ba..a5ce8c0 100755 --- a/scripts/start_frontend.sh +++ b/scripts/start_frontend.sh @@ -5,9 +5,7 @@ set -e cd "$(dirname "$0")/.." -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -source "$CONDA_ROOT/etc/profile.d/conda.sh" -conda activate searchengine +source ./activate.sh GREEN='\033[0;32m' YELLOW='\033[1;33m' diff --git a/scripts/start_indexer.sh b/scripts/start_indexer.sh index 3a03bdb..8924ae8 100755 --- a/scripts/start_indexer.sh +++ b/scripts/start_indexer.sh @@ -5,9 +5,7 @@ set -e cd "$(dirname "$0")/.." -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -source "$CONDA_ROOT/etc/profile.d/conda.sh" -conda activate searchengine +source ./activate.sh GREEN='\033[0;32m' YELLOW='\033[1;33m' @@ -17,13 +15,6 @@ echo -e "${GREEN}========================================${NC}" echo -e "${GREEN}Starting Indexer API Service${NC}" echo -e "${GREEN}========================================${NC}" -# Load config from .env file if it exists -if [ -f .env ]; then - set -a - source .env - set +a -fi - echo -e "\n${YELLOW}Configuration:${NC}" echo " INDEXER Host: ${INDEXER_HOST:-0.0.0.0}" echo " INDEXER Port: ${INDEXER_PORT:-6004}" diff --git a/scripts/tenant3__csv_to_shoplazza_xlsx.sh b/scripts/tenant3__csv_to_shoplazza_xlsx.sh index c3b042d..c0c85e8 100755 --- a/scripts/tenant3__csv_to_shoplazza_xlsx.sh +++ b/scripts/tenant3__csv_to_shoplazza_xlsx.sh @@ -1,7 +1,8 @@ -# 激活环境 -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -source "$CONDA_ROOT/etc/profile.d/conda.sh" -conda activate searchengine +#!/bin/bash +set -e + +cd "$(dirname "$0")/.." +source ./activate.sh # # 基本使用(生成所有数据) # python scripts/csv_to_excel.py diff --git a/setup.sh b/setup.sh index 28e0d2c..6f26507 100755 --- a/setup.sh +++ b/setup.sh @@ -1,8 +1,5 @@ #!/bin/bash -CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" -source "$CONDA_ROOT/etc/profile.d/conda.sh" - # saas-search Setup and Startup Script # This script sets up the environment and starts all services @@ -22,24 +19,16 @@ echo -e "${GREEN}========================================${NC}" cd "$(dirname "$0")" PROJECT_ROOT=$(pwd) -echo -e "\n${YELLOW}Step 1: Setting up Conda environment${NC}" -# Check if conda is available -if ! command -v conda &> /dev/null; then - echo -e "${RED}Error: conda not found. Please install Miniconda or Anaconda${NC}" - exit 1 -fi +echo -e "\n${YELLOW}Step 1: Setting up Python environment (venv preferred)${NC}" -# Check if environment exists -if conda env list | grep -q "searchengine"; then - echo -e "${GREEN}Environment 'searchengine' already exists${NC}" - conda activate searchengine -else - echo -e "${YELLOW}Creating conda environment 'searchengine'...${NC}" - conda env create -f environment.yml - conda activate searchengine - echo -e "${GREEN}Environment created successfully!${NC}" +if [ ! -f "${PROJECT_ROOT}/.venv/bin/activate" ]; then + echo -e "${YELLOW}Creating venv and installing dependencies...${NC}" + ./scripts/create_venv.sh fi +# Activate environment + load .env +source ./activate.sh + # Verify environment echo -e "\n${YELLOW}Current Python version:${NC}" python --version @@ -74,7 +63,7 @@ echo -e "${GREEN}Setup Complete!${NC}" echo -e "${GREEN}========================================${NC}" echo "" echo -e "Next steps:" -echo -e " 1. Ingest data: ${YELLOW}./scripts/ingest.sh${NC}" -echo -e " 2. Start backend: ${YELLOW}./scripts/start_backend.sh${NC}" +echo -e " 1. Start backend: ${YELLOW}./scripts/start_backend.sh${NC}" +echo -e " 2. Start indexer: ${YELLOW}./scripts/start_indexer.sh${NC}" echo -e " 3. Start frontend: ${YELLOW}./scripts/start_frontend.sh${NC}" echo "" -- libgit2 0.21.2