Blame view

scripts/build_suggestions.sh 3.55 KB
f251cf2d   tangwang   suggestion全量索引程序跑通
1
2
3
4
5
  #!/usr/bin/env bash
  #
  # Convenience script to rebuild suggestion index for a tenant.
  #
  # Usage:
ff9efda0   tangwang   suggest
6
7
8
9
10
  #   # full rebuild + alias publish (default)
  #   ./scripts/build_suggestions.sh <tenant_id> --mode full --days 30
  #
  #   # incremental update from watermark
  #   ./scripts/build_suggestions.sh <tenant_id> --mode incremental
f251cf2d   tangwang   suggestion全量索引程序跑通
11
  #
e50924ed   tangwang   1. tags -> enrich...
12
13
14
  #   # full rebuild + incremental + ES/API smoke checks (same as legacy rebuild_suggestions.sh)
  #   ./scripts/build_suggestions.sh <tenant_id> --rebuild-and-verify
  #
f251cf2d   tangwang   suggestion全量索引程序跑通
15
16
17
18
  
  set -euo pipefail
  
  if [ $# -lt 1 ]; then
e50924ed   tangwang   1. tags -> enrich...
19
    echo "Usage: $0 <tenant_id> [--rebuild-and-verify | extra args for main.py build-suggestions...]"
ff9efda0   tangwang   suggest
20
21
    echo "Example (full): $0 162 --mode full --days 30 --publish-alias"
    echo "Example (incremental): $0 162 --mode incremental --overlap-minutes 30"
e50924ed   tangwang   1. tags -> enrich...
22
    echo "Example (pipeline + smoke): $0 162 --rebuild-and-verify"
f251cf2d   tangwang   suggestion全量索引程序跑通
23
24
25
26
27
28
    exit 1
  fi
  
  TENANT_ID="$1"
  shift || true
  
e50924ed   tangwang   1. tags -> enrich...
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
  REBUILD_VERIFY=false
  PASSTHROUGH_ARGS=()
  for arg in "$@"; do
    if [ "$arg" = "--rebuild-and-verify" ]; then
      REBUILD_VERIFY=true
    else
      PASSTHROUGH_ARGS+=("$arg")
    fi
  done
  
  if [ "$REBUILD_VERIFY" = true ] && [ ${#PASSTHROUGH_ARGS[@]} -gt 0 ]; then
    echo "Error: --rebuild-and-verify cannot be combined with other build-suggestions arguments."
    exit 1
  fi
  
f251cf2d   tangwang   suggestion全量索引程序跑通
44
45
46
47
  ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
  
  cd "$ROOT_DIR"
  
ff9efda0   tangwang   suggest
48
49
50
51
52
  PY_BIN="${PYTHON_BIN:-$ROOT_DIR/.venv/bin/python}"
  if [ ! -x "$PY_BIN" ]; then
    PY_BIN="python3"
  fi
  
e50924ed   tangwang   1. tags -> enrich...
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  if [ "$REBUILD_VERIFY" = true ]; then
    # Fixed smoke-test queries and languages (no CLI args).
    SAMPLE_QUERIES=(s sh dress tshirt)
    SAMPLE_LANGS=(en zh)
    API_BASE="${API_BASE_URL:-http://localhost:6002}"
  
    if [ -z "${ES_HOST:-}" ]; then
      ES_HOST="$("$PY_BIN" - <<'PY'
  from dotenv import dotenv_values
  print(dotenv_values('.env').get('ES_HOST') or 'http://localhost:9200')
  PY
  )"
    fi
  
    if [ -z "${ES_USERNAME:-}" ] || [ -z "${ES_PASSWORD:-}" ]; then
      readarray -t _ES_CREDS < <("$PY_BIN" - <<'PY'
  from dotenv import dotenv_values
  cfg = dotenv_values('.env')
  print(cfg.get('ES_USERNAME') or '')
  print(cfg.get('ES_PASSWORD') or '')
  PY
  )
      ES_USERNAME="${ES_USERNAME:-${_ES_CREDS[0]}}"
      ES_PASSWORD="${ES_PASSWORD:-${_ES_CREDS[1]}}"
    fi
  
    if [ -n "${ES_USERNAME:-}" ] && [ -n "${ES_PASSWORD:-}" ]; then
      AUTH=(-u "${ES_USERNAME}:${ES_PASSWORD}")
    else
      AUTH=()
    fi
  
    ALIAS_NAME="${ES_INDEX_NAMESPACE:-}search_suggestions_tenant_${TENANT_ID}_current"
  
    echo "[1/4] Full rebuild tenant=${TENANT_ID} (versioned + alias publish)"
    "$PY_BIN" main.py build-suggestions \
      --tenant-id "$TENANT_ID" \
      --es-host "$ES_HOST" \
      --mode full \
      --days 365 \
      --batch-size 500 \
      --publish-alias \
      --keep-versions 2
  
    echo "[2/4] Incremental update tenant=${TENANT_ID}"
    "$PY_BIN" main.py build-suggestions \
      --tenant-id "$TENANT_ID" \
      --es-host "$ES_HOST" \
      --mode incremental \
      --overlap-minutes 30
  
    echo "[3/4] ES count + sample"
    curl -sS "${AUTH[@]}" "$ES_HOST/$ALIAS_NAME/_count?pretty"
    echo
    curl -sS "${AUTH[@]}" "$ES_HOST/$ALIAS_NAME/_search?pretty" -H 'Content-Type: application/json' -d '{
    "size": 5,
    "query": {"match_all": {}},
    "_source": ["lang", "text", "rank_score", "sources", "query_count_30d"]
  }'
    echo
  
    echo "[4/4] API smoke test"
    for lang in "${SAMPLE_LANGS[@]}"; do
      for q in "${SAMPLE_QUERIES[@]}"; do
        echo "--- GET /search/suggestions?q=${q}&language=${lang} ---"
        curl -sS "$API_BASE/search/suggestions?q=${q}&size=10&language=${lang}" -H "X-Tenant-ID: ${TENANT_ID}"
        echo
      done
    done
    exit 0
  fi
  
ff9efda0   tangwang   suggest
125
  "$PY_BIN" main.py build-suggestions \
f251cf2d   tangwang   suggestion全量索引程序跑通
126
    --tenant-id "$TENANT_ID" \
e50924ed   tangwang   1. tags -> enrich...
127
    "${PASSTHROUGH_ARGS[@]}"