build_suggestions.sh 3.55 KB
#!/usr/bin/env bash
#
# Convenience script to rebuild suggestion index for a tenant.
#
# Usage:
#   # full rebuild + alias publish (default)
#   ./scripts/build_suggestions.sh <tenant_id> --mode full --days 30
#
#   # incremental update from watermark
#   ./scripts/build_suggestions.sh <tenant_id> --mode incremental
#
#   # full rebuild + incremental + ES/API smoke checks (same as legacy rebuild_suggestions.sh)
#   ./scripts/build_suggestions.sh <tenant_id> --rebuild-and-verify
#

set -euo pipefail

if [ $# -lt 1 ]; then
  echo "Usage: $0 <tenant_id> [--rebuild-and-verify | extra args for main.py build-suggestions...]"
  echo "Example (full): $0 162 --mode full --days 30 --publish-alias"
  echo "Example (incremental): $0 162 --mode incremental --overlap-minutes 30"
  echo "Example (pipeline + smoke): $0 162 --rebuild-and-verify"
  exit 1
fi

TENANT_ID="$1"
shift || true

REBUILD_VERIFY=false
PASSTHROUGH_ARGS=()
for arg in "$@"; do
  if [ "$arg" = "--rebuild-and-verify" ]; then
    REBUILD_VERIFY=true
  else
    PASSTHROUGH_ARGS+=("$arg")
  fi
done

if [ "$REBUILD_VERIFY" = true ] && [ ${#PASSTHROUGH_ARGS[@]} -gt 0 ]; then
  echo "Error: --rebuild-and-verify cannot be combined with other build-suggestions arguments."
  exit 1
fi

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

cd "$ROOT_DIR"

PY_BIN="${PYTHON_BIN:-$ROOT_DIR/.venv/bin/python}"
if [ ! -x "$PY_BIN" ]; then
  PY_BIN="python3"
fi

if [ "$REBUILD_VERIFY" = true ]; then
  # Fixed smoke-test queries and languages (no CLI args).
  SAMPLE_QUERIES=(s sh dress tshirt)
  SAMPLE_LANGS=(en zh)
  API_BASE="${API_BASE_URL:-http://localhost:6002}"

  if [ -z "${ES_HOST:-}" ]; then
    ES_HOST="$("$PY_BIN" - <<'PY'
from dotenv import dotenv_values
print(dotenv_values('.env').get('ES_HOST') or 'http://localhost:9200')
PY
)"
  fi

  if [ -z "${ES_USERNAME:-}" ] || [ -z "${ES_PASSWORD:-}" ]; then
    readarray -t _ES_CREDS < <("$PY_BIN" - <<'PY'
from dotenv import dotenv_values
cfg = dotenv_values('.env')
print(cfg.get('ES_USERNAME') or '')
print(cfg.get('ES_PASSWORD') or '')
PY
)
    ES_USERNAME="${ES_USERNAME:-${_ES_CREDS[0]}}"
    ES_PASSWORD="${ES_PASSWORD:-${_ES_CREDS[1]}}"
  fi

  if [ -n "${ES_USERNAME:-}" ] && [ -n "${ES_PASSWORD:-}" ]; then
    AUTH=(-u "${ES_USERNAME}:${ES_PASSWORD}")
  else
    AUTH=()
  fi

  ALIAS_NAME="${ES_INDEX_NAMESPACE:-}search_suggestions_tenant_${TENANT_ID}_current"

  echo "[1/4] Full rebuild tenant=${TENANT_ID} (versioned + alias publish)"
  "$PY_BIN" main.py build-suggestions \
    --tenant-id "$TENANT_ID" \
    --es-host "$ES_HOST" \
    --mode full \
    --days 365 \
    --batch-size 500 \
    --publish-alias \
    --keep-versions 2

  echo "[2/4] Incremental update tenant=${TENANT_ID}"
  "$PY_BIN" main.py build-suggestions \
    --tenant-id "$TENANT_ID" \
    --es-host "$ES_HOST" \
    --mode incremental \
    --overlap-minutes 30

  echo "[3/4] ES count + sample"
  curl -sS "${AUTH[@]}" "$ES_HOST/$ALIAS_NAME/_count?pretty"
  echo
  curl -sS "${AUTH[@]}" "$ES_HOST/$ALIAS_NAME/_search?pretty" -H 'Content-Type: application/json' -d '{
  "size": 5,
  "query": {"match_all": {}},
  "_source": ["lang", "text", "rank_score", "sources", "query_count_30d"]
}'
  echo

  echo "[4/4] API smoke test"
  for lang in "${SAMPLE_LANGS[@]}"; do
    for q in "${SAMPLE_QUERIES[@]}"; do
      echo "--- GET /search/suggestions?q=${q}&language=${lang} ---"
      curl -sS "$API_BASE/search/suggestions?q=${q}&size=10&language=${lang}" -H "X-Tenant-ID: ${TENANT_ID}"
      echo
    done
  done
  exit 0
fi

"$PY_BIN" main.py build-suggestions \
  --tenant-id "$TENANT_ID" \
  "${PASSTHROUGH_ARGS[@]}"