#!/bin/bash # Xinference REST API 调用示例 # 演示如何通过 HTTP API 调用 Qwen3-Embedding 和 Qwen3-Reranker # 设置服务地址 XINFERENCE_HOST="http://localhost:9997" MODEL_EMBEDDING="qwen3-embedding" MODEL_RERANKER="qwen3-reranker" echo "=========================================" echo " Xinference REST API 调用示例" echo "=========================================" echo "" # 颜色定义 GREEN='\033[0;32m' BLUE='\033[0;34m' YELLOW='\033[1;33m' NC='\033[0m' # No Color print_section() { echo "" echo -e "${BLUE}=========================================${NC}" echo -e "${BLUE} $1${NC}" echo -e "${BLUE}=========================================${NC}" echo "" } print_info() { echo -e "${GREEN}➜${NC} $1" } # ============================================ # 1. 查看服务状态 # ============================================ print_section "1. 查看服务状态和已部署模型" print_info "查看所有已部署的模型:" curl -s "${XINFERENCE_HOST}/v1/models" | python3 -m json.tool echo "" print_info "查看服务健康状态:" curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务健康" || echo "❌ 服务异常" # ============================================ # 2. Embedding API 调用 # ============================================ print_section "2. Qwen3-Embedding API 调用" print_info "单个文本 embedding 生成:" curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_EMBEDDING}'", "input": ["适合老人用的智能手机大屏幕长续航"] }' | python3 -m json.tool echo "" print_info "批量文本 embedding 生成:" curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_EMBEDDING}'", "input": [ "红米Note12 5000mAh大电量", "华为畅享60 6000mAh超长续航", "小米手环8 智能运动监测" ] }' | python3 -m json.tool # ============================================ # 3. Reranker API 调用 # ============================================ print_section "3. Qwen3-Reranker API 调用" print_info "精排候选商品:" curl -X POST "${XINFERENCE_HOST}/v1/rerank" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_RERANKER}'", "query": "适合老人用的智能手机大屏幕长续航", "documents": [ "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", "iPhone 15 Pro Max 专业摄影旗舰", "华为畅享60 6000mAh超长续航 护眼大屏", "OPPO A1 5000mAh电池 简易模式适合长辈", "小米手环8 智能运动监测" ], "top_n": 5 }' | python3 -m json.tool # ============================================ # 4. 电商搜索实战:两阶段检索 # ============================================ print_section "4. 电商搜索实战:完整两阶段检索流程" # 阶段1: 密集检索 print_info "阶段1: 为用户 query 生成向量" echo "" echo "Query: 适合老人用的智能手机大屏幕长续航" QUERY_VECTOR=$(curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_EMBEDDING}'", "input": ["适合老人用的智能手机大屏幕长续航"] }' | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin)['data'][0]['embedding'])))") echo "Query 向量维度: $(echo $QUERY_VECTOR | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")" echo "" print_info "为候选商品生成向量(在实际应用中,这些向量应预计算并存储)" # 为简化演示,这里只显示部分候选商品的向量生成 CANDIDATES=( "红米Note12 5000mAh大电量" "华为畅享60 6000mAh超长续航" "小米手环8" ) for candidate in "${CANDIDATES[@]}"; do echo "" echo "商品: $candidate" curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_EMBEDDING}'", "input": ["'${candidate}'"] }' | python3 -c "import sys, json; data=json.load(sys.stdin); print(f\" 向量维度: {len(data['data'][0]['embedding'])}\")" done # 阶段2: 精排 echo "" print_info "阶段2: 使用 Reranker 对召回结果进行精排" curl -X POST "${XINFERENCE_HOST}/v1/rerank" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_RERANKER}'", "query": "适合老人用的智能手机大屏幕长续航", "documents": [ "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", "华为畅享60 6000mAh超长续航 护眼大屏", "OPPO A1 5000mAh电池 简易模式适合长辈", "iPhone 15 Pro Max 专业摄影旗舰", "小米手环8 智能运动监测" ], "top_n": 3 }' | python3 -m json.tool # ============================================ # 5. 高级用法:批量处理 # ============================================ print_section "5. 批量 Embedding 生成(离线任务)" print_info "为大量商品生成 embedding(模拟离线任务)" echo "注意: 实际生产环境中,批量大小建议为 100-1000" # 创建批量输入文件 cat > /tmp/batch_input.json < list: """获取文本的 embedding 向量""" response = requests.post( f"{XINFERENCE_HOST}/v1/embeddings", json={ "model": "qwen3-embedding", "input": [text] } ) return response.json()["data"][0]["embedding"] # 批量 Embedding def get_embeddings(texts: list) -> list: """批量获取 embedding 向量""" response = requests.post( f"{XINFERENCE_HOST}/v1/embeddings", json={ "model": "qwen3-embedding", "input": texts } ) return [item["embedding"] for item in response.json()["data"]] # Reranker 调用 def rerank(query: str, documents: list, top_n: int = 10) -> list: """使用 reranker 对文档排序""" response = requests.post( f"{XINFERENCE_HOST}/v1/rerank", json={ "model": "qwen3-reranker", "query": query, "documents": documents, "top_n": top_n } ) return response.json()["results"] # 计算余弦相似度 def cosine_similarity(vec1: list, vec2: list) -> float: """计算两个向量的余弦相似度""" v1 = np.array(vec1) v2 = np.array(vec2) return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) # 完整搜索流程 def search(query: str, products: list) -> list: """两阶段搜索""" # 阶段1: 密集检索(简化示例) query_vec = get_embedding(query) similarities = [] for product in products: prod_vec = get_embedding(product) sim = cosine_similarity(query_vec, prod_vec) similarities.append((product, sim)) # 取 Top-200 similarities.sort(key=lambda x: x[1], reverse=True) top_200 = [p for p, s in similarities[:200]] # 阶段2: 精排 reranked = rerank(query, top_200, top_n=10) return reranked # 使用示例 if __name__ == "__main__": query = "适合老人用的智能手机大屏幕长续航" products = [ "红米Note12 5000mAh大电量 6.67英寸大屏 老人模式", "华为畅享60 6000mAh超长续航 护眼大屏", "小米手环8 智能运动监测" ] results = search(query, products) for r in results: print(f"[{r['relevance_score']:.4f}] {r['document']}") EOF # ============================================ # 7. 性能测试 # ============================================ print_section "7. 性能测试" print_info "测试 Embedding API 响应时间:" echo "" for i in {1..5}; do START=$(date +%s%N) curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_EMBEDDING}'", "input": ["测试文本"] }' > /dev/null END=$(date +%s%N) ELAPSED=$((($END - $START) / 1000000)) echo " 请求 $i: ${ELAPSED}ms" done echo "" print_info "测试 Reranker API 响应时间:" for i in {1..5}; do START=$(date +%s%N) curl -s -X POST "${XINFERENCE_HOST}/v1/rerank" \ -H "Content-Type: application/json" \ -d '{ "model": "'${MODEL_RERANKER}'", "query": "测试查询", "documents": ["文档1", "文档2", "文档3"], "top_n": 3 }' > /dev/null END=$(date +%s%N) ELAPSED=$((($END - $START) / 1000000)) echo " 请求 $i: ${ELAPSED}ms" done # ============================================ # 8. 常见问题排查 # ============================================ print_section "8. 常见问题排查" print_info "检查服务是否运行:" curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务正常" || echo "❌ 服务未启动,请运行: ./start.sh" echo "" print_info "检查模型是否部署:" MODELS=$(curl -s "${XINFERENCE_HOST}/v1/models") echo "$MODELS" | python3 -c " import sys, json try: models = json.load(sys.stdin) if models: print('✅ 已部署模型:') for m in models: print(f' - {m.get(\"model_type\")}: {m.get(\"model_uid\")}') else: print('❌ 没有已部署的模型,请运行: python deploy_models.py') except: print('❌ 无法获取模型信息') " echo "" print_info "查看 GPU 使用情况:" if command -v nvidia-smi &> /dev/null; then nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader | while read line; do echo " GPU $line" done else echo " ⚠️ nvidia-smi 未安装,无法查看 GPU 信息" fi echo "" echo "=========================================" echo " ✅ API 调用示例演示完成" echo "========================================="