b401ef94
tangwang
third-party/xinfe...
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
|
#!/bin/bash
# Xinference REST API 调用示例
# 演示如何通过 HTTP API 调用 Qwen3-Embedding 和 Qwen3-Reranker
# 设置服务地址
XINFERENCE_HOST="http://localhost:9997"
MODEL_EMBEDDING="qwen3-embedding"
MODEL_RERANKER="qwen3-reranker"
echo "========================================="
echo " Xinference REST API 调用示例"
echo "========================================="
echo ""
# 颜色定义
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
print_section() {
echo ""
echo -e "${BLUE}=========================================${NC}"
echo -e "${BLUE} $1${NC}"
echo -e "${BLUE}=========================================${NC}"
echo ""
}
print_info() {
echo -e "${GREEN}➜${NC} $1"
}
# ============================================
# 1. 查看服务状态
# ============================================
print_section "1. 查看服务状态和已部署模型"
print_info "查看所有已部署的模型:"
curl -s "${XINFERENCE_HOST}/v1/models" | python3 -m json.tool
echo ""
print_info "查看服务健康状态:"
curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务健康" || echo "❌ 服务异常"
# ============================================
# 2. Embedding API 调用
# ============================================
print_section "2. Qwen3-Embedding API 调用"
print_info "单个文本 embedding 生成:"
curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_EMBEDDING}'",
"input": ["适合老人用的智能手机大屏幕长续航"]
}' | python3 -m json.tool
echo ""
print_info "批量文本 embedding 生成:"
curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_EMBEDDING}'",
"input": [
"红米Note12 5000mAh大电量",
"华为畅享60 6000mAh超长续航",
"小米手环8 智能运动监测"
]
}' | python3 -m json.tool
# ============================================
# 3. Reranker API 调用
# ============================================
print_section "3. Qwen3-Reranker API 调用"
print_info "精排候选商品:"
curl -X POST "${XINFERENCE_HOST}/v1/rerank" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_RERANKER}'",
"query": "适合老人用的智能手机大屏幕长续航",
"documents": [
"红米Note12 5000mAh大电量 6.67英寸大屏 老人模式",
"iPhone 15 Pro Max 专业摄影旗舰",
"华为畅享60 6000mAh超长续航 护眼大屏",
"OPPO A1 5000mAh电池 简易模式适合长辈",
"小米手环8 智能运动监测"
],
"top_n": 5
}' | python3 -m json.tool
# ============================================
# 4. 电商搜索实战:两阶段检索
# ============================================
print_section "4. 电商搜索实战:完整两阶段检索流程"
# 阶段1: 密集检索
print_info "阶段1: 为用户 query 生成向量"
echo ""
echo "Query: 适合老人用的智能手机大屏幕长续航"
QUERY_VECTOR=$(curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_EMBEDDING}'",
"input": ["适合老人用的智能手机大屏幕长续航"]
}' | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin)['data'][0]['embedding'])))")
echo "Query 向量维度: $(echo $QUERY_VECTOR | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")"
echo ""
print_info "为候选商品生成向量(在实际应用中,这些向量应预计算并存储)"
# 为简化演示,这里只显示部分候选商品的向量生成
CANDIDATES=(
"红米Note12 5000mAh大电量"
"华为畅享60 6000mAh超长续航"
"小米手环8"
)
for candidate in "${CANDIDATES[@]}"; do
echo ""
echo "商品: $candidate"
curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_EMBEDDING}'",
"input": ["'${candidate}'"]
}' | python3 -c "import sys, json; data=json.load(sys.stdin); print(f\" 向量维度: {len(data['data'][0]['embedding'])}\")"
done
# 阶段2: 精排
echo ""
print_info "阶段2: 使用 Reranker 对召回结果进行精排"
curl -X POST "${XINFERENCE_HOST}/v1/rerank" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_RERANKER}'",
"query": "适合老人用的智能手机大屏幕长续航",
"documents": [
"红米Note12 5000mAh大电量 6.67英寸大屏 老人模式",
"华为畅享60 6000mAh超长续航 护眼大屏",
"OPPO A1 5000mAh电池 简易模式适合长辈",
"iPhone 15 Pro Max 专业摄影旗舰",
"小米手环8 智能运动监测"
],
"top_n": 3
}' | python3 -m json.tool
# ============================================
# 5. 高级用法:批量处理
# ============================================
print_section "5. 批量 Embedding 生成(离线任务)"
print_info "为大量商品生成 embedding(模拟离线任务)"
echo "注意: 实际生产环境中,批量大小建议为 100-1000"
# 创建批量输入文件
cat > /tmp/batch_input.json <<EOF
{
"model": "${MODEL_EMBEDDING}",
"input": [
"商品1: 红米Note12 5000mAh大电量 6.67英寸大屏",
"商品2: iPhone 15 Pro Max 专业摄影旗舰",
"商品3: 华为畅享60 6000mAh超长续航 护眼大屏",
"商品4: OPPO A1 5000mAh电池 简易模式适合长辈",
"商品5: 小米手环8 智能运动监测",
"商品6: vivo Y78 5000mAh大电池 120Hz高刷屏",
"商品7: 三星Galaxy A54 5000mAh 防水防尘",
"商品8: 荣耀Play7T 6000mAh巨量电池",
"商品9: 真我11 Pro 2亿像素 100W快充",
"商品10: 诺基亚C31 5050mAh电池 耐用三防"
]
}
EOF
echo ""
echo "批量生成 10 个商品的 embedding..."
curl -X POST "${XINFERENCE_HOST}/v1/embeddings" \
-H "Content-Type: application/json" \
-d @/tmp/batch_input.json | python3 -c "import sys, json; data=json.load(sys.stdin); print(f'✅ 成功生成 {len(data[\"data\"])} 个向量'); print(f'向量维度: {len(data[\"data\"][0][\"embedding\"])}')"
# ============================================
# 6. Python 调用示例
# ============================================
print_section "6. Python 调用示例"
cat << 'EOF'
# Python 调用示例代码:
import requests
import numpy as np
XINFERENCE_HOST = "http://localhost:9997"
# Embedding 调用
def get_embedding(text: str) -> list:
"""获取文本的 embedding 向量"""
response = requests.post(
f"{XINFERENCE_HOST}/v1/embeddings",
json={
"model": "qwen3-embedding",
"input": [text]
}
)
return response.json()["data"][0]["embedding"]
# 批量 Embedding
def get_embeddings(texts: list) -> list:
"""批量获取 embedding 向量"""
response = requests.post(
f"{XINFERENCE_HOST}/v1/embeddings",
json={
"model": "qwen3-embedding",
"input": texts
}
)
return [item["embedding"] for item in response.json()["data"]]
# Reranker 调用
def rerank(query: str, documents: list, top_n: int = 10) -> list:
"""使用 reranker 对文档排序"""
response = requests.post(
f"{XINFERENCE_HOST}/v1/rerank",
json={
"model": "qwen3-reranker",
"query": query,
"documents": documents,
"top_n": top_n
}
)
return response.json()["results"]
# 计算余弦相似度
def cosine_similarity(vec1: list, vec2: list) -> float:
"""计算两个向量的余弦相似度"""
v1 = np.array(vec1)
v2 = np.array(vec2)
return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
# 完整搜索流程
def search(query: str, products: list) -> list:
"""两阶段搜索"""
# 阶段1: 密集检索(简化示例)
query_vec = get_embedding(query)
similarities = []
for product in products:
prod_vec = get_embedding(product)
sim = cosine_similarity(query_vec, prod_vec)
similarities.append((product, sim))
# 取 Top-200
similarities.sort(key=lambda x: x[1], reverse=True)
top_200 = [p for p, s in similarities[:200]]
# 阶段2: 精排
reranked = rerank(query, top_200, top_n=10)
return reranked
# 使用示例
if __name__ == "__main__":
query = "适合老人用的智能手机大屏幕长续航"
products = [
"红米Note12 5000mAh大电量 6.67英寸大屏 老人模式",
"华为畅享60 6000mAh超长续航 护眼大屏",
"小米手环8 智能运动监测"
]
results = search(query, products)
for r in results:
print(f"[{r['relevance_score']:.4f}] {r['document']}")
EOF
# ============================================
# 7. 性能测试
# ============================================
print_section "7. 性能测试"
print_info "测试 Embedding API 响应时间:"
echo ""
for i in {1..5}; do
START=$(date +%s%N)
curl -s -X POST "${XINFERENCE_HOST}/v1/embeddings" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_EMBEDDING}'",
"input": ["测试文本"]
}' > /dev/null
END=$(date +%s%N)
ELAPSED=$((($END - $START) / 1000000))
echo " 请求 $i: ${ELAPSED}ms"
done
echo ""
print_info "测试 Reranker API 响应时间:"
for i in {1..5}; do
START=$(date +%s%N)
curl -s -X POST "${XINFERENCE_HOST}/v1/rerank" \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL_RERANKER}'",
"query": "测试查询",
"documents": ["文档1", "文档2", "文档3"],
"top_n": 3
}' > /dev/null
END=$(date +%s%N)
ELAPSED=$((($END - $START) / 1000000))
echo " 请求 $i: ${ELAPSED}ms"
done
# ============================================
# 8. 常见问题排查
# ============================================
print_section "8. 常见问题排查"
print_info "检查服务是否运行:"
curl -s "${XINFERENCE_HOST}/v1/models" > /dev/null && echo "✅ 服务正常" || echo "❌ 服务未启动,请运行: ./start.sh"
echo ""
print_info "检查模型是否部署:"
MODELS=$(curl -s "${XINFERENCE_HOST}/v1/models")
echo "$MODELS" | python3 -c "
import sys, json
try:
models = json.load(sys.stdin)
if models:
print('✅ 已部署模型:')
for m in models:
print(f' - {m.get(\"model_type\")}: {m.get(\"model_uid\")}')
else:
print('❌ 没有已部署的模型,请运行: python deploy_models.py')
except:
print('❌ 无法获取模型信息')
"
echo ""
print_info "查看 GPU 使用情况:"
if command -v nvidia-smi &> /dev/null; then
nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader | while read line; do
echo " GPU $line"
done
else
echo " ⚠️ nvidia-smi 未安装,无法查看 GPU 信息"
fi
echo ""
echo "========================================="
echo " ✅ API 调用示例演示完成"
echo "========================================="
|