""" 简单示例:使用云端文本向量化模块 展示如何使用 CloudTextEncoder 进行文本向量化。 """ import os import sys from pathlib import Path # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent.parent)) from embeddings.cloud_text_encoder import CloudTextEncoder def example_single_text(): """示例1:单个文本向量化""" print("=" * 60) print("示例1:单个文本向量化") print("=" * 60) # 初始化编码器 encoder = CloudTextEncoder() # 单个文本 text = "衣服的质量杠杠的" print(f"输入文本: {text}") # 生成向量 embedding = encoder.encode(text) print(f"向量维度: {embedding.shape}") print(f"向量前5个值: {embedding[0][:5]}") print() def example_multiple_texts(): """示例2:批量文本向量化""" print("=" * 60) print("示例2:批量文本向量化") print("=" * 60) # 初始化编码器 encoder = CloudTextEncoder() # 多个文本 texts = [ "Bohemian Maxi Dress", "Vintage Denim Jacket", "Minimalist Linen Trousers", "Gothic Black Boots", "Streetwear Oversized Hoodie" ] print(f"输入文本数量: {len(texts)}") for i, text in enumerate(texts, 1): print(f" {i}. {text}") # 生成向量 embeddings = encoder.encode(texts) print(f"\n向量矩阵维度: {embeddings.shape}") print(f"第一个文本的向量前5个值: {embeddings[0][:5]}") print() def example_batch_processing(): """示例3:大批量处理""" print("=" * 60) print("示例3:大批量处理(自动分批)") print("=" * 60) # 初始化编码器 encoder = CloudTextEncoder() # 生成大量文本 texts = [f"商品描述 {i}" for i in range(50)] print(f"输入文本数量: {len(texts)}") print(f"批大小: 10") # 使用 encode_batch 自动分批处理 embeddings = encoder.encode_batch(texts, batch_size=10) print(f"向量矩阵维度: {embeddings.shape}") print(f"平均向量范数: {embeddings.mean():.4f}") print() def example_similarity_calculation(): """示例4:计算文本相似度""" print("=" * 60) print("示例4:计算文本相似度") print("=" * 60) import numpy as np # 初始化编码器 encoder = CloudTextEncoder() # 准备文本 query = "夏季连衣裙" candidates = [ "Summer maxi dress", "冬季羽绒服", "夏天长裙", "运动鞋", "女士连衣裙" ] print(f"查询文本: {query}") print(f"候选文本:") for i, text in enumerate(candidates, 1): print(f" {i}. {text}") # 生成向量 query_embedding = encoder.encode(query) candidate_embeddings = encoder.encode(candidates) # 计算余弦相似度 def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) print(f"\n相似度分数:") similarities = [] for i, candidate_emb in enumerate(candidate_embeddings): sim = cosine_similarity(query_embedding[0], candidate_emb) similarities.append((sim, candidates[i])) print(f" {candidates[i]}: {sim:.4f}") # 排序并显示最相似的 similarities.sort(reverse=True) print(f"\n最相似的文本: {similarities[0][1]} (相似度: {similarities[0][0]:.4f})") print() def main(): """主函数""" # 检查 API Key if not os.getenv("DASHSCOPE_API_KEY"): print("错误: 请设置 DASHSCOPE_API_KEY 环境变量") print("示例: export DASHSCOPE_API_KEY='your-api-key'") return print("\n云端文本向量化示例\n") try: # 运行所有示例 example_single_text() example_multiple_texts() example_batch_processing() example_similarity_calculation() print("=" * 60) print("所有示例运行完成!") print("=" * 60) except Exception as e: print(f"\n错误: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()