cloud_embedding_example.py 4.17 KB
"""
简单示例:使用云端文本向量化模块

展示如何使用 CloudTextEncoder 进行文本向量化。
"""

import os
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from embeddings.cloud_text_encoder import CloudTextEncoder


def example_single_text():
    """示例1:单个文本向量化"""
    print("=" * 60)
    print("示例1:单个文本向量化")
    print("=" * 60)
    
    # 初始化编码器
    encoder = CloudTextEncoder()
    
    # 单个文本
    text = "衣服的质量杠杠的"
    print(f"输入文本: {text}")
    
    # 生成向量
    embedding = encoder.encode(text)
    
    print(f"向量维度: {embedding.shape}")
    print(f"向量前5个值: {embedding[0][:5]}")
    print()


def example_multiple_texts():
    """示例2:批量文本向量化"""
    print("=" * 60)
    print("示例2:批量文本向量化")
    print("=" * 60)
    
    # 初始化编码器
    encoder = CloudTextEncoder()
    
    # 多个文本
    texts = [
        "Bohemian Maxi Dress",
        "Vintage Denim Jacket",
        "Minimalist Linen Trousers",
        "Gothic Black Boots",
        "Streetwear Oversized Hoodie"
    ]
    
    print(f"输入文本数量: {len(texts)}")
    for i, text in enumerate(texts, 1):
        print(f"  {i}. {text}")
    
    # 生成向量
    embeddings = encoder.encode(texts)
    
    print(f"\n向量矩阵维度: {embeddings.shape}")
    print(f"第一个文本的向量前5个值: {embeddings[0][:5]}")
    print()


def example_batch_processing():
    """示例3:大批量处理"""
    print("=" * 60)
    print("示例3:大批量处理(自动分批)")
    print("=" * 60)
    
    # 初始化编码器
    encoder = CloudTextEncoder()
    
    # 生成大量文本
    texts = [f"商品描述 {i}" for i in range(50)]
    
    print(f"输入文本数量: {len(texts)}")
    print(f"批大小: 10")
    
    # 使用 encode_batch 自动分批处理
    embeddings = encoder.encode_batch(texts, batch_size=10)
    
    print(f"向量矩阵维度: {embeddings.shape}")
    print(f"平均向量范数: {embeddings.mean():.4f}")
    print()


def example_similarity_calculation():
    """示例4:计算文本相似度"""
    print("=" * 60)
    print("示例4:计算文本相似度")
    print("=" * 60)
    
    import numpy as np
    
    # 初始化编码器
    encoder = CloudTextEncoder()
    
    # 准备文本
    query = "夏季连衣裙"
    candidates = [
        "Summer maxi dress",
        "冬季羽绒服",
        "夏天长裙",
        "运动鞋",
        "女士连衣裙"
    ]
    
    print(f"查询文本: {query}")
    print(f"候选文本:")
    for i, text in enumerate(candidates, 1):
        print(f"  {i}. {text}")
    
    # 生成向量
    query_embedding = encoder.encode(query)
    candidate_embeddings = encoder.encode(candidates)
    
    # 计算余弦相似度
    def cosine_similarity(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    print(f"\n相似度分数:")
    similarities = []
    for i, candidate_emb in enumerate(candidate_embeddings):
        sim = cosine_similarity(query_embedding[0], candidate_emb)
        similarities.append((sim, candidates[i]))
        print(f"  {candidates[i]}: {sim:.4f}")
    
    # 排序并显示最相似的
    similarities.sort(reverse=True)
    print(f"\n最相似的文本: {similarities[0][1]} (相似度: {similarities[0][0]:.4f})")
    print()


def main():
    """主函数"""
    # 检查 API Key
    if not os.getenv("DASHSCOPE_API_KEY"):
        print("错误: 请设置 DASHSCOPE_API_KEY 环境变量")
        print("示例: export DASHSCOPE_API_KEY='your-api-key'")
        return
    
    print("\n云端文本向量化示例\n")
    
    try:
        # 运行所有示例
        example_single_text()
        example_multiple_texts()
        example_batch_processing()
        example_similarity_calculation()
        
        print("=" * 60)
        print("所有示例运行完成!")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n错误: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()