user_profile_extractor.py 47.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006
"""
用户画像信息提取器 - 从UserProfile中提取相关信息并生成描述
"""

from typing import Dict, Any, Optional, List, NamedTuple
from dataclasses import dataclass
from datetime import datetime, timedelta
from collections import Counter
import re,math
from src.services.user_profile import UserProfile
from config.logging_config import get_app_logger
from src.chat_search.dict_loader import DictLoader
from config.chat_search_config import USER_PROFILE_BEHAVIOR_CONFIG, SESSION_CONFIG, ATTR_STATIS_DISPLAY_MIN_OPTION_COUNT, ATTR_STATIS_DISPLAY_MIN_PRODUCT_COUNT, get_display_text, USER_BEHAVIOR_STAT_IN_PROMPT, USER_SEARCH_HISTORY_IN_PROMPT

logger = get_app_logger(__name__)


@dataclass
class BehaviorStatFieldConfig:
    """行为统计字段配置"""
    field_name: str  # 原始字段名
    feature_prefix: str  # 特征前缀
    display_name: str  # 显示名称
    description_template: str  # 描述模板
    max_items: int = 10  # 最大显示项目数
    is_repeated: bool = False  # 是否为重复字段
    is_numeric: bool = False  # 是否为数值字段
    is_time: bool = False  # 是否为时间字段
    bucket_size: int = 10  # 分桶大小(仅用于数值字段)
    enable: bool = True  # 是否启用该字段,默认启用
    dict_name: str = None  # 词典名称(可选)


@dataclass
class BehaviorStatsConfig:
    """行为统计配置"""
    # 行为权重定义
    behavior_weights: Dict[str, float] = None
    
    # 直接取值字段配置
    direct_fields: List[BehaviorStatFieldConfig] = None
    
    # 重复字段配置
    repeated_fields: List[BehaviorStatFieldConfig] = None
    
    # 数值字段配置
    numeric_fields: List[BehaviorStatFieldConfig] = None
    
    # 时间字段配置
    time_fields: List[BehaviorStatFieldConfig] = None
    
    # 行为统计配置
    behavior_summary_truncate_limit: int = 1000  # 行为统计截断限制
    
    def __post_init__(self):
        """初始化默认配置"""
        # 从集中配置加载
        config = USER_PROFILE_BEHAVIOR_CONFIG
        
        if self.behavior_weights is None:
            self.behavior_weights = config['behavior_weights']
        
        if self.direct_fields is None:
            self.direct_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['direct_fields']]
        
        if self.repeated_fields is None:
            self.repeated_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['repeated_fields']]
        
        if self.numeric_fields is None:
            self.numeric_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['numeric_fields']]
        
        if self.time_fields is None:
            self.time_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['time_fields']]
        
        if self.behavior_summary_truncate_limit is None:
            self.behavior_summary_truncate_limit = config['behavior_summary_truncate_limit']


@dataclass
class UserProfileInfo:
    """用户画像信息结构"""
    # 基础信息
    sale_market_value: str = ""  # 主要销售地区名
    nature_of_company_value: str = ""  # 公司性质名
    customer_type: str = ""  # 公司类型编码
    customer_type_value: str = ""  # 公司类型名
    sell_channel_value: str = ""  # 销售渠道名
    stores_number: int = 0  # 门店数量
    register_category_values: List[str] = None  # 注册主要采购品类名
    auth_category_values: List[str] = None  # 认证主要采购品类名
    purchase_quantity_by_year_value: str = ""  # 采购规模名
    customer_goods_structures: List[Dict[str, str]] = None  # 客户商品结构
    brand_category_values: List[str] = None  # 客户品牌品类名
    delivery_type_value: str = ""  # 主要出货方式名
    customs_import_scale: str = ""  # 海关进口规模
    purchase_quantity: int = 0  # 单款采购箱数
    tax_clearance_type: str = ""  # 清关方式编码
    tax_clearance_type_value: str = ""  # 清关方式名
    category_values: List[str] = None  # 经营类目名
    stores_number_offline: int = 0  # 线下门店数量
    year_sales_amount: str = ""  # 年销售额
    main_market_values: List[str] = None  # 主攻市场名
    main_area_values: List[str] = None  # 外贸主攻区域名
    secondary_area_values: List[str] = None  # 外贸次要区域名
    country_value: str = ""  # 国家名
    
    # 最近搜索词
    recent_search_keywords: List[str] = None  # 最近10个搜索词(过滤掉isSearchFactory=true的)
    
    def __post_init__(self):
        """初始化默认值"""
        if self.register_category_values is None:
            self.register_category_values = []
        if self.auth_category_values is None:
            self.auth_category_values = []
        if self.customer_goods_structures is None:
            self.customer_goods_structures = []
        if self.brand_category_values is None:
            self.brand_category_values = []
        if self.category_values is None:
            self.category_values = []
        if self.main_market_values is None:
            self.main_market_values = []
        if self.main_area_values is None:
            self.main_area_values = []
        if self.secondary_area_values is None:
            self.secondary_area_values = []
        if self.recent_search_keywords is None:
            self.recent_search_keywords = []


class UserProfileExtractor:
    """用户画像信息提取器"""
    
    def __init__(self):
        """初始化提取器"""
        self.behavior_stats_config = BehaviorStatsConfig()
        self.dict_loader = DictLoader()
    
    def extract_user_profile_info(self, user_profile: UserProfile) -> UserProfileInfo:
        """
        从UserProfile中提取相关信息
        
        Args:
            user_profile: UserProfile对象
            
        Returns:
            UserProfileInfo: 提取的用户画像信息
        """
        if not user_profile or not user_profile.base_info:
            logger.warning("[extract_user_profile_info] UserProfile or base_info is None")
            return UserProfileInfo()
        
        base_info = user_profile.base_info
        
        # 提取基础信息
        profile_info = UserProfileInfo(
            sale_market_value=base_info.saleMarketValue or "",
            nature_of_company_value=base_info.natureOfCompanyValue or "",
            customer_type=base_info.customerType or "",
            customer_type_value=base_info.customerTypeValue or "",
            sell_channel_value=base_info.sellChannelValue or "",
            stores_number=base_info.storesNumber or 0,
            register_category_values=[str(item) for item in base_info.registerCategoryValues] if base_info.registerCategoryValues else [],
            auth_category_values=[str(item) for item in base_info.authCategoryValues] if base_info.authCategoryValues else [],
            purchase_quantity_by_year_value=base_info.purchaseQuantityByYearValue or "",
            customer_goods_structures=self._extract_customer_goods_structures(base_info.customerGoodsStructure),
            brand_category_values=[str(item) for item in base_info.brandCategoryValues] if base_info.brandCategoryValues else [],
            delivery_type_value=base_info.deliveryTypeValue or "",
            customs_import_scale=base_info.customsImportScale or "",
            purchase_quantity=base_info.purchaseQuantity or 0,
            tax_clearance_type=base_info.taxClearanceType or "",
            tax_clearance_type_value=base_info.taxClearanceTypeValue or "",
            category_values=[str(item) for item in base_info.categoryValues] if base_info.categoryValues else [],
            stores_number_offline=base_info.storesNumberOffline or 0,
            year_sales_amount=base_info.yearSalesAmount or "",
            main_market_values=[str(item) for item in base_info.mainMarketValues] if base_info.mainMarketValues else [],
            main_area_values=[str(item) for item in base_info.mainAreaValues] if base_info.mainAreaValues else [],
            secondary_area_values=[str(item) for item in base_info.secondaryAreaValues] if base_info.secondaryAreaValues else [],
            country_value=base_info.countryValue or "",
            recent_search_keywords=self._extract_recent_search_keywords(user_profile)
        )
        
        logger.info(f"[UserProfileExtractor.extract_user_profile_info] Extracted user profile info: {profile_info}")
        return profile_info
    
    def _extract_customer_goods_structures(self, customer_goods_structures) -> List[Dict[str, str]]:
        """
        提取客户商品结构信息
        
        Args:
            customer_goods_structures: 客户商品结构列表
            
        Returns:
            List[Dict[str, str]]: 客户商品结构信息列表
        """
        if not customer_goods_structures:
            return []
        
        structures = []
        for structure in customer_goods_structures:
            structure_info = {
                'price_between': structure.priceBetween or "",
                'goods_grade': structure.goodsGrade or "",
                'package_type': structure.packageType or ""
            }
            structures.append(structure_info)
        
        return structures

        
    def generate_chat_search_intro(self, profile_info: UserProfileInfo) -> str:
        """
        生成导购语介绍
        
        Args:
            profile_info: UserProfileInfo对象
            
        Returns:
            str: 导购语介绍
        """
        if profile_info:
            customer_type_value = profile_info.customer_type_value
            # 地理位置信息
            location = profile_info.sale_market_value if profile_info.sale_market_value else profile_info.country_value
        else:
            customer_type_value = None
            location = None
        
        # 生成导购语
        if not location and not customer_type_value:
            return "你是一个跨境B2B选品顾问,请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"
        elif not location:
            return f"你是一个跨境B2B选品顾问,了解“{customer_type_value}”类型客户的采购决策逻辑。请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"  
        elif not customer_type_value:
            return f"你是一个跨境B2B选品顾问,熟悉{location}市场。请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"
        else:
            return f"你是一个跨境B2B选品顾问,熟悉{location}市场,了解“{customer_type_value}”类型客户的采购决策逻辑。请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"


    def generate_natural_language_description(self, profile_info: UserProfileInfo) -> str:
        """
        生成用户基础信息的自然语言描述
        
        Args:
            profile_info: UserProfileInfo对象
            
        Returns:
            str: 自然语言描述
        """
        if not profile_info:
            return "暂无用户画像信息"
        
        description_parts = []
        
        # 基础公司信息
        if profile_info.customer_type_value:
            description_parts.append(f"公司类型:{profile_info.customer_type_value}")
        
        if profile_info.nature_of_company_value:
            description_parts.append(f"公司性质:{profile_info.nature_of_company_value}")
        
        if profile_info.sell_channel_value:
            description_parts.append(f"销售渠道:{profile_info.sell_channel_value}")
        
        # 地理位置信息
        location_parts = []
        if profile_info.country_value:
            location_parts.append(profile_info.country_value)
        if profile_info.sale_market_value:
            location_parts.append(profile_info.sale_market_value)
        if location_parts:
            description_parts.append(f"主要销售地区:{', '.join(location_parts)}")
        
        # 门店信息
        if profile_info.stores_number > 0:
            description_parts.append(f"门店数量:{profile_info.stores_number}家")
        if profile_info.stores_number_offline > 0:
            description_parts.append(f"线下门店:{profile_info.stores_number_offline}家")
        
        # 采购信息
        if profile_info.purchase_quantity_by_year_value:
            description_parts.append(f"采购规模:{profile_info.purchase_quantity_by_year_value}")
        
        if profile_info.purchase_quantity > 0:
            description_parts.append(f"单款采购箱数:{profile_info.purchase_quantity}箱")
        
        # 年销售额
        if profile_info.year_sales_amount:
            description_parts.append(f"年销售额:{profile_info.year_sales_amount}")
        
        # 类目信息
        if profile_info.register_category_values:
            description_parts.append(f"注册采购品类:{', '.join(str(item) for item in profile_info.register_category_values)}")
        
        if profile_info.auth_category_values:
            description_parts.append(f"认证采购品类:{', '.join(str(item) for item in profile_info.auth_category_values)}")
        
        if profile_info.category_values:
            description_parts.append(f"经营类目:{', '.join(str(item) for item in profile_info.category_values)}")
        
        # 品牌信息
        if profile_info.brand_category_values:
            description_parts.append(f"品牌品类:{', '.join(str(item) for item in profile_info.brand_category_values)}")
        
        # 市场信息
        if profile_info.main_market_values:
            description_parts.append(f"主攻市场:{', '.join(str(item) for item in profile_info.main_market_values)}")
        
        if profile_info.main_area_values:
            description_parts.append(f"外贸主攻区域:{', '.join(str(item) for item in profile_info.main_area_values)}")
        
        # 商品结构统计
        if profile_info.customer_goods_structures:
            structure_descriptions = []
            for structure in profile_info.customer_goods_structures[:USER_PROFILE_BEHAVIOR_CONFIG['max_customer_goods_structures']]:  # 只取前N个
                parts = []
                if structure['price_between']:
                    parts.append(f"价格区间{structure['price_between']}")
                if structure['goods_grade']:
                    parts.append(f"产品档次{structure['goods_grade']}")
                if structure['package_type']:
                    parts.append(f"包装类型{structure['package_type']}")
                if parts:
                    structure_descriptions.append('、'.join(parts))
            
            if structure_descriptions:
                description_parts.append(f"商品结构统计:{'; '.join(structure_descriptions)}")
        
        # 物流信息
        if profile_info.delivery_type_value:
            description_parts.append(f"主要出货方式:{profile_info.delivery_type_value}")
        
        if profile_info.tax_clearance_type_value:
            description_parts.append(f"清关方式:{profile_info.tax_clearance_type_value}")
        
        if profile_info.customs_import_scale:
            description_parts.append(f"海关进口规模:{profile_info.customs_import_scale}")
                
        # 组合成完整描述
        if description_parts:
            return "\n".join(description_parts)
        else:
            return "暂无用户画像信息(信息为空)"
    
    def extract_and_describe(self, user_profile: UserProfile) -> str:
        """
        提取用户画像信息并生成完整的自然语言描述
        
        Args:
            user_profile: UserProfile对象
            
        Returns:
            导购语, 完整的用户画像自然语言描述
        """
        # 提取基础信息
        profile_info = self.extract_user_profile_info(user_profile)

        # 生成导购语
        guide_intro = self.generate_chat_search_intro(profile_info)

        if not user_profile:
            return guide_intro, "暂无用户画像信息"
        
        natural_description = self.generate_natural_language_description(profile_info)

        # 提取历史行为中的通用属性分布统计
        common_attribute_distribution = self.extract_common_attribute_distribution(user_profile)
        
        # 提取历史行为中每个商品的具体属性统计
        item_specific_attributes = self.extract_item_specific_attributes(user_profile)
                        
        # 生成自然语言描述
        common_attribute_description = self.generate_common_attribute_distribution_description(common_attribute_distribution)
        item_specific_attribute_description = self.generate_item_specific_attribute_description(item_specific_attributes)
        
        # 组织完整的描述
        language = getattr(self, 'language', 'zh')
        
        complete_description = f"{get_display_text('customer_background', language)}:\n{natural_description}"
        
        # 添加通用属性分布描述
        if USER_BEHAVIOR_STAT_IN_PROMPT:
            if common_attribute_description:
                complete_description += f"\n\n{get_display_text('historical_purchase_general_attributes', language)}:\n{common_attribute_description}"
            
            # 添加具体属性偏好描述
            if item_specific_attribute_description:
                complete_description += f"\n\n{get_display_text('historical_purchase_category_specific_attributes', language)}:\n{item_specific_attribute_description}"
        
        # 添加最近搜索词信息
        # 提取最近搜索词
        if USER_SEARCH_HISTORY_IN_PROMPT:
            recent_search_keywords = self._extract_recent_search_keywords(user_profile)
            if recent_search_keywords:
                complete_description += f"\n\n{get_display_text('recent_search_keywords', language)}:{', '.join(recent_search_keywords)}"
        
        return guide_intro, complete_description 

    def extract_common_attribute_distribution(self, user_profile: UserProfile) -> Dict[str, Any]:
        """
        提取历史行为中的通用属性分布统计
        
        Args:
            user_profile: UserProfile对象
            
        Returns:
            Dict[str, Any]: 通用属性分布统计信息
        """
        if not user_profile or not user_profile.behavior_map:
            logger.warning("[extract_common_attribute_distribution] UserProfile or behavior_map is None")
            return {}
        
        behavior_map = user_profile.behavior_map
        common_features = {}
        
        # 获取所有行为数据
        all_behaviors = []
        for behavior_type, behaviors in [
            ('click', behavior_map.click),
            ('add_cart', behavior_map.add_cart),
            ('collect', behavior_map.collect),
            ('purchase', behavior_map.purchase)
        ]:
            logger.info(f"[UserProfileExtractor.extract_common_attribute_distribution] Extracted behavior_type {behavior_type} with {len(behaviors)} behaviors")
            for behavior in behaviors:
                all_behaviors.append((behavior, self.behavior_stats_config.behavior_weights[behavior_type]))
                
        
        # 1. 处理直接取值字段
        for field_config in self.behavior_stats_config.direct_fields:
            if not field_config.enable:
                continue
            counter = Counter()
            total_weight_for_field = 0  # 该字段的总权重(包括空值)
            
            for behavior, weight in all_behaviors:
                total_weight_for_field += weight  # 所有行为都计入总数
                if hasattr(behavior, field_config.field_name):
                    value = getattr(behavior, field_config.field_name)
                    if value:  # 确保值不为空
                        counter[str(value)] += weight  # 转换为字符串
                    # 如果值为空,不加入counter,但已计入total_weight_for_field
            
            # 计算空值权重
            empty_weight = total_weight_for_field - sum(counter.values())
            if empty_weight > 0:
                counter['__empty__'] = empty_weight
            
            # 保存统计结果
            common_features[f'{field_config.feature_prefix}_weighted_counts'] = dict(counter)
            common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
            common_features[f'{field_config.feature_prefix}_top_items'] = [item for item, count in counter.most_common(10)]
        
        # 2. 处理重复字段
        for field_config in self.behavior_stats_config.repeated_fields:
            if not field_config.enable:
                continue
            counter = Counter()
            total_weight_for_field = 0  # 该字段的总权重(包括空值)
            
            for behavior, weight in all_behaviors:
                total_weight_for_field += weight  # 所有行为都计入总数
                if hasattr(behavior, field_config.field_name) and getattr(behavior, field_config.field_name):
                    values = getattr(behavior, field_config.field_name)
                    has_valid_value = False
                    for value in values:
                        if value:
                            counter[str(value)] += weight
                            has_valid_value = True
                    # 如果没有有效值,不加入counter,但已计入total_weight_for_field
                # 如果字段不存在或为空,不加入counter,但已计入total_weight_for_field
            
            # 计算空值权重
            empty_weight = total_weight_for_field - sum(counter.values())
            if empty_weight > 0:
                counter['__empty__'] = empty_weight
            
            common_features[f'{field_config.feature_prefix}_weighted_counts'] = dict(counter)
            common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
            common_features[f'{field_config.feature_prefix}_top_items'] = [item for item, count in counter.most_common(10)]
        
        # 3. 处理数值字段分桶统计
        for field_config in self.behavior_stats_config.numeric_fields:
            if not field_config.enable:
                continue
            bucket_counter = Counter()
            total_weight_for_field = 0  # 该字段的总权重(包括空值)
            
            for behavior, weight in all_behaviors:
                total_weight_for_field += weight  # 所有行为都计入总数
                if hasattr(behavior, field_config.field_name):
                    value = getattr(behavior, field_config.field_name)
                    if value and value > 0:
                        bucket = int(value / field_config.bucket_size)
                        bucket_counter[str(bucket)] += weight  # 转换为字符串
                    # 如果值为空或<=0,不加入counter,但已计入total_weight_for_field
            
            # 计算空值权重
            empty_weight = total_weight_for_field - sum(bucket_counter.values())
            if empty_weight > 0:
                bucket_counter['__empty__'] = empty_weight
            
            common_features[f'{field_config.feature_prefix}_bucket_weighted_counts'] = dict(bucket_counter)
            common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
            common_features[f'{field_config.feature_prefix}_top_buckets'] = [bucket for bucket, count in bucket_counter.most_common(10)]
        
        # 4. 处理时间差统计
        for field_config in self.behavior_stats_config.time_fields:
            if not field_config.enable:
                continue
            time_bucket_counter = Counter()
            total_weight_for_field = 0  # 该字段的总权重(包括空值)
            
            for behavior, weight in all_behaviors:
                total_weight_for_field += weight  # 所有行为都计入总数
                if hasattr(behavior, field_config.field_name) and hasattr(behavior, 'behaviorTime'):
                    time_value = getattr(behavior, field_config.field_name)
                    behavior_time = behavior.behaviorTime
                    
                    if time_value and behavior_time:
                        try:
                            # 解析时间字符串
                            if isinstance(time_value, str):
                                time_obj = datetime.strptime(time_value, '%Y-%m-%d %H:%M:%S')
                            else:
                                time_obj = time_value
                            
                            if isinstance(behavior_time, str):
                                behavior_time_obj = datetime.strptime(behavior_time, '%Y-%m-%d %H:%M:%S')
                            else:
                                behavior_time_obj = behavior_time
                            
                            # 计算时间差(月数)
                            time_diff = behavior_time_obj - time_obj
                            months_diff = int(time_diff.days / 30)
                            
                            # 分桶:0-6个月,6-12个月,12-24个月,24个月以上
                            if months_diff < 0:
                                bucket = 'future'
                            elif months_diff <= 6:
                                bucket = '0-6m'
                            elif months_diff <= 12:
                                bucket = '6-12m'
                            elif months_diff <= 24:
                                bucket = '12-24m'
                            else:
                                bucket = '24m+'
                            
                            time_bucket_counter[bucket] += weight
                            
                        except (ValueError, TypeError) as e:
                            logger.debug(f"Error parsing time for {field_config.field_name}: {e}")
                            continue
                    # 如果时间值为空或解析失败,不加入counter,但已计入total_weight_for_field
            
            # 计算空值权重
            empty_weight = total_weight_for_field - sum(time_bucket_counter.values())
            if empty_weight > 0:
                time_bucket_counter['__empty__'] = empty_weight
            
            common_features[f'{field_config.feature_prefix}_time_bucket_weighted_counts'] = dict(time_bucket_counter)
            common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
            common_features[f'{field_config.feature_prefix}_top_time_buckets'] = [bucket for bucket, count in time_bucket_counter.most_common(5)]
        
        # 5. 综合统计信息
        total_weighted_behaviors = sum(weight for _, weight in all_behaviors)
        common_features['total_weighted_behaviors'] = total_weighted_behaviors
        
        # 各行为类型的统计
        behavior_type_counts = Counter()
        for behavior_type, behaviors in [
            ('click', behavior_map.click),
            ('add_cart', behavior_map.add_cart),
            ('collect', behavior_map.collect),
            ('purchase', behavior_map.purchase)
        ]:
            behavior_type_counts[behavior_type] = len(behaviors)
        
        common_features['behavior_type_counts'] = dict(behavior_type_counts)
        
        logger.info(f"Extracted behavior stats with {len(common_features)} feature groups")
        return common_features
    
    def extract_item_specific_attributes(self, user_profile: UserProfile) -> Dict[str, Any]:
        """
        从历史行为中提取每个商品的具体属性统计
        
        Args:
            user_profile: UserProfile对象
            
        Returns:
            Dict[str, Any]: 商品具体属性统计信息
        """
        if not user_profile or not user_profile.behavior_map:
            logger.warning("[extract_item_specific_attributes] UserProfile or behavior_map is None")
            return {}
        
        behavior_map = user_profile.behavior_map
        
        # 获取所有行为数据
        all_behaviors = []
        for behavior_type, behaviors in [
            ('click', behavior_map.click),
            ('add_cart', behavior_map.add_cart),
            ('collect', behavior_map.collect),
            ('purchase', behavior_map.purchase)
        ]:
            for behavior in behaviors:
                all_behaviors.append((behavior, self.behavior_stats_config.behavior_weights[behavior_type]))
        
        # 统计每个属性名称和属性值对应的权重
        attr_statistics = {}  # {attr_name: {option_name: weight}}
        
        for behavior, weight in all_behaviors:
            # 合并 spuAttributeList 和 skuAttributeList
            merged_attributes = []
            
            # 以 skuAttributeList 为基础
            if hasattr(behavior, 'skuAttributeList') and behavior.skuAttributeList:
                merged_attributes.extend(behavior.skuAttributeList)
            
            # 加入 spuAttributeList,如果 attributeId 已存在则跳过
            existing_attr_ids = set()
            if hasattr(behavior, 'skuAttributeList') and behavior.skuAttributeList:
                existing_attr_ids = {attr.attributeId for attr in behavior.skuAttributeList}
            
            if hasattr(behavior, 'spuAttributeList') and behavior.spuAttributeList:
                for attr in behavior.spuAttributeList:
                    if attr.attributeId not in existing_attr_ids:
                        merged_attributes.append(attr)
                        existing_attr_ids.add(attr.attributeId)
            
            # 统计合并后的属性
            for attr in merged_attributes:
                attr_id = attr.attributeId
                option_id = attr.optionId
                
                # 获取属性名称
                attr_name = self.dict_loader.get_name('spu_attribute', str(attr_id))
                if not attr_name:
                    attr_name = self.dict_loader.get_name('sku_attribute', str(attr_id))
                if not attr_name:
                    attr_name = f"属性{attr_id}"
                
                # 获取属性值名称
                option_name = self.dict_loader.get_name('spu_attribute_option', str(option_id))
                if not option_name:
                    option_name = self.dict_loader.get_name('sku_attribute_option', str(option_id))
                if not option_name:
                    option_name = f"选项{option_id}"
                
                # 跳过无效的属性值
                if option_name == '无' or not option_name:
                    continue
                
                # 统计
                if attr_name not in attr_statistics:
                    attr_statistics[attr_name] = {}
                
                if option_name not in attr_statistics[attr_name]:
                    attr_statistics[attr_name][option_name] = 0
                
                attr_statistics[attr_name][option_name] += weight
        
        if not attr_statistics:
            return {}
        
        # 生成属性统计特征
        attribute_features = {}
        
        # 计算每个属性的总权重并排序
        attr_with_total = [
            (attr_name, options_dict, sum(options_dict.values()))
            for attr_name, options_dict in attr_statistics.items()
        ]
        
        # 按总权重排序,取前10个属性
        sorted_attrs = sorted(attr_with_total, key=lambda x: x[2], reverse=True)
        
        for attr_name, options_dict, total_weight in sorted_attrs:
            # 按权重排序选项,取前5个
            sorted_options = sorted(options_dict.items(), key=lambda x: x[1], reverse=True)
            
            # 生成特征名称(使用属性名称的拼音或ID作为前缀)
            attr_feature_prefix = f"attr_{attr_name.replace(' ', '_').replace(':', '_')}"
            
            attribute_features[f'{attr_feature_prefix}_weighted_counts'] = dict(options_dict)
            attribute_features[f'{attr_feature_prefix}_total_weight'] = total_weight
            attribute_features[f'{attr_feature_prefix}_top_items'] = [item for item, count in sorted_options]
        
        # 添加总体属性统计
        total_attribute_weight = sum(attr[2] for attr in sorted_attrs)
        attribute_features['attribute_total_weight'] = total_attribute_weight
        attribute_features['attribute_attr_count'] = len(sorted_attrs)
        
        logger.info(f"Extracted attribute statistics with {len(attribute_features)} attribute feature groups")
        return attribute_features
    
    def generate_common_attribute_distribution_description(self, common_attribute_distribution: Dict[str, Any]) -> str:
        """
        生成通用属性分布统计的自然语言描述
        
        Args:
            common_attribute_distribution: 通用属性分布统计信息
            
        Returns:
            str: 自然语言描述
        """
        if not common_attribute_distribution:
            return "暂无通用属性分布统计信息"
        
        description_parts = []
        
        # 0. 行为总述(放在最前面)
        if 'behavior_type_counts' in common_attribute_distribution:
            behavior_counts = common_attribute_distribution['behavior_type_counts']
            total_behaviors = sum(behavior_counts.values())
            
            if total_behaviors > 0:
                behavior_summary_parts = []
                
                # 检查是否达到截断限制
                if total_behaviors >= self.behavior_stats_config.behavior_summary_truncate_limit:
                    behavior_summary_parts.append(f"该用户有超过{self.behavior_stats_config.behavior_summary_truncate_limit}次行为")
                else:
                    behavior_summary_parts.append(f"该用户有{total_behaviors}次行为")
                
                # 添加具体行为类型统计
                behavior_details = []
                if behavior_counts.get('click', 0) > 0:
                    behavior_details.append(f"{behavior_counts['click']}次点击")
                if behavior_counts.get('add_cart', 0) > 0:
                    behavior_details.append(f"{behavior_counts['add_cart']}次加购")
                if behavior_counts.get('collect', 0) > 0:
                    behavior_details.append(f"{behavior_counts['collect']}次收藏")
                if behavior_counts.get('purchase', 0) > 0:
                    behavior_details.append(f"{behavior_counts['purchase']}次购买")
                
                if behavior_details:
                    behavior_summary_parts.append(f"包括{', '.join(behavior_details)}")
                
                description_parts.append(''.join(behavior_summary_parts))
        
        # 1. 处理直接取值字段描述
        for field_config in self.behavior_stats_config.direct_fields:
            if not field_config.enable:
                continue
            weighted_counts_key = f'{field_config.feature_prefix}_weighted_counts'
            total_weight_key = f'{field_config.feature_prefix}_total_weight'
            
            if weighted_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
                weighted_counts = common_attribute_distribution[weighted_counts_key]
                total_weight = common_attribute_distribution[total_weight_key]
                
                if total_weight > 0:
                    # 生成带占比的描述
                    items_with_percentage = []
                    for item, count in sorted(weighted_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
                        percentage = (count / total_weight) * 100
                        # 词典映射
                        if item == '__empty__':
                            display_name = '空值'
                        elif field_config.dict_name:
                            display_name = self.dict_loader.get_name(field_config.dict_name, str(item)) or str(item)
                        else:
                            display_name = str(item)
                        
                        items_with_percentage.append(f"{display_name}({percentage:.1f}%)")
                    
                    if items_with_percentage:
                        description = field_config.description_template.format(
                            display_name=field_config.display_name,
                            values=', '.join(items_with_percentage)
                        )
                        description_parts.append(description)
        
        # 2. 处理重复字段描述
        for field_config in self.behavior_stats_config.repeated_fields:
            if not field_config.enable:
                continue
            weighted_counts_key = f'{field_config.feature_prefix}_weighted_counts'
            total_weight_key = f'{field_config.feature_prefix}_total_weight'
            
            if weighted_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
                weighted_counts = common_attribute_distribution[weighted_counts_key]
                total_weight = common_attribute_distribution[total_weight_key]
                
                if total_weight > 0:
                    # 生成带占比的描述
                    items_with_percentage = []
                    for item, count in sorted(weighted_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
                        percentage = (count / total_weight) * 100
                        # 词典映射
                        if item == '__empty__':
                            display_name = '空值'
                        elif field_config.dict_name:
                            display_name = self.dict_loader.get_name(field_config.dict_name, str(item)) or str(item)
                        else:
                            display_name = str(item)
                        
                        items_with_percentage.append(f"{display_name}({percentage:.1f}%)")
                    
                    if items_with_percentage:
                        description = field_config.description_template.format(
                            display_name=field_config.display_name,
                            values=', '.join(items_with_percentage)
                        )
                        description_parts.append(description)
        
        # 3. 处理数值字段描述
        for field_config in self.behavior_stats_config.numeric_fields:
            if not field_config.enable:
                continue
            bucket_counts_key = f'{field_config.feature_prefix}_bucket_weighted_counts'
            total_weight_key = f'{field_config.feature_prefix}_total_weight'
            
            if bucket_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
                bucket_counts = common_attribute_distribution[bucket_counts_key]
                total_weight = common_attribute_distribution[total_weight_key]
                
                if total_weight > 0:
                    # 生成带占比的描述
                    ranges_with_percentage = []
                    for bucket, count in sorted(bucket_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
                        percentage = (count / total_weight) * 100
                        
                        if bucket == '__empty__':
                            range_desc = '空值'
                        else:
                            range_desc = f"{int(bucket)*field_config.bucket_size}-{(int(bucket)+1)*field_config.bucket_size}"
                        
                        ranges_with_percentage.append(f"{range_desc}({percentage:.1f}%)")
                    
                    if ranges_with_percentage:
                        description = field_config.description_template.format(
                            display_name=field_config.display_name,
                            values=', '.join(ranges_with_percentage)
                        )
                        description_parts.append(description)
        
        # 4. 处理时间字段描述
        for field_config in self.behavior_stats_config.time_fields:
            if not field_config.enable:
                continue
            time_bucket_counts_key = f'{field_config.feature_prefix}_time_bucket_weighted_counts'
            total_weight_key = f'{field_config.feature_prefix}_total_weight'
            
            if time_bucket_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
                time_bucket_counts = common_attribute_distribution[time_bucket_counts_key]
                total_weight = common_attribute_distribution[total_weight_key]
                
                if total_weight > 0:
                    # 生成带占比的描述
                    time_descriptions_with_percentage = []
                    for bucket, count in sorted(time_bucket_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
                        percentage = (count / total_weight) * 100
                        bucket_str = str(bucket)
                        
                        if bucket_str == '__empty__':
                            time_desc = '空值'
                        elif bucket_str == '0-6m':
                            time_desc = '半年内'
                        elif bucket_str == '6-12m':
                            time_desc = '半年到一年'
                        elif bucket_str == '12-24m':
                            time_desc = '1-2年'
                        elif bucket_str == '24m+':
                            time_desc = '2年+'
                        elif bucket_str == 'future':
                            time_desc = '错误时间'
                        else:
                            time_desc = bucket_str
                        
                        time_descriptions_with_percentage.append(f"{time_desc}({percentage:.1f}%)")
                    
                    if time_descriptions_with_percentage:
                        description = field_config.description_template.format(
                            display_name=field_config.display_name,
                            values=', '.join(time_descriptions_with_percentage)
                        )
                        description_parts.append(description)
        
        # 组合成完整描述
        if description_parts:
            return "\n".join(description_parts)
        else:
            return "" 

    def generate_item_specific_attribute_description(self, item_specific_attributes: Dict[str, Any]) -> str:
        """
        生成商品具体属性统计的自然语言描述
        
        Args:
            item_specific_attributes: 商品具体属性统计信息
            
        Returns:
            str: 商品具体属性统计的自然语言描述
        """
        if not item_specific_attributes:
            return "暂无商品具体属性统计信息。"
        
        descriptions = []
        
        # 获取所有属性相关的特征
        attr_features = {}
        for key, value in item_specific_attributes.items():
            if key.startswith('attr_') and key.endswith('_weighted_counts'):
                attr_name = key.replace('_weighted_counts', '').replace('attr_', '')
                attr_features[attr_name] = value
        
        if not attr_features:
            return "暂无有效属性统计信息。"
        
        # 按总权重排序属性
        sorted_attrs = []
        for attr_name, weighted_counts in attr_features.items():
            total_weight = sum(weighted_counts.values())
            sorted_attrs.append((attr_name, weighted_counts, total_weight))
        
        sorted_attrs.sort(key=lambda x: x[2], reverse=True)
        
        # 生成描述
        max_attrs = USER_PROFILE_BEHAVIOR_CONFIG['max_attributes_display']
        max_options = USER_PROFILE_BEHAVIOR_CONFIG['max_options_per_attribute']
        for attr_name, weighted_counts, total_weight in sorted_attrs[:max_attrs]:  # 取前N个属性
            # 按权重排序选项,取前N个
            sorted_options = sorted(weighted_counts.items(), key=lambda x: x[1], reverse=True)[:max_options]
            
            option_texts = []
            for option_name, weight in sorted_options:
                if option_name != '__empty__':
                    # 计算百分比
                    percentage = (weight / total_weight) * 100
                    option_texts.append(f"{option_name}({percentage:.1f}%)")
            
            if option_texts:
                desc = f"• {attr_name}: {', '.join(option_texts)}"
                descriptions.append(desc)
        
        if descriptions:
            return "\n".join(descriptions)
        return "暂无有效属性统计信息。"
    
    def _extract_recent_search_keywords(self, user_profile: UserProfile) -> List[str]:
        """
        提取最近10个搜索词(过滤掉isSearchFactory=true的)
        
        Args:
            user_profile: UserProfile对象
            
        Returns:
            List[str]: 最近10个搜索词列表
        """
        if not user_profile or not user_profile.behavior_map:
            return []
        
        search_keywords = user_profile.behavior_map.search_keyword
        if not search_keywords:
            return []
        
        # 过滤、去重并收集最近10个搜索词
        seen_keywords = set()
        recent_keywords = []
        for search_behavior in search_keywords:
            if not search_behavior.isSearchFactory and search_behavior.keyword:
                keyword = search_behavior.keyword.strip()
                
                # 过滤掉纯数字、下划线、减号、空白字符构成的关键词
                if self._is_valid_search_keyword(keyword):
                    if keyword not in seen_keywords:
                        seen_keywords.add(keyword)
                        recent_keywords.append(keyword)
                        if len(recent_keywords) >= SESSION_CONFIG['max_recent_search_keywords']:  # 达到最大数量就停止
                            break
                
        logger.info(f"[UserProfileExtractor._extract_recent_search_keywords] Extracted {len(recent_keywords)} recent search keywords")
        return recent_keywords
    
    def _is_valid_search_keyword(self, keyword: str) -> bool:
        """
        判断搜索关键词是否有效
        
        Args:
            keyword: 搜索关键词
            
        Returns:
            bool: 是否有效
        """
        if not keyword or keyword.strip() == '':
            return False
        
        # 过滤掉纯数字、下划线、减号、空白字符构成的关键词
        # 使用正则表达式匹配:只包含数字、下划线、减号、空白字符的字符串
        if re.match(r'^[\d\s_-]+$', keyword):
            return False
        
        # 只有一个单词(split后只有一个)、并且这个单词里面既包含数字又包含字母 (转小写后 既有小写字母、又有数字)
        if len(keyword.split()) == 1:
            if re.match(r'^[a-z0-9]+$', keyword.lower()):
                return False
            # 包含数字和-
            if re.match(r'^[0-9-]+$', keyword):
                return False
        
        return True