user_profile_extractor.py
47.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
"""
用户画像信息提取器 - 从UserProfile中提取相关信息并生成描述
"""
from typing import Dict, Any, Optional, List, NamedTuple
from dataclasses import dataclass
from datetime import datetime, timedelta
from collections import Counter
import re,math
from src.services.user_profile import UserProfile
from config.logging_config import get_app_logger
from src.chat_search.dict_loader import DictLoader
from config.chat_search_config import USER_PROFILE_BEHAVIOR_CONFIG, SESSION_CONFIG, ATTR_STATIS_DISPLAY_MIN_OPTION_COUNT, ATTR_STATIS_DISPLAY_MIN_PRODUCT_COUNT, get_display_text, USER_BEHAVIOR_STAT_IN_PROMPT, USER_SEARCH_HISTORY_IN_PROMPT
logger = get_app_logger(__name__)
@dataclass
class BehaviorStatFieldConfig:
"""行为统计字段配置"""
field_name: str # 原始字段名
feature_prefix: str # 特征前缀
display_name: str # 显示名称
description_template: str # 描述模板
max_items: int = 10 # 最大显示项目数
is_repeated: bool = False # 是否为重复字段
is_numeric: bool = False # 是否为数值字段
is_time: bool = False # 是否为时间字段
bucket_size: int = 10 # 分桶大小(仅用于数值字段)
enable: bool = True # 是否启用该字段,默认启用
dict_name: str = None # 词典名称(可选)
@dataclass
class BehaviorStatsConfig:
"""行为统计配置"""
# 行为权重定义
behavior_weights: Dict[str, float] = None
# 直接取值字段配置
direct_fields: List[BehaviorStatFieldConfig] = None
# 重复字段配置
repeated_fields: List[BehaviorStatFieldConfig] = None
# 数值字段配置
numeric_fields: List[BehaviorStatFieldConfig] = None
# 时间字段配置
time_fields: List[BehaviorStatFieldConfig] = None
# 行为统计配置
behavior_summary_truncate_limit: int = 1000 # 行为统计截断限制
def __post_init__(self):
"""初始化默认配置"""
# 从集中配置加载
config = USER_PROFILE_BEHAVIOR_CONFIG
if self.behavior_weights is None:
self.behavior_weights = config['behavior_weights']
if self.direct_fields is None:
self.direct_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['direct_fields']]
if self.repeated_fields is None:
self.repeated_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['repeated_fields']]
if self.numeric_fields is None:
self.numeric_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['numeric_fields']]
if self.time_fields is None:
self.time_fields = [BehaviorStatFieldConfig(**field_config) for field_config in config['time_fields']]
if self.behavior_summary_truncate_limit is None:
self.behavior_summary_truncate_limit = config['behavior_summary_truncate_limit']
@dataclass
class UserProfileInfo:
"""用户画像信息结构"""
# 基础信息
sale_market_value: str = "" # 主要销售地区名
nature_of_company_value: str = "" # 公司性质名
customer_type: str = "" # 公司类型编码
customer_type_value: str = "" # 公司类型名
sell_channel_value: str = "" # 销售渠道名
stores_number: int = 0 # 门店数量
register_category_values: List[str] = None # 注册主要采购品类名
auth_category_values: List[str] = None # 认证主要采购品类名
purchase_quantity_by_year_value: str = "" # 采购规模名
customer_goods_structures: List[Dict[str, str]] = None # 客户商品结构
brand_category_values: List[str] = None # 客户品牌品类名
delivery_type_value: str = "" # 主要出货方式名
customs_import_scale: str = "" # 海关进口规模
purchase_quantity: int = 0 # 单款采购箱数
tax_clearance_type: str = "" # 清关方式编码
tax_clearance_type_value: str = "" # 清关方式名
category_values: List[str] = None # 经营类目名
stores_number_offline: int = 0 # 线下门店数量
year_sales_amount: str = "" # 年销售额
main_market_values: List[str] = None # 主攻市场名
main_area_values: List[str] = None # 外贸主攻区域名
secondary_area_values: List[str] = None # 外贸次要区域名
country_value: str = "" # 国家名
# 最近搜索词
recent_search_keywords: List[str] = None # 最近10个搜索词(过滤掉isSearchFactory=true的)
def __post_init__(self):
"""初始化默认值"""
if self.register_category_values is None:
self.register_category_values = []
if self.auth_category_values is None:
self.auth_category_values = []
if self.customer_goods_structures is None:
self.customer_goods_structures = []
if self.brand_category_values is None:
self.brand_category_values = []
if self.category_values is None:
self.category_values = []
if self.main_market_values is None:
self.main_market_values = []
if self.main_area_values is None:
self.main_area_values = []
if self.secondary_area_values is None:
self.secondary_area_values = []
if self.recent_search_keywords is None:
self.recent_search_keywords = []
class UserProfileExtractor:
"""用户画像信息提取器"""
def __init__(self):
"""初始化提取器"""
self.behavior_stats_config = BehaviorStatsConfig()
self.dict_loader = DictLoader()
def extract_user_profile_info(self, user_profile: UserProfile) -> UserProfileInfo:
"""
从UserProfile中提取相关信息
Args:
user_profile: UserProfile对象
Returns:
UserProfileInfo: 提取的用户画像信息
"""
if not user_profile or not user_profile.base_info:
logger.warning("[extract_user_profile_info] UserProfile or base_info is None")
return UserProfileInfo()
base_info = user_profile.base_info
# 提取基础信息
profile_info = UserProfileInfo(
sale_market_value=base_info.saleMarketValue or "",
nature_of_company_value=base_info.natureOfCompanyValue or "",
customer_type=base_info.customerType or "",
customer_type_value=base_info.customerTypeValue or "",
sell_channel_value=base_info.sellChannelValue or "",
stores_number=base_info.storesNumber or 0,
register_category_values=[str(item) for item in base_info.registerCategoryValues] if base_info.registerCategoryValues else [],
auth_category_values=[str(item) for item in base_info.authCategoryValues] if base_info.authCategoryValues else [],
purchase_quantity_by_year_value=base_info.purchaseQuantityByYearValue or "",
customer_goods_structures=self._extract_customer_goods_structures(base_info.customerGoodsStructure),
brand_category_values=[str(item) for item in base_info.brandCategoryValues] if base_info.brandCategoryValues else [],
delivery_type_value=base_info.deliveryTypeValue or "",
customs_import_scale=base_info.customsImportScale or "",
purchase_quantity=base_info.purchaseQuantity or 0,
tax_clearance_type=base_info.taxClearanceType or "",
tax_clearance_type_value=base_info.taxClearanceTypeValue or "",
category_values=[str(item) for item in base_info.categoryValues] if base_info.categoryValues else [],
stores_number_offline=base_info.storesNumberOffline or 0,
year_sales_amount=base_info.yearSalesAmount or "",
main_market_values=[str(item) for item in base_info.mainMarketValues] if base_info.mainMarketValues else [],
main_area_values=[str(item) for item in base_info.mainAreaValues] if base_info.mainAreaValues else [],
secondary_area_values=[str(item) for item in base_info.secondaryAreaValues] if base_info.secondaryAreaValues else [],
country_value=base_info.countryValue or "",
recent_search_keywords=self._extract_recent_search_keywords(user_profile)
)
logger.info(f"[UserProfileExtractor.extract_user_profile_info] Extracted user profile info: {profile_info}")
return profile_info
def _extract_customer_goods_structures(self, customer_goods_structures) -> List[Dict[str, str]]:
"""
提取客户商品结构信息
Args:
customer_goods_structures: 客户商品结构列表
Returns:
List[Dict[str, str]]: 客户商品结构信息列表
"""
if not customer_goods_structures:
return []
structures = []
for structure in customer_goods_structures:
structure_info = {
'price_between': structure.priceBetween or "",
'goods_grade': structure.goodsGrade or "",
'package_type': structure.packageType or ""
}
structures.append(structure_info)
return structures
def generate_chat_search_intro(self, profile_info: UserProfileInfo) -> str:
"""
生成导购语介绍
Args:
profile_info: UserProfileInfo对象
Returns:
str: 导购语介绍
"""
if profile_info:
customer_type_value = profile_info.customer_type_value
# 地理位置信息
location = profile_info.sale_market_value if profile_info.sale_market_value else profile_info.country_value
else:
customer_type_value = None
location = None
# 生成导购语
if not location and not customer_type_value:
return "你是一个跨境B2B选品顾问,请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"
elif not location:
return f"你是一个跨境B2B选品顾问,了解“{customer_type_value}”类型客户的采购决策逻辑。请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"
elif not customer_type_value:
return f"你是一个跨境B2B选品顾问,熟悉{location}市场。请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"
else:
return f"你是一个跨境B2B选品顾问,熟悉{location}市场,了解“{customer_type_value}”类型客户的采购决策逻辑。请基于客户背景信息、本次搜索query及其相关的搜索结果,按要求完成选品的思考和建议。"
def generate_natural_language_description(self, profile_info: UserProfileInfo) -> str:
"""
生成用户基础信息的自然语言描述
Args:
profile_info: UserProfileInfo对象
Returns:
str: 自然语言描述
"""
if not profile_info:
return "暂无用户画像信息"
description_parts = []
# 基础公司信息
if profile_info.customer_type_value:
description_parts.append(f"公司类型:{profile_info.customer_type_value}")
if profile_info.nature_of_company_value:
description_parts.append(f"公司性质:{profile_info.nature_of_company_value}")
if profile_info.sell_channel_value:
description_parts.append(f"销售渠道:{profile_info.sell_channel_value}")
# 地理位置信息
location_parts = []
if profile_info.country_value:
location_parts.append(profile_info.country_value)
if profile_info.sale_market_value:
location_parts.append(profile_info.sale_market_value)
if location_parts:
description_parts.append(f"主要销售地区:{', '.join(location_parts)}")
# 门店信息
if profile_info.stores_number > 0:
description_parts.append(f"门店数量:{profile_info.stores_number}家")
if profile_info.stores_number_offline > 0:
description_parts.append(f"线下门店:{profile_info.stores_number_offline}家")
# 采购信息
if profile_info.purchase_quantity_by_year_value:
description_parts.append(f"采购规模:{profile_info.purchase_quantity_by_year_value}")
if profile_info.purchase_quantity > 0:
description_parts.append(f"单款采购箱数:{profile_info.purchase_quantity}箱")
# 年销售额
if profile_info.year_sales_amount:
description_parts.append(f"年销售额:{profile_info.year_sales_amount}")
# 类目信息
if profile_info.register_category_values:
description_parts.append(f"注册采购品类:{', '.join(str(item) for item in profile_info.register_category_values)}")
if profile_info.auth_category_values:
description_parts.append(f"认证采购品类:{', '.join(str(item) for item in profile_info.auth_category_values)}")
if profile_info.category_values:
description_parts.append(f"经营类目:{', '.join(str(item) for item in profile_info.category_values)}")
# 品牌信息
if profile_info.brand_category_values:
description_parts.append(f"品牌品类:{', '.join(str(item) for item in profile_info.brand_category_values)}")
# 市场信息
if profile_info.main_market_values:
description_parts.append(f"主攻市场:{', '.join(str(item) for item in profile_info.main_market_values)}")
if profile_info.main_area_values:
description_parts.append(f"外贸主攻区域:{', '.join(str(item) for item in profile_info.main_area_values)}")
# 商品结构统计
if profile_info.customer_goods_structures:
structure_descriptions = []
for structure in profile_info.customer_goods_structures[:USER_PROFILE_BEHAVIOR_CONFIG['max_customer_goods_structures']]: # 只取前N个
parts = []
if structure['price_between']:
parts.append(f"价格区间{structure['price_between']}")
if structure['goods_grade']:
parts.append(f"产品档次{structure['goods_grade']}")
if structure['package_type']:
parts.append(f"包装类型{structure['package_type']}")
if parts:
structure_descriptions.append('、'.join(parts))
if structure_descriptions:
description_parts.append(f"商品结构统计:{'; '.join(structure_descriptions)}")
# 物流信息
if profile_info.delivery_type_value:
description_parts.append(f"主要出货方式:{profile_info.delivery_type_value}")
if profile_info.tax_clearance_type_value:
description_parts.append(f"清关方式:{profile_info.tax_clearance_type_value}")
if profile_info.customs_import_scale:
description_parts.append(f"海关进口规模:{profile_info.customs_import_scale}")
# 组合成完整描述
if description_parts:
return "\n".join(description_parts)
else:
return "暂无用户画像信息(信息为空)"
def extract_and_describe(self, user_profile: UserProfile) -> str:
"""
提取用户画像信息并生成完整的自然语言描述
Args:
user_profile: UserProfile对象
Returns:
导购语, 完整的用户画像自然语言描述
"""
# 提取基础信息
profile_info = self.extract_user_profile_info(user_profile)
# 生成导购语
guide_intro = self.generate_chat_search_intro(profile_info)
if not user_profile:
return guide_intro, "暂无用户画像信息"
natural_description = self.generate_natural_language_description(profile_info)
# 提取历史行为中的通用属性分布统计
common_attribute_distribution = self.extract_common_attribute_distribution(user_profile)
# 提取历史行为中每个商品的具体属性统计
item_specific_attributes = self.extract_item_specific_attributes(user_profile)
# 生成自然语言描述
common_attribute_description = self.generate_common_attribute_distribution_description(common_attribute_distribution)
item_specific_attribute_description = self.generate_item_specific_attribute_description(item_specific_attributes)
# 组织完整的描述
language = getattr(self, 'language', 'zh')
complete_description = f"{get_display_text('customer_background', language)}:\n{natural_description}"
# 添加通用属性分布描述
if USER_BEHAVIOR_STAT_IN_PROMPT:
if common_attribute_description:
complete_description += f"\n\n{get_display_text('historical_purchase_general_attributes', language)}:\n{common_attribute_description}"
# 添加具体属性偏好描述
if item_specific_attribute_description:
complete_description += f"\n\n{get_display_text('historical_purchase_category_specific_attributes', language)}:\n{item_specific_attribute_description}"
# 添加最近搜索词信息
# 提取最近搜索词
if USER_SEARCH_HISTORY_IN_PROMPT:
recent_search_keywords = self._extract_recent_search_keywords(user_profile)
if recent_search_keywords:
complete_description += f"\n\n{get_display_text('recent_search_keywords', language)}:{', '.join(recent_search_keywords)}"
return guide_intro, complete_description
def extract_common_attribute_distribution(self, user_profile: UserProfile) -> Dict[str, Any]:
"""
提取历史行为中的通用属性分布统计
Args:
user_profile: UserProfile对象
Returns:
Dict[str, Any]: 通用属性分布统计信息
"""
if not user_profile or not user_profile.behavior_map:
logger.warning("[extract_common_attribute_distribution] UserProfile or behavior_map is None")
return {}
behavior_map = user_profile.behavior_map
common_features = {}
# 获取所有行为数据
all_behaviors = []
for behavior_type, behaviors in [
('click', behavior_map.click),
('add_cart', behavior_map.add_cart),
('collect', behavior_map.collect),
('purchase', behavior_map.purchase)
]:
logger.info(f"[UserProfileExtractor.extract_common_attribute_distribution] Extracted behavior_type {behavior_type} with {len(behaviors)} behaviors")
for behavior in behaviors:
all_behaviors.append((behavior, self.behavior_stats_config.behavior_weights[behavior_type]))
# 1. 处理直接取值字段
for field_config in self.behavior_stats_config.direct_fields:
if not field_config.enable:
continue
counter = Counter()
total_weight_for_field = 0 # 该字段的总权重(包括空值)
for behavior, weight in all_behaviors:
total_weight_for_field += weight # 所有行为都计入总数
if hasattr(behavior, field_config.field_name):
value = getattr(behavior, field_config.field_name)
if value: # 确保值不为空
counter[str(value)] += weight # 转换为字符串
# 如果值为空,不加入counter,但已计入total_weight_for_field
# 计算空值权重
empty_weight = total_weight_for_field - sum(counter.values())
if empty_weight > 0:
counter['__empty__'] = empty_weight
# 保存统计结果
common_features[f'{field_config.feature_prefix}_weighted_counts'] = dict(counter)
common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
common_features[f'{field_config.feature_prefix}_top_items'] = [item for item, count in counter.most_common(10)]
# 2. 处理重复字段
for field_config in self.behavior_stats_config.repeated_fields:
if not field_config.enable:
continue
counter = Counter()
total_weight_for_field = 0 # 该字段的总权重(包括空值)
for behavior, weight in all_behaviors:
total_weight_for_field += weight # 所有行为都计入总数
if hasattr(behavior, field_config.field_name) and getattr(behavior, field_config.field_name):
values = getattr(behavior, field_config.field_name)
has_valid_value = False
for value in values:
if value:
counter[str(value)] += weight
has_valid_value = True
# 如果没有有效值,不加入counter,但已计入total_weight_for_field
# 如果字段不存在或为空,不加入counter,但已计入total_weight_for_field
# 计算空值权重
empty_weight = total_weight_for_field - sum(counter.values())
if empty_weight > 0:
counter['__empty__'] = empty_weight
common_features[f'{field_config.feature_prefix}_weighted_counts'] = dict(counter)
common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
common_features[f'{field_config.feature_prefix}_top_items'] = [item for item, count in counter.most_common(10)]
# 3. 处理数值字段分桶统计
for field_config in self.behavior_stats_config.numeric_fields:
if not field_config.enable:
continue
bucket_counter = Counter()
total_weight_for_field = 0 # 该字段的总权重(包括空值)
for behavior, weight in all_behaviors:
total_weight_for_field += weight # 所有行为都计入总数
if hasattr(behavior, field_config.field_name):
value = getattr(behavior, field_config.field_name)
if value and value > 0:
bucket = int(value / field_config.bucket_size)
bucket_counter[str(bucket)] += weight # 转换为字符串
# 如果值为空或<=0,不加入counter,但已计入total_weight_for_field
# 计算空值权重
empty_weight = total_weight_for_field - sum(bucket_counter.values())
if empty_weight > 0:
bucket_counter['__empty__'] = empty_weight
common_features[f'{field_config.feature_prefix}_bucket_weighted_counts'] = dict(bucket_counter)
common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
common_features[f'{field_config.feature_prefix}_top_buckets'] = [bucket for bucket, count in bucket_counter.most_common(10)]
# 4. 处理时间差统计
for field_config in self.behavior_stats_config.time_fields:
if not field_config.enable:
continue
time_bucket_counter = Counter()
total_weight_for_field = 0 # 该字段的总权重(包括空值)
for behavior, weight in all_behaviors:
total_weight_for_field += weight # 所有行为都计入总数
if hasattr(behavior, field_config.field_name) and hasattr(behavior, 'behaviorTime'):
time_value = getattr(behavior, field_config.field_name)
behavior_time = behavior.behaviorTime
if time_value and behavior_time:
try:
# 解析时间字符串
if isinstance(time_value, str):
time_obj = datetime.strptime(time_value, '%Y-%m-%d %H:%M:%S')
else:
time_obj = time_value
if isinstance(behavior_time, str):
behavior_time_obj = datetime.strptime(behavior_time, '%Y-%m-%d %H:%M:%S')
else:
behavior_time_obj = behavior_time
# 计算时间差(月数)
time_diff = behavior_time_obj - time_obj
months_diff = int(time_diff.days / 30)
# 分桶:0-6个月,6-12个月,12-24个月,24个月以上
if months_diff < 0:
bucket = 'future'
elif months_diff <= 6:
bucket = '0-6m'
elif months_diff <= 12:
bucket = '6-12m'
elif months_diff <= 24:
bucket = '12-24m'
else:
bucket = '24m+'
time_bucket_counter[bucket] += weight
except (ValueError, TypeError) as e:
logger.debug(f"Error parsing time for {field_config.field_name}: {e}")
continue
# 如果时间值为空或解析失败,不加入counter,但已计入total_weight_for_field
# 计算空值权重
empty_weight = total_weight_for_field - sum(time_bucket_counter.values())
if empty_weight > 0:
time_bucket_counter['__empty__'] = empty_weight
common_features[f'{field_config.feature_prefix}_time_bucket_weighted_counts'] = dict(time_bucket_counter)
common_features[f'{field_config.feature_prefix}_total_weight'] = total_weight_for_field
common_features[f'{field_config.feature_prefix}_top_time_buckets'] = [bucket for bucket, count in time_bucket_counter.most_common(5)]
# 5. 综合统计信息
total_weighted_behaviors = sum(weight for _, weight in all_behaviors)
common_features['total_weighted_behaviors'] = total_weighted_behaviors
# 各行为类型的统计
behavior_type_counts = Counter()
for behavior_type, behaviors in [
('click', behavior_map.click),
('add_cart', behavior_map.add_cart),
('collect', behavior_map.collect),
('purchase', behavior_map.purchase)
]:
behavior_type_counts[behavior_type] = len(behaviors)
common_features['behavior_type_counts'] = dict(behavior_type_counts)
logger.info(f"Extracted behavior stats with {len(common_features)} feature groups")
return common_features
def extract_item_specific_attributes(self, user_profile: UserProfile) -> Dict[str, Any]:
"""
从历史行为中提取每个商品的具体属性统计
Args:
user_profile: UserProfile对象
Returns:
Dict[str, Any]: 商品具体属性统计信息
"""
if not user_profile or not user_profile.behavior_map:
logger.warning("[extract_item_specific_attributes] UserProfile or behavior_map is None")
return {}
behavior_map = user_profile.behavior_map
# 获取所有行为数据
all_behaviors = []
for behavior_type, behaviors in [
('click', behavior_map.click),
('add_cart', behavior_map.add_cart),
('collect', behavior_map.collect),
('purchase', behavior_map.purchase)
]:
for behavior in behaviors:
all_behaviors.append((behavior, self.behavior_stats_config.behavior_weights[behavior_type]))
# 统计每个属性名称和属性值对应的权重
attr_statistics = {} # {attr_name: {option_name: weight}}
for behavior, weight in all_behaviors:
# 合并 spuAttributeList 和 skuAttributeList
merged_attributes = []
# 以 skuAttributeList 为基础
if hasattr(behavior, 'skuAttributeList') and behavior.skuAttributeList:
merged_attributes.extend(behavior.skuAttributeList)
# 加入 spuAttributeList,如果 attributeId 已存在则跳过
existing_attr_ids = set()
if hasattr(behavior, 'skuAttributeList') and behavior.skuAttributeList:
existing_attr_ids = {attr.attributeId for attr in behavior.skuAttributeList}
if hasattr(behavior, 'spuAttributeList') and behavior.spuAttributeList:
for attr in behavior.spuAttributeList:
if attr.attributeId not in existing_attr_ids:
merged_attributes.append(attr)
existing_attr_ids.add(attr.attributeId)
# 统计合并后的属性
for attr in merged_attributes:
attr_id = attr.attributeId
option_id = attr.optionId
# 获取属性名称
attr_name = self.dict_loader.get_name('spu_attribute', str(attr_id))
if not attr_name:
attr_name = self.dict_loader.get_name('sku_attribute', str(attr_id))
if not attr_name:
attr_name = f"属性{attr_id}"
# 获取属性值名称
option_name = self.dict_loader.get_name('spu_attribute_option', str(option_id))
if not option_name:
option_name = self.dict_loader.get_name('sku_attribute_option', str(option_id))
if not option_name:
option_name = f"选项{option_id}"
# 跳过无效的属性值
if option_name == '无' or not option_name:
continue
# 统计
if attr_name not in attr_statistics:
attr_statistics[attr_name] = {}
if option_name not in attr_statistics[attr_name]:
attr_statistics[attr_name][option_name] = 0
attr_statistics[attr_name][option_name] += weight
if not attr_statistics:
return {}
# 生成属性统计特征
attribute_features = {}
# 计算每个属性的总权重并排序
attr_with_total = [
(attr_name, options_dict, sum(options_dict.values()))
for attr_name, options_dict in attr_statistics.items()
]
# 按总权重排序,取前10个属性
sorted_attrs = sorted(attr_with_total, key=lambda x: x[2], reverse=True)
for attr_name, options_dict, total_weight in sorted_attrs:
# 按权重排序选项,取前5个
sorted_options = sorted(options_dict.items(), key=lambda x: x[1], reverse=True)
# 生成特征名称(使用属性名称的拼音或ID作为前缀)
attr_feature_prefix = f"attr_{attr_name.replace(' ', '_').replace(':', '_')}"
attribute_features[f'{attr_feature_prefix}_weighted_counts'] = dict(options_dict)
attribute_features[f'{attr_feature_prefix}_total_weight'] = total_weight
attribute_features[f'{attr_feature_prefix}_top_items'] = [item for item, count in sorted_options]
# 添加总体属性统计
total_attribute_weight = sum(attr[2] for attr in sorted_attrs)
attribute_features['attribute_total_weight'] = total_attribute_weight
attribute_features['attribute_attr_count'] = len(sorted_attrs)
logger.info(f"Extracted attribute statistics with {len(attribute_features)} attribute feature groups")
return attribute_features
def generate_common_attribute_distribution_description(self, common_attribute_distribution: Dict[str, Any]) -> str:
"""
生成通用属性分布统计的自然语言描述
Args:
common_attribute_distribution: 通用属性分布统计信息
Returns:
str: 自然语言描述
"""
if not common_attribute_distribution:
return "暂无通用属性分布统计信息"
description_parts = []
# 0. 行为总述(放在最前面)
if 'behavior_type_counts' in common_attribute_distribution:
behavior_counts = common_attribute_distribution['behavior_type_counts']
total_behaviors = sum(behavior_counts.values())
if total_behaviors > 0:
behavior_summary_parts = []
# 检查是否达到截断限制
if total_behaviors >= self.behavior_stats_config.behavior_summary_truncate_limit:
behavior_summary_parts.append(f"该用户有超过{self.behavior_stats_config.behavior_summary_truncate_limit}次行为")
else:
behavior_summary_parts.append(f"该用户有{total_behaviors}次行为")
# 添加具体行为类型统计
behavior_details = []
if behavior_counts.get('click', 0) > 0:
behavior_details.append(f"{behavior_counts['click']}次点击")
if behavior_counts.get('add_cart', 0) > 0:
behavior_details.append(f"{behavior_counts['add_cart']}次加购")
if behavior_counts.get('collect', 0) > 0:
behavior_details.append(f"{behavior_counts['collect']}次收藏")
if behavior_counts.get('purchase', 0) > 0:
behavior_details.append(f"{behavior_counts['purchase']}次购买")
if behavior_details:
behavior_summary_parts.append(f"包括{', '.join(behavior_details)}")
description_parts.append(''.join(behavior_summary_parts))
# 1. 处理直接取值字段描述
for field_config in self.behavior_stats_config.direct_fields:
if not field_config.enable:
continue
weighted_counts_key = f'{field_config.feature_prefix}_weighted_counts'
total_weight_key = f'{field_config.feature_prefix}_total_weight'
if weighted_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
weighted_counts = common_attribute_distribution[weighted_counts_key]
total_weight = common_attribute_distribution[total_weight_key]
if total_weight > 0:
# 生成带占比的描述
items_with_percentage = []
for item, count in sorted(weighted_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
percentage = (count / total_weight) * 100
# 词典映射
if item == '__empty__':
display_name = '空值'
elif field_config.dict_name:
display_name = self.dict_loader.get_name(field_config.dict_name, str(item)) or str(item)
else:
display_name = str(item)
items_with_percentage.append(f"{display_name}({percentage:.1f}%)")
if items_with_percentage:
description = field_config.description_template.format(
display_name=field_config.display_name,
values=', '.join(items_with_percentage)
)
description_parts.append(description)
# 2. 处理重复字段描述
for field_config in self.behavior_stats_config.repeated_fields:
if not field_config.enable:
continue
weighted_counts_key = f'{field_config.feature_prefix}_weighted_counts'
total_weight_key = f'{field_config.feature_prefix}_total_weight'
if weighted_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
weighted_counts = common_attribute_distribution[weighted_counts_key]
total_weight = common_attribute_distribution[total_weight_key]
if total_weight > 0:
# 生成带占比的描述
items_with_percentage = []
for item, count in sorted(weighted_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
percentage = (count / total_weight) * 100
# 词典映射
if item == '__empty__':
display_name = '空值'
elif field_config.dict_name:
display_name = self.dict_loader.get_name(field_config.dict_name, str(item)) or str(item)
else:
display_name = str(item)
items_with_percentage.append(f"{display_name}({percentage:.1f}%)")
if items_with_percentage:
description = field_config.description_template.format(
display_name=field_config.display_name,
values=', '.join(items_with_percentage)
)
description_parts.append(description)
# 3. 处理数值字段描述
for field_config in self.behavior_stats_config.numeric_fields:
if not field_config.enable:
continue
bucket_counts_key = f'{field_config.feature_prefix}_bucket_weighted_counts'
total_weight_key = f'{field_config.feature_prefix}_total_weight'
if bucket_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
bucket_counts = common_attribute_distribution[bucket_counts_key]
total_weight = common_attribute_distribution[total_weight_key]
if total_weight > 0:
# 生成带占比的描述
ranges_with_percentage = []
for bucket, count in sorted(bucket_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
percentage = (count / total_weight) * 100
if bucket == '__empty__':
range_desc = '空值'
else:
range_desc = f"{int(bucket)*field_config.bucket_size}-{(int(bucket)+1)*field_config.bucket_size}"
ranges_with_percentage.append(f"{range_desc}({percentage:.1f}%)")
if ranges_with_percentage:
description = field_config.description_template.format(
display_name=field_config.display_name,
values=', '.join(ranges_with_percentage)
)
description_parts.append(description)
# 4. 处理时间字段描述
for field_config in self.behavior_stats_config.time_fields:
if not field_config.enable:
continue
time_bucket_counts_key = f'{field_config.feature_prefix}_time_bucket_weighted_counts'
total_weight_key = f'{field_config.feature_prefix}_total_weight'
if time_bucket_counts_key in common_attribute_distribution and total_weight_key in common_attribute_distribution:
time_bucket_counts = common_attribute_distribution[time_bucket_counts_key]
total_weight = common_attribute_distribution[total_weight_key]
if total_weight > 0:
# 生成带占比的描述
time_descriptions_with_percentage = []
for bucket, count in sorted(time_bucket_counts.items(), key=lambda x: x[1], reverse=True)[:field_config.max_items]:
percentage = (count / total_weight) * 100
bucket_str = str(bucket)
if bucket_str == '__empty__':
time_desc = '空值'
elif bucket_str == '0-6m':
time_desc = '半年内'
elif bucket_str == '6-12m':
time_desc = '半年到一年'
elif bucket_str == '12-24m':
time_desc = '1-2年'
elif bucket_str == '24m+':
time_desc = '2年+'
elif bucket_str == 'future':
time_desc = '错误时间'
else:
time_desc = bucket_str
time_descriptions_with_percentage.append(f"{time_desc}({percentage:.1f}%)")
if time_descriptions_with_percentage:
description = field_config.description_template.format(
display_name=field_config.display_name,
values=', '.join(time_descriptions_with_percentage)
)
description_parts.append(description)
# 组合成完整描述
if description_parts:
return "\n".join(description_parts)
else:
return ""
def generate_item_specific_attribute_description(self, item_specific_attributes: Dict[str, Any]) -> str:
"""
生成商品具体属性统计的自然语言描述
Args:
item_specific_attributes: 商品具体属性统计信息
Returns:
str: 商品具体属性统计的自然语言描述
"""
if not item_specific_attributes:
return "暂无商品具体属性统计信息。"
descriptions = []
# 获取所有属性相关的特征
attr_features = {}
for key, value in item_specific_attributes.items():
if key.startswith('attr_') and key.endswith('_weighted_counts'):
attr_name = key.replace('_weighted_counts', '').replace('attr_', '')
attr_features[attr_name] = value
if not attr_features:
return "暂无有效属性统计信息。"
# 按总权重排序属性
sorted_attrs = []
for attr_name, weighted_counts in attr_features.items():
total_weight = sum(weighted_counts.values())
sorted_attrs.append((attr_name, weighted_counts, total_weight))
sorted_attrs.sort(key=lambda x: x[2], reverse=True)
# 生成描述
max_attrs = USER_PROFILE_BEHAVIOR_CONFIG['max_attributes_display']
max_options = USER_PROFILE_BEHAVIOR_CONFIG['max_options_per_attribute']
for attr_name, weighted_counts, total_weight in sorted_attrs[:max_attrs]: # 取前N个属性
# 按权重排序选项,取前N个
sorted_options = sorted(weighted_counts.items(), key=lambda x: x[1], reverse=True)[:max_options]
option_texts = []
for option_name, weight in sorted_options:
if option_name != '__empty__':
# 计算百分比
percentage = (weight / total_weight) * 100
option_texts.append(f"{option_name}({percentage:.1f}%)")
if option_texts:
desc = f"• {attr_name}: {', '.join(option_texts)}"
descriptions.append(desc)
if descriptions:
return "\n".join(descriptions)
return "暂无有效属性统计信息。"
def _extract_recent_search_keywords(self, user_profile: UserProfile) -> List[str]:
"""
提取最近10个搜索词(过滤掉isSearchFactory=true的)
Args:
user_profile: UserProfile对象
Returns:
List[str]: 最近10个搜索词列表
"""
if not user_profile or not user_profile.behavior_map:
return []
search_keywords = user_profile.behavior_map.search_keyword
if not search_keywords:
return []
# 过滤、去重并收集最近10个搜索词
seen_keywords = set()
recent_keywords = []
for search_behavior in search_keywords:
if not search_behavior.isSearchFactory and search_behavior.keyword:
keyword = search_behavior.keyword.strip()
# 过滤掉纯数字、下划线、减号、空白字符构成的关键词
if self._is_valid_search_keyword(keyword):
if keyword not in seen_keywords:
seen_keywords.add(keyword)
recent_keywords.append(keyword)
if len(recent_keywords) >= SESSION_CONFIG['max_recent_search_keywords']: # 达到最大数量就停止
break
logger.info(f"[UserProfileExtractor._extract_recent_search_keywords] Extracted {len(recent_keywords)} recent search keywords")
return recent_keywords
def _is_valid_search_keyword(self, keyword: str) -> bool:
"""
判断搜索关键词是否有效
Args:
keyword: 搜索关键词
Returns:
bool: 是否有效
"""
if not keyword or keyword.strip() == '':
return False
# 过滤掉纯数字、下划线、减号、空白字符构成的关键词
# 使用正则表达式匹配:只包含数字、下划线、减号、空白字符的字符串
if re.match(r'^[\d\s_-]+$', keyword):
return False
# 只有一个单词(split后只有一个)、并且这个单词里面既包含数字又包含字母 (转小写后 既有小写字母、又有数字)
if len(keyword.split()) == 1:
if re.match(r'^[a-z0-9]+$', keyword.lower()):
return False
# 包含数字和-
if re.match(r'^[0-9-]+$', keyword):
return False
return True