test_keywords_query.py 3.73 KB
import hanlp
from typing import List, Tuple, Dict, Any

class KeywordExtractor:
    """
    基于 HanLP 的名词关键词提取器
    """
    def __init__(self):
        # 加载带位置信息的分词模型(细粒度)
        self.tok = hanlp.load(hanlp.pretrained.tok.CTB9_TOK_ELECTRA_BASE_CRF)
        self.tok.config.output_spans = True   # 启用位置输出
        
        # 加载词性标注模型
        self.pos_tag = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
        
    def extract_keywords(self, query: str) -> str:
        """
        从查询中提取关键词(名词,长度 ≥ 2)
        
        Args:
            query: 输入文本
            
        Returns:
            拼接后的关键词字符串,非连续词之间自动插入空格
        """
        query = query.strip()
        # 分词结果带位置:[[word, start, end], ...]
        tok_result_with_position = self.tok(query)
        tok_result = [x[0] for x in tok_result_with_position]
        
        # 词性标注
        pos_tag_result = list(zip(tok_result, self.pos_tag(tok_result)))
        
        # 需要忽略的词
        ignore_keywords = ['玩具']
        
        keywords = []
        last_end_pos = 0
        
        for (word, postag), (_, start_pos, end_pos) in zip(pos_tag_result, tok_result_with_position):
            if len(word) >= 2 and postag.startswith('N'):
                if word in ignore_keywords:
                    continue
                # 如果当前词与上一个词在原文中不连续,插入空格
                if start_pos != last_end_pos and keywords:
                    keywords.append(" ")
                keywords.append(word)
                last_end_pos = end_pos
            # 可选:打印调试信息
            # print(f'分词: {word} | 词性: {postag} | 起始: {start_pos} | 结束: {end_pos}')
        
        return "".join(keywords).strip()


# 测试代码
if __name__ == "__main__":
    extractor = KeywordExtractor()
    
    test_queries = [
        # 中文(保留 9 个代表性查询)
        "2.4G遥控大蛇",
        "充气的篮球",
        "遥控 塑料 飞船 汽车 ",
        "亚克力相框",
        "8寸 搪胶蘑菇钉",
        "7寸娃娃",
        "太空沙套装",
        "脚蹬工程车",
        "捏捏乐钥匙扣",
        
        # 英文(新增)
        "plastic toy car",
        "remote control helicopter",
        "inflatable beach ball",
        "music keychain",
        "sand play set",
        # 常见商品搜索
        "plastic dinosaur toy",
        "wireless bluetooth speaker",
        "4K action camera",
        "stainless steel water bottle",
        "baby stroller with cup holder",
        
        # 疑问式 / 自然语言
        "what is the best smartphone under 500 dollars",
        "how to clean a laptop screen",
        "where can I buy organic coffee beans",
        
        # 含数字、特殊字符
        "USB-C to HDMI adapter 4K",
        "LED strip lights 16.4ft",
        "Nintendo Switch OLED model",
        "iPhone 15 Pro Max case",
        
        # 简短词组
        "gaming mouse",
        "mechanical keyboard",
        "wireless earbuds",
        
        # 长尾词
        "rechargeable AA batteries with charger",
        "foldable picnic blanket waterproof",
        
        # 商品属性组合
        "women's running shoes size 8",
        "men's cotton t-shirt crew neck",

            
        # 其他语种(保留原样,用于多语言测试)
        "свет USB с пультом дистанционного управления красочные",  # 俄语
    ]
    
    for q in test_queries:
        keywords = extractor.extract_keywords(q)
        print(f"{q:30} => {keywords}")