from query.tokenization import QueryTextAnalysisCache def test_han_coarse_tokens_follow_model_tokens_instead_of_whole_sentence(): cache = QueryTextAnalysisCache( tokenizer=lambda text: [("路上", 0, 2), ("穿着", 2, 4), ("女性", 4, 6), ("黑色", 10, 12)] ) cache.set_language_hint("路上穿着女性的衣服是黑色的", "zh") tokenized = cache.get_tokenized_text("路上穿着女性的衣服是黑色的") assert tokenized.fine_tokens == ("路上", "穿着", "女性", "黑色") assert tokenized.coarse_tokens == ("路上", "穿着", "女性", "黑色")