test_tokenization.py 599 Bytes
from query.tokenization import QueryTextAnalysisCache


def test_han_coarse_tokens_follow_model_tokens_instead_of_whole_sentence():
    cache = QueryTextAnalysisCache(
        tokenizer=lambda text: [("路上", 0, 2), ("穿着", 2, 4), ("女性", 4, 6), ("黑色", 10, 12)]
    )
    cache.set_language_hint("路上穿着女性的衣服是黑色的", "zh")

    tokenized = cache.get_tokenized_text("路上穿着女性的衣服是黑色的")

    assert tokenized.fine_tokens == ("路上", "穿着", "女性", "黑色")
    assert tokenized.coarse_tokens == ("路上", "穿着", "女性", "黑色")