test_tokenization.py
599 Bytes
from query.tokenization import QueryTextAnalysisCache
def test_han_coarse_tokens_follow_model_tokens_instead_of_whole_sentence():
cache = QueryTextAnalysisCache(
tokenizer=lambda text: [("路上", 0, 2), ("穿着", 2, 4), ("女性", 4, 6), ("黑色", 10, 12)]
)
cache.set_language_hint("路上穿着女性的衣服是黑色的", "zh")
tokenized = cache.get_tokenized_text("路上穿着女性的衣服是黑色的")
assert tokenized.fine_tokens == ("路上", "穿着", "女性", "黑色")
assert tokenized.coarse_tokens == ("路上", "穿着", "女性", "黑色")