Blame view

tests/test_tokenization.py 599 Bytes
45b39796   tangwang   qp性能优化
1
2
3
4
5
6
7
8
9
10
11
12
13
  from query.tokenization import QueryTextAnalysisCache
  
  
  def test_han_coarse_tokens_follow_model_tokens_instead_of_whole_sentence():
      cache = QueryTextAnalysisCache(
          tokenizer=lambda text: [("路上", 0, 2), ("穿着", 2, 4), ("女性", 4, 6), ("黑色", 10, 12)]
      )
      cache.set_language_hint("路上穿着女性的衣服是黑色的", "zh")
  
      tokenized = cache.get_tokenized_text("路上穿着女性的衣服是黑色的")
  
      assert tokenized.fine_tokens == ("路上", "穿着", "女性", "黑色")
      assert tokenized.coarse_tokens == ("路上", "穿着", "女性", "黑色")