Blame view

tests/test_llm_enrichment_batch_fill.py 2.08 KB
be3f0d46   tangwang   /indexer/enrich-c...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  from __future__ import annotations
  
  from typing import Any, Dict, List
  
  import pandas as pd
  
  from indexer.document_transformer import SPUDocumentTransformer
  
  
  def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch):
      seen_calls: List[Dict[str, Any]] = []
  
      def _fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None):
          # should always request batch_size=20 and pass full list; internal splitter handles >20
          seen_calls.append(
              {
                  "n": len(products),
                  "target_lang": target_lang,
                  "batch_size": batch_size,
                  "tenant_id": tenant_id,
              }
          )
          return [
              {
                  "id": p["id"],
                  "lang": target_lang,
                  "title_input": p["title"],
                  "tags": "t1,t2",
                  "anchor_text": f"{target_lang}-anchor-{p['id']}",
              }
              for p in products
          ]
  
      import indexer.document_transformer as doc_tr
  
      monkeypatch.setattr(doc_tr, "analyze_products", _fake_analyze_products)
  
      transformer = SPUDocumentTransformer(
          category_id_to_name={},
          searchable_option_dimensions=[],
          tenant_config={"index_languages": ["zh", "en"], "primary_language": "zh"},
          translator=None,
          encoder=None,
          enable_title_embedding=False,
          image_encoder=None,
          enable_image_embedding=False,
      )
  
      docs: List[Dict[str, Any]] = []
      rows: List[pd.Series] = []
      for i in range(45):
          docs.append({"tenant_id": "162", "spu_id": str(i)})
          rows.append(pd.Series({"id": i, "title": f"title-{i}"}))
  
      transformer.fill_llm_attributes_batch(docs, rows)
  
      # called once per language, with full list; analyze_products handles splitting
      assert seen_calls == [
          {"n": 45, "target_lang": "zh", "batch_size": 20, "tenant_id": "162"},
          {"n": 45, "target_lang": "en", "batch_size": 20, "tenant_id": "162"},
      ]
  
      assert docs[0]["qanchors"]["zh"] == "zh-anchor-0"
      assert docs[0]["qanchors"]["en"] == "en-anchor-0"