test_llm_enrichment_batch_fill.py
2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from __future__ import annotations
from typing import Any, Dict, List
import pandas as pd
from indexer.document_transformer import SPUDocumentTransformer
def test_fill_llm_attributes_batch_calls_analyze_in_batches(monkeypatch):
seen_calls: List[Dict[str, Any]] = []
def _fake_analyze_products(products, target_lang="zh", batch_size=None, tenant_id=None):
# should always request batch_size=20 and pass full list; internal splitter handles >20
seen_calls.append(
{
"n": len(products),
"target_lang": target_lang,
"batch_size": batch_size,
"tenant_id": tenant_id,
}
)
return [
{
"id": p["id"],
"lang": target_lang,
"title_input": p["title"],
"tags": "t1,t2",
"anchor_text": f"{target_lang}-anchor-{p['id']}",
}
for p in products
]
import indexer.document_transformer as doc_tr
monkeypatch.setattr(doc_tr, "analyze_products", _fake_analyze_products)
transformer = SPUDocumentTransformer(
category_id_to_name={},
searchable_option_dimensions=[],
tenant_config={"index_languages": ["zh", "en"], "primary_language": "zh"},
translator=None,
encoder=None,
enable_title_embedding=False,
image_encoder=None,
enable_image_embedding=False,
)
docs: List[Dict[str, Any]] = []
rows: List[pd.Series] = []
for i in range(45):
docs.append({"tenant_id": "162", "spu_id": str(i)})
rows.append(pd.Series({"id": i, "title": f"title-{i}"}))
transformer.fill_llm_attributes_batch(docs, rows)
# called once per language, with full list; analyze_products handles splitting
assert seen_calls == [
{"n": 45, "target_lang": "zh", "batch_size": 20, "tenant_id": "162"},
{"n": 45, "target_lang": "en", "batch_size": 20, "tenant_id": "162"},
]
assert docs[0]["qanchors"]["zh"] == "zh-anchor-0"
assert docs[0]["qanchors"]["en"] == "en-anchor-0"