mapping.py
3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Mapping generator for suggestion indices.
"""
from typing import Dict, Any, List
ANALYZER_BY_LANG: Dict[str, str] = {
"zh": "index_ik",
"en": "english",
"ar": "arabic",
"hy": "armenian",
"eu": "basque",
"pt_br": "brazilian",
"bg": "bulgarian",
"ca": "catalan",
"cjk": "cjk",
"cs": "czech",
"da": "danish",
"nl": "dutch",
"fi": "finnish",
"fr": "french",
"gl": "galician",
"de": "german",
"el": "greek",
"hi": "hindi",
"hu": "hungarian",
"id": "indonesian",
"it": "italian",
"no": "norwegian",
"fa": "persian",
"pt": "portuguese",
"ro": "romanian",
"ru": "russian",
"es": "spanish",
"sv": "swedish",
"tr": "turkish",
"th": "thai",
}
def _completion_field(lang: str) -> Dict[str, Any]:
analyzer = ANALYZER_BY_LANG.get(lang, "standard")
if lang == "zh":
return {
"type": "completion",
"analyzer": analyzer,
"search_analyzer": "query_ik",
}
return {"type": "completion", "analyzer": analyzer}
def _sat_field(lang: str) -> Dict[str, Any]:
analyzer = ANALYZER_BY_LANG.get(lang, "standard")
return {"type": "search_as_you_type", "analyzer": analyzer}
def build_suggestion_mapping(index_languages: List[str]) -> Dict[str, Any]:
"""Build index settings+mappings for suggestion index."""
langs = [x for x in (index_languages or []) if x]
if not langs:
langs = ["en", "zh"]
completion_props: Dict[str, Any] = {}
sat_props: Dict[str, Any] = {}
for lang in langs:
completion_props[lang] = _completion_field(lang)
sat_props[lang] = _sat_field(lang)
return {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"refresh_interval": "30s",
"analysis": {
"analyzer": {
"index_ik": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter": ["lowercase", "asciifolding"],
},
"query_ik": {
"type": "custom",
"tokenizer": "ik_smart",
"filter": ["lowercase", "asciifolding"],
},
}
},
},
"mappings": {
"properties": {
"tenant_id": {"type": "keyword"},
"lang": {"type": "keyword"},
"text": {"type": "keyword"},
"text_norm": {"type": "keyword"},
"sources": {"type": "keyword"},
"title_doc_count": {"type": "integer"},
"qanchor_doc_count": {"type": "integer"},
"tag_doc_count": {"type": "integer"},
"query_count_7d": {"type": "integer"},
"query_count_30d": {"type": "integer"},
"rank_score": {"type": "float"},
"lang_confidence": {"type": "float"},
"lang_source": {"type": "keyword"},
"lang_conflict": {"type": "boolean"},
"status": {"type": "byte"},
"updated_at": {"type": "date"},
"completion": {"properties": completion_props},
"sat": {"properties": sat_props},
}
},
}