商品索引的nchors 与语义属性

tangwang
1 parent d54b0467
Showing 1 changed file with 141 additions and 16 deletions Show diff stats
indexer/process_products.py
@@ -61,13 +61,141 @@ LANG_LABELS: Dict[str, str] = {
  
 SUPPORTED_LANGS = set(LANG_LABELS.keys())
  
+SYSTEM_MESSAGES: Dict[str, str] = {
+    "zh": (
+        "你是一名电商平台的商品标注员，你的工作是对输入的每个商品进行理解、分析和标注，"
+        "并按要求格式返回 Markdown 表格。所有输出内容必须为中文。"
+    ),
+    "en": (
+        "You are a product annotator for an e-commerce platform. "
+        "For each input product, you must understand, analyze and label it, "
+        "and return a Markdown table strictly following the requested format. "
+        "All output must be in English."
+    ),
+    "de": (
+        "Du bist ein Produktannotator für eine E‑Commerce‑Plattform. "
+        "Du sollst jedes Eingabeprodukt verstehen, analysieren und beschriften "
+        "und eine Markdown-Tabelle im geforderten Format zurückgeben. "
+        "Alle Ausgaben müssen auf Deutsch sein."
+    ),
+    "ru": (
+        "Вы — разметчик товаров для платформы электронной коммерции. "
+        "Ваша задача — понимать, анализировать и размечать каждый товар "
+        "и возвращать таблицу Markdown в требуемом формате. "
+        "Весь вывод должен быть на русском языке."
+    ),
+    "fr": (
+        "Vous êtes annotateur de produits pour une plateforme e‑commerce. "
+        "Pour chaque produit en entrée, vous devez le comprendre, l’analyser et l’annoter, "
+        "puis renvoyer un tableau Markdown au format demandé. "
+        "Toute la sortie doit être en français."
+    ),
+}
+
  
 def create_prompt(products: List[Dict[str, str]], target_lang: str = "zh") -> str:
-    """创建LLM提示词（根据目标语言输出）"""
-    lang_label = LANG_LABELS.get(target_lang, "对应语言")
-    prompt = f"""请对输入的每条商品标题，分析并提取以下信息，所有输出内容请使用{lang_label}：
+    """根据目标语言创建 LLM 提示词和表头说明。"""
+    if target_lang == "en":
+        prompt = """Please analyze each input product title and extract the following information:
+
+1. Product title: a natural English product name derived from the input title
+2. Category path: from broad to fine-grained category, separated by ">" (e.g. Clothing>Women>Dresses>Work Dress)
+3. Fine-grained tags: style / features / attributes (e.g. floral, waist-cinching, French style)
+4. Target audience: gender / age group, etc. (e.g. young women)
+5. Usage scene
+6. Applicable season
+7. Key attributes
+8. Material description
+9. Functional features
+10. Selling point: one concise key selling sentence for recommendation
+11. Anchor text: a set of words or phrases that could be used by users as search queries for this product, covering category, fine-grained tags, functional attributes, usage scenes, etc.
+
+Input product list:
+
+"""
+        prompt_tail = """
+Please strictly return a Markdown table in the following format. For any column that can contain multiple values, separate values with commas. Do not add any other explanations:
+
+| No. | Product title | Category path | Fine-grained tags | Target audience | Usage scene | Season | Key attributes | Material | Features | Selling point | Anchor text |
+|----|----|----|----|----|----|----|----|----|----|----|----|
+"""
+    elif target_lang == "de":
+        prompt = """Bitte analysiere jeden eingegebenen Produkttitel und extrahiere die folgenden Informationen:
+
+1. Produkttitel: ein natürlicher deutscher Produkttitel basierend auf dem Eingangstitel
+2. Kategoriepfad: von Oberkategorie bis Feinkategorie, getrennt durch ">" (z. B. Kleidung>Damen>Kleider>Businesskleid)
+3. Feinkörnige Tags: Stil / Merkmale / Eigenschaften (z. B. Blumenmuster, tailliert, französischer Stil)
+4. Zielgruppe: Geschlecht / Altersgruppe usw. (z. B. junge Frauen)
+5. Einsatzszenario
+6. Geeignete Saison
+7. Wichtige Attribute
+8. Materialbeschreibung
+9. Funktionale Merkmale
+10. Verkaufsargument: ein prägnanter, einzeiliger Haupt-Selling-Point für Empfehlungen
+11. Ankertexte: eine Menge von Wörtern oder Phrasen, die Nutzer als Suchanfragen für dieses Produkt verwenden könnten und die Kategorie, feine Tags, Funktion und Nutzungsszenarien abdecken.
+
+Eingabeliste der Produkte:
+
+"""
+        prompt_tail = """
+Gib bitte strikt eine Markdown-Tabelle im folgenden Format zurück. Mehrere Werte in einer Spalte werden durch Kommas getrennt. Füge keine weiteren Erklärungen hinzu:
+
+| Nr. | Produkttitel | Kategoriepfad | Feintags | Zielgruppe | Einsatzszenario | Saison | Wichtige Attribute | Material | Merkmale | Verkaufsargument | Ankertexte |
+|----|----|----|----|----|----|----|----|----|----|----|----|
+"""
+    elif target_lang == "ru":
+        prompt = """Пожалуйста, проанализируйте каждый входной заголовок товара и извлеките следующую информацию:
+
+1. Заголовок товара: естественное русскоязычное название товара на основе исходного заголовка
+2. Путь категории: от широкой до узкой категории, разделённый символом ">" (например: Одежда>Женская одежда>Платья>Деловое платье)
+3. Детализированные теги: стиль / особенности / характеристики (например: цветочный принт, приталенный, французский стиль)
+4. Целевая аудитория: пол / возрастная группа и т. п. (например: молодые женщины)
+5. Сценарий использования
+6. Подходящий сезон
+7. Ключевые характеристики
+8. Описание материала
+9. Функциональные особенности
+10. Торговое преимущество: одно краткое ключевое предложение для рекомендаций
+11. Якорные запросы: набор слов или фраз, которые пользователи могут использовать в качестве поисковых запросов для этого товара, покрывающих категорию, детализированные теги, функциональные характеристики, сценарии использования и т. д.
+
+Список входных товаров:
  
-1. 商品标题：将输入商品名称翻译为{lang_label}
+"""
+        prompt_tail = """
+Пожалуйста, строго верните Markdown‑таблицу в следующем формате. Для колонок с несколькими значениями разделяйте значения запятыми. Не добавляйте никаких дополнительных пояснений:
+
+| № | Заголовок товара | Путь категории | Детализированные теги | Целевая аудитория | Сценарий использования | Сезон | Ключевые характеристики | Материал | Особенности | Торговое преимущество | Якорные запросы |
+|----|----|----|----|----|----|----|----|----|----|----|----|
+"""
+    elif target_lang == "fr":
+        prompt = """Veuillez analyser chaque titre de produit en entrée et extraire les informations suivantes :
+
+1. Titre du produit : un titre de produit naturel en français basé sur le titre d’origine
+2. Chemin de catégorie : de la catégorie la plus large à la plus fine, séparées par ">" (par ex. Vêtements>Femme>Robes>Robe de travail)
+3. Tags détaillés : style / caractéristiques / attributs (par ex. fleuri, cintré, style français)
+4. Public cible : sexe / tranche d’âge, etc. (par ex. jeunes femmes)
+5. Scénario d’utilisation
+6. Saison adaptée
+7. Attributs clés
+8. Description du matériau
+9. Caractéristiques fonctionnelles
+10. Argument de vente : une phrase concise résumant le principal atout pour la recommandation
+11. Texte d’ancrage : un ensemble de mots ou d’expressions que les utilisateurs pourraient saisir comme requêtes de recherche pour ce produit, couvrant la catégorie, les tags détaillés, les fonctions, les scénarios d’usage, etc.
+
+Liste des produits en entrée :
+
+"""
+        prompt_tail = """
+Veuillez strictement renvoyer un tableau Markdown au format suivant. Pour toute colonne pouvant contenir plusieurs valeurs, séparez‑les par des virgules. N’ajoutez aucune autre explication :
+
+| N° | Titre du produit | Chemin de catégorie | Tags détaillés | Public cible | Scénario d’utilisation | Saison | Attributs clés | Matériau | Caractéristiques | Argument de vente | Texte d’ancrage |
+|----|----|----|----|----|----|----|----|----|----|----|----|
+"""
+    else:
+        # 默认中文版本
+        prompt = """请对输入的每条商品标题，分析并提取以下信息：
+
+1. 商品标题：将输入商品名称翻译为自然、完整的中文商品标题
 2. 品类路径：从大类到细分品类，用">"分隔（例如：服装>女装>裤子>工装裤）
 3. 细分标签：商品的风格、特点、功能等（例如：碎花，收腰，法式）
 4. 适用人群：性别/年龄段等（例如：年轻女性）
@@ -82,8 +210,7 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = &quot;zh&quot;) -&gt; st
 输入商品列表：
  
 """
-
-    prompt_tail = """
+        prompt_tail = """
 请严格按照以下markdown表格格式返回，每列内部的多值内容都用逗号分隔，不要添加任何其他说明：
  
 | 序号 | 商品标题 | 品类路径 | 细分标签 | 适用人群 | 使用场景 | 适用季节 | 关键属性 | 材质说明 | 功能特点 | 商品卖点 | 锚文本 |
@@ -97,8 +224,8 @@ def create_prompt(products: List[Dict[str, str]], target_lang: str = &quot;zh&quot;) -&gt; st
     return prompt
  
  
-def call_llm(prompt: str) -> Tuple[str, str]:
-    """调用大模型API（带重试机制）"""
+def call_llm(prompt: str, target_lang: str = "zh") -> Tuple[str, str]:
+    """调用大模型API（带重试机制），按目标语言选择系统提示词。"""
     headers = {
         "Authorization": f"Bearer {API_KEY}",
         "Content-Type": "application/json"
@@ -109,7 +236,7 @@ def call_llm(prompt: str) -&gt; Tuple[str, str]:
         "messages": [
             {
                 "role": "system",
-                "content": "你是一名电商平台的商品标注员，你的工作是对输入的每个商品进行理解、分析和标注，按要求格式返回Markdown表格。"
+                "content": SYSTEM_MESSAGES.get(target_lang, SYSTEM_MESSAGES["zh"])
             },
             {
                 "role": "user",
@@ -196,18 +323,16 @@ def parse_markdown_table(markdown_content: str) -&gt; List[Dict[str, str]]:
         if not line:
             continue
  
-        # 跳过表头
+        # 表格行处理
         if line.startswith('|'):
-            # 跳过分隔行
+            # 分隔行（----）
             if set(line.replace('|', '').strip()) <= {'-', ':'}:
                 data_started = True
                 continue
  
-            # 跳过表头行
+            # 首个表头行：无论语言如何，统一跳过
             if not data_started:
-                if '序号' in line or '商品中文标题' in line:
-                    continue
-                data_started = True
+                # 等待下一行数据行
                 continue
  
             # 解析数据行
@@ -248,7 +373,7 @@ def process_batch(
  
     # 调用LLM
     try:
-        raw_response, full_response_json = call_llm(prompt)
+        raw_response, full_response_json = call_llm(prompt, target_lang=target_lang)
  
         # 解析结果
         parsed_results = parse_markdown_table(raw_response)