diff --git a/CLAUDE.md b/CLAUDE.md index 810cbb1..70cf005 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,12 +23,24 @@ This is a **production-ready Multi-Tenant E-Commerce Search SaaS** platform spec ## Development Environment -**Required Environment Setup:** Use project root `activate.sh` (activates conda env `searchengine` and loads `.env`). On a new machine, set `CONDA_ROOT` if conda is not at default path. +**Required Environment Setup:** Default to the project venv via root `activate.sh` (activates `./.venv` and loads `.env`). Do not default to system `python3`/`pip3` for repo work. ```bash -# Optional on new machine: if conda is ~/anaconda3/bin/conda → export CONDA_ROOT=$HOME/anaconda3 source activate.sh ``` -See `docs/QUICKSTART.md` §1.4–1.8 for first-time env creation and production credentials (venv: `./scripts/create_venv.sh`; conda: `conda env create -f environment.yml` or `pip install -r requirements.txt`). +See `docs/QUICKSTART.md` §1.4–1.8 for first-time env creation and production credentials (`./scripts/create_venv.sh` for the main venv). + +**Environment Resolution Rules:** +- Main app, backend, indexer, frontend, generic scripts, and most tests: `./.venv` +- Translator service runtime and local translation model tooling: `./.venv-translator` +- Embedding service runtime: `./.venv-embedding` +- Reranker service runtime: `./.venv-reranker` +- CN-CLIP service runtime: `./.venv-cnclip` +- Never assume the system interpreter reflects project dependencies; prefer `source activate.sh` or invoke the exact venv binary directly. + +**Operational Rule For Commands:** +- For repo-wide `pytest`, ad hoc Python scripts, and lightweight development commands, use the matching venv interpreter first. +- For isolated services, prefer the service scripts (`./scripts/start_translator.sh`, `./scripts/start_embedding_service.sh`, `./scripts/start_reranker.sh`, `./scripts/start_cnclip_service.sh`) because they already select the correct environment. +- If a dependency appears “missing”, check whether the command was run under the wrong venv before proposing installs. **Database Configuration:** ```yaml @@ -49,12 +61,14 @@ password: P89cZHS5d7dFyc9R ### Environment Setup ```bash -# Activate environment (canonical: use activate.sh) +# Activate main project environment (canonical) source activate.sh # First-time / new machine: create env and install deps -./setup.sh # or: conda env create -f environment.yml -# If pip-only: pip install -r requirements.txt +./setup.sh +# or: +./scripts/create_venv.sh +source activate.sh ``` ### Data Management @@ -83,12 +97,12 @@ python main.py serve --host 0.0.0.0 --port 6002 --reload ### Testing ```bash # Run all tests -python -m pytest tests/ +pytest tests/ # Run specific test types -python -m pytest tests/unit/ # Unit tests -python -m pytest tests/integration/ # Integration tests -python -m pytest -m "api" # API tests only +pytest tests/unit/ # Unit tests +pytest tests/integration/ # Integration tests +pytest -m "api" # API tests only # Test search from command line python main.py search "query" --tenant-id 1 --size 10 @@ -602,4 +616,3 @@ python main.py search "query" --tenant-id 1 # Quick search test 8. **Multi-tenant Architecture**: Single index with `tenant_id` isolation 9. **Hybrid Search**: BM25 + vector similarity with configurable weighting 10. **Production Ready**: Health checks, monitoring, graceful degradation - diff --git a/api/translator_app.py b/api/translator_app.py index a5152c1..b4e4f87 100644 --- a/api/translator_app.py +++ b/api/translator_app.py @@ -5,18 +5,24 @@ import logging import os import pathlib import time +import uuid from contextlib import asynccontextmanager from functools import lru_cache from logging.handlers import TimedRotatingFileHandler from typing import List, Optional, Union import uvicorn -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from pydantic import BaseModel, ConfigDict, Field from config.services_config import get_translation_config +from translation.logging_utils import ( + TranslationRequestFilter, + bind_translation_request_id, + reset_translation_request_id, +) from translation.service import TranslationService from translation.settings import ( get_enabled_translation_models, @@ -33,7 +39,8 @@ def configure_translator_logging() -> None: log_level = os.getenv("LOG_LEVEL", "INFO").upper() numeric_level = getattr(logging, log_level, logging.INFO) - formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + formatter = logging.Formatter("%(asctime)s | reqid:%(reqid)s | %(name)s | %(levelname)s | %(message)s") + request_filter = TranslationRequestFilter() root_logger = logging.getLogger() root_logger.setLevel(numeric_level) @@ -42,6 +49,7 @@ def configure_translator_logging() -> None: console_handler = logging.StreamHandler() console_handler.setLevel(numeric_level) console_handler.setFormatter(formatter) + console_handler.addFilter(request_filter) root_logger.addHandler(console_handler) file_handler = TimedRotatingFileHandler( @@ -53,6 +61,7 @@ def configure_translator_logging() -> None: ) file_handler.setLevel(numeric_level) file_handler.setFormatter(formatter) + file_handler.addFilter(request_filter) root_logger.addHandler(file_handler) verbose_logger = logging.getLogger("translator.verbose") @@ -69,6 +78,7 @@ def configure_translator_logging() -> None: ) verbose_handler.setLevel(numeric_level) verbose_handler.setFormatter(formatter) + verbose_handler.addFilter(request_filter) verbose_logger.addHandler(verbose_handler) @@ -178,6 +188,13 @@ def _result_preview(translated: Union[str, List[Optional[str]], None]) -> str: return _text_preview(str(translated)) +def _resolve_request_id(http_request: Request) -> str: + header_value = http_request.headers.get("X-Request-ID") + if header_value and header_value.strip(): + return header_value.strip()[:32] + return str(uuid.uuid4())[:8] + + def _translate_batch( service: TranslationService, raw_text: List[str], @@ -189,15 +206,11 @@ def _translate_batch( ) -> List[Optional[str]]: backend = service.get_backend(model) logger.info( - "Translation batch dispatch | model=%s scene=%s target_lang=%s source_lang=%s count=%s lengths=%s first_preview=%s supports_batch=%s", - model, - scene, - target_lang, - source_lang or "auto", + "Translation batch dispatch | execution=%s count=%s lengths=%s first_preview=%s", + "backend-batch" if getattr(backend, "supports_batch", False) else "per-item", len(raw_text), [len(str(item or "")) for item in raw_text], _text_preview(raw_text[0] if raw_text else ""), - bool(getattr(backend, "supports_batch", False)), ) if getattr(backend, "supports_batch", False): try: @@ -330,12 +343,13 @@ async def health_check(): @app.post("/translate", response_model=TranslationResponse) -async def translate(request: TranslationRequest): +async def translate(request: TranslationRequest, http_request: Request): _ensure_valid_text(request.text) if not request.target_lang: raise HTTPException(status_code=400, detail="target_lang is required") + _, request_token = bind_translation_request_id(_resolve_request_id(http_request)) request_started = time.perf_counter() try: service = get_translation_service() @@ -447,12 +461,14 @@ async def translate(request: TranslationRequest): raise except ValueError as e: latency_ms = (time.perf_counter() - request_started) * 1000 - logger.warning("Translation validation error | error=%s latency_ms=%.2f", e, latency_ms, exc_info=True) + logger.warning("Translation validation error | error=%s latency_ms=%.2f", e, latency_ms) raise HTTPException(status_code=400, detail=str(e)) from e except Exception as e: latency_ms = (time.perf_counter() - request_started) * 1000 logger.error("Translation error | error=%s latency_ms=%.2f", e, latency_ms, exc_info=True) raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}") + finally: + reset_translation_request_id(request_token) @app.get("/") diff --git a/tests/test_translation_local_backends.py b/tests/test_translation_local_backends.py index c9c6c58..d48edca 100644 --- a/tests/test_translation_local_backends.py +++ b/tests/test_translation_local_backends.py @@ -1,6 +1,10 @@ +import logging + +import pytest import torch from translation.backends.local_seq2seq import MarianMTTranslationBackend, NLLBTranslationBackend +from translation.backends.local_ctranslate2 import NLLBCTranslate2TranslationBackend from translation.service import TranslationService from translation.text_splitter import compute_safe_input_token_limit, split_text_for_translation @@ -44,11 +48,59 @@ class _FakeModel: return [[42]] +class _FakeCT2Tokenizer: + def __init__(self, src_lang=None): + self.src_lang = src_lang + self.pad_token = "" + self.eos_token = "" + self.last_call = None + + def __call__(self, texts, **kwargs): + self.last_call = {"texts": list(texts), **kwargs} + return {"input_ids": [[1, 2, 3] for _ in texts]} + + def convert_ids_to_tokens(self, ids): + del ids + return ["tok_a", "tok_b", "tok_c"] + + def convert_tokens_to_ids(self, tokens): + if isinstance(tokens, list): + return [1 for _ in tokens] + return 1 + + def decode(self, token_ids, skip_special_tokens=True): + del token_ids, skip_special_tokens + return "translated" + + +class _FakeCT2Result: + def __init__(self, tokens): + self.hypotheses = [tokens] + + +class _FakeCT2Translator: + def __init__(self): + self.last_translate_batch_kwargs = None + + def translate_batch(self, source_tokens, **kwargs): + self.last_translate_batch_kwargs = {"source_tokens": source_tokens, **kwargs} + target_prefix = kwargs.get("target_prefix") or [] + return [ + _FakeCT2Result((target_prefix[idx] or []) + ["translated_token"]) + for idx, _ in enumerate(source_tokens) + ] + + def _stub_load_model(self): self.tokenizer = _FakeTokenizer() self.seq2seq_model = _FakeModel() +def _stub_load_ct2_runtime(self): + self.tokenizer = _FakeCT2Tokenizer() + self.translator = _FakeCT2Translator() + + def test_marian_language_validation(monkeypatch): monkeypatch.setattr(MarianMTTranslationBackend, "_load_model", _stub_load_model) backend = MarianMTTranslationBackend( @@ -68,12 +120,8 @@ def test_marian_language_validation(monkeypatch): result = backend.translate("测试", source_lang="zh", target_lang="en") assert result == "translated" - try: + with pytest.raises(ValueError, match="source languages"): backend.translate("test", source_lang="en", target_lang="zh") - except ValueError as exc: - assert "source languages" in str(exc) - else: - raise AssertionError("Expected unsupported source language to raise") def test_nllb_uses_src_lang_and_forced_bos(monkeypatch): @@ -97,6 +145,61 @@ def test_nllb_uses_src_lang_and_forced_bos(monkeypatch): assert backend.seq2seq_model.last_generate_kwargs["forced_bos_token_id"] == 202 +def test_nllb_accepts_finnish_short_code(monkeypatch): + monkeypatch.setattr(NLLBTranslationBackend, "_load_model", _stub_load_model) + backend = NLLBTranslationBackend( + name="nllb-200-distilled-600m", + model_id="facebook/nllb-200-distilled-600M", + model_dir="./models/translation/facebook/nllb-200-distilled-600M", + device="cpu", + torch_dtype="float32", + batch_size=1, + max_input_length=16, + max_new_tokens=16, + num_beams=1, + ) + + result = backend.translate("test", source_lang="fi", target_lang="zh") + + assert result == "translated" + assert backend.tokenizer.src_lang == "fin_Latn" + assert backend.seq2seq_model.last_generate_kwargs["forced_bos_token_id"] == 202 + + +def test_nllb_ctranslate2_accepts_finnish_short_code(monkeypatch): + created_tokenizers = [] + + def _fake_from_pretrained(source, src_lang=None, **kwargs): + del source, kwargs + tokenizer = _FakeCT2Tokenizer(src_lang=src_lang) + created_tokenizers.append(tokenizer) + return tokenizer + + monkeypatch.setattr(NLLBCTranslate2TranslationBackend, "_load_runtime", _stub_load_ct2_runtime) + monkeypatch.setattr( + "translation.backends.local_ctranslate2.AutoTokenizer.from_pretrained", + _fake_from_pretrained, + ) + backend = NLLBCTranslate2TranslationBackend( + name="nllb-200-distilled-600m", + model_id="facebook/nllb-200-distilled-600M", + model_dir="./models/translation/facebook/nllb-200-distilled-600M", + device="cpu", + torch_dtype="float32", + batch_size=1, + max_input_length=16, + max_new_tokens=16, + num_beams=1, + ) + + result = backend.translate("test", source_lang="fi", target_lang="zh") + + assert result == "translated" + assert len(created_tokenizers) == 1 + assert created_tokenizers[0].src_lang == "fin_Latn" + assert backend.translator.last_translate_batch_kwargs["target_prefix"] == [["zho_Hans"]] + + def test_translation_service_preloads_enabled_backends(monkeypatch): created = [] @@ -245,7 +348,71 @@ def test_local_backend_splits_oversized_text_before_translation(): result = backend.translate(text, source_lang="en", target_lang="zh") assert result is not None - assert len(backend.translated_batches) == 1 - assert len(backend.translated_batches[0]) >= 2 - assert all(len(piece) <= 16 for piece in backend.translated_batches[0]) - assert result == "".join(f"<{piece.strip()}>" for piece in backend.translated_batches[0]) + all_segments = [piece for batch in backend.translated_batches for piece in batch] + assert len(all_segments) >= 2 + assert all(len(batch) <= backend.batch_size for batch in backend.translated_batches) + assert all(len(piece) <= 16 for piece in all_segments) + assert result == "".join(f"<{piece.strip()}>" for piece in all_segments) + + +def test_local_backend_batches_after_segmentation(): + backend = _SegmentingMarianBackend( + name="opus-mt-en-zh", + model_id="Helsinki-NLP/opus-mt-en-zh", + model_dir="./models/translation/Helsinki-NLP/opus-mt-en-zh", + device="cpu", + torch_dtype="float32", + batch_size=4, + max_input_length=24, + max_new_tokens=24, + num_beams=1, + source_langs=["en"], + target_langs=["zh"], + ) + + texts = [ + "alpha beta gamma delta, epsilon zeta eta theta, iota kappa lambda mu.", + "nu xi omicron pi, rho sigma tau upsilon, phi chi psi omega.", + "dress shirt coat pants, socks shoes belt scarf, hat gloves bag watch.", + ] + + result = backend.translate(texts, source_lang="en", target_lang="zh") + + assert isinstance(result, list) + assert len(result) == 3 + assert len(backend.translated_batches) >= 2 + assert all(len(batch) <= backend.batch_size for batch in backend.translated_batches) + assert sum(len(batch) for batch in backend.translated_batches) > backend.batch_size + assert all(item is not None for item in result) + + +def test_local_backend_logs_segmentation_and_inference_batches(caplog): + backend = _SegmentingMarianBackend( + name="opus-mt-en-zh", + model_id="Helsinki-NLP/opus-mt-en-zh", + model_dir="./models/translation/Helsinki-NLP/opus-mt-en-zh", + device="cpu", + torch_dtype="float32", + batch_size=2, + max_input_length=24, + max_new_tokens=24, + num_beams=1, + source_langs=["en"], + target_langs=["zh"], + ) + + texts = [ + "one two three four, five six seven eight, nine ten eleven twelve.", + "thirteen fourteen fifteen sixteen, seventeen eighteen nineteen twenty.", + ] + + with caplog.at_level(logging.INFO): + backend.translate(texts, source_lang="en", target_lang="zh") + + messages = [record.getMessage() for record in caplog.records] + + assert any(message.startswith("Translation segmentation summary |") for message in messages) + inference_logs = [ + message for message in messages if message.startswith("Translation inference batch |") + ] + assert len(inference_logs) >= 2 diff --git a/tests/test_translator_failure_semantics.py b/tests/test_translator_failure_semantics.py index ce9ab0f..22643be 100644 --- a/tests/test_translator_failure_semantics.py +++ b/tests/test_translator_failure_semantics.py @@ -1,4 +1,11 @@ +import logging + from translation.cache import TranslationCache +from translation.logging_utils import ( + TranslationRequestFilter, + bind_translation_request_id, + reset_translation_request_id, +) from translation.service import TranslationService @@ -107,3 +114,80 @@ def test_service_caches_all_capabilities(monkeypatch): ("opus-mt-zh-en", "en", "连衣裙", "opus-mt-zh-en:连衣裙"), ("opus-mt-zh-en", "en", "衬衫", "opus-mt-zh-en:衬衫"), ] + + +def test_translation_request_filter_injects_reqid(): + reqid, token = bind_translation_request_id("req-test-1234567890") + try: + record = logging.LogRecord( + name="translation.service", + level=logging.INFO, + pathname=__file__, + lineno=1, + msg="hello", + args=(), + exc_info=None, + ) + TranslationRequestFilter().filter(record) + + assert reqid == "req-test-1234567890" + assert record.reqid == "req-test-1234567890" + finally: + reset_translation_request_id(token) + + +def test_translation_route_log_focuses_on_routing_decision(monkeypatch, caplog): + monkeypatch.setattr(TranslationCache, "_init_redis_client", staticmethod(lambda: None)) + + def _fake_create_backend(self, *, name, backend_type, cfg): + del self, backend_type, cfg + + class _Backend: + model = name + + @property + def supports_batch(self): + return True + + def translate(self, text, target_lang, source_lang=None, scene=None): + del target_lang, source_lang, scene + return text + + return _Backend() + + monkeypatch.setattr(TranslationService, "_create_backend", _fake_create_backend) + service = TranslationService( + { + "service_url": "http://127.0.0.1:6006", + "timeout_sec": 10.0, + "default_model": "llm", + "default_scene": "general", + "capabilities": { + "llm": { + "enabled": True, + "backend": "llm", + "model": "dummy-llm", + "base_url": "https://example.com", + "timeout_sec": 10.0, + "use_cache": True, + } + }, + "cache": { + "ttl_seconds": 60, + "sliding_expiration": True, + }, + } + ) + + with caplog.at_level(logging.INFO): + service.translate("商品标题", target_lang="en", source_lang="zh", model="llm") + + route_messages = [ + record.getMessage() + for record in caplog.records + if record.name == "translation.service" and record.getMessage().startswith("Translation route |") + ] + + assert route_messages == [ + "Translation route | backend=llm request_type=single use_cache=True cache_available=False" + ] diff --git a/translation/backends/local_ctranslate2.py b/translation/backends/local_ctranslate2.py index 210bdbf..7d1f5d0 100644 --- a/translation/backends/local_ctranslate2.py +++ b/translation/backends/local_ctranslate2.py @@ -13,7 +13,12 @@ from typing import Dict, List, Optional, Sequence, Union from transformers import AutoTokenizer -from translation.languages import MARIAN_LANGUAGE_DIRECTIONS, NLLB_LANGUAGE_CODES +from translation.languages import ( + MARIAN_LANGUAGE_DIRECTIONS, + build_nllb_language_catalog, + normalize_language_key, + resolve_nllb_language_code, +) from translation.text_splitter import ( compute_safe_input_token_limit, join_translated_segments, @@ -23,6 +28,17 @@ from translation.text_splitter import ( logger = logging.getLogger(__name__) +def _text_preview(text: Optional[str], limit: int = 32) -> str: + return str(text or "").replace("\n", "\\n")[:limit] + + +def _summarize_lengths(values: Sequence[int]) -> str: + if not values: + return "[]" + total = sum(values) + return f"min={min(values)} max={max(values)} avg={total / len(values):.1f}" + + def _resolve_device(device: Optional[str]) -> str: value = str(device or "auto").strip().lower() if value not in {"auto", "cpu", "cuda"}: @@ -285,6 +301,17 @@ class LocalCTranslate2TranslationBackend: source_tokens = self._encode_source_tokens(texts, source_lang, target_lang) target_prefix = self._target_prefixes(len(source_tokens), source_lang, target_lang) max_decoding_length = self._resolve_max_decoding_length(source_tokens) + logger.info( + "Translation model batch detail | model=%s segment_count=%s token_lengths=%s max_decoding_length=%s batch_type=%s beam_size=%s target_lang=%s source_lang=%s", + self.model, + len(source_tokens), + _summarize_lengths([len(tokens) for tokens in source_tokens]), + max_decoding_length, + self.ct2_batch_type, + self.num_beams, + target_lang, + source_lang or "auto", + ) results = self.translator.translate_batch( source_tokens, target_prefix=target_prefix, @@ -336,6 +363,59 @@ class LocalCTranslate2TranslationBackend: ), ) + def _log_segmentation_summary( + self, + *, + texts: Sequence[str], + segment_plans: Sequence[Sequence[str]], + target_lang: str, + source_lang: Optional[str], + ) -> None: + non_empty_count = sum(1 for text in texts if text.strip()) + segment_counts = [len(segments) for segments in segment_plans if segments] + total_segments = sum(segment_counts) + segmented_inputs = sum(1 for count in segment_counts if count > 1) + logger.info( + "Translation segmentation summary | model=%s inputs=%s non_empty_inputs=%s segmented_inputs=%s total_segments=%s batch_size=%s target_lang=%s source_lang=%s segments_per_input=%s", + self.model, + len(texts), + non_empty_count, + segmented_inputs, + total_segments, + self.batch_size, + target_lang, + source_lang or "auto", + _summarize_lengths(segment_counts), + ) + + def _translate_segment_batches( + self, + segments: List[str], + target_lang: str, + source_lang: Optional[str] = None, + ) -> List[Optional[str]]: + if not segments: + return [] + outputs: List[Optional[str]] = [] + total_batches = (len(segments) + self.batch_size - 1) // self.batch_size + for batch_index, start in enumerate(range(0, len(segments), self.batch_size), start=1): + batch = segments[start:start + self.batch_size] + logger.info( + "Translation inference batch | model=%s batch_index=%s total_batches=%s segment_count=%s char_lengths=%s first_preview=%s target_lang=%s source_lang=%s", + self.model, + batch_index, + total_batches, + len(batch), + _summarize_lengths([len(segment) for segment in batch]), + _text_preview(batch[0] if batch else ""), + target_lang, + source_lang or "auto", + ) + outputs.extend( + self._translate_batch(batch, target_lang=target_lang, source_lang=source_lang) + ) + return outputs + def _translate_with_segmentation( self, texts: List[str], @@ -352,8 +432,15 @@ class LocalCTranslate2TranslationBackend: segment_plans.append(segments) flat_segments.extend(segments) + self._log_segmentation_summary( + texts=texts, + segment_plans=segment_plans, + target_lang=target_lang, + source_lang=source_lang, + ) + translated_segments = ( - self._translate_batch(flat_segments, target_lang=target_lang, source_lang=source_lang) + self._translate_segment_batches(flat_segments, target_lang=target_lang, source_lang=source_lang) if flat_segments else [] ) @@ -387,13 +474,10 @@ class LocalCTranslate2TranslationBackend: del scene is_single = isinstance(text, str) texts = self._normalize_texts(text) - outputs: List[Optional[str]] = [] - for start in range(0, len(texts), self.batch_size): - chunk = texts[start:start + self.batch_size] - if not any(item.strip() for item in chunk): - outputs.extend([None if not item.strip() else item for item in chunk]) # type: ignore[list-item] - continue - outputs.extend(self._translate_with_segmentation(chunk, target_lang=target_lang, source_lang=source_lang)) + if not any(item.strip() for item in texts): + outputs = [None if not item.strip() else item for item in texts] # type: ignore[list-item] + return outputs[0] if is_single else outputs + outputs = self._translate_with_segmentation(texts, target_lang=target_lang, source_lang=source_lang) return outputs[0] if is_single else outputs @@ -492,11 +576,7 @@ class NLLBCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): ct2_decoding_length_extra: int = 0, ct2_decoding_length_min: int = 1, ) -> None: - overrides = language_codes or {} - self.language_codes = { - **NLLB_LANGUAGE_CODES, - **{str(k).strip().lower(): str(v).strip() for k, v in overrides.items() if str(k).strip()}, - } + self.language_codes = build_nllb_language_catalog(language_codes) self._tokenizers_by_source: Dict[str, object] = {} super().__init__( name=name, @@ -522,17 +602,17 @@ class NLLBCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): ) def _validate_languages(self, source_lang: Optional[str], target_lang: str) -> None: - src = str(source_lang or "").strip().lower() - tgt = str(target_lang or "").strip().lower() - if not src: + if not str(source_lang or "").strip(): raise ValueError(f"Model '{self.model}' requires source_lang") - if src not in self.language_codes: + if resolve_nllb_language_code(source_lang, self.language_codes) is None: raise ValueError(f"Unsupported NLLB source language: {source_lang}") - if tgt not in self.language_codes: + if resolve_nllb_language_code(target_lang, self.language_codes) is None: raise ValueError(f"Unsupported NLLB target language: {target_lang}") def _get_tokenizer_for_source(self, source_lang: str): - src_code = self.language_codes[source_lang] + src_code = resolve_nllb_language_code(source_lang, self.language_codes) + if src_code is None: + raise ValueError(f"Unsupported NLLB source language: {source_lang}") with self._tokenizer_lock: tokenizer = self._tokenizers_by_source.get(src_code) if tokenizer is None: @@ -549,7 +629,7 @@ class NLLBCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): target_lang: str, ) -> List[List[str]]: del target_lang - source_key = str(source_lang or "").strip().lower() + source_key = normalize_language_key(source_lang) tokenizer = self._get_tokenizer_for_source(source_key) encoded = tokenizer( texts, @@ -567,7 +647,9 @@ class NLLBCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): target_lang: str, ) -> Optional[List[Optional[List[str]]]]: del source_lang - tgt_code = self.language_codes[str(target_lang).strip().lower()] + tgt_code = resolve_nllb_language_code(target_lang, self.language_codes) + if tgt_code is None: + raise ValueError(f"Unsupported NLLB target language: {target_lang}") return [[tgt_code] for _ in range(count)] def _postprocess_hypothesis( @@ -577,7 +659,9 @@ class NLLBCTranslate2TranslationBackend(LocalCTranslate2TranslationBackend): target_lang: str, ) -> List[str]: del source_lang - tgt_code = self.language_codes[str(target_lang).strip().lower()] + tgt_code = resolve_nllb_language_code(target_lang, self.language_codes) + if tgt_code is None: + raise ValueError(f"Unsupported NLLB target language: {target_lang}") if tokens and tokens[0] == tgt_code: return tokens[1:] return tokens diff --git a/translation/backends/local_seq2seq.py b/translation/backends/local_seq2seq.py index d15acbe..6a517c1 100644 --- a/translation/backends/local_seq2seq.py +++ b/translation/backends/local_seq2seq.py @@ -10,7 +10,11 @@ from typing import Dict, List, Optional, Sequence, Union import torch from transformers import AutoModelForSeq2SeqLM, AutoTokenizer -from translation.languages import MARIAN_LANGUAGE_DIRECTIONS, NLLB_LANGUAGE_CODES +from translation.languages import ( + MARIAN_LANGUAGE_DIRECTIONS, + build_nllb_language_catalog, + resolve_nllb_language_code, +) from translation.text_splitter import ( compute_safe_input_token_limit, join_translated_segments, @@ -20,6 +24,17 @@ from translation.text_splitter import ( logger = logging.getLogger(__name__) +def _text_preview(text: Optional[str], limit: int = 32) -> str: + return str(text or "").replace("\n", "\\n")[:limit] + + +def _summarize_lengths(values: Sequence[int]) -> str: + if not values: + return "[]" + total = sum(values) + return f"min={min(values)} max={max(values)} avg={total / len(values):.1f}" + + def _resolve_device(device: Optional[str]) -> str: value = str(device or "auto").strip().lower() if value == "auto": @@ -198,6 +213,59 @@ class LocalSeq2SeqTranslationBackend: ), ) + def _log_segmentation_summary( + self, + *, + texts: Sequence[str], + segment_plans: Sequence[Sequence[str]], + target_lang: str, + source_lang: Optional[str], + ) -> None: + non_empty_count = sum(1 for text in texts if text.strip()) + segment_counts = [len(segments) for segments in segment_plans if segments] + total_segments = sum(segment_counts) + segmented_inputs = sum(1 for count in segment_counts if count > 1) + logger.info( + "Translation segmentation summary | model=%s inputs=%s non_empty_inputs=%s segmented_inputs=%s total_segments=%s batch_size=%s target_lang=%s source_lang=%s segments_per_input=%s", + self.model, + len(texts), + non_empty_count, + segmented_inputs, + total_segments, + self.batch_size, + target_lang, + source_lang or "auto", + _summarize_lengths(segment_counts), + ) + + def _translate_segment_batches( + self, + segments: List[str], + target_lang: str, + source_lang: Optional[str] = None, + ) -> List[Optional[str]]: + if not segments: + return [] + outputs: List[Optional[str]] = [] + total_batches = (len(segments) + self.batch_size - 1) // self.batch_size + for batch_index, start in enumerate(range(0, len(segments), self.batch_size), start=1): + batch = segments[start:start + self.batch_size] + logger.info( + "Translation inference batch | model=%s batch_index=%s total_batches=%s segment_count=%s char_lengths=%s first_preview=%s target_lang=%s source_lang=%s", + self.model, + batch_index, + total_batches, + len(batch), + _summarize_lengths([len(segment) for segment in batch]), + _text_preview(batch[0] if batch else ""), + target_lang, + source_lang or "auto", + ) + outputs.extend( + self._translate_batch(batch, target_lang=target_lang, source_lang=source_lang) + ) + return outputs + def _translate_with_segmentation( self, texts: List[str], @@ -214,8 +282,15 @@ class LocalSeq2SeqTranslationBackend: segment_plans.append(segments) flat_segments.extend(segments) + self._log_segmentation_summary( + texts=texts, + segment_plans=segment_plans, + target_lang=target_lang, + source_lang=source_lang, + ) + translated_segments = ( - self._translate_batch(flat_segments, target_lang=target_lang, source_lang=source_lang) + self._translate_segment_batches(flat_segments, target_lang=target_lang, source_lang=source_lang) if flat_segments else [] ) @@ -249,13 +324,10 @@ class LocalSeq2SeqTranslationBackend: del scene is_single = isinstance(text, str) texts = self._normalize_texts(text) - outputs: List[Optional[str]] = [] - for start in range(0, len(texts), self.batch_size): - chunk = texts[start:start + self.batch_size] - if not any(item.strip() for item in chunk): - outputs.extend([None if not item.strip() else item for item in chunk]) # type: ignore[list-item] - continue - outputs.extend(self._translate_with_segmentation(chunk, target_lang=target_lang, source_lang=source_lang)) + if not any(item.strip() for item in texts): + outputs = [None if not item.strip() else item for item in texts] # type: ignore[list-item] + return outputs[0] if is_single else outputs + outputs = self._translate_with_segmentation(texts, target_lang=target_lang, source_lang=source_lang) return outputs[0] if is_single else outputs @@ -324,11 +396,7 @@ class NLLBTranslationBackend(LocalSeq2SeqTranslationBackend): language_codes: Optional[Dict[str, str]] = None, attn_implementation: Optional[str] = None, ) -> None: - overrides = language_codes or {} - self.language_codes = { - **NLLB_LANGUAGE_CODES, - **{str(k).strip().lower(): str(v).strip() for k, v in overrides.items() if str(k).strip()}, - } + self.language_codes = build_nllb_language_catalog(language_codes) super().__init__( name=name, model_id=model_id, @@ -343,24 +411,26 @@ class NLLBTranslationBackend(LocalSeq2SeqTranslationBackend): ) def _validate_languages(self, source_lang: Optional[str], target_lang: str) -> None: - src = str(source_lang or "").strip().lower() - tgt = str(target_lang or "").strip().lower() - if not src: + if not str(source_lang or "").strip(): raise ValueError(f"Model '{self.model}' requires source_lang") - if src not in self.language_codes: + if resolve_nllb_language_code(source_lang, self.language_codes) is None: raise ValueError(f"Unsupported NLLB source language: {source_lang}") - if tgt not in self.language_codes: + if resolve_nllb_language_code(target_lang, self.language_codes) is None: raise ValueError(f"Unsupported NLLB target language: {target_lang}") def _prepare_tokenizer(self, source_lang: Optional[str], target_lang: str) -> Dict[str, object]: del target_lang - src_code = self.language_codes[str(source_lang).strip().lower()] + src_code = resolve_nllb_language_code(source_lang, self.language_codes) + if src_code is None: + raise ValueError(f"Unsupported NLLB source language: {source_lang}") self.tokenizer.src_lang = src_code return {} def _build_generate_kwargs(self, source_lang: Optional[str], target_lang: str) -> Dict[str, object]: del source_lang - tgt_code = self.language_codes[str(target_lang).strip().lower()] + tgt_code = resolve_nllb_language_code(target_lang, self.language_codes) + if tgt_code is None: + raise ValueError(f"Unsupported NLLB target language: {target_lang}") forced_bos_token_id = None if hasattr(self.tokenizer, "lang_code_to_id"): forced_bos_token_id = self.tokenizer.lang_code_to_id.get(tgt_code) diff --git a/translation/cache.py b/translation/cache.py index 3f45907..72aec6e 100644 --- a/translation/cache.py +++ b/translation/cache.py @@ -40,10 +40,8 @@ class TranslationCache: try: value = self.redis_client.get(key) logger.info( - "Translation cache %s | model=%s target_lang=%s text_len=%s key=%s", + "Translation cache %s | text_len=%s key=%s", "hit" if value is not None else "miss", - model, - target_lang, len(str(source_text or "")), key, ) @@ -61,9 +59,7 @@ class TranslationCache: try: self.redis_client.setex(key, self.ttl_seconds, translated_text) logger.info( - "Translation cache write | model=%s target_lang=%s text_len=%s result_len=%s ttl_seconds=%s key=%s", - model, - target_lang, + "Translation cache write | text_len=%s result_len=%s ttl_seconds=%s key=%s", len(str(source_text or "")), len(str(translated_text or "")), self.ttl_seconds, diff --git a/translation/languages.py b/translation/languages.py index 79ee64e..815aff0 100644 --- a/translation/languages.py +++ b/translation/languages.py @@ -2,12 +2,13 @@ from __future__ import annotations -from typing import Dict, Tuple +from typing import Dict, Mapping, Optional, Tuple LANGUAGE_LABELS: Dict[str, str] = { "zh": "Chinese", "en": "English", + "fi": "Finnish", "ru": "Russian", "ar": "Arabic", "ja": "Japanese", @@ -49,6 +50,7 @@ DEEPL_LANGUAGE_CODES: Dict[str, str] = { NLLB_LANGUAGE_CODES: Dict[str, str] = { "en": "eng_Latn", + "fi": "fin_Latn", "zh": "zho_Hans", "ru": "rus_Cyrl", "ar": "arb_Arab", @@ -65,3 +67,56 @@ MARIAN_LANGUAGE_DIRECTIONS: Dict[str, Tuple[str, str]] = { "opus-mt-zh-en": ("zh", "en"), "opus-mt-en-zh": ("en", "zh"), } + + +NLLB_LANGUAGE_ALIASES: Dict[str, str] = { + "fi_fi": "fi", + "fin": "fi", + "fin_fin": "fi", + "zh_cn": "zh", + "zh_hans": "zh", +} + + +def normalize_language_key(language: Optional[str]) -> str: + return str(language or "").strip().lower().replace("-", "_") + + +def build_nllb_language_catalog( + overrides: Optional[Mapping[str, str]] = None, +) -> Dict[str, str]: + catalog = { + normalize_language_key(key): str(value).strip() + for key, value in NLLB_LANGUAGE_CODES.items() + if str(key).strip() + } + for key, value in (overrides or {}).items(): + normalized_key = normalize_language_key(key) + if normalized_key: + catalog[normalized_key] = str(value).strip() + return catalog + + +def resolve_nllb_language_code( + language: Optional[str], + language_codes: Optional[Mapping[str, str]] = None, +) -> Optional[str]: + normalized = normalize_language_key(language) + if not normalized: + return None + + catalog = build_nllb_language_catalog(language_codes) + direct = catalog.get(normalized) + if direct is not None: + return direct + + alias = NLLB_LANGUAGE_ALIASES.get(normalized) + if alias is not None: + aliased = catalog.get(normalize_language_key(alias)) + if aliased is not None: + return aliased + + for code in catalog.values(): + if normalize_language_key(code) == normalized: + return code + return None diff --git a/translation/logging_utils.py b/translation/logging_utils.py new file mode 100644 index 0000000..e4b6a26 --- /dev/null +++ b/translation/logging_utils.py @@ -0,0 +1,37 @@ +"""Shared translation logging context helpers.""" + +from __future__ import annotations + +import contextvars +import logging +import uuid +from typing import Optional + + +_translation_request_id_var: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar( + "translation_request_id", + default=None, +) + + +def current_translation_request_id() -> str: + return _translation_request_id_var.get() or "-1" + + +def bind_translation_request_id(request_id: Optional[str] = None) -> tuple[str, contextvars.Token]: + raw_value = str(request_id or "").strip() + normalized = raw_value[:32] if raw_value else str(uuid.uuid4())[:8] + return normalized, _translation_request_id_var.set(normalized) + + +def reset_translation_request_id(token: contextvars.Token) -> None: + _translation_request_id_var.reset(token) + + +class TranslationRequestFilter(logging.Filter): + """Inject a request id into translator logs when one is available.""" + + def filter(self, record: logging.LogRecord) -> bool: + if not hasattr(record, "reqid"): + record.reqid = current_translation_request_id() + return True diff --git a/translation/service.py b/translation/service.py index 5e5c7df..c0011bf 100644 --- a/translation/service.py +++ b/translation/service.py @@ -198,15 +198,10 @@ class TranslationService: active_scene = normalize_translation_scene(self.config, scene) capability_cfg = self._enabled_capabilities[normalized_model] use_cache = bool(capability_cfg.get("use_cache")) - text_count = 1 if isinstance(text, str) else len(list(text)) logger.info( - "Translation route | model=%s backend=%s scene=%s target_lang=%s source_lang=%s count=%s use_cache=%s cache_available=%s", - normalized_model, + "Translation route | backend=%s request_type=%s use_cache=%s cache_available=%s", getattr(backend, "model", normalized_model), - active_scene, - target_lang, - source_lang or "auto", - text_count, + "single" if isinstance(text, str) else "batch", use_cache, self._translation_cache.available, ) @@ -252,11 +247,7 @@ class TranslationService: cached = self._translation_cache.get(model=model, target_lang=target_lang, source_text=text) if cached is not None: logger.info( - "Translation cache served | model=%s scene=%s target_lang=%s source_lang=%s text_len=%s", - model, - scene, - target_lang, - source_lang or "auto", + "Translation cache served | request_type=single text_len=%s", len(text), ) return cached @@ -274,21 +265,13 @@ class TranslationService: translated_text=translated, ) logger.info( - "Translation backend result cached | model=%s scene=%s target_lang=%s source_lang=%s text_len=%s result_len=%s", - model, - scene, - target_lang, - source_lang or "auto", + "Translation backend result cached | request_type=single text_len=%s result_len=%s", len(text), len(str(translated)), ) else: logger.warning( - "Translation backend returned empty result | model=%s scene=%s target_lang=%s source_lang=%s text_len=%s", - model, - scene, - target_lang, - source_lang or "auto", + "Translation backend returned empty result | request_type=single text_len=%s", len(text), ) return translated @@ -327,11 +310,7 @@ class TranslationService: miss_indices.append(idx) logger.info( - "Translation batch cache summary | model=%s scene=%s target_lang=%s source_lang=%s total=%s cache_hits=%s cache_misses=%s", - model, - scene, - target_lang, - source_lang or "auto", + "Translation batch cache summary | total=%s cache_hits=%s cache_misses=%s", len(texts), cache_hits, len(misses), @@ -356,11 +335,7 @@ class TranslationService: ) else: logger.warning( - "Translation batch item returned empty result | model=%s scene=%s target_lang=%s source_lang=%s item_index=%s text_len=%s", - model, - scene, - target_lang, - source_lang or "auto", + "Translation batch item returned empty result | item_index=%s text_len=%s", idx, len(original_text), ) -- libgit2 0.21.2