Automatic translations

2025-12-01 12:00:00 -07:00
parent 3832c73da9
commit 5cdc5b7991
2 changed files with 293 additions and 42 deletions
@@ -5,17 +5,122 @@ import json
 import os
 import pathlib
 import xml.etree.ElementTree as ET
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from requests.adapters import HTTPAdapter
 from typing import cast
+from urllib3.util.retry import Retry

 import requests

 TRANSLATIONS_DIR = pathlib.Path(__file__).resolve().parent
 TRANSLATIONS_LANGUAGES = TRANSLATIONS_DIR / "languages.json"

-OPENAI_MODEL = "gpt-4"
+OPENAI_MODEL = "gpt-5"
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
-OPENAI_PROMPT = "You are a professional translator from English to {language} (ISO 639 language code). " + \
-                "The following sentence or word is in the GUI of a software called openpilot, translate it accordingly."
+
+FUN_LANG_KEYS = {"caveman", "duck", "frog", "pirate", "shakespearean"}
+
+FUN_PROMPT_TEMPLATE = """
+You are a playful *style translator* for openpilot. Translate the following message (an English source string) into the style '{language}'. Output ONLY the translated text, with no quotes or extra words.
+
+Output rules:
+- Input: one English UI string from a Qt .ts file.
+- Style key: {language}.
+- Output: a fun, stylized rewrite that keeps technical structure intact.
+
+Hard requirements:
+1) Preserve placeholders, variables, and markup exactly as written: {{name}}, {{0}}, {{icu}}, %1, %n, %(speed)d, $SPEED, <b>...</b>, <a href="...">...</a>, etc.
+2) Keep all non-translatable tokens unchanged: product/brand names (e.g., openpilot, ACC), file paths, error codes, part numbers.
+3) Do not add, remove, or reorder placeholders. If grammar absolutely requires reordering, keep all placeholders intact and still produce a correct sentence; prefer wordings that avoid reordering.
+4) Do not convert units or numbers (e.g., mph↔km/h). Translate unit labels only if standard in the style and not part of a preserved token.
+5) Maintain the same warning/priority level and imperative tone. Never soften or intensify safety messages ("Do not...", "Warning", "Critical").
+6) Preserve hotkeys/accelerators if present (e.g., &F, _O). If the exact letter is impossible, pick the nearest mnemonic but keep the marker.
+7) Follow style punctuation and casing norms while respecting all technical tokens.
+8) If ICU MessageFormat/plural/select syntax is present, keep the structure and variable names unchanged and rewrite only the human-readable text.
+9) Keep the output as concise as the source. Do not append notes, explanations, or metadata.
+
+Style Hints:
+- caveman: Short, blunt sentences. Simple words. Little grammar. Example: "Me want food. You come now."
+- duck: Quacky interjections, waddling rhythm, silly tone. Example: "Quack! What you mean? Waddle-waddle, quack!"
+- frog: Croaky, ribbit-filled speech, jumpy tone. Example: "Ribbit! I hop to help you. Croak, ribbit!"
+- pirate: Rough, nautical slang, dropped consonants, lots of "Arr!" Example: "Arr, ye scallywag! Hoist the sails 'n fetch me rum!"
+- shakespearean: Flowery, old-fashioned English, thee/thou, dramatic flair. Example: "Prithee, good sir, thou dost jest most cruelly!"
+
+Keep length close to source; avoid bloat. Respond with the styled string only.
+"""
+
+OPENAI_PROMPT = """
+You are a safety-critical UI translator for openpilot. Translate the following message (an English source string) into the locale '{language}'. Output ONLY the translated text, with no quotes or extra words.
+
+Hard requirements:
+1) Preserve placeholders, variables, and markup exactly as written: {{name}}, {{0}}, {{icu}}, %1, %n, %(speed)d, $SPEED, <b>...</b>, <a href="...">...</a>, etc.
+2) Keep all non-translatable tokens unchanged: product/brand names (e.g., openpilot, ACC), file paths, error codes, part numbers.
+3) Do not add, remove, or reorder placeholders. If grammar absolutely requires reordering, keep all placeholders intact and still produce a correct sentence; prefer wordings that avoid reordering.
+4) Do not convert units or numbers (e.g., mph↔km/h). Translate unit labels only if standard in the target locale and not part of a preserved token.
+5) Maintain the same warning/priority level and imperative tone. Never soften or intensify safety messages ("Do not...", "Warning", "Critical").
+6) Preserve hotkeys/accelerators if present (e.g., &F, _O). If the exact letter is impossible, pick the nearest mnemonic but keep the marker.
+7) Follow target-locale punctuation and casing norms while respecting all technical tokens.
+8) If ICU MessageFormat/plural/select syntax is present, keep the structure and variable names unchanged and translate only the human-readable text.
+9) Keep the translation as concise as the source. Do not append notes, explanations, or metadata.
+
+If the source is ambiguous or untranslatable without more context, choose the safest literal rendering that preserves meaning. If you cannot translate without risking meaning loss, return the source text unchanged.
+
+Your entire reply must be a single line containing only the final translation.
+"""
+
+OPENAI_EVAL_PROMPT = """
+You are a safety-critical reviewer for UI translations for openpilot. Your job is to compare two candidate translations (A and B) of an English source string and select the safest, most accurate option in the locale '{language}'.
+
+Output rules:
+- Return ONLY one line containing exactly one of these: the full text of Translation A, or the full text of Translation B, or the exact Source string.
+- Do not include quotes, labels, explanations, or whitespace beyond the chosen text.
+
+Decision criteria (apply in order):
+1) Hard correctness checks vs the Source:
+   - All placeholders/variables/markup from the Source must be preserved verbatim and remain valid: {{name}}, {{0}}, {{icu}}, %1, %n, %(speed)d, $SPEED, <b>...</b>, <a href="...">...</a>, &amp;, etc.
+   - Numbers and units must match; no unit conversion (mph↔km/h) and no number changes.
+   - Non-translatable tokens present in Source must remain unchanged (e.g., openpilot, ACC, file paths, error codes, part numbers).
+   - Hotkeys/accelerators such as &F or _O must be preserved with a sensible mnemonic letter in the target language; the marker must remain.
+   - ICU MessageFormat/plural/select syntax must keep the same structure and variable names; translate only human-readable text.
+   If only one candidate passes all hard checks, select it. If both fail, return the exact Source string.
+
+2) Meaning, tone, and severity:
+   - Preserve the precise meaning and intent; do not add, omit, soften, or intensify warnings/errors/imperatives.
+   - Keep safety language direct and unambiguous.
+   If only one candidate preserves meaning/tone precisely, select it.
+
+3) Target-language quality:
+   - Must be written in the target locale '{language}', idiomatic, grammatically correct, and concise while retaining meaning.
+   - Follow locale-appropriate punctuation, spacing, and capitalization, without altering required technical tokens.
+   Prefer the candidate that best satisfies these.
+
+4) Tie-breakers (when both are acceptable and equally accurate):
+   - Prefer the one closer in length to the Source and more readable on small UI.
+   - Prefer consistent terminology with common automotive/HMI usage in the target locale.
+   - Prefer minimal reordering of placeholders if both are valid.
+
+Remember:
+- Never fabricate or "improve" content. Choose A or B, or fall back to the Source if both are unsafe.
+- Your reply must be exactly the chosen string with no commentary.
+"""
+
+SESSION = requests.Session()
+
+def configure_session():
+  if OPENAI_API_KEY:
+    SESSION.headers.update({
+      "Authorization": f"Bearer {OPENAI_API_KEY}",
+      "Content-Type": "application/json"
+    })
+  retry = Retry(
+    total=10,
+    backoff_factor=1,
+    status_forcelist=[429, 500, 502, 503, 504],
+    allowed_methods=frozenset(["POST"])
+  )
+  adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=retry)
+  SESSION.mount("https://", adapter)
+  SESSION.mount("http://", adapter)


 def get_language_files(languages: list[str] = None) -> dict[str, pathlib.Path]:
@@ -34,42 +139,69 @@ def get_language_files(languages: list[str] = None) -> dict[str, pathlib.Path]:
  return files


+def evaluate_translation(source: str, old: str, new: str, language: str) -> str:
+  try:
+    response = SESSION.post(
+      "https://api.openai.com/v1/chat/completions",
+      json={
+        "model": OPENAI_MODEL,
+        "messages": [
+          {"role": "system", "content": OPENAI_EVAL_PROMPT.format(language=language)},
+          {"role": "user", "content": f"Source: {source}\n\nTranslation A: {old}\n\nTranslation B: {new}"},
+        ],
+        "max_completion_tokens": 2048,
+        "reasoning_effort": "medium",
+        "verbosity": "low",
+      },
+      timeout=(10, 60)
+    )
+
+    if 400 <= response.status_code < 600:
+      raise requests.HTTPError(f'Error {response.status_code}: {response.text}', response=response)
+
+    data = response.json()
+    return cast(str, data["choices"][0]["message"]["content"])
+  except Exception as e:
+    print(f"Evaluation failed for '{source[:40]}...': {e}")
+    return old
+
+
 def translate_phrase(text: str, language: str) -> str:
-  response = requests.post(
-    "https://api.openai.com/v1/chat/completions",
-    json={
-      "model": OPENAI_MODEL,
-      "messages": [
-        {
-          "role": "system",
-          "content": OPENAI_PROMPT.format(language=language),
-        },
-        {
-          "role": "user",
-          "content": text,
-        },
-      ],
-      "temperature": 0.8,
-      "max_tokens": 1024,
-      "top_p": 1,
-    },
-    headers={
-      "Authorization": f"Bearer {OPENAI_API_KEY}",
-      "Content-Type": "application/json",
-    },
-  )
+  lang_key = language.strip().lower()
+  if lang_key in FUN_LANG_KEYS:
+    prompt = FUN_PROMPT_TEMPLATE.format(language=lang_key)
+  else:
+    prompt = OPENAI_PROMPT.format(language=language)

-  if 400 <= response.status_code < 600:
-    raise requests.HTTPError(f'Error {response.status_code}: {response.json()}', response=response)
+  try:
+    response = SESSION.post(
+      "https://api.openai.com/v1/chat/completions",
+      json={
+        "model": OPENAI_MODEL,
+        "messages": [
+          {"role": "system", "content": prompt},
+          {"role": "user", "content": text},
+        ],
+        "max_completion_tokens": 2048,
+        "reasoning_effort": "minimal",
+        "verbosity": "low",
+      },
+      timeout=(10, 60)
+    )

-  data = response.json()
+    if 400 <= response.status_code < 600:
+      print(f'Error {response.status_code}: {response.text}')
+      return ""

-  return cast(str, data["choices"][0]["message"]["content"])
+    data = response.json()
+    return cast(str, data["choices"][0]["message"]["content"])
+  except Exception as e:
+    print(f"Translation failed for '{text[:40]}...': {e}")
+    return ""


-def translate_file(path: pathlib.Path, language: str, all_: bool) -> None:
+def translate_file(path: pathlib.Path, language: str, all_: bool, vet_translations: bool) -> None:
  tree = ET.parse(path)
-
  root = tree.getroot()

  for context in root.findall("./context"):
@@ -79,6 +211,8 @@ def translate_file(path: pathlib.Path, language: str, all_: bool) -> None:

    print(f"Context: {name.text}")

+    work_items = []
+
    for message in context.findall("./message"):
      source = message.find("source")
      translation = message.find("translation")
@@ -86,16 +220,67 @@ def translate_file(path: pathlib.Path, language: str, all_: bool) -> None:
      if source is None or translation is None:
        raise ValueError("source or translation not found")

-      if not all_ and translation.attrib.get("type") != "unfinished":
-        continue
+      translation_type = translation.attrib.get("type", "")

-      llm_translation = translate_phrase(cast(str, source.text), language)
+      if vet_translations:
+        if "-generated" not in translation_type:
+          continue
+      elif not all_:
+        if translation_type != "unfinished":
+          if translation_type.endswith("-generated") and not translation_type.startswith(OPENAI_MODEL):
+            pass
+          else:
+            continue

-      print(f"Source: {source.text}\n" +
-            f"Current translation: {translation.text}\n" +
-            f"LLM translation: {llm_translation}")
+      text = cast(str, source.text)
+      numerus = (message.attrib.get("numerus") == "yes") or ("%n" in text)
+      old_translation = translation.text or ""

-      translation.text = llm_translation
+      work_items.append((message, translation, text, numerus, old_translation))
+
+    if not work_items:
+      continue
+
+    def worker(item):
+      message, translation, text, numerus, old_translation = item
+      llm_translation = translate_phrase(text, language)
+
+      if vet_translations:
+        best = evaluate_translation(text, old_translation, llm_translation, language)
+        return (message, translation, text, numerus, best, True)
+      else:
+        return (message, translation, text, numerus, llm_translation, False)
+
+    results = []
+    with ThreadPoolExecutor(max_workers=100) as executor:
+      future_map = {executor.submit(worker, item): item for item in work_items}
+      for future in as_completed(future_map):
+        try:
+          results.append(future.result())
+        except Exception as e:
+          item = future_map[future]
+          print(f"Task failed for '{item[2][:40]}...': {e}")
+
+    for message, translation, text, numerus, chosen_translation, was_vetted in results:
+      print(f"Source: {text}\nCurrent translation: {translation.text}\nLLM translation: {chosen_translation}")
+
+      if was_vetted:
+        print(f"Chosen translation: {chosen_translation}")
+        translation.text = chosen_translation
+      else:
+        translation.set("type", f"{OPENAI_MODEL}-generated")
+
+        if numerus:
+          translations = chosen_translation or (translation.text or text)
+          for child in list(translation):
+            translation.remove(child)
+
+          translation.text = None
+
+          ET.SubElement(translation, "numerusform").text = translations
+          ET.SubElement(translation, "numerusform").text = translations
+        else:
+          translation.text = chosen_translation

  with path.open("w", encoding="utf-8") as fp:
    fp.write('<?xml version="1.0" encoding="utf-8"?>\n' +
@@ -111,6 +296,7 @@ def main():
  group.add_argument("-f", "--file", nargs="+", help="Translate the selected files. (Example: -f fr de)")

  arg_parser.add_argument("-t", "--all-translations", action="store_true", default=False, help="Translate all sections. (Default: only unfinished)")
+  arg_parser.add_argument("-v", "--vet-translations", action="store_true", default=False, help="Re-evaluate AI-generated translations")

  args = arg_parser.parse_args()

@@ -119,6 +305,8 @@ def main():
          "If you don't have one go to: https://beta.openai.com/account/api-keys.")
    exit(1)

+  configure_session()
+
  files = get_language_files(None if args.all_files else args.file)

  if args.file:
@@ -127,11 +315,14 @@ def main():
      print(f"No language files found: {missing_files}")
      exit(1)

-  print(f"Translation mode: {'all' if args.all_translations else 'only unfinished'}. Files: {list(files)}")
+  if args.vet_translations:
+    print(f"Re-evaluating all translations with the '{OPENAI_MODEL}-generated' type.")
+  else:
+    print(f"Translation mode: {'all' if args.all_translations else 'only unfinished'}. Files: {list(files)}")

  for lang, path in files.items():
    print(f"Translate {lang} ({path})")
-    translate_file(path, lang, args.all_translations)
+    translate_file(path, lang, args.all_translations, args.vet_translations)


 if __name__ == "__main__":
@@ -2,8 +2,12 @@
 import argparse
 import json
 import os
+import xml.etree.ElementTree as ET

-from openpilot.common.basedir import BASEDIR
+if "BASEDIR" in os.environ:
+  BASEDIR = os.environ.get("BASEDIR")
+else:
+  from openpilot.common.basedir import BASEDIR

 UI_DIR = os.path.join(BASEDIR, "selfdrive", "ui")
 FROGPILOT_UI_DIR = os.path.join(BASEDIR, "frogpilot", "ui")
@@ -25,6 +29,47 @@ def generate_translations_include():
    f.write(content)


+def backup_translation_types(root):
+  backup = {}
+  for context in root.findall("context"):
+    context_name = context.findtext("name")
+    if not context_name:
+      continue
+
+    for message in context.findall("message"):
+      source_text = message.findtext("source")
+      translation = message.find("translation")
+
+      if not source_text or translation is None:
+        continue
+
+      type_attr = translation.attrib.get("type", "")
+      if type_attr.endswith("-generated"):
+        backup[(context_name, source_text)] = type_attr
+
+  return backup
+
+
+def restore_translation_types(root, backup):
+  for context in root.findall("context"):
+    context_name = context.findtext("name")
+    if not context_name:
+      continue
+
+    for message in context.findall("message"):
+      source_text = message.findtext("source")
+      if not source_text:
+        continue
+
+      key = (context_name, source_text)
+      if key not in backup:
+        continue
+
+      translation = message.find("translation")
+      if translation is not None:
+        translation.attrib["type"] = backup[key]
+
+
 def update_translations(vanish: bool = False, translation_files: None | list[str] = None, translations_dir: str = TRANSLATIONS_DIR):
  if translation_files is None:
    with open(LANGUAGES_FILE) as f:
@@ -32,6 +77,11 @@ def update_translations(vanish: bool = False, translation_files: None | list[str

  for file in translation_files:
    tr_file = os.path.join(translations_dir, f"{file}.ts")
+
+    tree = ET.parse(tr_file)
+    root = tree.getroot()
+    backup = backup_translation_types(root)
+
    args = f"lupdate -locations none -recursive {UI_DIR} {FROGPILOT_UI_DIR} -ts {tr_file} -I {BASEDIR}"
    if vanish:
      args += " -no-obsolete"
@@ -40,6 +90,16 @@ def update_translations(vanish: bool = False, translation_files: None | list[str
    ret = os.system(args)
    assert ret == 0

+    tree = ET.parse(tr_file)
+    root = tree.getroot()
+    restore_translation_types(root, backup)
+
+    with open(tr_file, "w", encoding="utf-8") as fp:
+      fp.write('<?xml version="1.0" encoding="utf-8"?>\n' +
+               '<!DOCTYPE TS>\n' +
+               ET.tostring(root, encoding="utf-8", short_empty_elements=False).decode() +
+               "\n")
+

 if __name__ == "__main__":
  parser = argparse.ArgumentParser(description="Update translation files for UI",