Skip to content

Commit 15f83e2

Browse files
authored
initial prompt and translate function for l10n (#6700)
* initial prompt and translate function for l10n * adjustments based on feedback
1 parent fb07c64 commit 15f83e2

File tree

5 files changed

+263
-1
lines changed

5 files changed

+263
-1
lines changed

kitsune/llm/l10n/__init__.py

Whitespace-only changes.

kitsune/llm/l10n/config.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
L10N_LLM_MODEL = "gemini-2.5-pro-preview-05-06"
2+
3+
# These are do-not-translate (DNT) strings, or strings that should be preserved unchanged
4+
# when translating. Place shorter strings after any longer strings that contain them. So
5+
# for example, place "Firefox" after "Firefox Relay" in this list.
6+
L10N_PROTECTED_TERMS = [
7+
"Anonym",
8+
"Bugzilla",
9+
"ESR",
10+
"Extended Support Release",
11+
"Firefox Beta",
12+
"Firefox Developer Edition",
13+
"Firefox for Android",
14+
"Firefox for iOS",
15+
"Firefox for Enterprise",
16+
"Firefox Focus",
17+
"Firefox Relay",
18+
"Firefox Nightly",
19+
"Firefox",
20+
"Klar",
21+
"MDN Web Docs",
22+
"MDN Plus",
23+
"MDN",
24+
"Mozilla account",
25+
"Mozilla accounts",
26+
"Mozilla Account",
27+
"Mozilla Accounts",
28+
"Mozilla Wordmark",
29+
"Mozilla Wordmark + Symbol",
30+
"Mozilla VPN",
31+
"Mozilla Monitor",
32+
"Mozilla",
33+
"Pocket",
34+
"Pontoon",
35+
"QMO",
36+
"Rapid Release",
37+
"SUMO",
38+
"Sync",
39+
"Thunderbird for Android",
40+
"Thunderbird",
41+
"{note}",
42+
"{/note}",
43+
"{warning}",
44+
"{/warning}",
45+
"{/for}",
46+
"__TOC__",
47+
]

kitsune/llm/l10n/prompt.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
from typing import Any
2+
3+
from langchain.prompts import ChatPromptTemplate
4+
from langchain.schema import AIMessage
5+
6+
from kitsune.llm.l10n.config import L10N_PROTECTED_TERMS
7+
8+
9+
TRANSLATION_INSTRUCTIONS = """
10+
# Role and task
11+
- You are an expert at translating technical documents written in Wiki syntax about Mozilla's products from {{ source_language }} to {{ target_language }}.
12+
- Your task is to translate the given {{ source_language }} text into {{ target_language }}, **strictly obeying** the task instructions.
13+
- You may be given a "prior translation" as well. If so, the task instructions will describe how to use the "prior translation" to complete your task.
14+
15+
# Definitions
16+
Remember the following definitions. You will use them to complete your task.
17+
18+
## Definition of `wiki-hook`
19+
A `wiki-hook` is a string that case-sensitively matches the regular expression pattern that follows:
20+
21+
```python
22+
r"\[\[(Image|Video|V|Button|UI|Include|I|Template|T):.*?\]\]"
23+
```
24+
25+
## Definition of `wiki-article-link`
26+
A `wiki-article-link` is a string that case-sensitively matches the regular expression pattern that follows:
27+
28+
```python
29+
r"\[\[(?!Image:|Video:|V:|Button:|UI:|Include:|I:|Template:|T:)[^|]+?(?:\|(?P<description>.+?))?\]\]"
30+
```
31+
32+
## Definition of `wiki-external-link`
33+
A `wiki-external-link` is a string that case-sensitively matches the regular expression pattern that follows:
34+
35+
```python
36+
r"\[((mailto:|git://|irc://|https?://|ftp://|/)[^<>\]\[\x00-\x20\x7f]*)\s*(?P<description>.*?)\]"
37+
```
38+
39+
## Definition of `prior-translation-wiki-map`
40+
- The `prior-translation-wiki-map` is a Python `dict` built from the "prior translation", if it is provided.
41+
- A Python `dict` maps keys to their values.
42+
- Each `wiki-hook`, `wiki-article-link`, and `wiki-external-link` in the {{ source_language }} text of the "prior translation" becomes a key in the `prior-translation-wiki-map`, and each key's value is its corresponding translation found in the {{ target_language }} text of the "prior translation".
43+
- If no "prior translation" is provided, the `prior-translation-wiki-map` should be set to an empty `dict`.
44+
45+
# Rules for translating special strings
46+
- Each of the following rules describes how to translate special strings that may be present within the {{ source_language }} text that you are translating.
47+
- **Strictly obey** all of these rules when freshly translating {{ source_language }} text.
48+
49+
1. **Preserve unchanged** each of the following strings (each is wrapped with single backticks):
50+
{%- for term in protected_terms %}
51+
- `{{ term }}`
52+
{%- endfor %}
53+
54+
2. **Preserve unchanged** each string that case-sensitively matches the following regular expression:
55+
56+
```python
57+
r"\{(for|key|filepath) .*?\}"
58+
```
59+
60+
3. For each string that case-sensitively matches the following regular expression:
61+
62+
```python
63+
r"\{(?:button|menu|pref) (?P<description>.*?)\}"
64+
```
65+
66+
- Translate only the text matched by the named group `description` (**remember to obey rule #1 above**), and **preserve the rest of the string unchanged**.
67+
68+
4. For each `wiki-hook`, perform the following steps:
69+
- First, check if the `wiki-hook` is a key within the `prior-translation-wiki-map`.
70+
- If it is a key within the `prior-translation-wiki-map`, use its value from the `prior-translation-wiki-map` as its translation.
71+
- If it is **not** a key within the `prior-translation-wiki-map`, **preserve it unchanged**.
72+
73+
5. For each `wiki-article-link`, perform the following steps:
74+
- First, check if the `wiki-article-link` is a key within the `prior-translation-wiki-map`.
75+
- If it is a key within the `prior-translation-wiki-map`, use its value from the `prior-translation-wiki-map` as its translation.
76+
- If it is **not** a key within the `prior-translation-wiki-map`, translate only the text matched by the named group `description` (**remember to obey rule #1 above**), and **preserve the rest unchanged**.
77+
78+
6. For each `wiki-external-link`, perform the following steps:
79+
- First, check if the `wiki-external-link` is a key within the `prior-translation-wiki-map`.
80+
- If it is a key within the `prior-translation-wiki-map`, use its value from the `prior-translation-wiki-map` as its translation.
81+
- If it is **not** a key within the `prior-translation-wiki-map`, translate only the text matched by the named group `description` (**remember to obey rule #1 above**), and **preserve the rest unchanged**.
82+
83+
# Task Instructions
84+
1. **Build the `prior-translation-wiki-map`**. If no "prior translation" is provided, set the `prior-translation-wiki-map` to an empty `dict`.
85+
2. **Compare** the {{ source_language }} text you've been asked to translate with the {{ source_language }} text of the "prior translation", if provided, and **determine which parts are the same and which parts are different**. If no "prior translation" was provided, consider the entire {{ source_language }} text you've been asked to translate as different.
86+
3. For each part that is the same, **copy** its corresponding translation from the {{ target_language }} text of the "prior translation".
87+
4. For each part that is different, **freshly translate** that part. **Remember to obey the `Rules for translating special strings`**.
88+
5. **Combine** the copied parts and the freshly translated parts into a final translation.
89+
6. In your response, include your final translation and an explanation describing what you did for each step.
90+
91+
# Response Format
92+
Use the following template to format your response:
93+
<<begin-translation>>
94+
{ translation }
95+
<<end-translation>>
96+
<<begin-explanation>>
97+
{ explanation }
98+
<<end-explanation>>
99+
"""
100+
101+
SOURCE_ARTICLE = """
102+
# Prior translation
103+
104+
{%if prior_translation -%}
105+
## The {{ source_language }} text of the prior translation
106+
107+
```wiki
108+
{{ prior_translation.source_text|safe }}
109+
```
110+
111+
## The {{ target_language }} text of the prior translation
112+
113+
```wiki
114+
{{ prior_translation.target_text|safe }}
115+
```
116+
{%- else -%}
117+
There is no prior translation.
118+
{%- endif %}
119+
120+
# The {{ source_language }} text to translate
121+
122+
```wiki
123+
{{ source_text|safe }}
124+
```
125+
"""
126+
127+
128+
def translation_parser(message: AIMessage) -> dict[str, Any]:
129+
"""
130+
Parses the result from the LLM invocation for a translation, and returns a dictionary
131+
with the translation and the explanation. Special characters in the translation and
132+
the explanation often caused JSON decode errors when the StructuredOutputParser was
133+
used.
134+
"""
135+
result = {}
136+
content = message.content
137+
for name in ("translation", "explanation"):
138+
result[name] = content.split(f"<<begin-{name}>>")[-1].split(f"<<end-{name}>>")[0].strip()
139+
return result
140+
141+
142+
translation_prompt = ChatPromptTemplate(
143+
(
144+
("system", TRANSLATION_INSTRUCTIONS),
145+
("human", SOURCE_ARTICLE),
146+
),
147+
template_format="jinja2",
148+
).partial(protected_terms=L10N_PROTECTED_TERMS)

kitsune/llm/l10n/translator.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from typing import TYPE_CHECKING, Any
2+
3+
from django.conf import settings
4+
5+
from kitsune.llm.l10n.prompt import translation_parser, translation_prompt
6+
from kitsune.llm.l10n.config import L10N_LLM_MODEL
7+
from kitsune.llm.utils import get_llm
8+
9+
if TYPE_CHECKING:
10+
from kitsune.wiki.models import Document
11+
12+
13+
def translate(doc: "Document", target_locale: str) -> dict[str, dict[str, Any]]:
14+
"""
15+
Translates the summary, keywords, content, and, conditionally, the title of the
16+
given document into the target locale. The given document must be a parent
17+
document.
18+
"""
19+
llm = get_llm(model_name=L10N_LLM_MODEL)
20+
21+
translation_chain = translation_prompt | llm | translation_parser
22+
23+
payload: dict[str, Any] = dict(
24+
source_language=settings.LOCALES[doc.locale].english,
25+
target_language=settings.LOCALES[target_locale].english,
26+
)
27+
28+
result: dict[str, dict[str, Any]] = {}
29+
30+
content_attributes = ["summary", "keywords", "content"]
31+
32+
target_doc = doc.translated_to(target_locale)
33+
34+
# Generate a translation of the title only if the doc is not a template
35+
# and it doesn't already have a child in the target locale.
36+
if not doc.is_template and not target_doc:
37+
content_attributes.append("title")
38+
39+
def get_source_text(content_attribute):
40+
if content_attribute == "title":
41+
return doc.title
42+
return getattr(doc.latest_localizable_revision, content_attribute)
43+
44+
def get_prior_translation(content_attribute):
45+
if (
46+
content_attribute != "title"
47+
and target_doc
48+
and (target_rev := target_doc.current_revision)
49+
and (source_rev := target_rev.based_on)
50+
):
51+
return dict(
52+
source_text=getattr(source_rev, content_attribute),
53+
target_text=getattr(target_rev, content_attribute),
54+
)
55+
return None
56+
57+
for content_attribute in content_attributes:
58+
59+
payload.update(
60+
source_text=get_source_text(content_attribute),
61+
prior_translation=get_prior_translation(content_attribute),
62+
)
63+
64+
result[content_attribute] = translation_chain.invoke(payload)
65+
66+
return result

setup.cfg

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ ignore =
1414
E231
1515
E203
1616
per-file-ignores =
17-
kitsune/llm/questions/prompt.py: E501
17+
kitsune/llm/questions/prompt.py: E501
18+
kitsune/llm/l10n/prompt.py: E501, W605

0 commit comments

Comments
 (0)