88 lines
2.2 KiB
Python
88 lines
2.2 KiB
Python
import os
|
|
import re
|
|
from openai import OpenAI
|
|
|
|
BLOCK_MARKER = "<<<BLOCK_{index}>>>"
|
|
BATCH_SIZE = 50
|
|
|
|
|
|
def _build_batch_prompt(blocks):
|
|
parts = []
|
|
for block in blocks:
|
|
parts.append(BLOCK_MARKER.format(index=block["index"]))
|
|
parts.append(block["text"])
|
|
parts.append("")
|
|
|
|
joined = "\n".join(parts)
|
|
|
|
return f"""
|
|
You are translating subtitle text from English to Danish.
|
|
|
|
Rules:
|
|
- Translate naturally into short, readable Danish suitable for subtitles.
|
|
- Keep each block marker exactly unchanged.
|
|
- Do not add explanations.
|
|
- Do not remove markers.
|
|
- Return only the translated blocks.
|
|
|
|
Text to translate:
|
|
|
|
{joined}
|
|
""".strip()
|
|
|
|
|
|
def _parse_translated_response(translated_text):
|
|
pattern = r"<<<BLOCK_(\d+)>>>\n?(.*?)(?=(?:\n<<<BLOCK_\d+>>>|\Z))"
|
|
matches = re.findall(pattern, translated_text, re.DOTALL)
|
|
|
|
result = {}
|
|
for block_id, text in matches:
|
|
result[int(block_id)] = text.strip()
|
|
|
|
return result
|
|
|
|
|
|
def _chunked(seq, size):
|
|
for i in range(0, len(seq), size):
|
|
yield seq[i:i + size]
|
|
|
|
|
|
def translate_blocks(blocks):
|
|
api_key = os.getenv("OPENAI_API_KEY", "").strip()
|
|
|
|
if not api_key:
|
|
print("DEBUG: no OPENAI_API_KEY found, using fallback translator", flush=True)
|
|
for block in blocks:
|
|
block["translated_text"] = "[DA] " + block["text"]
|
|
return blocks
|
|
|
|
client = OpenAI(api_key=api_key)
|
|
model = os.getenv("OPENAI_MODEL", "gpt-4.1-mini")
|
|
|
|
translated_map = {}
|
|
|
|
for batch_num, batch in enumerate(_chunked(blocks, BATCH_SIZE), start=1):
|
|
print(f"DEBUG: translating batch {batch_num} with {len(batch)} blocks", flush=True)
|
|
|
|
prompt = _build_batch_prompt(batch)
|
|
|
|
response = client.responses.create(
|
|
model=model,
|
|
input=prompt,
|
|
)
|
|
|
|
translated_output = response.output_text
|
|
parsed = _parse_translated_response(translated_output)
|
|
|
|
print(
|
|
f"DEBUG: batch {batch_num} parsed translations = {len(parsed)}",
|
|
flush=True
|
|
)
|
|
|
|
for block in batch:
|
|
translated_map[block["index"]] = parsed.get(block["index"], block["text"])
|
|
|
|
for block in blocks:
|
|
block["translated_text"] = translated_map.get(block["index"], block["text"])
|
|
|
|
return blocks
|