subfox/app/services/translator.py

88 lines
2.2 KiB
Python

import os
import re
from openai import OpenAI
BLOCK_MARKER = "<<<BLOCK_{index}>>>"
BATCH_SIZE = 50
def _build_batch_prompt(blocks):
parts = []
for block in blocks:
parts.append(BLOCK_MARKER.format(index=block["index"]))
parts.append(block["text"])
parts.append("")
joined = "\n".join(parts)
return f"""
You are translating subtitle text from English to Danish.
Rules:
- Translate naturally into short, readable Danish suitable for subtitles.
- Keep each block marker exactly unchanged.
- Do not add explanations.
- Do not remove markers.
- Return only the translated blocks.
Text to translate:
{joined}
""".strip()
def _parse_translated_response(translated_text):
pattern = r"<<<BLOCK_(\d+)>>>\n?(.*?)(?=(?:\n<<<BLOCK_\d+>>>|\Z))"
matches = re.findall(pattern, translated_text, re.DOTALL)
result = {}
for block_id, text in matches:
result[int(block_id)] = text.strip()
return result
def _chunked(seq, size):
for i in range(0, len(seq), size):
yield seq[i:i + size]
def translate_blocks(blocks):
api_key = os.getenv("OPENAI_API_KEY", "").strip()
if not api_key:
print("DEBUG: no OPENAI_API_KEY found, using fallback translator", flush=True)
for block in blocks:
block["translated_text"] = "[DA] " + block["text"]
return blocks
client = OpenAI(api_key=api_key)
model = os.getenv("OPENAI_MODEL", "gpt-4.1-mini")
translated_map = {}
for batch_num, batch in enumerate(_chunked(blocks, BATCH_SIZE), start=1):
print(f"DEBUG: translating batch {batch_num} with {len(batch)} blocks", flush=True)
prompt = _build_batch_prompt(batch)
response = client.responses.create(
model=model,
input=prompt,
)
translated_output = response.output_text
parsed = _parse_translated_response(translated_output)
print(
f"DEBUG: batch {batch_num} parsed translations = {len(parsed)}",
flush=True
)
for block in batch:
translated_map[block["index"]] = parsed.get(block["index"], block["text"])
for block in blocks:
block["translated_text"] = translated_map.get(block["index"], block["text"])
return blocks