import os import re from openai import OpenAI BLOCK_MARKER = "<<>>" BATCH_SIZE = 50 def _build_batch_prompt(blocks): parts = [] for block in blocks: parts.append(BLOCK_MARKER.format(index=block["index"])) parts.append(block["text"]) parts.append("") joined = "\n".join(parts) return f""" You are translating subtitle text from English to Danish. Rules: - Translate naturally into short, readable Danish suitable for subtitles. - Keep each block marker exactly unchanged. - Do not add explanations. - Do not remove markers. - Return only the translated blocks. Text to translate: {joined} """.strip() def _parse_translated_response(translated_text): pattern = r"<<>>\n?(.*?)(?=(?:\n<<>>|\Z))" matches = re.findall(pattern, translated_text, re.DOTALL) result = {} for block_id, text in matches: result[int(block_id)] = text.strip() return result def _chunked(seq, size): for i in range(0, len(seq), size): yield seq[i:i + size] def translate_blocks(blocks): api_key = os.getenv("OPENAI_API_KEY", "").strip() if not api_key: print("DEBUG: no OPENAI_API_KEY found, using fallback translator", flush=True) for block in blocks: block["translated_text"] = "[DA] " + block["text"] return blocks client = OpenAI(api_key=api_key) model = os.getenv("OPENAI_MODEL", "gpt-4.1-mini") translated_map = {} for batch_num, batch in enumerate(_chunked(blocks, BATCH_SIZE), start=1): print(f"DEBUG: translating batch {batch_num} with {len(batch)} blocks", flush=True) prompt = _build_batch_prompt(batch) response = client.responses.create( model=model, input=prompt, ) translated_output = response.output_text parsed = _parse_translated_response(translated_output) print( f"DEBUG: batch {batch_num} parsed translations = {len(parsed)}", flush=True ) for block in batch: translated_map[block["index"]] = parsed.get(block["index"], block["text"]) for block in blocks: block["translated_text"] = translated_map.get(block["index"], block["text"]) return blocks