Make SubFox production-ready with parallel translation and UI controls
This commit is contained in:
parent
c40b8bed2b
commit
2b1d05f02c
6046 changed files with 798327 additions and 0 deletions
88
app/services/translator.py
Normal file
88
app/services/translator.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
import os
|
||||
import re
|
||||
from openai import OpenAI
|
||||
|
||||
BLOCK_MARKER = "<<<BLOCK_{index}>>>"
|
||||
BATCH_SIZE = 50
|
||||
|
||||
|
||||
def _build_batch_prompt(blocks):
|
||||
parts = []
|
||||
for block in blocks:
|
||||
parts.append(BLOCK_MARKER.format(index=block["index"]))
|
||||
parts.append(block["text"])
|
||||
parts.append("")
|
||||
|
||||
joined = "\n".join(parts)
|
||||
|
||||
return f"""
|
||||
You are translating subtitle text from English to Danish.
|
||||
|
||||
Rules:
|
||||
- Translate naturally into short, readable Danish suitable for subtitles.
|
||||
- Keep each block marker exactly unchanged.
|
||||
- Do not add explanations.
|
||||
- Do not remove markers.
|
||||
- Return only the translated blocks.
|
||||
|
||||
Text to translate:
|
||||
|
||||
{joined}
|
||||
""".strip()
|
||||
|
||||
|
||||
def _parse_translated_response(translated_text):
|
||||
pattern = r"<<<BLOCK_(\d+)>>>\n?(.*?)(?=(?:\n<<<BLOCK_\d+>>>|\Z))"
|
||||
matches = re.findall(pattern, translated_text, re.DOTALL)
|
||||
|
||||
result = {}
|
||||
for block_id, text in matches:
|
||||
result[int(block_id)] = text.strip()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _chunked(seq, size):
|
||||
for i in range(0, len(seq), size):
|
||||
yield seq[i:i + size]
|
||||
|
||||
|
||||
def translate_blocks(blocks):
|
||||
api_key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
|
||||
if not api_key:
|
||||
print("DEBUG: no OPENAI_API_KEY found, using fallback translator", flush=True)
|
||||
for block in blocks:
|
||||
block["translated_text"] = "[DA] " + block["text"]
|
||||
return blocks
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
model = os.getenv("OPENAI_MODEL", "gpt-4.1-mini")
|
||||
|
||||
translated_map = {}
|
||||
|
||||
for batch_num, batch in enumerate(_chunked(blocks, BATCH_SIZE), start=1):
|
||||
print(f"DEBUG: translating batch {batch_num} with {len(batch)} blocks", flush=True)
|
||||
|
||||
prompt = _build_batch_prompt(batch)
|
||||
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input=prompt,
|
||||
)
|
||||
|
||||
translated_output = response.output_text
|
||||
parsed = _parse_translated_response(translated_output)
|
||||
|
||||
print(
|
||||
f"DEBUG: batch {batch_num} parsed translations = {len(parsed)}",
|
||||
flush=True
|
||||
)
|
||||
|
||||
for block in batch:
|
||||
translated_map[block["index"]] = parsed.get(block["index"], block["text"])
|
||||
|
||||
for block in blocks:
|
||||
block["translated_text"] = translated_map.get(block["index"], block["text"])
|
||||
|
||||
return blocks
|
||||
Loading…
Add table
Add a link
Reference in a new issue