Make SubFox production-ready with parallel translation and UI controls

This commit is contained in:
Eddie Nielsen 2026-03-25 11:24:54 +00:00
parent c40b8bed2b
commit 2b1d05f02c
6046 changed files with 798327 additions and 0 deletions

0
app/services/__init__.py Normal file
View file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1 @@
jobs = {}

View file

@ -0,0 +1,10 @@
def build_srt(blocks):
output = []
for block in blocks:
output.append(str(block["index"]))
output.append(f'{block["start"]} --> {block["end"]}')
output.append(block.get("translated_text", block["text"]))
output.append("")
return "\n".join(output)

View file

@ -0,0 +1,35 @@
import re
def parse_srt(content: str):
content = content.replace("\r\n", "\n").replace("\r", "\n").strip()
parts = re.split(r"\n\s*\n", content)
blocks = []
for part in parts:
lines = [line.rstrip() for line in part.split("\n") if line.strip() != ""]
if len(lines) < 3:
continue
try:
index = int(lines[0].strip())
times = lines[1].strip()
if " --> " not in times:
continue
start, end = times.split(" --> ", 1)
text = "\n".join(lines[2:]).strip()
blocks.append({
"index": index,
"start": start,
"end": end,
"text": text
})
except Exception:
continue
return blocks

View file

@ -0,0 +1,193 @@
import re
from app.translators.factory import get_translator
def parse_srt(content: str):
content = content.replace("\r\n", "\n").replace("\r", "\n").strip()
raw_blocks = re.split(r"\n\s*\n", content)
blocks = []
for raw in raw_blocks:
lines = raw.strip().split("\n")
if len(lines) < 3:
continue
try:
index = int(lines[0].strip())
except ValueError:
continue
if "-->" not in lines[1]:
continue
start, end = [x.strip() for x in lines[1].split("-->", 1)]
text = "\n".join(lines[2:]).strip()
blocks.append(
{
"index": index,
"start": start,
"end": end,
"text": text,
}
)
return blocks
def build_srt(blocks):
output = []
for b in blocks:
output.append(
f"{b['index']}\n{b['start']} --> {b['end']}\n{b['text']}\n"
)
return "\n".join(output)
def _make_translator(mode=None, source_lang="auto", target_lang="da"):
attempts = [
lambda: get_translator(mode=mode, source_lang=source_lang, target_lang=target_lang),
lambda: get_translator(mode, source_lang, target_lang),
lambda: get_translator(mode=mode),
lambda: get_translator(mode),
lambda: get_translator(),
]
last_error = None
for attempt in attempts:
try:
translator = attempt()
if translator is not None:
return translator
except Exception as e:
last_error = e
raise RuntimeError(f"Could not create translator via get_translator(): {last_error}")
def _translate_blocks(
translator,
blocks,
source_lang="auto",
target_lang="da",
progress_callback=None,
):
if hasattr(translator, "translate_blocks"):
return translator.translate_blocks(
blocks,
source_lang,
target_lang,
progress_callback=progress_callback,
)
if hasattr(translator, "translate_batch"):
translated_texts = translator.translate_batch(blocks, source_lang, target_lang)
output = []
total = len(blocks)
for i, (block, translated_text) in enumerate(zip(blocks, translated_texts), start=1):
new_block = dict(block)
if isinstance(translated_text, dict):
translated_text = (
translated_text.get("text")
or translated_text.get("translated_text")
or translated_text.get("translation")
or block["text"]
)
new_block["text"] = str(translated_text).strip()
output.append(new_block)
if progress_callback:
progress_callback(i, total)
return output
if hasattr(translator, "translate"):
output = []
total = len(blocks)
for i, block in enumerate(blocks, start=1):
new_block = dict(block)
try:
translated = translator.translate(block["text"], source_lang, target_lang)
except TypeError:
translated = translator.translate(block["text"])
new_block["text"] = str(translated).strip()
output.append(new_block)
if progress_callback:
progress_callback(i, total)
return output
if callable(translator):
output = []
total = len(blocks)
for i, block in enumerate(blocks, start=1):
new_block = dict(block)
try:
translated = translator(block["text"], source_lang, target_lang)
except TypeError:
translated = translator(block["text"])
new_block["text"] = str(translated).strip()
output.append(new_block)
if progress_callback:
progress_callback(i, total)
return output
raise TypeError(f"Unsupported translator interface: {type(translator)}")
def translate_srt_content(
content,
translator=None,
mode=None,
source_lang="auto",
target_lang="da",
job_id=None,
progress_callback=None,
**kwargs,
):
print("DEBUG: ===== START translate_srt_content =====")
print("DEBUG: mode:", mode)
print("DEBUG: source_lang:", source_lang)
print("DEBUG: target_lang:", target_lang)
print("DEBUG: job_id:", job_id)
print("DEBUG: extra kwargs:", kwargs)
blocks = parse_srt(content)
print("DEBUG: blocks count:", len(blocks))
print("DEBUG: first block:", blocks[0] if blocks else "NONE")
if not blocks:
raise ValueError("No SRT blocks could be parsed from content")
if translator is None:
translator = _make_translator(
mode=mode,
source_lang=source_lang,
target_lang=target_lang,
)
print("DEBUG: translator created:", type(translator))
translated_blocks = _translate_blocks(
translator,
blocks,
source_lang=source_lang,
target_lang=target_lang,
progress_callback=progress_callback,
)
print("DEBUG: translated blocks count:", len(translated_blocks))
print("DEBUG: ===== END translate_srt_content =====")
return build_srt(translated_blocks)

View file

@ -0,0 +1,88 @@
import os
import re
from openai import OpenAI
BLOCK_MARKER = "<<<BLOCK_{index}>>>"
BATCH_SIZE = 50
def _build_batch_prompt(blocks):
parts = []
for block in blocks:
parts.append(BLOCK_MARKER.format(index=block["index"]))
parts.append(block["text"])
parts.append("")
joined = "\n".join(parts)
return f"""
You are translating subtitle text from English to Danish.
Rules:
- Translate naturally into short, readable Danish suitable for subtitles.
- Keep each block marker exactly unchanged.
- Do not add explanations.
- Do not remove markers.
- Return only the translated blocks.
Text to translate:
{joined}
""".strip()
def _parse_translated_response(translated_text):
pattern = r"<<<BLOCK_(\d+)>>>\n?(.*?)(?=(?:\n<<<BLOCK_\d+>>>|\Z))"
matches = re.findall(pattern, translated_text, re.DOTALL)
result = {}
for block_id, text in matches:
result[int(block_id)] = text.strip()
return result
def _chunked(seq, size):
for i in range(0, len(seq), size):
yield seq[i:i + size]
def translate_blocks(blocks):
api_key = os.getenv("OPENAI_API_KEY", "").strip()
if not api_key:
print("DEBUG: no OPENAI_API_KEY found, using fallback translator", flush=True)
for block in blocks:
block["translated_text"] = "[DA] " + block["text"]
return blocks
client = OpenAI(api_key=api_key)
model = os.getenv("OPENAI_MODEL", "gpt-4.1-mini")
translated_map = {}
for batch_num, batch in enumerate(_chunked(blocks, BATCH_SIZE), start=1):
print(f"DEBUG: translating batch {batch_num} with {len(batch)} blocks", flush=True)
prompt = _build_batch_prompt(batch)
response = client.responses.create(
model=model,
input=prompt,
)
translated_output = response.output_text
parsed = _parse_translated_response(translated_output)
print(
f"DEBUG: batch {batch_num} parsed translations = {len(parsed)}",
flush=True
)
for block in batch:
translated_map[block["index"]] = parsed.get(block["index"], block["text"])
for block in blocks:
block["translated_text"] = translated_map.get(block["index"], block["text"])
return blocks