Make SubFox production-ready with parallel translation and UI controls
This commit is contained in:
parent
c40b8bed2b
commit
2b1d05f02c
6046 changed files with 798327 additions and 0 deletions
0
app/services/__init__.py
Normal file
0
app/services/__init__.py
Normal file
BIN
app/services/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/job_store.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/job_store.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/srt_builder.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/srt_builder.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/srt_parser.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/srt_parser.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/subtitle_service.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/subtitle_service.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/translator.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/translator.cpython-310.pyc
Normal file
Binary file not shown.
1
app/services/job_store.py
Normal file
1
app/services/job_store.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
jobs = {}
|
||||
10
app/services/srt_builder.py
Normal file
10
app/services/srt_builder.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
def build_srt(blocks):
|
||||
output = []
|
||||
|
||||
for block in blocks:
|
||||
output.append(str(block["index"]))
|
||||
output.append(f'{block["start"]} --> {block["end"]}')
|
||||
output.append(block.get("translated_text", block["text"]))
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
35
app/services/srt_parser.py
Normal file
35
app/services/srt_parser.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
import re
|
||||
|
||||
|
||||
def parse_srt(content: str):
|
||||
content = content.replace("\r\n", "\n").replace("\r", "\n").strip()
|
||||
parts = re.split(r"\n\s*\n", content)
|
||||
|
||||
blocks = []
|
||||
|
||||
for part in parts:
|
||||
lines = [line.rstrip() for line in part.split("\n") if line.strip() != ""]
|
||||
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
try:
|
||||
index = int(lines[0].strip())
|
||||
times = lines[1].strip()
|
||||
|
||||
if " --> " not in times:
|
||||
continue
|
||||
|
||||
start, end = times.split(" --> ", 1)
|
||||
text = "\n".join(lines[2:]).strip()
|
||||
|
||||
blocks.append({
|
||||
"index": index,
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return blocks
|
||||
193
app/services/subtitle_service.py
Normal file
193
app/services/subtitle_service.py
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
import re
|
||||
from app.translators.factory import get_translator
|
||||
|
||||
|
||||
def parse_srt(content: str):
|
||||
content = content.replace("\r\n", "\n").replace("\r", "\n").strip()
|
||||
raw_blocks = re.split(r"\n\s*\n", content)
|
||||
blocks = []
|
||||
|
||||
for raw in raw_blocks:
|
||||
lines = raw.strip().split("\n")
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
try:
|
||||
index = int(lines[0].strip())
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if "-->" not in lines[1]:
|
||||
continue
|
||||
|
||||
start, end = [x.strip() for x in lines[1].split("-->", 1)]
|
||||
text = "\n".join(lines[2:]).strip()
|
||||
|
||||
blocks.append(
|
||||
{
|
||||
"index": index,
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text,
|
||||
}
|
||||
)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def build_srt(blocks):
|
||||
output = []
|
||||
for b in blocks:
|
||||
output.append(
|
||||
f"{b['index']}\n{b['start']} --> {b['end']}\n{b['text']}\n"
|
||||
)
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def _make_translator(mode=None, source_lang="auto", target_lang="da"):
|
||||
attempts = [
|
||||
lambda: get_translator(mode=mode, source_lang=source_lang, target_lang=target_lang),
|
||||
lambda: get_translator(mode, source_lang, target_lang),
|
||||
lambda: get_translator(mode=mode),
|
||||
lambda: get_translator(mode),
|
||||
lambda: get_translator(),
|
||||
]
|
||||
|
||||
last_error = None
|
||||
for attempt in attempts:
|
||||
try:
|
||||
translator = attempt()
|
||||
if translator is not None:
|
||||
return translator
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
|
||||
raise RuntimeError(f"Could not create translator via get_translator(): {last_error}")
|
||||
|
||||
|
||||
def _translate_blocks(
|
||||
translator,
|
||||
blocks,
|
||||
source_lang="auto",
|
||||
target_lang="da",
|
||||
progress_callback=None,
|
||||
):
|
||||
if hasattr(translator, "translate_blocks"):
|
||||
return translator.translate_blocks(
|
||||
blocks,
|
||||
source_lang,
|
||||
target_lang,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
if hasattr(translator, "translate_batch"):
|
||||
translated_texts = translator.translate_batch(blocks, source_lang, target_lang)
|
||||
|
||||
output = []
|
||||
total = len(blocks)
|
||||
|
||||
for i, (block, translated_text) in enumerate(zip(blocks, translated_texts), start=1):
|
||||
new_block = dict(block)
|
||||
|
||||
if isinstance(translated_text, dict):
|
||||
translated_text = (
|
||||
translated_text.get("text")
|
||||
or translated_text.get("translated_text")
|
||||
or translated_text.get("translation")
|
||||
or block["text"]
|
||||
)
|
||||
|
||||
new_block["text"] = str(translated_text).strip()
|
||||
output.append(new_block)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(i, total)
|
||||
|
||||
return output
|
||||
|
||||
if hasattr(translator, "translate"):
|
||||
output = []
|
||||
total = len(blocks)
|
||||
|
||||
for i, block in enumerate(blocks, start=1):
|
||||
new_block = dict(block)
|
||||
try:
|
||||
translated = translator.translate(block["text"], source_lang, target_lang)
|
||||
except TypeError:
|
||||
translated = translator.translate(block["text"])
|
||||
|
||||
new_block["text"] = str(translated).strip()
|
||||
output.append(new_block)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(i, total)
|
||||
|
||||
return output
|
||||
|
||||
if callable(translator):
|
||||
output = []
|
||||
total = len(blocks)
|
||||
|
||||
for i, block in enumerate(blocks, start=1):
|
||||
new_block = dict(block)
|
||||
try:
|
||||
translated = translator(block["text"], source_lang, target_lang)
|
||||
except TypeError:
|
||||
translated = translator(block["text"])
|
||||
|
||||
new_block["text"] = str(translated).strip()
|
||||
output.append(new_block)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(i, total)
|
||||
|
||||
return output
|
||||
|
||||
raise TypeError(f"Unsupported translator interface: {type(translator)}")
|
||||
|
||||
|
||||
def translate_srt_content(
|
||||
content,
|
||||
translator=None,
|
||||
mode=None,
|
||||
source_lang="auto",
|
||||
target_lang="da",
|
||||
job_id=None,
|
||||
progress_callback=None,
|
||||
**kwargs,
|
||||
):
|
||||
print("DEBUG: ===== START translate_srt_content =====")
|
||||
print("DEBUG: mode:", mode)
|
||||
print("DEBUG: source_lang:", source_lang)
|
||||
print("DEBUG: target_lang:", target_lang)
|
||||
print("DEBUG: job_id:", job_id)
|
||||
print("DEBUG: extra kwargs:", kwargs)
|
||||
|
||||
blocks = parse_srt(content)
|
||||
|
||||
print("DEBUG: blocks count:", len(blocks))
|
||||
print("DEBUG: first block:", blocks[0] if blocks else "NONE")
|
||||
|
||||
if not blocks:
|
||||
raise ValueError("No SRT blocks could be parsed from content")
|
||||
|
||||
if translator is None:
|
||||
translator = _make_translator(
|
||||
mode=mode,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
)
|
||||
print("DEBUG: translator created:", type(translator))
|
||||
|
||||
translated_blocks = _translate_blocks(
|
||||
translator,
|
||||
blocks,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
print("DEBUG: translated blocks count:", len(translated_blocks))
|
||||
print("DEBUG: ===== END translate_srt_content =====")
|
||||
|
||||
return build_srt(translated_blocks)
|
||||
88
app/services/translator.py
Normal file
88
app/services/translator.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
import os
|
||||
import re
|
||||
from openai import OpenAI
|
||||
|
||||
BLOCK_MARKER = "<<<BLOCK_{index}>>>"
|
||||
BATCH_SIZE = 50
|
||||
|
||||
|
||||
def _build_batch_prompt(blocks):
|
||||
parts = []
|
||||
for block in blocks:
|
||||
parts.append(BLOCK_MARKER.format(index=block["index"]))
|
||||
parts.append(block["text"])
|
||||
parts.append("")
|
||||
|
||||
joined = "\n".join(parts)
|
||||
|
||||
return f"""
|
||||
You are translating subtitle text from English to Danish.
|
||||
|
||||
Rules:
|
||||
- Translate naturally into short, readable Danish suitable for subtitles.
|
||||
- Keep each block marker exactly unchanged.
|
||||
- Do not add explanations.
|
||||
- Do not remove markers.
|
||||
- Return only the translated blocks.
|
||||
|
||||
Text to translate:
|
||||
|
||||
{joined}
|
||||
""".strip()
|
||||
|
||||
|
||||
def _parse_translated_response(translated_text):
|
||||
pattern = r"<<<BLOCK_(\d+)>>>\n?(.*?)(?=(?:\n<<<BLOCK_\d+>>>|\Z))"
|
||||
matches = re.findall(pattern, translated_text, re.DOTALL)
|
||||
|
||||
result = {}
|
||||
for block_id, text in matches:
|
||||
result[int(block_id)] = text.strip()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _chunked(seq, size):
|
||||
for i in range(0, len(seq), size):
|
||||
yield seq[i:i + size]
|
||||
|
||||
|
||||
def translate_blocks(blocks):
|
||||
api_key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
|
||||
if not api_key:
|
||||
print("DEBUG: no OPENAI_API_KEY found, using fallback translator", flush=True)
|
||||
for block in blocks:
|
||||
block["translated_text"] = "[DA] " + block["text"]
|
||||
return blocks
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
model = os.getenv("OPENAI_MODEL", "gpt-4.1-mini")
|
||||
|
||||
translated_map = {}
|
||||
|
||||
for batch_num, batch in enumerate(_chunked(blocks, BATCH_SIZE), start=1):
|
||||
print(f"DEBUG: translating batch {batch_num} with {len(batch)} blocks", flush=True)
|
||||
|
||||
prompt = _build_batch_prompt(batch)
|
||||
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input=prompt,
|
||||
)
|
||||
|
||||
translated_output = response.output_text
|
||||
parsed = _parse_translated_response(translated_output)
|
||||
|
||||
print(
|
||||
f"DEBUG: batch {batch_num} parsed translations = {len(parsed)}",
|
||||
flush=True
|
||||
)
|
||||
|
||||
for block in batch:
|
||||
translated_map[block["index"]] = parsed.get(block["index"], block["text"])
|
||||
|
||||
for block in blocks:
|
||||
block["translated_text"] = translated_map.get(block["index"], block["text"])
|
||||
|
||||
return blocks
|
||||
Loading…
Add table
Add a link
Reference in a new issue