Make SubFox production-ready with parallel translation and UI controls
This commit is contained in:
parent
c40b8bed2b
commit
2b1d05f02c
6046 changed files with 798327 additions and 0 deletions
BIN
app/__pycache__/main.cpython-310.pyc
Normal file
BIN
app/__pycache__/main.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/srt_parser.cpython-310.pyc
Normal file
BIN
app/__pycache__/srt_parser.cpython-310.pyc
Normal file
Binary file not shown.
0
app/app/__init__.py
Normal file
0
app/app/__init__.py
Normal file
7
app/app/main.py
Normal file
7
app/app/main.py
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
from fastapi import FastAPI
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/")
|
||||
def root():
|
||||
return {"status": "SubFox alive"}
|
||||
0
app/app/services/__init__.py
Normal file
0
app/app/services/__init__.py
Normal file
0
app/app/translators/__init__.py
Normal file
0
app/app/translators/__init__.py
Normal file
52
app/cache.py
Normal file
52
app/cache.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import os
|
||||
import json
|
||||
import hashlib
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
CACHE_ROOT = Path("/data/cache")
|
||||
|
||||
|
||||
def _hash_key(source_lang: str, target_lang: str, text: str) -> str:
|
||||
raw = f"{source_lang}:{target_lang}:{text.strip()}"
|
||||
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _get_path(source_lang: str, target_lang: str, key: str) -> Path:
|
||||
folder = CACHE_ROOT / f"{source_lang}_{target_lang}"
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
return folder / f"{key}.json"
|
||||
|
||||
|
||||
def get_cached(source_lang: str, target_lang: str, text: str):
|
||||
key = _hash_key(source_lang, target_lang, text)
|
||||
path = _get_path(source_lang, target_lang, key)
|
||||
|
||||
if not path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return data.get("translated")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def set_cache(source_lang: str, target_lang: str, text: str, translated: str, model: str):
|
||||
key = _hash_key(source_lang, target_lang, text)
|
||||
path = _get_path(source_lang, target_lang, key)
|
||||
|
||||
data = {
|
||||
"source": text,
|
||||
"translated": translated,
|
||||
"model": model,
|
||||
"created": int(time.time()),
|
||||
}
|
||||
|
||||
tmp_path = path.with_suffix(".tmp")
|
||||
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False)
|
||||
|
||||
os.replace(tmp_path, path)
|
||||
157
app/main.py
Normal file
157
app/main.py
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, UploadFile, File, Form, Request
|
||||
from fastapi.responses import HTMLResponse, FileResponse, JSONResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from app.services.job_store import jobs
|
||||
from app.services.subtitle_service import translate_srt_content, parse_srt
|
||||
|
||||
app = FastAPI()
|
||||
templates = Jinja2Templates(directory="app/templates")
|
||||
|
||||
OUTPUT_DIR = Path("data/output")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def home(request: Request):
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="index.html",
|
||||
context={},
|
||||
)
|
||||
|
||||
|
||||
@app.post("/start")
|
||||
async def start_translation(
|
||||
file: UploadFile = File(...),
|
||||
mode: str = Form("fast"),
|
||||
source_lang: str = Form("auto"),
|
||||
target_lang: str = Form("da"),
|
||||
model: str = Form("gpt-4o-mini"),
|
||||
workers: int = Form(4),
|
||||
):
|
||||
try:
|
||||
raw = await file.read()
|
||||
content = raw.decode("utf-8-sig")
|
||||
except UnicodeDecodeError:
|
||||
return JSONResponse(
|
||||
{"error": "Kunne ikke læse filen som UTF-8/UTF-8-SIG"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
if workers < 1:
|
||||
workers = 1
|
||||
if workers > 16:
|
||||
workers = 16
|
||||
|
||||
job_id = str(uuid.uuid4())
|
||||
output_path = OUTPUT_DIR / f"{job_id}.srt"
|
||||
|
||||
try:
|
||||
parsed_blocks = parse_srt(content)
|
||||
block_count = len(parsed_blocks)
|
||||
except Exception:
|
||||
block_count = 0
|
||||
|
||||
jobs[job_id] = {
|
||||
"status": "queued",
|
||||
"progress": 0,
|
||||
"filename": file.filename or "translated.srt",
|
||||
"output_path": str(output_path),
|
||||
"blocks": block_count,
|
||||
"error": "",
|
||||
"done": False,
|
||||
"mode": mode,
|
||||
"source_lang": source_lang,
|
||||
"target_lang": target_lang,
|
||||
"model": model,
|
||||
"workers": workers,
|
||||
}
|
||||
|
||||
def progress_callback(done_blocks: int, total_blocks: int):
|
||||
if total_blocks <= 0:
|
||||
jobs[job_id]["progress"] = 1
|
||||
return
|
||||
|
||||
percent = int((done_blocks / total_blocks) * 100)
|
||||
|
||||
if done_blocks > 0:
|
||||
percent = max(2, percent)
|
||||
|
||||
percent = min(99, percent)
|
||||
|
||||
jobs[job_id]["progress"] = percent
|
||||
jobs[job_id]["status"] = "running"
|
||||
jobs[job_id]["blocks"] = total_blocks
|
||||
|
||||
def worker():
|
||||
try:
|
||||
jobs[job_id]["status"] = "starting"
|
||||
jobs[job_id]["progress"] = 1
|
||||
|
||||
translated_srt = translate_srt_content(
|
||||
content=content,
|
||||
mode=mode,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
job_id=job_id,
|
||||
progress_callback=progress_callback,
|
||||
model=model,
|
||||
workers=workers,
|
||||
)
|
||||
|
||||
output_path.write_text(translated_srt, encoding="utf-8")
|
||||
|
||||
jobs[job_id]["status"] = "done"
|
||||
jobs[job_id]["progress"] = 100
|
||||
jobs[job_id]["done"] = True
|
||||
|
||||
except Exception as e:
|
||||
jobs[job_id]["status"] = "error"
|
||||
jobs[job_id]["progress"] = 0
|
||||
jobs[job_id]["error"] = str(e)
|
||||
jobs[job_id]["done"] = False
|
||||
|
||||
threading.Thread(target=worker, daemon=True).start()
|
||||
|
||||
return JSONResponse({"job_id": job_id})
|
||||
|
||||
|
||||
@app.get("/status/{job_id}")
|
||||
async def get_status(job_id: str):
|
||||
job = jobs.get(job_id)
|
||||
if not job:
|
||||
return JSONResponse(
|
||||
{"status": "unknown", "progress": 0, "error": "Job not found"},
|
||||
status_code=404,
|
||||
)
|
||||
return JSONResponse(job)
|
||||
|
||||
|
||||
@app.get("/download/{job_id}")
|
||||
async def download_result(job_id: str):
|
||||
job = jobs.get(job_id)
|
||||
if not job:
|
||||
return JSONResponse({"error": "Job not found"}, status_code=404)
|
||||
|
||||
if not job.get("done"):
|
||||
return JSONResponse({"error": "File not ready yet"}, status_code=400)
|
||||
|
||||
path = Path(job["output_path"])
|
||||
if not path.exists():
|
||||
return JSONResponse({"error": "Output file missing"}, status_code=404)
|
||||
|
||||
original_name = job.get("filename", "translated.srt")
|
||||
download_name = f"translated_{original_name}"
|
||||
|
||||
return FileResponse(
|
||||
path=path,
|
||||
media_type="application/x-subrip",
|
||||
filename=download_name,
|
||||
)
|
||||
0
app/services/__init__.py
Normal file
0
app/services/__init__.py
Normal file
BIN
app/services/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/job_store.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/job_store.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/srt_builder.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/srt_builder.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/srt_parser.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/srt_parser.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/subtitle_service.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/subtitle_service.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/services/__pycache__/translator.cpython-310.pyc
Normal file
BIN
app/services/__pycache__/translator.cpython-310.pyc
Normal file
Binary file not shown.
1
app/services/job_store.py
Normal file
1
app/services/job_store.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
jobs = {}
|
||||
10
app/services/srt_builder.py
Normal file
10
app/services/srt_builder.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
def build_srt(blocks):
|
||||
output = []
|
||||
|
||||
for block in blocks:
|
||||
output.append(str(block["index"]))
|
||||
output.append(f'{block["start"]} --> {block["end"]}')
|
||||
output.append(block.get("translated_text", block["text"]))
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
35
app/services/srt_parser.py
Normal file
35
app/services/srt_parser.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
import re
|
||||
|
||||
|
||||
def parse_srt(content: str):
|
||||
content = content.replace("\r\n", "\n").replace("\r", "\n").strip()
|
||||
parts = re.split(r"\n\s*\n", content)
|
||||
|
||||
blocks = []
|
||||
|
||||
for part in parts:
|
||||
lines = [line.rstrip() for line in part.split("\n") if line.strip() != ""]
|
||||
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
try:
|
||||
index = int(lines[0].strip())
|
||||
times = lines[1].strip()
|
||||
|
||||
if " --> " not in times:
|
||||
continue
|
||||
|
||||
start, end = times.split(" --> ", 1)
|
||||
text = "\n".join(lines[2:]).strip()
|
||||
|
||||
blocks.append({
|
||||
"index": index,
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return blocks
|
||||
193
app/services/subtitle_service.py
Normal file
193
app/services/subtitle_service.py
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
import re
|
||||
from app.translators.factory import get_translator
|
||||
|
||||
|
||||
def parse_srt(content: str):
|
||||
content = content.replace("\r\n", "\n").replace("\r", "\n").strip()
|
||||
raw_blocks = re.split(r"\n\s*\n", content)
|
||||
blocks = []
|
||||
|
||||
for raw in raw_blocks:
|
||||
lines = raw.strip().split("\n")
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
try:
|
||||
index = int(lines[0].strip())
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if "-->" not in lines[1]:
|
||||
continue
|
||||
|
||||
start, end = [x.strip() for x in lines[1].split("-->", 1)]
|
||||
text = "\n".join(lines[2:]).strip()
|
||||
|
||||
blocks.append(
|
||||
{
|
||||
"index": index,
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text,
|
||||
}
|
||||
)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def build_srt(blocks):
|
||||
output = []
|
||||
for b in blocks:
|
||||
output.append(
|
||||
f"{b['index']}\n{b['start']} --> {b['end']}\n{b['text']}\n"
|
||||
)
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def _make_translator(mode=None, source_lang="auto", target_lang="da"):
|
||||
attempts = [
|
||||
lambda: get_translator(mode=mode, source_lang=source_lang, target_lang=target_lang),
|
||||
lambda: get_translator(mode, source_lang, target_lang),
|
||||
lambda: get_translator(mode=mode),
|
||||
lambda: get_translator(mode),
|
||||
lambda: get_translator(),
|
||||
]
|
||||
|
||||
last_error = None
|
||||
for attempt in attempts:
|
||||
try:
|
||||
translator = attempt()
|
||||
if translator is not None:
|
||||
return translator
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
|
||||
raise RuntimeError(f"Could not create translator via get_translator(): {last_error}")
|
||||
|
||||
|
||||
def _translate_blocks(
|
||||
translator,
|
||||
blocks,
|
||||
source_lang="auto",
|
||||
target_lang="da",
|
||||
progress_callback=None,
|
||||
):
|
||||
if hasattr(translator, "translate_blocks"):
|
||||
return translator.translate_blocks(
|
||||
blocks,
|
||||
source_lang,
|
||||
target_lang,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
if hasattr(translator, "translate_batch"):
|
||||
translated_texts = translator.translate_batch(blocks, source_lang, target_lang)
|
||||
|
||||
output = []
|
||||
total = len(blocks)
|
||||
|
||||
for i, (block, translated_text) in enumerate(zip(blocks, translated_texts), start=1):
|
||||
new_block = dict(block)
|
||||
|
||||
if isinstance(translated_text, dict):
|
||||
translated_text = (
|
||||
translated_text.get("text")
|
||||
or translated_text.get("translated_text")
|
||||
or translated_text.get("translation")
|
||||
or block["text"]
|
||||
)
|
||||
|
||||
new_block["text"] = str(translated_text).strip()
|
||||
output.append(new_block)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(i, total)
|
||||
|
||||
return output
|
||||
|
||||
if hasattr(translator, "translate"):
|
||||
output = []
|
||||
total = len(blocks)
|
||||
|
||||
for i, block in enumerate(blocks, start=1):
|
||||
new_block = dict(block)
|
||||
try:
|
||||
translated = translator.translate(block["text"], source_lang, target_lang)
|
||||
except TypeError:
|
||||
translated = translator.translate(block["text"])
|
||||
|
||||
new_block["text"] = str(translated).strip()
|
||||
output.append(new_block)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(i, total)
|
||||
|
||||
return output
|
||||
|
||||
if callable(translator):
|
||||
output = []
|
||||
total = len(blocks)
|
||||
|
||||
for i, block in enumerate(blocks, start=1):
|
||||
new_block = dict(block)
|
||||
try:
|
||||
translated = translator(block["text"], source_lang, target_lang)
|
||||
except TypeError:
|
||||
translated = translator(block["text"])
|
||||
|
||||
new_block["text"] = str(translated).strip()
|
||||
output.append(new_block)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(i, total)
|
||||
|
||||
return output
|
||||
|
||||
raise TypeError(f"Unsupported translator interface: {type(translator)}")
|
||||
|
||||
|
||||
def translate_srt_content(
|
||||
content,
|
||||
translator=None,
|
||||
mode=None,
|
||||
source_lang="auto",
|
||||
target_lang="da",
|
||||
job_id=None,
|
||||
progress_callback=None,
|
||||
**kwargs,
|
||||
):
|
||||
print("DEBUG: ===== START translate_srt_content =====")
|
||||
print("DEBUG: mode:", mode)
|
||||
print("DEBUG: source_lang:", source_lang)
|
||||
print("DEBUG: target_lang:", target_lang)
|
||||
print("DEBUG: job_id:", job_id)
|
||||
print("DEBUG: extra kwargs:", kwargs)
|
||||
|
||||
blocks = parse_srt(content)
|
||||
|
||||
print("DEBUG: blocks count:", len(blocks))
|
||||
print("DEBUG: first block:", blocks[0] if blocks else "NONE")
|
||||
|
||||
if not blocks:
|
||||
raise ValueError("No SRT blocks could be parsed from content")
|
||||
|
||||
if translator is None:
|
||||
translator = _make_translator(
|
||||
mode=mode,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
)
|
||||
print("DEBUG: translator created:", type(translator))
|
||||
|
||||
translated_blocks = _translate_blocks(
|
||||
translator,
|
||||
blocks,
|
||||
source_lang=source_lang,
|
||||
target_lang=target_lang,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
print("DEBUG: translated blocks count:", len(translated_blocks))
|
||||
print("DEBUG: ===== END translate_srt_content =====")
|
||||
|
||||
return build_srt(translated_blocks)
|
||||
88
app/services/translator.py
Normal file
88
app/services/translator.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
import os
|
||||
import re
|
||||
from openai import OpenAI
|
||||
|
||||
BLOCK_MARKER = "<<<BLOCK_{index}>>>"
|
||||
BATCH_SIZE = 50
|
||||
|
||||
|
||||
def _build_batch_prompt(blocks):
|
||||
parts = []
|
||||
for block in blocks:
|
||||
parts.append(BLOCK_MARKER.format(index=block["index"]))
|
||||
parts.append(block["text"])
|
||||
parts.append("")
|
||||
|
||||
joined = "\n".join(parts)
|
||||
|
||||
return f"""
|
||||
You are translating subtitle text from English to Danish.
|
||||
|
||||
Rules:
|
||||
- Translate naturally into short, readable Danish suitable for subtitles.
|
||||
- Keep each block marker exactly unchanged.
|
||||
- Do not add explanations.
|
||||
- Do not remove markers.
|
||||
- Return only the translated blocks.
|
||||
|
||||
Text to translate:
|
||||
|
||||
{joined}
|
||||
""".strip()
|
||||
|
||||
|
||||
def _parse_translated_response(translated_text):
|
||||
pattern = r"<<<BLOCK_(\d+)>>>\n?(.*?)(?=(?:\n<<<BLOCK_\d+>>>|\Z))"
|
||||
matches = re.findall(pattern, translated_text, re.DOTALL)
|
||||
|
||||
result = {}
|
||||
for block_id, text in matches:
|
||||
result[int(block_id)] = text.strip()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _chunked(seq, size):
|
||||
for i in range(0, len(seq), size):
|
||||
yield seq[i:i + size]
|
||||
|
||||
|
||||
def translate_blocks(blocks):
|
||||
api_key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
|
||||
if not api_key:
|
||||
print("DEBUG: no OPENAI_API_KEY found, using fallback translator", flush=True)
|
||||
for block in blocks:
|
||||
block["translated_text"] = "[DA] " + block["text"]
|
||||
return blocks
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
model = os.getenv("OPENAI_MODEL", "gpt-4.1-mini")
|
||||
|
||||
translated_map = {}
|
||||
|
||||
for batch_num, batch in enumerate(_chunked(blocks, BATCH_SIZE), start=1):
|
||||
print(f"DEBUG: translating batch {batch_num} with {len(batch)} blocks", flush=True)
|
||||
|
||||
prompt = _build_batch_prompt(batch)
|
||||
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input=prompt,
|
||||
)
|
||||
|
||||
translated_output = response.output_text
|
||||
parsed = _parse_translated_response(translated_output)
|
||||
|
||||
print(
|
||||
f"DEBUG: batch {batch_num} parsed translations = {len(parsed)}",
|
||||
flush=True
|
||||
)
|
||||
|
||||
for block in batch:
|
||||
translated_map[block["index"]] = parsed.get(block["index"], block["text"])
|
||||
|
||||
for block in blocks:
|
||||
block["translated_text"] = translated_map.get(block["index"], block["text"])
|
||||
|
||||
return blocks
|
||||
41
app/srt_parser.py
Normal file
41
app/srt_parser.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
# app/srt_parser.py
|
||||
import re
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
SRT_BLOCK_RE = re.compile(
|
||||
r"(\d+)\s*\n"
|
||||
r"(\d{2}:\d{2}:\d{2},\d{3})\s-->\s(\d{2}:\d{2}:\d{2},\d{3})\s*\n"
|
||||
r"(.*?)(?=\n{2,}|\Z)",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def parse_srt(content: str) -> List[Dict[str, str]]:
|
||||
normalized = content.replace("\r\n", "\n").replace("\r", "\n").strip()
|
||||
|
||||
blocks = []
|
||||
for match in SRT_BLOCK_RE.finditer(normalized):
|
||||
index, start, end, text = match.groups()
|
||||
blocks.append(
|
||||
{
|
||||
"index": int(index),
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text.strip(),
|
||||
}
|
||||
)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def build_srt(blocks: List[Dict[str, str]]) -> str:
|
||||
output = []
|
||||
|
||||
for i, block in enumerate(blocks, start=1):
|
||||
output.append(str(i))
|
||||
output.append(f"{block['start']} --> {block['end']}")
|
||||
output.append(block["text"])
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output).strip() + "\n"
|
||||
254
app/templates/index.html
Normal file
254
app/templates/index.html
Normal file
|
|
@ -0,0 +1,254 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1" />
|
||||
<title>SubFox</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
max-width: 760px;
|
||||
margin: 40px auto;
|
||||
padding: 0 16px;
|
||||
line-height: 1.4;
|
||||
}
|
||||
h1 {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
.card {
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 12px;
|
||||
padding: 16px;
|
||||
margin-top: 20px;
|
||||
}
|
||||
.grid {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 12px;
|
||||
}
|
||||
.full {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
label {
|
||||
display: block;
|
||||
font-weight: 600;
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
input, select, button {
|
||||
width: 100%;
|
||||
padding: 10px;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
button {
|
||||
cursor: pointer;
|
||||
font-weight: 700;
|
||||
}
|
||||
progress {
|
||||
width: 100%;
|
||||
height: 22px;
|
||||
}
|
||||
.muted {
|
||||
color: #666;
|
||||
font-size: 14px;
|
||||
}
|
||||
.hidden {
|
||||
display: none;
|
||||
}
|
||||
#statusBox {
|
||||
margin-top: 16px;
|
||||
}
|
||||
#downloadLink {
|
||||
display: inline-block;
|
||||
margin-top: 12px;
|
||||
font-weight: 700;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
background: #f7f7f7;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>SubFox</h1>
|
||||
<div class="muted">Subtitle translator with per-job settings</div>
|
||||
|
||||
<div class="card">
|
||||
<form id="uploadForm">
|
||||
<div class="grid">
|
||||
<div class="full">
|
||||
<label for="file">SRT file</label>
|
||||
<input id="file" name="file" type="file" accept=".srt" required />
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label for="mode">Mode</label>
|
||||
<select id="mode" name="mode">
|
||||
<option value="fast" selected>fast</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label for="target_lang">Target language</label>
|
||||
<select id="target_lang" name="target_lang">
|
||||
<option value="da" selected>Danish</option>
|
||||
<option value="en">English</option>
|
||||
<option value="de">German</option>
|
||||
<option value="sv">Swedish</option>
|
||||
<option value="no">Norwegian</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label for="source_lang">Source language</label>
|
||||
<select id="source_lang" name="source_lang">
|
||||
<option value="auto" selected>auto</option>
|
||||
<option value="en">English</option>
|
||||
<option value="da">Danish</option>
|
||||
<option value="de">German</option>
|
||||
<option value="sv">Swedish</option>
|
||||
<option value="no">Norwegian</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label for="model">Model</label>
|
||||
<select id="model" name="model">
|
||||
<option value="gpt-4o-mini" selected>gpt-4o-mini</option>
|
||||
<option value="gpt-4.1-mini">gpt-4.1-mini</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label for="workers">Workers</label>
|
||||
<input id="workers" name="workers" type="number" min="1" max="16" value="4" />
|
||||
</div>
|
||||
|
||||
<div class="full">
|
||||
<button type="submit">Start translation</button>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<div id="statusBox" class="hidden">
|
||||
<p><strong>Status:</strong> <span id="statusText">queued</span></p>
|
||||
<p><strong>Progress:</strong> <span id="progressText">0%</span></p>
|
||||
<progress id="progressBar" value="0" max="100"></progress>
|
||||
|
||||
<div class="card">
|
||||
<div><strong>Job settings</strong></div>
|
||||
<pre id="jobMeta"></pre>
|
||||
</div>
|
||||
|
||||
<a id="downloadLink" class="hidden" href="#">Download translated file</a>
|
||||
|
||||
<div id="errorBox" class="hidden">
|
||||
<strong>Error</strong>
|
||||
<pre id="errorText"></pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const form = document.getElementById("uploadForm");
|
||||
const statusBox = document.getElementById("statusBox");
|
||||
const statusText = document.getElementById("statusText");
|
||||
const progressText = document.getElementById("progressText");
|
||||
const progressBar = document.getElementById("progressBar");
|
||||
const downloadLink = document.getElementById("downloadLink");
|
||||
const errorBox = document.getElementById("errorBox");
|
||||
const errorText = document.getElementById("errorText");
|
||||
const jobMeta = document.getElementById("jobMeta");
|
||||
|
||||
let pollTimer = null;
|
||||
|
||||
function setStatus(job) {
|
||||
statusBox.classList.remove("hidden");
|
||||
statusText.textContent = job.status ?? "unknown";
|
||||
progressText.textContent = `${job.progress ?? 0}%`;
|
||||
progressBar.value = job.progress ?? 0;
|
||||
|
||||
jobMeta.textContent = JSON.stringify({
|
||||
mode: job.mode,
|
||||
source_lang: job.source_lang,
|
||||
target_lang: job.target_lang,
|
||||
model: job.model,
|
||||
workers: job.workers,
|
||||
blocks: job.blocks
|
||||
}, null, 2);
|
||||
|
||||
if (job.done) {
|
||||
downloadLink.href = `/download/${jobId}`;
|
||||
downloadLink.classList.remove("hidden");
|
||||
} else {
|
||||
downloadLink.classList.add("hidden");
|
||||
}
|
||||
|
||||
if (job.status === "error") {
|
||||
errorBox.classList.remove("hidden");
|
||||
errorText.textContent = job.error || "Unknown error";
|
||||
} else {
|
||||
errorBox.classList.add("hidden");
|
||||
errorText.textContent = "";
|
||||
}
|
||||
}
|
||||
|
||||
let jobId = null;
|
||||
|
||||
async function pollStatus() {
|
||||
if (!jobId) return;
|
||||
|
||||
const res = await fetch(`/status/${jobId}`);
|
||||
const job = await res.json();
|
||||
setStatus(job);
|
||||
|
||||
if (job.done || job.status === "error") {
|
||||
if (pollTimer) {
|
||||
clearTimeout(pollTimer);
|
||||
pollTimer = null;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
pollTimer = setTimeout(pollStatus, 800);
|
||||
}
|
||||
|
||||
form.addEventListener("submit", async (e) => {
|
||||
e.preventDefault();
|
||||
|
||||
if (pollTimer) {
|
||||
clearTimeout(pollTimer);
|
||||
pollTimer = null;
|
||||
}
|
||||
|
||||
downloadLink.classList.add("hidden");
|
||||
errorBox.classList.add("hidden");
|
||||
progressBar.value = 0;
|
||||
progressText.textContent = "0%";
|
||||
statusText.textContent = "uploading...";
|
||||
statusBox.classList.remove("hidden");
|
||||
|
||||
const formData = new FormData(form);
|
||||
|
||||
const res = await fetch("/start", {
|
||||
method: "POST",
|
||||
body: formData
|
||||
});
|
||||
|
||||
const data = await res.json();
|
||||
|
||||
if (!res.ok) {
|
||||
statusText.textContent = "error";
|
||||
errorBox.classList.remove("hidden");
|
||||
errorText.textContent = data.error || "Upload failed";
|
||||
return;
|
||||
}
|
||||
|
||||
jobId = data.job_id;
|
||||
pollStatus();
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
0
app/translators/__init__.py
Normal file
0
app/translators/__init__.py
Normal file
BIN
app/translators/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
app/translators/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/translators/__pycache__/base.cpython-310.pyc
Normal file
BIN
app/translators/__pycache__/base.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/translators/__pycache__/factory.cpython-310.pyc
Normal file
BIN
app/translators/__pycache__/factory.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/translators/__pycache__/fast_engine.cpython-310.pyc
Normal file
BIN
app/translators/__pycache__/fast_engine.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/translators/__pycache__/smart_engine.cpython-310.pyc
Normal file
BIN
app/translators/__pycache__/smart_engine.cpython-310.pyc
Normal file
Binary file not shown.
13
app/translators/base.py
Normal file
13
app/translators/base.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
|
||||
class BaseTranslator(ABC):
|
||||
@abstractmethod
|
||||
def translate_blocks(
|
||||
self,
|
||||
texts: List[str],
|
||||
source_lang: str = "auto",
|
||||
target_lang: str = "da",
|
||||
) -> List[str]:
|
||||
pass
|
||||
9
app/translators/factory.py
Normal file
9
app/translators/factory.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from .fast_engine import FastTranslator
|
||||
|
||||
|
||||
def get_translator(mode: str = "fast"):
|
||||
if mode == "smart":
|
||||
from .smart_engine import SmartTranslator
|
||||
return SmartTranslator()
|
||||
|
||||
return FastTranslator(max_chunk_chars=3500)
|
||||
151
app/translators/fast_engine.py
Normal file
151
app/translators/fast_engine.py
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Callable, Dict, List, Optional
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from app.cache import get_cached, set_cache
|
||||
|
||||
|
||||
def _env_int(name: str, default: int) -> int:
|
||||
value = os.getenv(name)
|
||||
if value is None or value == "":
|
||||
return default
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _env_float(name: str, default: float) -> float:
|
||||
value = os.getenv(name)
|
||||
if value is None or value == "":
|
||||
return default
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
class FastTranslator:
|
||||
def __init__(
|
||||
self,
|
||||
api_key=None,
|
||||
model=None,
|
||||
workers=None,
|
||||
max_retries=None,
|
||||
retry_base_delay=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
self.model = model or os.getenv("SUBFOX_MODEL", "gpt-4o-mini")
|
||||
self.workers = workers if workers is not None else _env_int("SUBFOX_WORKERS", 4)
|
||||
self.max_retries = (
|
||||
max_retries if max_retries is not None else _env_int("SUBFOX_MAX_RETRIES", 3)
|
||||
)
|
||||
self.retry_base_delay = (
|
||||
retry_base_delay
|
||||
if retry_base_delay is not None
|
||||
else _env_float("SUBFOX_RETRY_BASE_DELAY", 1.0)
|
||||
)
|
||||
self.kwargs = kwargs
|
||||
self.client = OpenAI(api_key=self.api_key) if self.api_key else OpenAI()
|
||||
|
||||
def _translate_text(self, text: str, source_lang: str, target_lang: str) -> str:
|
||||
cached = get_cached(source_lang, target_lang, text)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
prompt = (
|
||||
f"Translate the following subtitle text from {source_lang} to {target_lang}. "
|
||||
"Preserve meaning, keep it natural, and return only the translated text.\n\n"
|
||||
f"{text}"
|
||||
)
|
||||
|
||||
last_error = None
|
||||
|
||||
for attempt in range(1, self.max_retries + 1):
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a subtitle translator. "
|
||||
"Return only the translated text with no explanations."
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content or ""
|
||||
result = content.strip()
|
||||
|
||||
if result:
|
||||
set_cache(source_lang, target_lang, text, result, self.model)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
|
||||
if attempt >= self.max_retries:
|
||||
break
|
||||
|
||||
delay = self.retry_base_delay * (2 ** (attempt - 1))
|
||||
time.sleep(delay)
|
||||
|
||||
raise RuntimeError(
|
||||
f"Translation failed after {self.max_retries} attempts: {last_error}"
|
||||
)
|
||||
|
||||
def _translate_one(self, block: Dict, source_lang: str, target_lang: str) -> Dict:
|
||||
new_block = dict(block)
|
||||
new_block["text"] = self._translate_text(
|
||||
block["text"],
|
||||
source_lang,
|
||||
target_lang,
|
||||
)
|
||||
return new_block
|
||||
|
||||
def translate_blocks(
|
||||
self,
|
||||
blocks: List[Dict],
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> List[Dict]:
|
||||
total = len(blocks)
|
||||
output: List[Optional[Dict]] = [None] * total
|
||||
|
||||
with ThreadPoolExecutor(max_workers=self.workers) as executor:
|
||||
futures = {
|
||||
executor.submit(self._translate_one, block, source_lang, target_lang): i
|
||||
for i, block in enumerate(blocks)
|
||||
}
|
||||
|
||||
done = 0
|
||||
for future in as_completed(futures):
|
||||
idx = futures[future]
|
||||
output[idx] = future.result()
|
||||
done += 1
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(done, total)
|
||||
|
||||
return [block for block in output if block is not None]
|
||||
|
||||
def translate_batch(
|
||||
self,
|
||||
batch: List[Dict],
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
) -> List[str]:
|
||||
translated_blocks = self.translate_blocks(batch, source_lang, target_lang)
|
||||
return [block["text"] for block in translated_blocks]
|
||||
58
app/translators/smart_engine.py
Normal file
58
app/translators/smart_engine.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
# app/translators/smart_engine.py
|
||||
import json
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from .base import BaseTranslator
|
||||
|
||||
|
||||
class SmartTranslator(BaseTranslator):
|
||||
def __init__(self, api_key: str | None = None, model: str = "gpt-4.1-mini", batch_size: int = 40):
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY mangler for smart mode")
|
||||
|
||||
self.client = OpenAI(api_key=api_key)
|
||||
self.model = model
|
||||
self.batch_size = batch_size
|
||||
|
||||
def _translate_batch(self, batch: List[str], source_lang: str, target_lang: str) -> List[str]:
|
||||
payload = [{"i": i, "text": t} for i, t in enumerate(batch)]
|
||||
|
||||
prompt = (
|
||||
f"Translate these subtitle lines from {source_lang} to {target_lang}.\n"
|
||||
"Return ONLY valid JSON array.\n"
|
||||
'Each item must be like: {"i": 0, "text": "..."}\n'
|
||||
"Keep same order, keep line meaning natural and concise.\n\n"
|
||||
f"{json.dumps(payload, ensure_ascii=False)}"
|
||||
)
|
||||
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.2,
|
||||
)
|
||||
|
||||
text = response.choices[0].message.content or ""
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
return [item["text"] for item in data]
|
||||
except Exception:
|
||||
return batch
|
||||
|
||||
def translate_blocks(
|
||||
self,
|
||||
texts: List[str],
|
||||
source_lang: str = "auto",
|
||||
target_lang: str = "da",
|
||||
) -> List[str]:
|
||||
out = []
|
||||
|
||||
for i in range(0, len(texts), self.batch_size):
|
||||
batch = texts[i:i + self.batch_size]
|
||||
out.extend(self._translate_batch(batch, source_lang, target_lang))
|
||||
|
||||
return out
|
||||
Loading…
Add table
Add a link
Reference in a new issue