feat: add named volume detection and image-aware classification

This commit is contained in:
Eddie Nielsen 2026-03-23 13:27:12 +00:00
parent 932c668e65
commit 483e2720f1
2 changed files with 770 additions and 221 deletions

View file

@ -1,291 +1,546 @@
from __future__ import annotations from __future__ import annotations
import json
import shutil
import subprocess
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import yaml import yaml
from .models import MountEntry
CRITICAL_TARGET_PATTERNS = (
"/var/lib/mysql", # ----------------------------
"/var/lib/postgresql", # Image-aware rules
"/var/lib/postgres", # ----------------------------
"/var/lib/mariadb",
"/data", IMAGE_RULES = {
"mariadb": {
"/var/lib/mysql": "critical",
},
"mysql": {
"/var/lib/mysql": "critical",
},
"postgres": {
"/var/lib/postgresql/data": "critical",
},
"redis": {
"/data": "critical",
},
"grafana": {
"/var/lib/grafana": "critical",
},
"prometheus": {
"/prometheus": "critical",
},
"influxdb": {
"/var/lib/influxdb": "critical",
},
"nginx": {
"/var/log/nginx": "optional",
},
}
# ----------------------------
# Generic rules
# ----------------------------
CRITICAL_TARGETS = {
"/config", "/config",
"/var/www", "/data",
"/srv", "/var/lib/mysql",
"/app/data", "/var/lib/mariadb",
"/bitnami", "/var/lib/postgresql/data",
"/var/opt", "/bitnami/postgresql",
"/var/lib/redis", "/var/lib/redis",
"/redis", "/data/db",
"/var/lib/mongodb", "/var/lib/grafana",
"/mongodb", "/var/lib/influxdb",
"/var/lib/prometheus",
"/etc/letsencrypt",
"/acme.sh",
"/app/data",
"/srv",
}
REVIEW_TARGET_KEYWORDS = {
"backup",
"uploads",
"media",
"www",
"html",
"content",
"storage",
"files",
"database",
"db",
"config",
}
SKIP_TARGET_PREFIXES = (
"/tmp",
"/var/tmp",
"/run",
"/var/run",
"/dev",
) )
OPTIONAL_TARGET_PATTERNS = ( SKIP_TARGET_EXACT = {
"/var/log", "/var/log",
"/var/log/nginx",
"/logs", "/logs",
"/log", "/log",
"/tmp",
"/cache", "/cache",
"/var/cache",
"/run",
)
OPTIONAL_SOURCE_PATTERNS = (
"logs",
"log",
"cache",
"tmp",
"temp",
)
SKIP_TARGET_PATTERNS = (
"/dev",
"/proc",
"/sys",
"/run",
"/tmp", "/tmp",
) }
SKIP_SOURCE_PATTERNS = (
"/var/run/docker.sock",
"docker.sock",
)
def load_compose(compose_file: Path) -> dict[str, Any]: CLASS_PRIORITY = {
"critical": 3,
"review": 2,
"optional": 1,
"unknown": 0,
}
# ----------------------------
# Compose loader
# ----------------------------
def load_compose(compose_path: str | Path) -> dict[str, Any]:
compose_file = Path(compose_path).expanduser().resolve()
with compose_file.open("r", encoding="utf-8") as f: with compose_file.open("r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {} data = yaml.safe_load(f) or {}
if not isinstance(data, dict): if not isinstance(data, dict):
raise ValueError("Compose file did not parse into a dictionary") raise ValueError(f"Compose file did not parse as a mapping: {compose_file}")
return data return data
def is_bind_mount(volume: Any) -> bool: # ----------------------------
if isinstance(volume, str): # Docker helpers
return ":" in volume # ----------------------------
if isinstance(volume, dict): def docker_available() -> bool:
return volume.get("type") == "bind" return shutil.which("docker") is not None
return False
def parse_volume_entry( def run_docker_volume_inspect(volume_name: str) -> dict[str, Any] | None:
volume: Any, if not docker_available():
compose_file: Path, return None
) -> dict[str, str] | None:
project_root = compose_file.parent.resolve()
if isinstance(volume, str): try:
parts = volume.split(":") result = subprocess.run(
if len(parts) < 2: ["docker", "volume", "inspect", volume_name],
return None capture_output=True,
text=True,
check=False,
)
except OSError:
return None
source_raw = parts[0].strip() if result.returncode != 0:
target = parts[1].strip() return None
if not source_raw or not target: try:
return None data = json.loads(result.stdout)
except json.JSONDecodeError:
return None
# Named volumes should not be treated as bind mounts if not isinstance(data, list) or not data:
if not source_raw.startswith(("/", ".", "~")): return None
return None
source = resolve_source_path(source_raw, project_root) first = data[0]
if not isinstance(first, dict):
return None
return { return first
"source": str(source),
"target": target,
}
if isinstance(volume, dict):
if volume.get("type") != "bind":
return None
source_raw = str(volume.get("source", "")).strip()
target = str(volume.get("target", "")).strip()
if not source_raw or not target:
return None
source = resolve_source_path(source_raw, project_root)
return {
"source": str(source),
"target": target,
}
return None
def resolve_source_path(source_raw: str, project_root: Path) -> Path: # ----------------------------
source_path = Path(source_raw).expanduser() # Volume resolution
# ----------------------------
if not source_path.is_absolute(): def infer_project_name(compose_path: Path, compose_data: dict[str, Any]) -> str:
source_path = (project_root / source_path).resolve() top_level_name = compose_data.get("name")
else: if isinstance(top_level_name, str) and top_level_name.strip():
source_path = source_path.resolve() return top_level_name.strip()
return source_path return compose_path.parent.name
def classify_mount( def normalize_top_level_volume_name(
service_name: str, volume_key: str,
source: str, compose_data: dict[str, Any],
target: str, ) -> tuple[str | None, bool]:
) -> tuple[str, str, str]: volumes = compose_data.get("volumes", {})
source_lower = source.lower() if not isinstance(volumes, dict):
target_lower = target.lower() return None, False
for pattern in SKIP_SOURCE_PATTERNS: cfg = volumes.get(volume_key)
if pattern in source_lower: if not isinstance(cfg, dict):
return "skip", "optional", "docker runtime socket" return None, False
for pattern in SKIP_TARGET_PATTERNS: explicit_name = cfg.get("name")
if target_lower == pattern or target_lower.startswith(pattern + "/"): if not isinstance(explicit_name, str) or not explicit_name.strip():
return "skip", "optional", "runtime/system path" explicit_name = None
for pattern in CRITICAL_TARGET_PATTERNS: external = cfg.get("external", False)
if target_lower == pattern or target_lower.startswith(pattern + "/"): is_external = False
return "include", "critical", "persistent app data"
for pattern in OPTIONAL_TARGET_PATTERNS: if isinstance(external, bool):
if target_lower == pattern or target_lower.startswith(pattern + "/"): is_external = external
return "skip", "optional", "logs/cache/temp path" elif isinstance(external, dict):
is_external = True
ext_name = external.get("name")
if isinstance(ext_name, str) and ext_name.strip():
explicit_name = ext_name.strip()
source_name = Path(source).name.lower() return explicit_name, is_external
for pattern in OPTIONAL_SOURCE_PATTERNS:
if pattern in source_name:
return "skip", "optional", "logs/cache/temp source"
return "review", "medium", "unknown bind mount"
def classify_service_mounts( def build_volume_candidates(
service_name: str, compose_name: str,
service_data: dict[str, Any], compose_path: Path,
compose_file: Path, compose_data: dict[str, Any],
) -> list[dict[str, str]]: ) -> list[str]:
results: list[dict[str, str]] = [] project_name = infer_project_name(compose_path, compose_data)
explicit_name, is_external = normalize_top_level_volume_name(compose_name, compose_data)
volumes = service_data.get("volumes", []) candidates: list[str] = []
if not isinstance(volumes, list):
return results
for volume in volumes: if explicit_name:
if not is_bind_mount(volume): candidates.append(explicit_name)
if is_external:
candidates.append(compose_name)
candidates.append(compose_name)
candidates.append(f"{project_name}_{compose_name}")
unique: list[str] = []
seen: set[str] = set()
for candidate in candidates:
if candidate not in seen:
unique.append(candidate)
seen.add(candidate)
return unique
def resolve_named_volume(
compose_name: str,
compose_path: Path,
compose_data: dict[str, Any],
) -> tuple[Path | None, str]:
if not docker_available():
return None, "docker CLI not available"
for candidate in build_volume_candidates(compose_name, compose_path, compose_data):
inspected = run_docker_volume_inspect(candidate)
if not inspected:
continue continue
parsed = parse_volume_entry(volume, compose_file) mountpoint = inspected.get("Mountpoint")
if not parsed: if isinstance(mountpoint, str) and mountpoint.strip():
continue return Path(mountpoint), f"named volume '{compose_name}' -> docker volume '{candidate}'"
bucket, priority, reason = classify_mount( return None, f"named volume '{compose_name}' could not be resolved"
service_name=service_name,
source=parsed["source"],
target=parsed["target"],
)
results.append(
{
"bucket": bucket,
"priority": priority,
"reason": reason,
"service": service_name,
"source": parsed["source"],
"target": parsed["target"],
}
)
return results
def deduplicate_items(items: list[dict[str, str]]) -> list[dict[str, str]]: # ----------------------------
seen: set[tuple[str, str, str, str]] = set() # Parsing helpers
deduped: list[dict[str, str]] = [] # ----------------------------
for item in items: def _extract_image_name(image: str | None) -> str | None:
key = ( if not image or not isinstance(image, str):
item["service"], return None
item["source"],
item["target"],
item["bucket"],
)
if key in seen:
continue
seen.add(key)
deduped.append(item)
return deduped if "/" in image:
image = image.split("/")[-1]
if ":" in image:
image = image.split(":")[0]
return image.lower()
def sort_items(items: list[dict[str, str]]) -> list[dict[str, str]]: def _is_bind_source(source: str) -> bool:
priority_order = { return (
"critical": 0, source.startswith("/")
"high": 1, or source.startswith("./")
"medium": 2, or source.startswith("../")
"low": 3, or source.startswith("~/")
"optional": 4,
}
return sorted(
items,
key=lambda item: (
priority_order.get(item["priority"], 99),
item["service"],
item["source"],
item["target"],
),
) )
def classify_compose(compose_file: str | Path) -> dict[str, Any]: def _normalize_bind_path(source: str, compose_file: Path) -> Path:
compose_path = Path(compose_file).resolve() path = Path(source).expanduser()
data = load_compose(compose_path) if path.is_absolute():
return path.resolve()
return (compose_file.parent / path).resolve()
services = data.get("services", {})
if not isinstance(services, dict):
raise ValueError("Compose file does not contain a valid 'services' section")
all_items: list[dict[str, str]] = [] def _parse_volume_string(spec: str) -> dict[str, str | None]:
parts = spec.split(":")
for service_name, service_data in services.items(): if len(parts) == 1:
if not isinstance(service_data, dict): return {
"source": None,
"target": parts[0],
"mode": None,
"kind": "anonymous",
}
source = parts[0]
target = parts[1]
mode = ":".join(parts[2:]) if len(parts) > 2 else None
kind = "bind" if _is_bind_source(source) else "named"
return {
"source": source,
"target": target,
"mode": mode,
"kind": kind,
}
def _parse_volume_entry(entry: Any) -> dict[str, str | None]:
if isinstance(entry, str):
return _parse_volume_string(entry)
if isinstance(entry, dict):
entry_type = entry.get("type")
source = entry.get("source") or entry.get("src")
target = entry.get("target") or entry.get("dst") or entry.get("destination")
if entry_type == "bind":
kind = "bind"
elif entry_type == "volume":
kind = "named" if source else "anonymous"
else:
if isinstance(source, str) and source:
kind = "bind" if _is_bind_source(source) else "named"
else:
kind = "anonymous"
return {
"source": source,
"target": target,
"mode": None,
"kind": kind,
}
return {
"source": None,
"target": None,
"mode": None,
"kind": "unknown",
}
# ----------------------------
# Classification logic
# ----------------------------
def _classify_target(target_path: str | None, image_name: str | None = None) -> tuple[str, str]:
if not target_path:
return "review", "missing container target path"
if image_name and image_name in IMAGE_RULES:
rules = IMAGE_RULES[image_name]
if target_path in rules:
level = rules[target_path]
if level == "critical":
return "critical", f"{image_name} rule for {target_path}"
if level == "optional":
return "optional", f"{image_name} rule for {target_path}"
if target_path in CRITICAL_TARGETS:
return "critical", f"critical target path {target_path}"
if target_path in SKIP_TARGET_EXACT:
return "optional", f"non-essential target path {target_path}"
if target_path.startswith(SKIP_TARGET_PREFIXES):
return "optional", f"ephemeral target path {target_path}"
lowered = target_path.lower()
for keyword in REVIEW_TARGET_KEYWORDS:
if keyword in lowered:
return "review", f"data-like target path {target_path} requires review"
return "review", f"unknown target path {target_path}"
def _merge_reason(existing: str, new: str) -> str:
if not existing:
return new
if not new or new == existing:
return existing
parts = [p.strip() for p in existing.split(" | ") if p.strip()]
if new not in parts:
parts.append(new)
return " | ".join(parts)
def _prefer_entry(existing: MountEntry, new: MountEntry) -> MountEntry:
existing_priority = CLASS_PRIORITY.get(existing.classification, 0)
new_priority = CLASS_PRIORITY.get(new.classification, 0)
if new_priority > existing_priority:
preferred = new
other = existing
else:
preferred = existing
other = new
preferred.reason = _merge_reason(preferred.reason, other.reason)
if other.service and other.service not in preferred.reason:
preferred.reason = _merge_reason(preferred.reason, f"also used by service={other.service} target={other.target}")
preferred.exists = preferred.exists or other.exists
return preferred
def _dedupe_entries(entries: list[MountEntry]) -> list[MountEntry]:
deduped: dict[str, MountEntry] = {}
for entry in entries:
key = str(entry.source.resolve()) if entry.source.is_absolute() else str(entry.source)
if key not in deduped:
deduped[key] = entry
continue continue
all_items.extend( deduped[key] = _prefer_entry(deduped[key], entry)
classify_service_mounts(
service_name=service_name, return list(deduped.values())
service_data=service_data,
compose_file=compose_path,
def _make_entry(
source: Path,
service: str,
target: str | None,
classification: str,
reason: str,
) -> MountEntry:
return MountEntry(
source=source,
service=service,
target=target or "unknown",
classification=classification,
reason=reason,
exists=source.exists(),
)
# ----------------------------
# Main classifier
# ----------------------------
def classify_compose(compose_path: str | Path) -> list[MountEntry]:
compose_file = Path(compose_path).expanduser().resolve()
compose_data = load_compose(compose_file)
services = compose_data.get("services", {})
if not isinstance(services, dict):
return []
entries: list[MountEntry] = []
for service_name, service_cfg in services.items():
if not isinstance(service_cfg, dict):
continue
raw_volumes = service_cfg.get("volumes", [])
if not isinstance(raw_volumes, list):
continue
image_name = _extract_image_name(service_cfg.get("image"))
for raw_entry in raw_volumes:
parsed = _parse_volume_entry(raw_entry)
source = parsed.get("source")
target = parsed.get("target")
kind = parsed.get("kind")
if kind == "anonymous":
entries.append(
MountEntry(
source=Path("/__anonymous_volume__"),
service=service_name,
target=target or "unknown",
classification="review",
reason="anonymous volume cannot be safely mapped to host path",
exists=False,
)
)
continue
if kind == "bind" and isinstance(source, str):
host_path = _normalize_bind_path(source, compose_file)
classification, base_reason = _classify_target(target, image_name)
reason = f"{base_reason}; bind mount source '{source}' -> '{host_path}'"
entries.append(
_make_entry(
source=host_path,
service=service_name,
target=target,
classification=classification,
reason=reason,
)
)
continue
if kind == "named" and isinstance(source, str):
mountpoint, volume_reason = resolve_named_volume(source, compose_file, compose_data)
if mountpoint is None:
entries.append(
MountEntry(
source=Path(f"/__named_volume_unresolved__/{source}"),
service=service_name,
target=target or "unknown",
classification="review",
reason=volume_reason,
exists=False,
)
)
continue
classification, base_reason = _classify_target(target, image_name)
reason = f"{base_reason}; {volume_reason}; mountpoint '{mountpoint}'"
entries.append(
_make_entry(
source=mountpoint,
service=service_name,
target=target,
classification=classification,
reason=reason,
)
)
continue
entries.append(
MountEntry(
source=Path("/__unknown_volume__"),
service=service_name,
target=target or "unknown",
classification="review",
reason="unrecognized volume entry",
exists=False,
)
) )
)
all_items = deduplicate_items(all_items) return _dedupe_entries(entries)
all_items = sort_items(all_items)
include = [strip_bucket(item) for item in all_items if item["bucket"] == "include"]
review = [strip_bucket(item) for item in all_items if item["bucket"] == "review"]
skip = [strip_bucket(item) for item in all_items if item["bucket"] == "skip"]
return {
"include": include,
"review": review,
"skip": skip,
}
def strip_bucket(item: dict[str, str]) -> dict[str, str]:
return {
"service": item["service"],
"source": item["source"],
"target": item["target"],
"priority": item["priority"],
"reason": item["reason"],
}

View file

@ -0,0 +1,294 @@
from __future__ import annotations
import textwrap
from pathlib import Path
import pytest
from dockervault.classifier import classify_compose
def write_compose(tmp_path: Path, content: str) -> Path:
compose_file = tmp_path / "docker-compose.yml"
compose_file.write_text(textwrap.dedent(content).strip() + "\n", encoding="utf-8")
return compose_file
def find_entry(entries, service: str, target: str):
for entry in entries:
if entry.service == service and entry.target == target:
return entry
raise AssertionError(f"No entry found for service={service!r} target={target!r}")
def test_bind_mount_relative_path_is_resolved_and_classified_critical(tmp_path: Path):
data_dir = tmp_path / "db"
data_dir.mkdir()
compose_file = write_compose(
tmp_path,
"""
services:
db:
image: mariadb:11
volumes:
- ./db:/var/lib/mysql
""",
)
entries = classify_compose(compose_file)
assert len(entries) == 1
entry = find_entry(entries, "db", "/var/lib/mysql")
assert entry.source == data_dir.resolve()
assert entry.classification == "critical"
assert entry.exists is True
assert entry.service == "db"
assert "mariadb" in entry.reason or "critical" in entry.reason
def test_named_volume_resolves_and_is_classified_critical(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
mountpoint = tmp_path / "docker-volumes" / "project_dbdata" / "_data"
mountpoint.mkdir(parents=True)
compose_file = write_compose(
tmp_path,
"""
services:
db:
image: mariadb:11
volumes:
- dbdata:/var/lib/mysql
volumes:
dbdata:
""",
)
monkeypatch.setattr("dockervault.classifier.docker_available", lambda: True)
monkeypatch.setattr(
"dockervault.classifier.run_docker_volume_inspect",
lambda volume_name: {"Mountpoint": str(mountpoint)} if volume_name == "dbdata" else None,
)
entries = classify_compose(compose_file)
assert len(entries) == 1
entry = find_entry(entries, "db", "/var/lib/mysql")
assert entry.source == mountpoint
assert entry.classification == "critical"
assert entry.exists is True
assert "named volume 'dbdata'" in entry.reason
def test_named_volume_unresolved_falls_back_to_review(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
compose_file = write_compose(
tmp_path,
"""
services:
app:
image: redis:7
volumes:
- cachedata:/data
volumes:
cachedata:
""",
)
monkeypatch.setattr("dockervault.classifier.docker_available", lambda: True)
monkeypatch.setattr("dockervault.classifier.run_docker_volume_inspect", lambda volume_name: None)
entries = classify_compose(compose_file)
assert len(entries) == 1
entry = find_entry(entries, "app", "/data")
assert entry.classification == "review"
assert entry.exists is False
assert "__named_volume_unresolved__" in str(entry.source)
assert "could not be resolved" in entry.reason
def test_named_volume_review_when_docker_not_available(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
compose_file = write_compose(
tmp_path,
"""
services:
app:
image: redis:7
volumes:
- cachedata:/data
volumes:
cachedata:
""",
)
monkeypatch.setattr("dockervault.classifier.docker_available", lambda: False)
entries = classify_compose(compose_file)
assert len(entries) == 1
entry = find_entry(entries, "app", "/data")
assert entry.classification == "review"
assert entry.exists is False
assert "docker CLI not available" in entry.reason
def test_image_rule_overrides_generic_logic_for_nginx_logs(tmp_path: Path):
logs_dir = tmp_path / "logs"
logs_dir.mkdir()
compose_file = write_compose(
tmp_path,
"""
services:
nginx:
image: nginx:latest
volumes:
- ./logs:/var/log/nginx
""",
)
entries = classify_compose(compose_file)
assert len(entries) == 1
entry = find_entry(entries, "nginx", "/var/log/nginx")
assert entry.source == logs_dir.resolve()
assert entry.classification == "optional"
assert entry.exists is True
def test_dedupe_prefers_stronger_classification_for_same_source(tmp_path: Path):
shared_dir = tmp_path / "shared"
shared_dir.mkdir()
compose_file = write_compose(
tmp_path,
f"""
services:
db:
image: mariadb:11
volumes:
- {shared_dir}:/var/lib/mysql
backup:
image: busybox
volumes:
- {shared_dir}:/backup
""",
)
entries = classify_compose(compose_file)
assert len(entries) == 1
entry = entries[0]
assert entry.source == shared_dir.resolve()
assert entry.classification == "critical"
assert entry.exists is True
assert "mariadb" in entry.reason or "/var/lib/mysql" in entry.reason
def test_top_level_volume_name_override_is_used(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
mountpoint = tmp_path / "docker-volumes" / "real-db-volume" / "_data"
mountpoint.mkdir(parents=True)
compose_file = write_compose(
tmp_path,
"""
services:
db:
image: postgres:16
volumes:
- dbdata:/var/lib/postgresql/data
volumes:
dbdata:
name: real-db-volume
""",
)
seen = []
def fake_inspect(volume_name: str):
seen.append(volume_name)
if volume_name == "real-db-volume":
return {"Mountpoint": str(mountpoint)}
return None
monkeypatch.setattr("dockervault.classifier.docker_available", lambda: True)
monkeypatch.setattr("dockervault.classifier.run_docker_volume_inspect", fake_inspect)
entries = classify_compose(compose_file)
entry = find_entry(entries, "db", "/var/lib/postgresql/data")
assert entry.source == mountpoint
assert entry.classification == "critical"
assert "real-db-volume" in entry.reason
assert seen[0] == "real-db-volume"
def test_external_volume_tries_raw_name(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
mountpoint = tmp_path / "docker-volumes" / "shared-prod-data" / "_data"
mountpoint.mkdir(parents=True)
compose_file = write_compose(
tmp_path,
"""
services:
app:
image: redis:7
volumes:
- shareddata:/data
volumes:
shareddata:
external: true
""",
)
tried = []
def fake_inspect(volume_name: str):
tried.append(volume_name)
if volume_name == "shareddata":
return {"Mountpoint": str(mountpoint)}
return None
monkeypatch.setattr("dockervault.classifier.docker_available", lambda: True)
monkeypatch.setattr("dockervault.classifier.run_docker_volume_inspect", fake_inspect)
entries = classify_compose(compose_file)
entry = find_entry(entries, "app", "/data")
assert entry.source == mountpoint
assert entry.classification == "critical"
assert "shareddata" in tried
def test_anonymous_volume_becomes_review(tmp_path: Path):
compose_file = write_compose(
tmp_path,
"""
services:
app:
image: busybox
volumes:
- /data
""",
)
entries = classify_compose(compose_file)
assert len(entries) == 1
entry = find_entry(entries, "app", "/data")
assert entry.classification == "review"
assert entry.exists is False
assert "__anonymous_volume__" in str(entry.source) or "__anonymous__" in str(entry.source)