Make SubFox production-ready with parallel translation and UI controls

2026-03-25 11:24:54 +00:00 · 2026-03-25 11:24:54 +00:00 · 2b1d05f02c
commit 2b1d05f02c
parent c40b8bed2b
6046 changed files with 798327 additions and 0 deletions
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/legacy.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/legacy.py
@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+from warnings import warn
+
+from .api import from_bytes
+from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
+
+if TYPE_CHECKING:
+    from typing import TypedDict
+
+    class ResultDict(TypedDict):
+        encoding: str | None
+        language: str
+        confidence: float | None
+
+
+def detect(
+    byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
+) -> ResultDict:
+    """
+    chardet legacy method
+    Detect the encoding of the given byte string. It should be mostly backward-compatible.
+    Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
+    This function is deprecated and should be used to migrate your project easily, consult the documentation for
+    further information. Not planned for removal.
+
+    :param byte_str:     The byte sequence to examine.
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    """
+    if len(kwargs):
+        warn(
+            f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
+        )
+
+    if not isinstance(byte_str, (bytearray, bytes)):
+        raise TypeError(  # pragma: nocover
+            f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
+        )
+
+    if isinstance(byte_str, bytearray):
+        byte_str = bytes(byte_str)
+
+    r = from_bytes(byte_str).best()
+
+    encoding = r.encoding if r is not None else None
+    language = r.language if r is not None and r.language != "Unknown" else ""
+    confidence = 1.0 - r.chaos if r is not None else None
+
+    # automatically lower confidence
+    # on small bytes samples.
+    # https://github.com/jawah/charset_normalizer/issues/391
+    if (
+        confidence is not None
+        and confidence >= 0.9
+        and encoding
+        not in {
+            "utf_8",
+            "ascii",
+        }
+        and r.bom is False  # type: ignore[union-attr]
+        and len(byte_str) < TOO_SMALL_SEQUENCE
+    ):
+        confidence -= 0.2
+
+    # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
+    # but chardet does return 'utf-8-sig' and it is a valid codec name.
+    if r is not None and encoding == "utf_8" and r.bom:
+        encoding += "_sig"
+
+    if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
+        encoding = CHARDET_CORRESPONDENCE[encoding]
+
+    return {
+        "encoding": encoding,
+        "language": language,
+        "confidence": confidence,
+    }