Make SubFox production-ready with parallel translation and UI controls

2026-03-25 11:24:54 +00:00 · 2026-03-25 11:24:54 +00:00 · 2b1d05f02c
commit 2b1d05f02c
parent c40b8bed2b
6046 changed files with 798327 additions and 0 deletions
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/init.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/init.py
@ -0,0 +1,48 @@
+"""
+Charset-Normalizer
+~~~~~~~~~~~~~~
+The Real First Universal Charset Detector.
+A library that helps you read text from an unknown charset encoding.
+Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
+All IANA character set names for which the Python core library provides codecs are supported.
+
+Basic usage:
+   >>> from charset_normalizer import from_bytes
+   >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
+   >>> best_guess = results.best()
+   >>> str(best_guess)
+   'Bсеки човек има право на образование. Oбразованието!'
+
+Others methods and usages are available - see the full documentation
+at <https://github.com/Ousret/charset_normalizer>.
+:copyright: (c) 2021 by Ahmed TAHRI
+:license: MIT, see LICENSE for more details.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from .api import from_bytes, from_fp, from_path, is_binary
+from .legacy import detect
+from .models import CharsetMatch, CharsetMatches
+from .utils import set_logging_handler
+from .version import VERSION, __version__
+
+__all__ = (
+    "from_fp",
+    "from_path",
+    "from_bytes",
+    "is_binary",
+    "detect",
+    "CharsetMatch",
+    "CharsetMatches",
+    "__version__",
+    "VERSION",
+    "set_logging_handler",
+)
+
+# Attach a NullHandler to the top level logger by default
+# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
+
+logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/main.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/main.py
@ -0,0 +1,6 @@
+from __future__ import annotations
+
+from .cli import cli_detect
+
+if __name__ == "__main__":
+    cli_detect()
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/init.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/init.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/main.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/main.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/api.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/api.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/cd.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/cd.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/constant.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/constant.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/legacy.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/legacy.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/md.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/md.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/models.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/models.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/utils.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/utils.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/version.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/pycache/version.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/api.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/api.py
@ -0,0 +1,974 @@
+from __future__ import annotations
+
+import logging
+from os import PathLike
+from typing import BinaryIO
+
+from .cd import (
+    coherence_ratio,
+    encoding_languages,
+    mb_encoding_languages,
+    merge_coherence_ratios,
+)
+from .constant import (
+    IANA_SUPPORTED,
+    IANA_SUPPORTED_SIMILAR,
+    TOO_BIG_SEQUENCE,
+    TOO_SMALL_SEQUENCE,
+    TRACE,
+)
+from .md import mess_ratio
+from .models import CharsetMatch, CharsetMatches
+from .utils import (
+    any_specified_encoding,
+    cut_sequence_chunks,
+    iana_name,
+    identify_sig_or_bom,
+    is_multi_byte_encoding,
+    should_strip_sig_or_bom,
+)
+
+logger = logging.getLogger("charset_normalizer")
+explain_handler = logging.StreamHandler()
+explain_handler.setFormatter(
+    logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
+)
+
+# Pre-compute a reordered encoding list: multibyte first, then single-byte.
+# This allows the mb_definitive_match optimization to fire earlier, skipping
+# all single-byte encodings for genuine CJK content. Multibyte codecs
+# hard-fail (UnicodeDecodeError) on single-byte data almost instantly, so
+# testing them first costs negligible time for non-CJK files.
+_mb_supported: list[str] = []
+_sb_supported: list[str] = []
+
+for _supported_enc in IANA_SUPPORTED:
+    try:
+        if is_multi_byte_encoding(_supported_enc):
+            _mb_supported.append(_supported_enc)
+        else:
+            _sb_supported.append(_supported_enc)
+    except ImportError:
+        _sb_supported.append(_supported_enc)
+
+IANA_SUPPORTED_MB_FIRST: list[str] = _mb_supported + _sb_supported
+
+
+def from_bytes(
+    sequences: bytes | bytearray,
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.2,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = True,
+) -> CharsetMatches:
+    """
+    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
+    If there is no results, it is a strong indicator that the source is binary/not text.
+    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
+    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
+
+    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
+    but never take it for granted. Can improve the performance.
+
+    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
+    purpose.
+
+    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
+    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
+    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
+    Custom logging format and handler can be set manually.
+    """
+
+    if not isinstance(sequences, (bytearray, bytes)):
+        raise TypeError(
+            "Expected object of type bytes or bytearray, got: {}".format(
+                type(sequences)
+            )
+        )
+
+    if explain:
+        previous_logger_level: int = logger.level
+        logger.addHandler(explain_handler)
+        logger.setLevel(TRACE)
+
+    length: int = len(sequences)
+
+    if length == 0:
+        logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
+        if explain:  # Defensive: ensure exit path clean handler
+            logger.removeHandler(explain_handler)
+            logger.setLevel(previous_logger_level)
+        return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
+
+    if cp_isolation is not None:
+        logger.log(
+            TRACE,
+            "cp_isolation is set. use this flag for debugging purpose. "
+            "limited list of encoding allowed : %s.",
+            ", ".join(cp_isolation),
+        )
+        cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
+    else:
+        cp_isolation = []
+
+    if cp_exclusion is not None:
+        logger.log(
+            TRACE,
+            "cp_exclusion is set. use this flag for debugging purpose. "
+            "limited list of encoding excluded : %s.",
+            ", ".join(cp_exclusion),
+        )
+        cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
+    else:
+        cp_exclusion = []
+
+    if length <= (chunk_size * steps):
+        logger.log(
+            TRACE,
+            "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
+            steps,
+            chunk_size,
+            length,
+        )
+        steps = 1
+        chunk_size = length
+
+    if steps > 1 and length / steps < chunk_size:
+        chunk_size = int(length / steps)
+
+    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
+    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
+
+    if is_too_small_sequence:
+        logger.log(
+            TRACE,
+            "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
+                length
+            ),
+        )
+    elif is_too_large_sequence:
+        logger.log(
+            TRACE,
+            "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
+                length
+            ),
+        )
+
+    prioritized_encodings: list[str] = []
+
+    specified_encoding: str | None = (
+        any_specified_encoding(sequences) if preemptive_behaviour else None
+    )
+
+    if specified_encoding is not None:
+        prioritized_encodings.append(specified_encoding)
+        logger.log(
+            TRACE,
+            "Detected declarative mark in sequence. Priority +1 given for %s.",
+            specified_encoding,
+        )
+
+    tested: set[str] = set()
+    tested_but_hard_failure: list[str] = []
+    tested_but_soft_failure: list[str] = []
+    soft_failure_skip: set[str] = set()
+    success_fast_tracked: set[str] = set()
+
+    # Cache for decoded payload deduplication: hash(decoded_payload) -> (mean_mess_ratio, cd_ratios_merged, passed)
+    # When multiple encodings decode to the exact same string, we can skip the expensive
+    # mess_ratio and coherence_ratio analysis and reuse the results from the first encoding.
+    payload_result_cache: dict[int, tuple[float, list[tuple[str, float]], bool]] = {}
+
+    # When a definitive result (chaos=0.0 and good coherence) is found after testing
+    # the prioritized encodings (ascii, utf_8), we can significantly reduce the remaining
+    # work. Encodings that target completely different language families (e.g., Cyrillic
+    # when the definitive match is Latin) are skipped entirely.
+    # Additionally, for same-family encodings that pass chaos probing, we reuse the
+    # definitive match's coherence ratios instead of recomputing them — a major savings
+    # since coherence_ratio accounts for ~30% of total time on slow Latin files.
+    definitive_match_found: bool = False
+    definitive_target_languages: set[str] = set()
+    # After the definitive match fires, we cap the number of additional same-family
+    # single-byte encodings that pass chaos probing. Once we've accumulated enough
+    # good candidates (N), further same-family SB encodings are unlikely to produce
+    # a better best() result and just waste mess_ratio + coherence_ratio time.
+    # The first encoding to trigger the definitive match is NOT counted (it's already in).
+    post_definitive_sb_success_count: int = 0
+    POST_DEFINITIVE_SB_CAP: int = 7
+
+    # When a non-UTF multibyte encoding passes chaos probing with significant multibyte
+    # content (decoded length < 98% of raw length), skip all remaining single-byte encodings.
+    # Rationale: multi-byte decoders (CJK) have strict byte-sequence validation — if they
+    # decode without error AND pass chaos probing with substantial multibyte content, the
+    # data is genuinely multibyte encoded. Single-byte encodings will always decode (every
+    # byte maps to something) but waste time on mess_ratio before failing.
+    # The 98% threshold prevents false triggers on files that happen to have a few valid
+    # multibyte pairs (e.g., cp424/_ude_1.txt where big5 decodes with 99% ratio).
+    mb_definitive_match_found: bool = False
+
+    fallback_ascii: CharsetMatch | None = None
+    fallback_u8: CharsetMatch | None = None
+    fallback_specified: CharsetMatch | None = None
+
+    results: CharsetMatches = CharsetMatches()
+
+    early_stop_results: CharsetMatches = CharsetMatches()
+
+    sig_encoding, sig_payload = identify_sig_or_bom(sequences)
+
+    if sig_encoding is not None:
+        prioritized_encodings.append(sig_encoding)
+        logger.log(
+            TRACE,
+            "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
+            len(sig_payload),
+            sig_encoding,
+        )
+
+    prioritized_encodings.append("ascii")
+
+    if "utf_8" not in prioritized_encodings:
+        prioritized_encodings.append("utf_8")
+
+    for encoding_iana in prioritized_encodings + IANA_SUPPORTED_MB_FIRST:
+        if cp_isolation and encoding_iana not in cp_isolation:
+            continue
+
+        if cp_exclusion and encoding_iana in cp_exclusion:
+            continue
+
+        if encoding_iana in tested:
+            continue
+
+        tested.add(encoding_iana)
+
+        decoded_payload: str | None = None
+        bom_or_sig_available: bool = sig_encoding == encoding_iana
+        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
+            encoding_iana
+        )
+
+        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
+            logger.log(
+                TRACE,
+                "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
+                encoding_iana,
+            )
+            continue
+        if encoding_iana in {"utf_7"} and not bom_or_sig_available:
+            logger.log(
+                TRACE,
+                "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
+                encoding_iana,
+            )
+            continue
+
+        # Skip encodings similar to ones that already soft-failed (high mess ratio).
+        # Checked BEFORE the expensive decode attempt.
+        if encoding_iana in soft_failure_skip:
+            logger.log(
+                TRACE,
+                "%s is deemed too similar to a code page that was already considered unsuited. Continuing!",
+                encoding_iana,
+            )
+            continue
+
+        # Skip encodings that were already fast-tracked from a similar successful encoding.
+        if encoding_iana in success_fast_tracked:
+            logger.log(
+                TRACE,
+                "Skipping %s: already fast-tracked from a similar successful encoding.",
+                encoding_iana,
+            )
+            continue
+
+        try:
+            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
+        except (ModuleNotFoundError, ImportError):  # Defensive:
+            logger.log(
+                TRACE,
+                "Encoding %s does not provide an IncrementalDecoder",
+                encoding_iana,
+            )
+            continue
+
+        # When we've already found a definitive match (chaos=0.0 with good coherence)
+        # after testing the prioritized encodings, skip encodings that target
+        # completely different language families. This avoids running expensive
+        # mess_ratio + coherence_ratio on clearly unrelated candidates (e.g., Cyrillic
+        # when the definitive match is Latin-based).
+        if definitive_match_found:
+            if not is_multi_byte_decoder:
+                enc_languages = set(encoding_languages(encoding_iana))
+            else:
+                enc_languages = set(mb_encoding_languages(encoding_iana))
+            if not enc_languages.intersection(definitive_target_languages):
+                logger.log(
+                    TRACE,
+                    "Skipping %s: definitive match already found, this encoding targets different languages (%s vs %s).",
+                    encoding_iana,
+                    enc_languages,
+                    definitive_target_languages,
+                )
+                continue
+
+        # After the definitive match, cap the number of additional same-family
+        # single-byte encodings that pass chaos probing. This avoids testing the
+        # tail of rare, low-value same-family encodings (mac_iceland, cp860, etc.)
+        # that almost never change best() but each cost ~1-2ms of mess_ratio + coherence.
+        if (
+            definitive_match_found
+            and not is_multi_byte_decoder
+            and post_definitive_sb_success_count >= POST_DEFINITIVE_SB_CAP
+        ):
+            logger.log(
+                TRACE,
+                "Skipping %s: already accumulated %d same-family results after definitive match (cap=%d).",
+                encoding_iana,
+                post_definitive_sb_success_count,
+                POST_DEFINITIVE_SB_CAP,
+            )
+            continue
+
+        # When a multibyte encoding with significant multibyte content has already
+        # passed chaos probing, skip all single-byte encodings. They will either fail
+        # chaos probing (wasting mess_ratio time) or produce inferior results.
+        if mb_definitive_match_found and not is_multi_byte_decoder:
+            logger.log(
+                TRACE,
+                "Skipping single-byte %s: multi-byte definitive match already found.",
+                encoding_iana,
+            )
+            continue
+
+        try:
+            if is_too_large_sequence and is_multi_byte_decoder is False:
+                str(
+                    (
+                        sequences[: int(50e4)]
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) : int(50e4)]
+                    ),
+                    encoding=encoding_iana,
+                )
+            else:
+                decoded_payload = str(
+                    (
+                        sequences
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) :]
+                    ),
+                    encoding=encoding_iana,
+                )
+        except (UnicodeDecodeError, LookupError) as e:
+            if not isinstance(e, LookupError):
+                logger.log(
+                    TRACE,
+                    "Code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+            tested_but_hard_failure.append(encoding_iana)
+            continue
+
+        r_ = range(
+            0 if not bom_or_sig_available else len(sig_payload),
+            length,
+            int(length / steps),
+        )
+
+        multi_byte_bonus: bool = (
+            is_multi_byte_decoder
+            and decoded_payload is not None
+            and len(decoded_payload) < length
+        )
+
+        if multi_byte_bonus:
+            logger.log(
+                TRACE,
+                "Code page %s is a multi byte encoding table and it appear that at least one character "
+                "was encoded using n-bytes.",
+                encoding_iana,
+            )
+
+        # Payload-hash deduplication: if another encoding already decoded to the
+        # exact same string, reuse its mess_ratio and coherence results entirely.
+        # This is strictly more general than the old IANA_SUPPORTED_SIMILAR approach
+        # because it catches ALL identical decoding, not just pre-mapped ones.
+        if decoded_payload is not None and not is_multi_byte_decoder:
+            payload_hash: int = hash(decoded_payload)
+            cached = payload_result_cache.get(payload_hash)
+            if cached is not None:
+                cached_mess, cached_cd, cached_passed = cached
+                if cached_passed:
+                    # The previous encoding with identical output passed chaos probing.
+                    fast_match = CharsetMatch(
+                        sequences,
+                        encoding_iana,
+                        cached_mess,
+                        bom_or_sig_available,
+                        cached_cd,
+                        (
+                            decoded_payload
+                            if (
+                                is_too_large_sequence is False
+                                or encoding_iana
+                                in [specified_encoding, "ascii", "utf_8"]
+                            )
+                            else None
+                        ),
+                        preemptive_declaration=specified_encoding,
+                    )
+                    results.append(fast_match)
+                    success_fast_tracked.add(encoding_iana)
+                    logger.log(
+                        TRACE,
+                        "%s fast-tracked (identical decoded payload to a prior encoding, chaos=%f %%).",
+                        encoding_iana,
+                        round(cached_mess * 100, ndigits=3),
+                    )
+
+                    if (
+                        encoding_iana in [specified_encoding, "ascii", "utf_8"]
+                        and cached_mess < 0.1
+                    ):
+                        if cached_mess == 0.0:
+                            logger.debug(
+                                "Encoding detection: %s is most likely the one.",
+                                fast_match.encoding,
+                            )
+                            if explain:
+                                logger.removeHandler(explain_handler)
+                                logger.setLevel(previous_logger_level)
+                            return CharsetMatches([fast_match])
+                        early_stop_results.append(fast_match)
+
+                    if (
+                        len(early_stop_results)
+                        and (specified_encoding is None or specified_encoding in tested)
+                        and "ascii" in tested
+                        and "utf_8" in tested
+                    ):
+                        probable_result: CharsetMatch = early_stop_results.best()  # type: ignore[assignment]
+                        logger.debug(
+                            "Encoding detection: %s is most likely the one.",
+                            probable_result.encoding,
+                        )
+                        if explain:
+                            logger.removeHandler(explain_handler)
+                            logger.setLevel(previous_logger_level)
+                        return CharsetMatches([probable_result])
+
+                    continue
+                else:
+                    # The previous encoding with identical output failed chaos probing.
+                    tested_but_soft_failure.append(encoding_iana)
+                    logger.log(
+                        TRACE,
+                        "%s fast-skipped (identical decoded payload to a prior encoding that failed chaos probing).",
+                        encoding_iana,
+                    )
+                    # Prepare fallbacks for special encodings even when skipped.
+                    if enable_fallback and encoding_iana in [
+                        "ascii",
+                        "utf_8",
+                        specified_encoding,
+                        "utf_16",
+                        "utf_32",
+                    ]:
+                        fallback_entry = CharsetMatch(
+                            sequences,
+                            encoding_iana,
+                            threshold,
+                            bom_or_sig_available,
+                            [],
+                            decoded_payload,
+                            preemptive_declaration=specified_encoding,
+                        )
+                        if encoding_iana == specified_encoding:
+                            fallback_specified = fallback_entry
+                        elif encoding_iana == "ascii":
+                            fallback_ascii = fallback_entry
+                        else:
+                            fallback_u8 = fallback_entry
+                    continue
+
+        max_chunk_gave_up: int = int(len(r_) / 4)
+
+        max_chunk_gave_up = max(max_chunk_gave_up, 2)
+        early_stop_count: int = 0
+        lazy_str_hard_failure = False
+
+        md_chunks: list[str] = []
+        md_ratios = []
+
+        try:
+            for chunk in cut_sequence_chunks(
+                sequences,
+                encoding_iana,
+                r_,
+                chunk_size,
+                bom_or_sig_available,
+                strip_sig_or_bom,
+                sig_payload,
+                is_multi_byte_decoder,
+                decoded_payload,
+            ):
+                md_chunks.append(chunk)
+
+                md_ratios.append(
+                    mess_ratio(
+                        chunk,
+                        threshold,
+                        explain is True and 1 <= len(cp_isolation) <= 2,
+                    )
+                )
+
+                if md_ratios[-1] >= threshold:
+                    early_stop_count += 1
+
+                if (early_stop_count >= max_chunk_gave_up) or (
+                    bom_or_sig_available and strip_sig_or_bom is False
+                ):
+                    break
+        except (
+            UnicodeDecodeError
+        ) as e:  # Lazy str loading may have missed something there
+            logger.log(
+                TRACE,
+                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                encoding_iana,
+                str(e),
+            )
+            early_stop_count = max_chunk_gave_up
+            lazy_str_hard_failure = True
+
+        # We might want to check the sequence again with the whole content
+        # Only if initial MD tests passes
+        if (
+            not lazy_str_hard_failure
+            and is_too_large_sequence
+            and not is_multi_byte_decoder
+        ):
+            try:
+                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
+            except UnicodeDecodeError as e:
+                logger.log(
+                    TRACE,
+                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                tested_but_hard_failure.append(encoding_iana)
+                continue
+
+        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
+        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
+            tested_but_soft_failure.append(encoding_iana)
+            if encoding_iana in IANA_SUPPORTED_SIMILAR:
+                soft_failure_skip.update(IANA_SUPPORTED_SIMILAR[encoding_iana])
+            # Cache this soft-failure so identical decoding from other encodings
+            # can be skipped immediately.
+            if decoded_payload is not None and not is_multi_byte_decoder:
+                payload_result_cache.setdefault(
+                    hash(decoded_payload), (mean_mess_ratio, [], False)
+                )
+            logger.log(
+                TRACE,
+                "%s was excluded because of initial chaos probing. Gave up %i time(s). "
+                "Computed mean chaos is %f %%.",
+                encoding_iana,
+                early_stop_count,
+                round(mean_mess_ratio * 100, ndigits=3),
+            )
+            # Preparing those fallbacks in case we got nothing.
+            if (
+                enable_fallback
+                and encoding_iana
+                in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
+                and not lazy_str_hard_failure
+            ):
+                fallback_entry = CharsetMatch(
+                    sequences,
+                    encoding_iana,
+                    threshold,
+                    bom_or_sig_available,
+                    [],
+                    decoded_payload,
+                    preemptive_declaration=specified_encoding,
+                )
+                if encoding_iana == specified_encoding:
+                    fallback_specified = fallback_entry
+                elif encoding_iana == "ascii":
+                    fallback_ascii = fallback_entry
+                else:
+                    fallback_u8 = fallback_entry
+            continue
+
+        logger.log(
+            TRACE,
+            "%s passed initial chaos probing. Mean measured chaos is %f %%",
+            encoding_iana,
+            round(mean_mess_ratio * 100, ndigits=3),
+        )
+
+        if not is_multi_byte_decoder:
+            target_languages: list[str] = encoding_languages(encoding_iana)
+        else:
+            target_languages = mb_encoding_languages(encoding_iana)
+
+        if target_languages:
+            logger.log(
+                TRACE,
+                "{} should target any language(s) of {}".format(
+                    encoding_iana, str(target_languages)
+                ),
+            )
+
+        cd_ratios = []
+
+        # Run coherence detection on all chunks. We previously tried limiting to
+        # 1-2 chunks for post-definitive encodings to save time, but this caused
+        # coverage regressions by producing unrepresentative coherence scores.
+        # The SB cap and language-family skip optimizations provide sufficient
+        # speedup without sacrificing coherence accuracy.
+        if encoding_iana != "ascii":
+            # We shall skip the CD when its about ASCII
+            # Most of the time its not relevant to run "language-detection" on it.
+            for chunk in md_chunks:
+                chunk_languages = coherence_ratio(
+                    chunk,
+                    language_threshold,
+                    ",".join(target_languages) if target_languages else None,
+                )
+
+                cd_ratios.append(chunk_languages)
+            cd_ratios_merged = merge_coherence_ratios(cd_ratios)
+        else:
+            cd_ratios_merged = merge_coherence_ratios(cd_ratios)
+
+        if cd_ratios_merged:
+            logger.log(
+                TRACE,
+                "We detected language {} using {}".format(
+                    cd_ratios_merged, encoding_iana
+                ),
+            )
+
+        current_match = CharsetMatch(
+            sequences,
+            encoding_iana,
+            mean_mess_ratio,
+            bom_or_sig_available,
+            cd_ratios_merged,
+            (
+                decoded_payload
+                if (
+                    is_too_large_sequence is False
+                    or encoding_iana in [specified_encoding, "ascii", "utf_8"]
+                )
+                else None
+            ),
+            preemptive_declaration=specified_encoding,
+        )
+
+        results.append(current_match)
+
+        # Cache the successful result for payload-hash deduplication.
+        if decoded_payload is not None and not is_multi_byte_decoder:
+            payload_result_cache.setdefault(
+                hash(decoded_payload),
+                (mean_mess_ratio, cd_ratios_merged, True),
+            )
+
+        # Count post-definitive same-family SB successes for the early termination cap.
+        # Only count low-mess encodings (< 2%) toward the cap. High-mess encodings are
+        # marginal results that shouldn't prevent better-quality candidates from being
+        # tested. For example, iso8859_4 (mess=0%) should not be skipped just because
+        # 7 high-mess Latin encodings (cp1252 at 8%, etc.) were tried first.
+        if (
+            definitive_match_found
+            and not is_multi_byte_decoder
+            and mean_mess_ratio < 0.02
+        ):
+            post_definitive_sb_success_count += 1
+
+        if (
+            encoding_iana in [specified_encoding, "ascii", "utf_8"]
+            and mean_mess_ratio < 0.1
+        ):
+            # If md says nothing to worry about, then... stop immediately!
+            if mean_mess_ratio == 0.0:
+                logger.debug(
+                    "Encoding detection: %s is most likely the one.",
+                    current_match.encoding,
+                )
+                if explain:  # Defensive: ensure exit path clean handler
+                    logger.removeHandler(explain_handler)
+                    logger.setLevel(previous_logger_level)
+                return CharsetMatches([current_match])
+
+            early_stop_results.append(current_match)
+
+        if (
+            len(early_stop_results)
+            and (specified_encoding is None or specified_encoding in tested)
+            and "ascii" in tested
+            and "utf_8" in tested
+        ):
+            probable_result = early_stop_results.best()  # type: ignore[assignment]
+            logger.debug(
+                "Encoding detection: %s is most likely the one.",
+                probable_result.encoding,  # type: ignore[union-attr]
+            )
+            if explain:  # Defensive: ensure exit path clean handler
+                logger.removeHandler(explain_handler)
+                logger.setLevel(previous_logger_level)
+
+            return CharsetMatches([probable_result])
+
+        # Once we find a result with good coherence (>= 0.5) after testing the
+        # prioritized encodings (ascii, utf_8), activate "definitive mode": skip
+        # encodings that target completely different language families. This avoids
+        # running expensive mess_ratio + coherence_ratio on clearly unrelated
+        # candidates (e.g., Cyrillic encodings when the match is Latin-based).
+        # We require coherence >= 0.5 to avoid false positives (e.g., cp1251 decoding
+        # Hebrew text with 0.0 chaos but wrong language detection at coherence 0.33).
+        if not definitive_match_found and not is_multi_byte_decoder:
+            best_coherence = (
+                max((v for _, v in cd_ratios_merged), default=0.0)
+                if cd_ratios_merged
+                else 0.0
+            )
+            if best_coherence >= 0.5 and "ascii" in tested and "utf_8" in tested:
+                definitive_match_found = True
+                definitive_target_languages.update(target_languages)
+                logger.log(
+                    TRACE,
+                    "Definitive match found: %s (chaos=%.3f, coherence=%.2f). Encodings targeting different language families will be skipped.",
+                    encoding_iana,
+                    mean_mess_ratio,
+                    best_coherence,
+                )
+
+        # When a non-UTF multibyte encoding passes chaos probing with significant
+        # multibyte content (decoded < 98% of raw), activate mb_definitive_match.
+        # This skips all remaining single-byte encodings which would either soft-fail
+        # (running expensive mess_ratio for nothing) or produce inferior results.
+        if (
+            not mb_definitive_match_found
+            and is_multi_byte_decoder
+            and multi_byte_bonus
+            and decoded_payload is not None
+            and len(decoded_payload) < length * 0.98
+            and encoding_iana
+            not in {
+                "utf_8",
+                "utf_8_sig",
+                "utf_16",
+                "utf_16_be",
+                "utf_16_le",
+                "utf_32",
+                "utf_32_be",
+                "utf_32_le",
+                "utf_7",
+            }
+            and "ascii" in tested
+            and "utf_8" in tested
+        ):
+            mb_definitive_match_found = True
+            logger.log(
+                TRACE,
+                "Multi-byte definitive match: %s (chaos=%.3f, decoded=%d/%d=%.1f%%). Single-byte encodings will be skipped.",
+                encoding_iana,
+                mean_mess_ratio,
+                len(decoded_payload),
+                length,
+                len(decoded_payload) / length * 100,
+            )
+
+        if encoding_iana == sig_encoding:
+            logger.debug(
+                "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
+                "the beginning of the sequence.",
+                encoding_iana,
+            )
+            if explain:  # Defensive: ensure exit path clean handler
+                logger.removeHandler(explain_handler)
+                logger.setLevel(previous_logger_level)
+            return CharsetMatches([results[encoding_iana]])
+
+    if len(results) == 0:
+        if fallback_u8 or fallback_ascii or fallback_specified:
+            logger.log(
+                TRACE,
+                "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
+            )
+
+        if fallback_specified:
+            logger.debug(
+                "Encoding detection: %s will be used as a fallback match",
+                fallback_specified.encoding,
+            )
+            results.append(fallback_specified)
+        elif (
+            (fallback_u8 and fallback_ascii is None)
+            or (
+                fallback_u8
+                and fallback_ascii
+                and fallback_u8.fingerprint != fallback_ascii.fingerprint
+            )
+            or (fallback_u8 is not None)
+        ):
+            logger.debug("Encoding detection: utf_8 will be used as a fallback match")
+            results.append(fallback_u8)
+        elif fallback_ascii:
+            logger.debug("Encoding detection: ascii will be used as a fallback match")
+            results.append(fallback_ascii)
+
+    if results:
+        logger.debug(
+            "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
+            results.best().encoding,  # type: ignore
+            len(results) - 1,
+        )
+    else:
+        logger.debug("Encoding detection: Unable to determine any suitable charset.")
+
+    if explain:
+        logger.removeHandler(explain_handler)
+        logger.setLevel(previous_logger_level)
+
+    return results
+
+
+def from_fp(
+    fp: BinaryIO,
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.20,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = True,
+) -> CharsetMatches:
+    """
+    Same thing than the function from_bytes but using a file pointer that is already ready.
+    Will not close the file pointer.
+    """
+    return from_bytes(
+        fp.read(),
+        steps,
+        chunk_size,
+        threshold,
+        cp_isolation,
+        cp_exclusion,
+        preemptive_behaviour,
+        explain,
+        language_threshold,
+        enable_fallback,
+    )
+
+
+def from_path(
+    path: str | bytes | PathLike,  # type: ignore[type-arg]
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.20,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = True,
+) -> CharsetMatches:
+    """
+    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
+    Can raise IOError.
+    """
+    with open(path, "rb") as fp:
+        return from_fp(
+            fp,
+            steps,
+            chunk_size,
+            threshold,
+            cp_isolation,
+            cp_exclusion,
+            preemptive_behaviour,
+            explain,
+            language_threshold,
+            enable_fallback,
+        )
+
+
+def is_binary(
+    fp_or_path_or_payload: PathLike | str | BinaryIO | bytes,  # type: ignore[type-arg]
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.20,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = False,
+) -> bool:
+    """
+    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
+    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
+    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
+    """
+    if isinstance(fp_or_path_or_payload, (str, PathLike)):
+        guesses = from_path(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+    elif isinstance(
+        fp_or_path_or_payload,
+        (
+            bytes,
+            bytearray,
+        ),
+    ):
+        guesses = from_bytes(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+    else:
+        guesses = from_fp(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+
+    return not guesses
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/cd.cpython-310-x86_64-linux-gnu.so
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/cd.cpython-310-x86_64-linux-gnu.so
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/cd.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/cd.py
@ -0,0 +1,454 @@
+from __future__ import annotations
+
+import importlib
+from codecs import IncrementalDecoder
+from collections import Counter
+from functools import lru_cache
+from typing import Counter as TypeCounter
+
+from .constant import (
+    FREQUENCIES,
+    KO_NAMES,
+    LANGUAGE_SUPPORTED_COUNT,
+    TOO_SMALL_SEQUENCE,
+    ZH_NAMES,
+    _FREQUENCIES_SET,
+    _FREQUENCIES_RANK,
+)
+from .md import is_suspiciously_successive_range
+from .models import CoherenceMatches
+from .utils import (
+    is_accentuated,
+    is_latin,
+    is_multi_byte_encoding,
+    is_unicode_range_secondary,
+    unicode_range,
+)
+
+
+def encoding_unicode_range(iana_name: str) -> list[str]:
+    """
+    Return associated unicode ranges in a single byte code page.
+    """
+    if is_multi_byte_encoding(iana_name):
+        raise OSError(  # Defensive:
+            "Function not supported on multi-byte code page"
+        )
+
+    decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
+
+    p: IncrementalDecoder = decoder(errors="ignore")
+    seen_ranges: dict[str, int] = {}
+    character_count: int = 0
+
+    for i in range(0x40, 0xFF):
+        chunk: str = p.decode(bytes([i]))
+
+        if chunk:
+            character_range: str | None = unicode_range(chunk)
+
+            if character_range is None:
+                continue
+
+            if is_unicode_range_secondary(character_range) is False:
+                if character_range not in seen_ranges:
+                    seen_ranges[character_range] = 0
+                seen_ranges[character_range] += 1
+                character_count += 1
+
+    return sorted(
+        [
+            character_range
+            for character_range in seen_ranges
+            if seen_ranges[character_range] / character_count >= 0.15
+        ]
+    )
+
+
+def unicode_range_languages(primary_range: str) -> list[str]:
+    """
+    Return inferred languages used with a unicode range.
+    """
+    languages: list[str] = []
+
+    for language, characters in FREQUENCIES.items():
+        for character in characters:
+            if unicode_range(character) == primary_range:
+                languages.append(language)
+                break
+
+    return languages
+
+
+@lru_cache()
+def encoding_languages(iana_name: str) -> list[str]:
+    """
+    Single-byte encoding language association. Some code page are heavily linked to particular language(s).
+    This function does the correspondence.
+    """
+    unicode_ranges: list[str] = encoding_unicode_range(iana_name)
+    primary_range: str | None = None
+
+    for specified_range in unicode_ranges:
+        if "Latin" not in specified_range:
+            primary_range = specified_range
+            break
+
+    if primary_range is None:
+        return ["Latin Based"]
+
+    return unicode_range_languages(primary_range)
+
+
+@lru_cache()
+def mb_encoding_languages(iana_name: str) -> list[str]:
+    """
+    Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
+    This function does the correspondence.
+    """
+    if (
+        iana_name.startswith("shift_")
+        or iana_name.startswith("iso2022_jp")
+        or iana_name.startswith("euc_j")
+        or iana_name == "cp932"
+    ):
+        return ["Japanese"]
+    if iana_name.startswith("gb") or iana_name in ZH_NAMES:
+        return ["Chinese"]
+    if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
+        return ["Korean"]
+
+    return []
+
+
+@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
+def get_target_features(language: str) -> tuple[bool, bool]:
+    """
+    Determine main aspects from a supported language if it contains accents and if is pure Latin.
+    """
+    target_have_accents: bool = False
+    target_pure_latin: bool = True
+
+    for character in FREQUENCIES[language]:
+        if not target_have_accents and is_accentuated(character):
+            target_have_accents = True
+        if target_pure_latin and is_latin(character) is False:
+            target_pure_latin = False
+
+    return target_have_accents, target_pure_latin
+
+
+def alphabet_languages(
+    characters: list[str], ignore_non_latin: bool = False
+) -> list[str]:
+    """
+    Return associated languages associated to given characters.
+    """
+    languages: list[tuple[str, float]] = []
+
+    characters_set: frozenset[str] = frozenset(characters)
+    source_have_accents = any(is_accentuated(character) for character in characters)
+
+    for language, language_characters in FREQUENCIES.items():
+        target_have_accents, target_pure_latin = get_target_features(language)
+
+        if ignore_non_latin and target_pure_latin is False:
+            continue
+
+        if target_have_accents is False and source_have_accents:
+            continue
+
+        character_count: int = len(language_characters)
+
+        character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set)
+
+        ratio: float = character_match_count / character_count
+
+        if ratio >= 0.2:
+            languages.append((language, ratio))
+
+    languages = sorted(languages, key=lambda x: x[1], reverse=True)
+
+    return [compatible_language[0] for compatible_language in languages]
+
+
+def characters_popularity_compare(
+    language: str, ordered_characters: list[str]
+) -> float:
+    """
+    Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
+    The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
+    Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
+    """
+    if language not in FREQUENCIES:
+        raise ValueError(f"{language} not available")  # Defensive:
+
+    character_approved_count: int = 0
+    frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
+    lang_rank: dict[str, int] = _FREQUENCIES_RANK[language]
+
+    ordered_characters_count: int = len(ordered_characters)
+    target_language_characters_count: int = len(FREQUENCIES[language])
+
+    large_alphabet: bool = target_language_characters_count > 26
+
+    expected_projection_ratio: float = (
+        target_language_characters_count / ordered_characters_count
+    )
+
+    # Pre-built rank dict for ordered_characters (avoids repeated list slicing).
+    ordered_rank: dict[str, int] = {
+        char: rank for rank, char in enumerate(ordered_characters)
+    }
+
+    # Pre-compute characters common to both orderings.
+    # Avoids repeated `c in ordered_rank` dict lookups in the inner counts.
+    common_chars: list[tuple[int, int]] = [
+        (lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
+    ]
+
+    # Pre-extract lr and orr arrays for faster iteration in the inner loop.
+    # Plain integer loops with local arrays are much faster under mypyc than
+    # generator expression sums over a list of tuples.
+    common_count: int = len(common_chars)
+    common_lr: list[int] = [p[0] for p in common_chars]
+    common_orr: list[int] = [p[1] for p in common_chars]
+
+    for character, character_rank in zip(
+        ordered_characters, range(0, ordered_characters_count)
+    ):
+        if character not in frequencies_language_set:
+            continue
+
+        character_rank_in_language: int = lang_rank[character]
+        character_rank_projection: int = int(character_rank * expected_projection_ratio)
+
+        if (
+            large_alphabet is False
+            and abs(character_rank_projection - character_rank_in_language) > 4
+        ):
+            continue
+
+        if (
+            large_alphabet is True
+            and abs(character_rank_projection - character_rank_in_language)
+            < target_language_characters_count / 3
+        ):
+            character_approved_count += 1
+            continue
+
+        # Count how many characters appear "before" in both orderings,
+        # and how many appear "at or after" in both orderings.
+        # Single pass over pre-extracted arrays — much faster under mypyc
+        # than two generator expression sums.
+        before_match_count: int = 0
+        after_match_count: int = 0
+        for i in range(common_count):
+            lr_i: int = common_lr[i]
+            orr_i: int = common_orr[i]
+            if lr_i < character_rank_in_language:
+                if orr_i < character_rank:
+                    before_match_count += 1
+            else:
+                if orr_i >= character_rank:
+                    after_match_count += 1
+
+        after_len: int = target_language_characters_count - character_rank_in_language
+
+        if character_rank_in_language == 0 and before_match_count <= 4:
+            character_approved_count += 1
+            continue
+
+        if after_len == 0 and after_match_count <= 4:
+            character_approved_count += 1
+            continue
+
+        if (
+            character_rank_in_language > 0
+            and before_match_count / character_rank_in_language >= 0.4
+        ) or (after_len > 0 and after_match_count / after_len >= 0.4):
+            character_approved_count += 1
+            continue
+
+    return character_approved_count / len(ordered_characters)
+
+
+def alpha_unicode_split(decoded_sequence: str) -> list[str]:
+    """
+    Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
+    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
+    One containing the latin letters and the other hebrew.
+    """
+    layers: dict[str, list[str]] = {}
+
+    # Fast path: track single-layer key to skip dict iteration for single-script text.
+    single_layer_key: str | None = None
+    multi_layer: bool = False
+
+    # Cache the last character_range and its resolved layer to avoid repeated
+    # is_suspiciously_successive_range calls for consecutive same-range chars.
+    prev_character_range: str | None = None
+    prev_layer_target: str | None = None
+
+    for character in decoded_sequence:
+        if character.isalpha() is False:
+            continue
+
+        # ASCII fast-path: a-z and A-Z are always "Basic Latin".
+        # Avoids unicode_range() function call overhead for the most common case.
+        character_ord: int = ord(character)
+        if character_ord < 128:
+            character_range: str | None = "Basic Latin"
+        else:
+            character_range = unicode_range(character)
+
+        if character_range is None:
+            continue
+
+        # Fast path: same range as previous character → reuse cached layer target.
+        if character_range == prev_character_range:
+            if prev_layer_target is not None:
+                layers[prev_layer_target].append(character)
+            continue
+
+        layer_target_range: str | None = None
+
+        if multi_layer:
+            for discovered_range in layers:
+                if (
+                    is_suspiciously_successive_range(discovered_range, character_range)
+                    is False
+                ):
+                    layer_target_range = discovered_range
+                    break
+        elif single_layer_key is not None:
+            if (
+                is_suspiciously_successive_range(single_layer_key, character_range)
+                is False
+            ):
+                layer_target_range = single_layer_key
+
+        if layer_target_range is None:
+            layer_target_range = character_range
+
+        if layer_target_range not in layers:
+            layers[layer_target_range] = []
+            if single_layer_key is None:
+                single_layer_key = layer_target_range
+            else:
+                multi_layer = True
+
+        layers[layer_target_range].append(character)
+
+        # Cache for next iteration
+        prev_character_range = character_range
+        prev_layer_target = layer_target_range
+
+    return ["".join(chars).lower() for chars in layers.values()]
+
+
+def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
+    """
+    This function merge results previously given by the function coherence_ratio.
+    The return type is the same as coherence_ratio.
+    """
+    per_language_ratios: dict[str, list[float]] = {}
+    for result in results:
+        for sub_result in result:
+            language, ratio = sub_result
+            if language not in per_language_ratios:
+                per_language_ratios[language] = [ratio]
+                continue
+            per_language_ratios[language].append(ratio)
+
+    merge = [
+        (
+            language,
+            round(
+                sum(per_language_ratios[language]) / len(per_language_ratios[language]),
+                4,
+            ),
+        )
+        for language in per_language_ratios
+    ]
+
+    return sorted(merge, key=lambda x: x[1], reverse=True)
+
+
+def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
+    """
+    We shall NOT return "English—" in CoherenceMatches because it is an alternative
+    of "English". This function only keeps the best match and remove the em-dash in it.
+    """
+    index_results: dict[str, list[float]] = dict()
+
+    for result in results:
+        language, ratio = result
+        no_em_name: str = language.replace("—", "")
+
+        if no_em_name not in index_results:
+            index_results[no_em_name] = []
+
+        index_results[no_em_name].append(ratio)
+
+    if any(len(index_results[e]) > 1 for e in index_results):
+        filtered_results: CoherenceMatches = []
+
+        for language in index_results:
+            filtered_results.append((language, max(index_results[language])))
+
+        return filtered_results
+
+    return results
+
+
+@lru_cache(maxsize=2048)
+def coherence_ratio(
+    decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
+) -> CoherenceMatches:
+    """
+    Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
+    A layer = Character extraction by alphabets/ranges.
+    """
+
+    results: list[tuple[str, float]] = []
+    ignore_non_latin: bool = False
+
+    sufficient_match_count: int = 0
+
+    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
+    if "Latin Based" in lg_inclusion_list:
+        ignore_non_latin = True
+        lg_inclusion_list.remove("Latin Based")
+
+    for layer in alpha_unicode_split(decoded_sequence):
+        sequence_frequencies: TypeCounter[str] = Counter(layer)
+        most_common = sequence_frequencies.most_common()
+
+        character_count: int = len(layer)
+
+        if character_count <= TOO_SMALL_SEQUENCE:
+            continue
+
+        popular_character_ordered: list[str] = [c for c, o in most_common]
+
+        for language in lg_inclusion_list or alphabet_languages(
+            popular_character_ordered, ignore_non_latin
+        ):
+            ratio: float = characters_popularity_compare(
+                language, popular_character_ordered
+            )
+
+            if ratio < threshold:
+                continue
+            elif ratio >= 0.8:
+                sufficient_match_count += 1
+
+            results.append((language, round(ratio, 4)))
+
+            if sufficient_match_count >= 3:
+                break
+
+    return sorted(
+        filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
+    )
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/cli/init.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/cli/init.py
@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from .__main__ import cli_detect, query_yes_no
+
+__all__ = (
+    "cli_detect",
+    "query_yes_no",
+)
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/cli/main.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/cli/main.py
@ -0,0 +1,362 @@
+from __future__ import annotations
+
+import argparse
+import sys
+import typing
+from json import dumps
+from os.path import abspath, basename, dirname, join, realpath
+from platform import python_version
+from unicodedata import unidata_version
+
+import charset_normalizer.md as md_module
+from charset_normalizer import from_fp
+from charset_normalizer.models import CliDetectionResult
+from charset_normalizer.version import __version__
+
+
+def query_yes_no(question: str, default: str = "yes") -> bool:  # Defensive:
+    """Ask a yes/no question via input() and return the answer as a bool."""
+    prompt = " [Y/n] " if default == "yes" else " [y/N] "
+
+    while True:
+        choice = input(question + prompt).strip().lower()
+        if not choice:
+            return default == "yes"
+        if choice in ("y", "yes"):
+            return True
+        if choice in ("n", "no"):
+            return False
+        print("Please respond with 'y' or 'n'.")
+
+
+class FileType:
+    """Factory for creating file object types
+
+    Instances of FileType are typically passed as type= arguments to the
+    ArgumentParser add_argument() method.
+
+    Keyword Arguments:
+        - mode -- A string indicating how the file is to be opened. Accepts the
+            same values as the builtin open() function.
+        - bufsize -- The file's desired buffer size. Accepts the same values as
+            the builtin open() function.
+        - encoding -- The file's encoding. Accepts the same values as the
+            builtin open() function.
+        - errors -- A string indicating how encoding and decoding errors are to
+            be handled. Accepts the same value as the builtin open() function.
+
+    Backported from CPython 3.12
+    """
+
+    def __init__(
+        self,
+        mode: str = "r",
+        bufsize: int = -1,
+        encoding: str | None = None,
+        errors: str | None = None,
+    ):
+        self._mode = mode
+        self._bufsize = bufsize
+        self._encoding = encoding
+        self._errors = errors
+
+    def __call__(self, string: str) -> typing.IO:  # type: ignore[type-arg]
+        # the special argument "-" means sys.std{in,out}
+        if string == "-":
+            if "r" in self._mode:
+                return sys.stdin.buffer if "b" in self._mode else sys.stdin
+            elif any(c in self._mode for c in "wax"):
+                return sys.stdout.buffer if "b" in self._mode else sys.stdout
+            else:
+                msg = f'argument "-" with mode {self._mode}'
+                raise ValueError(msg)
+
+        # all other arguments are used as file names
+        try:
+            return open(string, self._mode, self._bufsize, self._encoding, self._errors)
+        except OSError as e:
+            message = f"can't open '{string}': {e}"
+            raise argparse.ArgumentTypeError(message)
+
+    def __repr__(self) -> str:
+        args = self._mode, self._bufsize
+        kwargs = [("encoding", self._encoding), ("errors", self._errors)]
+        args_str = ", ".join(
+            [repr(arg) for arg in args if arg != -1]
+            + [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
+        )
+        return f"{type(self).__name__}({args_str})"
+
+
+def cli_detect(argv: list[str] | None = None) -> int:
+    """
+    CLI assistant using ARGV and ArgumentParser
+    :param argv:
+    :return: 0 if everything is fine, anything else equal trouble
+    """
+    parser = argparse.ArgumentParser(
+        description="The Real First Universal Charset Detector. "
+        "Discover originating encoding used on text file. "
+        "Normalize text to unicode."
+    )
+
+    parser.add_argument(
+        "files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        default=False,
+        dest="verbose",
+        help="Display complementary information about file if any. "
+        "Stdout will contain logs about the detection process.",
+    )
+    parser.add_argument(
+        "-a",
+        "--with-alternative",
+        action="store_true",
+        default=False,
+        dest="alternatives",
+        help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
+    )
+    parser.add_argument(
+        "-n",
+        "--normalize",
+        action="store_true",
+        default=False,
+        dest="normalize",
+        help="Permit to normalize input file. If not set, program does not write anything.",
+    )
+    parser.add_argument(
+        "-m",
+        "--minimal",
+        action="store_true",
+        default=False,
+        dest="minimal",
+        help="Only output the charset detected to STDOUT. Disabling JSON output.",
+    )
+    parser.add_argument(
+        "-r",
+        "--replace",
+        action="store_true",
+        default=False,
+        dest="replace",
+        help="Replace file when trying to normalize it instead of creating a new one.",
+    )
+    parser.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        default=False,
+        dest="force",
+        help="Replace file without asking if you are sure, use this flag with caution.",
+    )
+    parser.add_argument(
+        "-i",
+        "--no-preemptive",
+        action="store_true",
+        default=False,
+        dest="no_preemptive",
+        help="Disable looking at a charset declaration to hint the detector.",
+    )
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        action="store",
+        default=0.2,
+        type=float,
+        dest="threshold",
+        help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
+            __version__,
+            python_version(),
+            unidata_version,
+            "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
+        ),
+        help="Show version information and exit.",
+    )
+
+    args = parser.parse_args(argv)
+
+    if args.replace is True and args.normalize is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
+        print("Use --replace in addition of --normalize only.", file=sys.stderr)
+        return 1
+
+    if args.force is True and args.replace is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
+        print("Use --force in addition of --replace only.", file=sys.stderr)
+        return 1
+
+    if args.threshold < 0.0 or args.threshold > 1.0:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
+        print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
+        return 1
+
+    x_ = []
+
+    for my_file in args.files:
+        matches = from_fp(
+            my_file,
+            threshold=args.threshold,
+            explain=args.verbose,
+            preemptive_behaviour=args.no_preemptive is False,
+        )
+
+        best_guess = matches.best()
+
+        if best_guess is None:
+            print(
+                'Unable to identify originating encoding for "{}". {}'.format(
+                    my_file.name,
+                    (
+                        "Maybe try increasing maximum amount of chaos."
+                        if args.threshold < 1.0
+                        else ""
+                    ),
+                ),
+                file=sys.stderr,
+            )
+            x_.append(
+                CliDetectionResult(
+                    abspath(my_file.name),
+                    None,
+                    [],
+                    [],
+                    "Unknown",
+                    [],
+                    False,
+                    1.0,
+                    0.0,
+                    None,
+                    True,
+                )
+            )
+        else:
+            cli_result = CliDetectionResult(
+                abspath(my_file.name),
+                best_guess.encoding,
+                best_guess.encoding_aliases,
+                [
+                    cp
+                    for cp in best_guess.could_be_from_charset
+                    if cp != best_guess.encoding
+                ],
+                best_guess.language,
+                best_guess.alphabets,
+                best_guess.bom,
+                best_guess.percent_chaos,
+                best_guess.percent_coherence,
+                None,
+                True,
+            )
+            x_.append(cli_result)
+
+            if len(matches) > 1 and args.alternatives:
+                for el in matches:
+                    if el != best_guess:
+                        x_.append(
+                            CliDetectionResult(
+                                abspath(my_file.name),
+                                el.encoding,
+                                el.encoding_aliases,
+                                [
+                                    cp
+                                    for cp in el.could_be_from_charset
+                                    if cp != el.encoding
+                                ],
+                                el.language,
+                                el.alphabets,
+                                el.bom,
+                                el.percent_chaos,
+                                el.percent_coherence,
+                                None,
+                                False,
+                            )
+                        )
+
+            if args.normalize is True:
+                if best_guess.encoding.startswith("utf") is True:
+                    print(
+                        '"{}" file does not need to be normalized, as it already came from unicode.'.format(
+                            my_file.name
+                        ),
+                        file=sys.stderr,
+                    )
+                    if my_file.closed is False:
+                        my_file.close()
+                    continue
+
+                dir_path = dirname(realpath(my_file.name))
+                file_name = basename(realpath(my_file.name))
+
+                o_: list[str] = file_name.split(".")
+
+                if args.replace is False:
+                    o_.insert(-1, best_guess.encoding)
+                    if my_file.closed is False:
+                        my_file.close()
+                elif (
+                    args.force is False
+                    and query_yes_no(
+                        'Are you sure to normalize "{}" by replacing it ?'.format(
+                            my_file.name
+                        ),
+                        "no",
+                    )
+                    is False
+                ):
+                    if my_file.closed is False:
+                        my_file.close()
+                    continue
+
+                try:
+                    cli_result.unicode_path = join(dir_path, ".".join(o_))
+
+                    with open(cli_result.unicode_path, "wb") as fp:
+                        fp.write(best_guess.output())
+                except OSError as e:  # Defensive:
+                    print(str(e), file=sys.stderr)
+                    if my_file.closed is False:
+                        my_file.close()
+                    return 2
+
+        if my_file.closed is False:
+            my_file.close()
+
+    if args.minimal is False:
+        print(
+            dumps(
+                [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
+                ensure_ascii=True,
+                indent=4,
+            )
+        )
+    else:
+        for my_file in args.files:
+            print(
+                ", ".join(
+                    [
+                        el.encoding or "undefined"
+                        for el in x_
+                        if el.path == abspath(my_file.name)
+                    ]
+                )
+            )
+
+    return 0
+
+
+if __name__ == "__main__":  # Defensive:
+    cli_detect()
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/cli/pycache/init.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/cli/pycache/init.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/cli/pycache/main.cpython-310.pyc
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/cli/pycache/main.cpython-310.pyc
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/constant.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/constant.py
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/legacy.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/legacy.py
@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+from warnings import warn
+
+from .api import from_bytes
+from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
+
+if TYPE_CHECKING:
+    from typing import TypedDict
+
+    class ResultDict(TypedDict):
+        encoding: str | None
+        language: str
+        confidence: float | None
+
+
+def detect(
+    byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
+) -> ResultDict:
+    """
+    chardet legacy method
+    Detect the encoding of the given byte string. It should be mostly backward-compatible.
+    Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
+    This function is deprecated and should be used to migrate your project easily, consult the documentation for
+    further information. Not planned for removal.
+
+    :param byte_str:     The byte sequence to examine.
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    """
+    if len(kwargs):
+        warn(
+            f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
+        )
+
+    if not isinstance(byte_str, (bytearray, bytes)):
+        raise TypeError(  # pragma: nocover
+            f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
+        )
+
+    if isinstance(byte_str, bytearray):
+        byte_str = bytes(byte_str)
+
+    r = from_bytes(byte_str).best()
+
+    encoding = r.encoding if r is not None else None
+    language = r.language if r is not None and r.language != "Unknown" else ""
+    confidence = 1.0 - r.chaos if r is not None else None
+
+    # automatically lower confidence
+    # on small bytes samples.
+    # https://github.com/jawah/charset_normalizer/issues/391
+    if (
+        confidence is not None
+        and confidence >= 0.9
+        and encoding
+        not in {
+            "utf_8",
+            "ascii",
+        }
+        and r.bom is False  # type: ignore[union-attr]
+        and len(byte_str) < TOO_SMALL_SEQUENCE
+    ):
+        confidence -= 0.2
+
+    # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
+    # but chardet does return 'utf-8-sig' and it is a valid codec name.
+    if r is not None and encoding == "utf_8" and r.bom:
+        encoding += "_sig"
+
+    if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
+        encoding = CHARDET_CORRESPONDENCE[encoding]
+
+    return {
+        "encoding": encoding,
+        "language": language,
+        "confidence": confidence,
+    }
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/md.cpython-310-x86_64-linux-gnu.so
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/md.cpython-310-x86_64-linux-gnu.so
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/md.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/md.py
@ -0,0 +1,936 @@
+from __future__ import annotations
+
+import sys
+from functools import lru_cache
+from logging import getLogger
+
+if sys.version_info >= (3, 8):
+    from typing import final
+else:
+    try:
+        from typing_extensions import final
+    except ImportError:
+
+        def final(cls):  # type: ignore[misc,no-untyped-def]
+            return cls
+
+
+from .constant import (
+    COMMON_CJK_CHARACTERS,
+    COMMON_SAFE_ASCII_CHARACTERS,
+    TRACE,
+    UNICODE_SECONDARY_RANGE_KEYWORD,
+    _ACCENTUATED,
+    _ARABIC,
+    _ARABIC_ISOLATED_FORM,
+    _CJK,
+    _HANGUL,
+    _HIRAGANA,
+    _KATAKANA,
+    _LATIN,
+    _THAI,
+)
+from .utils import (
+    _character_flags,
+    is_emoticon,
+    is_punctuation,
+    is_separator,
+    is_symbol,
+    remove_accent,
+    unicode_range,
+)
+
+# Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection.
+_GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI
+
+
+@final
+class CharInfo:
+    """Pre-computed character properties shared across all detectors.
+
+    Instantiated once and reused via :meth:`update` on every character
+    in the hot loop so that redundant calls to str methods
+    (``isalpha``, ``isupper``, …) and cached utility functions
+    (``_character_flags``, ``is_punctuation``, …) are avoided when
+    several plugins need the same information.
+    """
+
+    __slots__ = (
+        "character",
+        "printable",
+        "alpha",
+        "upper",
+        "lower",
+        "space",
+        "digit",
+        "is_ascii",
+        "case_variable",
+        "flags",
+        "accentuated",
+        "latin",
+        "is_cjk",
+        "is_arabic",
+        "is_glyph",
+        "punct",
+        "sym",
+    )
+
+    def __init__(self) -> None:
+        self.character: str = ""
+        self.printable: bool = False
+        self.alpha: bool = False
+        self.upper: bool = False
+        self.lower: bool = False
+        self.space: bool = False
+        self.digit: bool = False
+        self.is_ascii: bool = False
+        self.case_variable: bool = False
+        self.flags: int = 0
+        self.accentuated: bool = False
+        self.latin: bool = False
+        self.is_cjk: bool = False
+        self.is_arabic: bool = False
+        self.is_glyph: bool = False
+        self.punct: bool = False
+        self.sym: bool = False
+
+    def update(self, character: str) -> None:
+        """Update all properties for *character* (called once per character)."""
+        self.character = character
+
+        # ASCII fast-path: for characters with ord < 128, we can skip
+        # _character_flags() entirely and derive most properties from ord.
+        o: int = ord(character)
+        if o < 128:
+            self.is_ascii = True
+            self.accentuated = False
+            self.is_cjk = False
+            self.is_arabic = False
+            self.is_glyph = False
+            # ASCII alpha: a-z (97-122) or A-Z (65-90)
+            if 65 <= o <= 90:
+                # Uppercase ASCII letter
+                self.alpha = True
+                self.upper = True
+                self.lower = False
+                self.space = False
+                self.digit = False
+                self.printable = True
+                self.case_variable = True
+                self.flags = _LATIN
+                self.latin = True
+                self.punct = False
+                self.sym = False
+            elif 97 <= o <= 122:
+                # Lowercase ASCII letter
+                self.alpha = True
+                self.upper = False
+                self.lower = True
+                self.space = False
+                self.digit = False
+                self.printable = True
+                self.case_variable = True
+                self.flags = _LATIN
+                self.latin = True
+                self.punct = False
+                self.sym = False
+            elif 48 <= o <= 57:
+                # ASCII digit 0-9
+                self.alpha = False
+                self.upper = False
+                self.lower = False
+                self.space = False
+                self.digit = True
+                self.printable = True
+                self.case_variable = False
+                self.flags = 0
+                self.latin = False
+                self.punct = False
+                self.sym = False
+            elif o == 32 or (9 <= o <= 13):
+                # Space, tab, newline, etc.
+                self.alpha = False
+                self.upper = False
+                self.lower = False
+                self.space = True
+                self.digit = False
+                self.printable = o == 32
+                self.case_variable = False
+                self.flags = 0
+                self.latin = False
+                self.punct = False
+                self.sym = False
+            else:
+                # Other ASCII (punctuation, symbols, control chars)
+                self.printable = character.isprintable()
+                self.alpha = False
+                self.upper = False
+                self.lower = False
+                self.space = False
+                self.digit = False
+                self.case_variable = False
+                self.flags = 0
+                self.latin = False
+                self.punct = is_punctuation(character) if self.printable else False
+                self.sym = is_symbol(character) if self.printable else False
+        else:
+            # Non-ASCII path
+            self.is_ascii = False
+            self.printable = character.isprintable()
+            self.alpha = character.isalpha()
+            self.upper = character.isupper()
+            self.lower = character.islower()
+            self.space = character.isspace()
+            self.digit = character.isdigit()
+            self.case_variable = self.lower != self.upper
+
+            # Flag-based classification (single unicodedata.name() call, lru-cached)
+            flags: int
+            if self.alpha:
+                flags = _character_flags(character)
+            else:
+                flags = 0
+            self.flags = flags
+            self.accentuated = bool(flags & _ACCENTUATED)
+            self.latin = bool(flags & _LATIN)
+            self.is_cjk = bool(flags & _CJK)
+            self.is_arabic = bool(flags & _ARABIC)
+            self.is_glyph = bool(flags & _GLYPH_MASK)
+
+            # Eagerly compute punct and sym (avoids property dispatch overhead
+            # on 300K+ accesses in the hot loop).
+            self.punct = is_punctuation(character) if self.printable else False
+            self.sym = is_symbol(character) if self.printable else False
+
+
+class MessDetectorPlugin:
+    """
+    Base abstract class used for mess detection plugins.
+    All detectors MUST extend and implement given methods.
+    """
+
+    __slots__ = ()
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """
+        The main routine to be executed upon character.
+        Insert the logic in witch the text would be considered chaotic.
+        """
+        raise NotImplementedError  # Defensive:
+
+    def reset(self) -> None:  # Defensive:
+        """
+        Permit to reset the plugin to the initial state.
+        """
+        raise NotImplementedError
+
+    @property
+    def ratio(self) -> float:
+        """
+        Compute the chaos ratio based on what your feed() has seen.
+        Must NOT be lower than 0.; No restriction gt 0.
+        """
+        raise NotImplementedError  # Defensive:
+
+
+@final
+class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
+    __slots__ = (
+        "_punctuation_count",
+        "_symbol_count",
+        "_character_count",
+        "_last_printable_char",
+        "_frenzy_symbol_in_word",
+    )
+
+    def __init__(self) -> None:
+        self._punctuation_count: int = 0
+        self._symbol_count: int = 0
+        self._character_count: int = 0
+
+        self._last_printable_char: str | None = None
+        self._frenzy_symbol_in_word: bool = False
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        self._character_count += 1
+
+        if (
+            character != self._last_printable_char
+            and character not in COMMON_SAFE_ASCII_CHARACTERS
+        ):
+            if info.punct:
+                self._punctuation_count += 1
+            elif not info.digit and info.sym and not is_emoticon(character):
+                self._symbol_count += 2
+
+        self._last_printable_char = character
+
+    def reset(self) -> None:  # Abstract
+        self._punctuation_count = 0
+        self._character_count = 0
+        self._symbol_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:
+            return 0.0
+
+        ratio_of_punctuation: float = (
+            self._punctuation_count + self._symbol_count
+        ) / self._character_count
+
+        return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
+
+
+@final
+class TooManyAccentuatedPlugin(MessDetectorPlugin):
+    __slots__ = ("_character_count", "_accentuated_count")
+
+    def __init__(self) -> None:
+        self._character_count: int = 0
+        self._accentuated_count: int = 0
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        self._character_count += 1
+
+        if info.accentuated:
+            self._accentuated_count += 1
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._accentuated_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count < 8:
+            return 0.0
+
+        ratio_of_accentuation: float = self._accentuated_count / self._character_count
+        return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
+
+
+@final
+class UnprintablePlugin(MessDetectorPlugin):
+    __slots__ = ("_unprintable_count", "_character_count")
+
+    def __init__(self) -> None:
+        self._unprintable_count: int = 0
+        self._character_count: int = 0
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        if (
+            not info.space
+            and not info.printable
+            and character != "\x1a"
+            and character != "\ufeff"
+        ):
+            self._unprintable_count += 1
+        self._character_count += 1
+
+    def reset(self) -> None:  # Abstract
+        self._unprintable_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:  # Defensive:
+            return 0.0
+
+        return (self._unprintable_count * 8) / self._character_count
+
+
+@final
+class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
+    __slots__ = (
+        "_successive_count",
+        "_character_count",
+        "_last_latin_character",
+        "_last_was_accentuated",
+    )
+
+    def __init__(self) -> None:
+        self._successive_count: int = 0
+        self._character_count: int = 0
+
+        self._last_latin_character: str | None = None
+        self._last_was_accentuated: bool = False
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        self._character_count += 1
+        if (
+            self._last_latin_character is not None
+            and info.accentuated
+            and self._last_was_accentuated
+        ):
+            if info.upper and self._last_latin_character.isupper():
+                self._successive_count += 1
+            if remove_accent(character) == remove_accent(self._last_latin_character):
+                self._successive_count += 1
+        self._last_latin_character = character
+        self._last_was_accentuated = info.accentuated
+
+    def reset(self) -> None:  # Abstract
+        self._successive_count = 0
+        self._character_count = 0
+        self._last_latin_character = None
+        self._last_was_accentuated = False
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:
+            return 0.0
+
+        return (self._successive_count * 2) / self._character_count
+
+
+@final
+class SuspiciousRange(MessDetectorPlugin):
+    __slots__ = (
+        "_suspicious_successive_range_count",
+        "_character_count",
+        "_last_printable_seen",
+        "_last_printable_range",
+    )
+
+    def __init__(self) -> None:
+        self._suspicious_successive_range_count: int = 0
+        self._character_count: int = 0
+        self._last_printable_seen: str | None = None
+        self._last_printable_range: str | None = None
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        self._character_count += 1
+
+        if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS:
+            self._last_printable_seen = None
+            self._last_printable_range = None
+            return
+
+        if self._last_printable_seen is None:
+            self._last_printable_seen = character
+            self._last_printable_range = unicode_range(character)
+            return
+
+        unicode_range_a: str | None = self._last_printable_range
+        unicode_range_b: str | None = unicode_range(character)
+
+        if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
+            self._suspicious_successive_range_count += 1
+
+        self._last_printable_seen = character
+        self._last_printable_range = unicode_range_b
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._suspicious_successive_range_count = 0
+        self._last_printable_seen = None
+        self._last_printable_range = None
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count <= 13:
+            return 0.0
+
+        ratio_of_suspicious_range_usage: float = (
+            self._suspicious_successive_range_count * 2
+        ) / self._character_count
+
+        return ratio_of_suspicious_range_usage
+
+
+@final
+class SuperWeirdWordPlugin(MessDetectorPlugin):
+    __slots__ = (
+        "_word_count",
+        "_bad_word_count",
+        "_foreign_long_count",
+        "_is_current_word_bad",
+        "_foreign_long_watch",
+        "_character_count",
+        "_bad_character_count",
+        "_buffer_length",
+        "_buffer_last_char",
+        "_buffer_last_char_accentuated",
+        "_buffer_accent_count",
+        "_buffer_glyph_count",
+        "_buffer_upper_count",
+    )
+
+    def __init__(self) -> None:
+        self._word_count: int = 0
+        self._bad_word_count: int = 0
+        self._foreign_long_count: int = 0
+
+        self._is_current_word_bad: bool = False
+        self._foreign_long_watch: bool = False
+
+        self._character_count: int = 0
+        self._bad_character_count: int = 0
+
+        self._buffer_length: int = 0
+        self._buffer_last_char: str | None = None
+        self._buffer_last_char_accentuated: bool = False
+        self._buffer_accent_count: int = 0
+        self._buffer_glyph_count: int = 0
+        self._buffer_upper_count: int = 0
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        if info.alpha:
+            self._buffer_length += 1
+            self._buffer_last_char = character
+
+            if info.upper:
+                self._buffer_upper_count += 1
+
+            self._buffer_last_char_accentuated = info.accentuated
+
+            if info.accentuated:
+                self._buffer_accent_count += 1
+            if (
+                not self._foreign_long_watch
+                and (not info.latin or info.accentuated)
+                and not info.is_glyph
+            ):
+                self._foreign_long_watch = True
+            if info.is_glyph:
+                self._buffer_glyph_count += 1
+            return
+        if not self._buffer_length:
+            return
+        if info.space or info.punct or is_separator(character):
+            self._word_count += 1
+            buffer_length: int = self._buffer_length
+
+            self._character_count += buffer_length
+
+            if buffer_length >= 4:
+                if self._buffer_accent_count / buffer_length >= 0.5:
+                    self._is_current_word_bad = True
+                elif (
+                    self._buffer_last_char_accentuated
+                    and self._buffer_last_char.isupper()  # type: ignore[union-attr]
+                    and self._buffer_upper_count != buffer_length
+                ):
+                    self._foreign_long_count += 1
+                    self._is_current_word_bad = True
+                elif self._buffer_glyph_count == 1:
+                    self._is_current_word_bad = True
+                    self._foreign_long_count += 1
+            if buffer_length >= 24 and self._foreign_long_watch:
+                probable_camel_cased: bool = (
+                    self._buffer_upper_count > 0
+                    and self._buffer_upper_count / buffer_length <= 0.3
+                )
+
+                if not probable_camel_cased:
+                    self._foreign_long_count += 1
+                    self._is_current_word_bad = True
+
+            if self._is_current_word_bad:
+                self._bad_word_count += 1
+                self._bad_character_count += buffer_length
+                self._is_current_word_bad = False
+
+            self._foreign_long_watch = False
+            self._buffer_length = 0
+            self._buffer_last_char = None
+            self._buffer_last_char_accentuated = False
+            self._buffer_accent_count = 0
+            self._buffer_glyph_count = 0
+            self._buffer_upper_count = 0
+        elif (
+            character not in {"<", ">", "-", "=", "~", "|", "_"}
+            and not info.digit
+            and info.sym
+        ):
+            self._is_current_word_bad = True
+            self._buffer_length += 1
+            self._buffer_last_char = character
+            self._buffer_last_char_accentuated = False
+
+    def reset(self) -> None:  # Abstract
+        self._buffer_length = 0
+        self._buffer_last_char = None
+        self._buffer_last_char_accentuated = False
+        self._is_current_word_bad = False
+        self._foreign_long_watch = False
+        self._bad_word_count = 0
+        self._word_count = 0
+        self._character_count = 0
+        self._bad_character_count = 0
+        self._foreign_long_count = 0
+        self._buffer_accent_count = 0
+        self._buffer_glyph_count = 0
+        self._buffer_upper_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._word_count <= 10 and self._foreign_long_count == 0:
+            return 0.0
+
+        return self._bad_character_count / self._character_count
+
+
+@final
+class CjkUncommonPlugin(MessDetectorPlugin):
+    """
+    Detect messy CJK text that probably means nothing.
+    """
+
+    __slots__ = ("_character_count", "_uncommon_count")
+
+    def __init__(self) -> None:
+        self._character_count: int = 0
+        self._uncommon_count: int = 0
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        self._character_count += 1
+
+        if character not in COMMON_CJK_CHARACTERS:
+            self._uncommon_count += 1
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._uncommon_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count < 8:
+            return 0.0
+
+        uncommon_form_usage: float = self._uncommon_count / self._character_count
+
+        # we can be pretty sure it's garbage when uncommon characters are widely
+        # used. otherwise it could just be traditional chinese for example.
+        return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
+
+
+@final
+class ArchaicUpperLowerPlugin(MessDetectorPlugin):
+    __slots__ = (
+        "_buf",
+        "_character_count_since_last_sep",
+        "_successive_upper_lower_count",
+        "_successive_upper_lower_count_final",
+        "_character_count",
+        "_last_alpha_seen",
+        "_last_alpha_seen_upper",
+        "_last_alpha_seen_lower",
+        "_current_ascii_only",
+    )
+
+    def __init__(self) -> None:
+        self._buf: bool = False
+
+        self._character_count_since_last_sep: int = 0
+
+        self._successive_upper_lower_count: int = 0
+        self._successive_upper_lower_count_final: int = 0
+
+        self._character_count: int = 0
+
+        self._last_alpha_seen: str | None = None
+        self._last_alpha_seen_upper: bool = False
+        self._last_alpha_seen_lower: bool = False
+        self._current_ascii_only: bool = True
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        is_concerned: bool = info.alpha and info.case_variable
+        chunk_sep: bool = not is_concerned
+
+        if chunk_sep and self._character_count_since_last_sep > 0:
+            if (
+                self._character_count_since_last_sep <= 64
+                and not info.digit
+                and not self._current_ascii_only
+            ):
+                self._successive_upper_lower_count_final += (
+                    self._successive_upper_lower_count
+                )
+
+            self._successive_upper_lower_count = 0
+            self._character_count_since_last_sep = 0
+            self._last_alpha_seen = None
+            self._buf = False
+            self._character_count += 1
+            self._current_ascii_only = True
+
+            return
+
+        if self._current_ascii_only and not info.is_ascii:
+            self._current_ascii_only = False
+
+        if self._last_alpha_seen is not None:
+            if (info.upper and self._last_alpha_seen_lower) or (
+                info.lower and self._last_alpha_seen_upper
+            ):
+                if self._buf:
+                    self._successive_upper_lower_count += 2
+                    self._buf = False
+                else:
+                    self._buf = True
+            else:
+                self._buf = False
+
+        self._character_count += 1
+        self._character_count_since_last_sep += 1
+        self._last_alpha_seen = character
+        self._last_alpha_seen_upper = info.upper
+        self._last_alpha_seen_lower = info.lower
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._character_count_since_last_sep = 0
+        self._successive_upper_lower_count = 0
+        self._successive_upper_lower_count_final = 0
+        self._last_alpha_seen = None
+        self._last_alpha_seen_upper = False
+        self._last_alpha_seen_lower = False
+        self._buf = False
+        self._current_ascii_only = True
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:  # Defensive:
+            return 0.0
+
+        return self._successive_upper_lower_count_final / self._character_count
+
+
+@final
+class ArabicIsolatedFormPlugin(MessDetectorPlugin):
+    __slots__ = ("_character_count", "_isolated_form_count")
+
+    def __init__(self) -> None:
+        self._character_count: int = 0
+        self._isolated_form_count: int = 0
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._isolated_form_count = 0
+
+    def feed_info(self, character: str, info: CharInfo) -> None:
+        """Optimized feed using pre-computed character info."""
+        self._character_count += 1
+
+        if info.flags & _ARABIC_ISOLATED_FORM:
+            self._isolated_form_count += 1
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count < 8:
+            return 0.0
+
+        isolated_form_usage: float = self._isolated_form_count / self._character_count
+
+        return isolated_form_usage
+
+
+@lru_cache(maxsize=1024)
+def is_suspiciously_successive_range(
+    unicode_range_a: str | None, unicode_range_b: str | None
+) -> bool:
+    """
+    Determine if two Unicode range seen next to each other can be considered as suspicious.
+    """
+    if unicode_range_a is None or unicode_range_b is None:
+        return True
+
+    if unicode_range_a == unicode_range_b:
+        return False
+
+    if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
+        return False
+
+    if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
+        return False
+
+    # Latin characters can be accompanied with a combining diacritical mark
+    # eg. Vietnamese.
+    if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
+        "Combining" in unicode_range_a or "Combining" in unicode_range_b
+    ):
+        return False
+
+    keywords_range_a, keywords_range_b = (
+        unicode_range_a.split(" "),
+        unicode_range_b.split(" "),
+    )
+
+    for el in keywords_range_a:
+        if el in UNICODE_SECONDARY_RANGE_KEYWORD:
+            continue
+        if el in keywords_range_b:
+            return False
+
+    # Japanese Exception
+    range_a_jp_chars, range_b_jp_chars = (
+        unicode_range_a
+        in (
+            "Hiragana",
+            "Katakana",
+        ),
+        unicode_range_b in ("Hiragana", "Katakana"),
+    )
+    if (range_a_jp_chars or range_b_jp_chars) and (
+        "CJK" in unicode_range_a or "CJK" in unicode_range_b
+    ):
+        return False
+    if range_a_jp_chars and range_b_jp_chars:
+        return False
+
+    if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
+        if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
+            return False
+        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+            return False
+
+    # Chinese/Japanese use dedicated range for punctuation and/or separators.
+    if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
+        unicode_range_a in ["Katakana", "Hiragana"]
+        and unicode_range_b in ["Katakana", "Hiragana"]
+    ):
+        if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
+            return False
+        if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
+            return False
+        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+            return False
+
+    return True
+
+
+@lru_cache(maxsize=2048)
+def mess_ratio(
+    decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
+) -> float:
+    """
+    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
+    """
+
+    seq_len: int = len(decoded_sequence)
+
+    if seq_len < 511:
+        step: int = 32
+    elif seq_len < 1024:
+        step = 64
+    else:
+        step = 128
+
+    # Create each detector as a named local variable (unrolled from the generic loop).
+    # This eliminates per-character iteration over the detector list and
+    # per-character eligible() virtual dispatch, while keeping every plugin class
+    # intact and fully readable.
+    d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin()
+    d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin()
+    d_up: UnprintablePlugin = UnprintablePlugin()
+    d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin()
+    d_sr: SuspiciousRange = SuspiciousRange()
+    d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin()
+    d_cu: CjkUncommonPlugin = CjkUncommonPlugin()
+    d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin()
+    d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin()
+
+    # Local references for feed_info methods called in the hot loop.
+    d_sp_feed = d_sp.feed_info
+    d_ta_feed = d_ta.feed_info
+    d_up_feed = d_up.feed_info
+    d_sda_feed = d_sda.feed_info
+    d_sr_feed = d_sr.feed_info
+    d_sw_feed = d_sw.feed_info
+    d_cu_feed = d_cu.feed_info
+    d_au_feed = d_au.feed_info
+    d_ai_feed = d_ai.feed_info
+
+    # Single reusable CharInfo object (avoids per-character allocation).
+    info: CharInfo = CharInfo()
+    info_update = info.update
+
+    mean_mess_ratio: float
+
+    for block_start in range(0, seq_len, step):
+        for character in decoded_sequence[block_start : block_start + step]:
+            # Pre-compute all character properties once (shared across all plugins).
+            info_update(character)
+
+            # Detectors with eligible() == always True
+            d_up_feed(character, info)
+            d_sw_feed(character, info)
+            d_au_feed(character, info)
+
+            # Detectors with eligible() == isprintable
+            if info.printable:
+                d_sp_feed(character, info)
+                d_sr_feed(character, info)
+
+            # Detectors with eligible() == isalpha
+            if info.alpha:
+                d_ta_feed(character, info)
+                # SuspiciousDuplicateAccent: isalpha() and is_latin()
+                if info.latin:
+                    d_sda_feed(character, info)
+                # CjkUncommon: is_cjk()
+                if info.is_cjk:
+                    d_cu_feed(character, info)
+                # ArabicIsolatedForm: is_arabic()
+                if info.is_arabic:
+                    d_ai_feed(character, info)
+
+        mean_mess_ratio = (
+            d_sp.ratio
+            + d_ta.ratio
+            + d_up.ratio
+            + d_sda.ratio
+            + d_sr.ratio
+            + d_sw.ratio
+            + d_cu.ratio
+            + d_au.ratio
+            + d_ai.ratio
+        )
+
+        if mean_mess_ratio >= maximum_threshold:
+            break
+    else:
+        # Flush last word buffer in SuperWeirdWordPlugin via trailing newline.
+        info_update("\n")
+        d_sw_feed("\n", info)
+        d_au_feed("\n", info)
+        d_up_feed("\n", info)
+
+        mean_mess_ratio = (
+            d_sp.ratio
+            + d_ta.ratio
+            + d_up.ratio
+            + d_sda.ratio
+            + d_sr.ratio
+            + d_sw.ratio
+            + d_cu.ratio
+            + d_au.ratio
+            + d_ai.ratio
+        )
+
+    if debug:  # Defensive:
+        logger = getLogger("charset_normalizer")
+
+        logger.log(
+            TRACE,
+            "Mess-detector extended-analysis start. "
+            f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} "
+            f"maximum_threshold={maximum_threshold}",
+        )
+
+        if seq_len > 16:
+            logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
+            logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
+
+        for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]:
+            logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
+
+    return round(mean_mess_ratio, 3)
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/models.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/models.py
@ -0,0 +1,359 @@
+from __future__ import annotations
+
+from encodings.aliases import aliases
+from json import dumps
+from re import sub
+from typing import Any, Iterator, List, Tuple
+
+from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
+from .utils import iana_name, is_multi_byte_encoding, unicode_range
+
+
+class CharsetMatch:
+    def __init__(
+        self,
+        payload: bytes | bytearray,
+        guessed_encoding: str,
+        mean_mess_ratio: float,
+        has_sig_or_bom: bool,
+        languages: CoherenceMatches,
+        decoded_payload: str | None = None,
+        preemptive_declaration: str | None = None,
+    ):
+        self._payload: bytes | bytearray = payload
+
+        self._encoding: str = guessed_encoding
+        self._mean_mess_ratio: float = mean_mess_ratio
+        self._languages: CoherenceMatches = languages
+        self._has_sig_or_bom: bool = has_sig_or_bom
+        self._unicode_ranges: list[str] | None = None
+
+        self._leaves: list[CharsetMatch] = []
+        self._mean_coherence_ratio: float = 0.0
+
+        self._output_payload: bytes | None = None
+        self._output_encoding: str | None = None
+
+        self._string: str | None = decoded_payload
+
+        self._preemptive_declaration: str | None = preemptive_declaration
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, CharsetMatch):
+            if isinstance(other, str):
+                return iana_name(other) == self.encoding
+            return False
+        return self.encoding == other.encoding and self.fingerprint == other.fingerprint
+
+    def __lt__(self, other: object) -> bool:
+        """
+        Implemented to make sorted available upon CharsetMatches items.
+        """
+        if not isinstance(other, CharsetMatch):
+            raise ValueError
+
+        chaos_difference: float = abs(self.chaos - other.chaos)
+        coherence_difference: float = abs(self.coherence - other.coherence)
+
+        # Below 0.5% difference --> Use Coherence
+        if chaos_difference < 0.005 and coherence_difference > 0.02:
+            return self.coherence > other.coherence
+        elif chaos_difference < 0.005 and coherence_difference <= 0.02:
+            # When having a difficult decision, use the result that decoded as many multi-byte as possible.
+            # preserve RAM usage!
+            if len(self._payload) >= TOO_BIG_SEQUENCE:
+                return self.chaos < other.chaos
+            return self.multi_byte_usage > other.multi_byte_usage
+
+        return self.chaos < other.chaos
+
+    @property
+    def multi_byte_usage(self) -> float:
+        return 1.0 - (len(str(self)) / len(self.raw))
+
+    def __str__(self) -> str:
+        # Lazy Str Loading
+        if self._string is None:
+            self._string = str(self._payload, self._encoding, "strict")
+        return self._string
+
+    def __repr__(self) -> str:
+        return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>"
+
+    def add_submatch(self, other: CharsetMatch) -> None:
+        if not isinstance(other, CharsetMatch) or other == self:
+            raise ValueError(
+                "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
+                    other.__class__
+                )
+            )
+
+        other._string = None  # Unload RAM usage; dirty trick.
+        self._leaves.append(other)
+
+    @property
+    def encoding(self) -> str:
+        return self._encoding
+
+    @property
+    def encoding_aliases(self) -> list[str]:
+        """
+        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
+        """
+        also_known_as: list[str] = []
+        for u, p in aliases.items():
+            if self.encoding == u:
+                also_known_as.append(p)
+            elif self.encoding == p:
+                also_known_as.append(u)
+        return also_known_as
+
+    @property
+    def bom(self) -> bool:
+        return self._has_sig_or_bom
+
+    @property
+    def byte_order_mark(self) -> bool:
+        return self._has_sig_or_bom
+
+    @property
+    def languages(self) -> list[str]:
+        """
+        Return the complete list of possible languages found in decoded sequence.
+        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
+        """
+        return [e[0] for e in self._languages]
+
+    @property
+    def language(self) -> str:
+        """
+        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
+        "Unknown".
+        """
+        if not self._languages:
+            # Trying to infer the language based on the given encoding
+            # Its either English or we should not pronounce ourselves in certain cases.
+            if "ascii" in self.could_be_from_charset:
+                return "English"
+
+            # doing it there to avoid circular import
+            from charset_normalizer.cd import encoding_languages, mb_encoding_languages
+
+            languages = (
+                mb_encoding_languages(self.encoding)
+                if is_multi_byte_encoding(self.encoding)
+                else encoding_languages(self.encoding)
+            )
+
+            if len(languages) == 0 or "Latin Based" in languages:
+                return "Unknown"
+
+            return languages[0]
+
+        return self._languages[0][0]
+
+    @property
+    def chaos(self) -> float:
+        return self._mean_mess_ratio
+
+    @property
+    def coherence(self) -> float:
+        if not self._languages:
+            return 0.0
+        return self._languages[0][1]
+
+    @property
+    def percent_chaos(self) -> float:
+        return round(self.chaos * 100, ndigits=3)
+
+    @property
+    def percent_coherence(self) -> float:
+        return round(self.coherence * 100, ndigits=3)
+
+    @property
+    def raw(self) -> bytes | bytearray:
+        """
+        Original untouched bytes.
+        """
+        return self._payload
+
+    @property
+    def submatch(self) -> list[CharsetMatch]:
+        return self._leaves
+
+    @property
+    def has_submatch(self) -> bool:
+        return len(self._leaves) > 0
+
+    @property
+    def alphabets(self) -> list[str]:
+        if self._unicode_ranges is not None:
+            return self._unicode_ranges
+        # list detected ranges
+        detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
+        # filter and sort
+        self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
+        return self._unicode_ranges
+
+    @property
+    def could_be_from_charset(self) -> list[str]:
+        """
+        The complete list of encoding that output the exact SAME str result and therefore could be the originating
+        encoding.
+        This list does include the encoding available in property 'encoding'.
+        """
+        return [self._encoding] + [m.encoding for m in self._leaves]
+
+    def output(self, encoding: str = "utf_8") -> bytes:
+        """
+        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
+        Any errors will be simply ignored by the encoder NOT replaced.
+        """
+        if self._output_encoding is None or self._output_encoding != encoding:
+            self._output_encoding = encoding
+            decoded_string = str(self)
+            if (
+                self._preemptive_declaration is not None
+                and self._preemptive_declaration.lower()
+                not in ["utf-8", "utf8", "utf_8"]
+            ):
+                patched_header = sub(
+                    RE_POSSIBLE_ENCODING_INDICATION,
+                    lambda m: m.string[m.span()[0] : m.span()[1]].replace(
+                        m.groups()[0],
+                        iana_name(self._output_encoding).replace("_", "-"),  # type: ignore[arg-type]
+                    ),
+                    decoded_string[:8192],
+                    count=1,
+                )
+
+                decoded_string = patched_header + decoded_string[8192:]
+
+            self._output_payload = decoded_string.encode(encoding, "replace")
+
+        return self._output_payload  # type: ignore
+
+    @property
+    def fingerprint(self) -> int:
+        """
+        Retrieve a hash fingerprint of the decoded payload, used for deduplication.
+        """
+        return hash(str(self))
+
+
+class CharsetMatches:
+    """
+    Container with every CharsetMatch items ordered by default from most probable to the less one.
+    Act like a list(iterable) but does not implements all related methods.
+    """
+
+    def __init__(self, results: list[CharsetMatch] | None = None):
+        self._results: list[CharsetMatch] = sorted(results) if results else []
+
+    def __iter__(self) -> Iterator[CharsetMatch]:
+        yield from self._results
+
+    def __getitem__(self, item: int | str) -> CharsetMatch:
+        """
+        Retrieve a single item either by its position or encoding name (alias may be used here).
+        Raise KeyError upon invalid index or encoding not present in results.
+        """
+        if isinstance(item, int):
+            return self._results[item]
+        if isinstance(item, str):
+            item = iana_name(item, False)
+            for result in self._results:
+                if item in result.could_be_from_charset:
+                    return result
+        raise KeyError
+
+    def __len__(self) -> int:
+        return len(self._results)
+
+    def __bool__(self) -> bool:
+        return len(self._results) > 0
+
+    def append(self, item: CharsetMatch) -> None:
+        """
+        Insert a single match. Will be inserted accordingly to preserve sort.
+        Can be inserted as a submatch.
+        """
+        if not isinstance(item, CharsetMatch):
+            raise ValueError(
+                "Cannot append instance '{}' to CharsetMatches".format(
+                    str(item.__class__)
+                )
+            )
+        # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
+        if len(item.raw) < TOO_BIG_SEQUENCE:
+            for match in self._results:
+                if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
+                    match.add_submatch(item)
+                    return
+        self._results.append(item)
+        self._results = sorted(self._results)
+
+    def best(self) -> CharsetMatch | None:
+        """
+        Simply return the first match. Strict equivalent to matches[0].
+        """
+        if not self._results:
+            return None
+        return self._results[0]
+
+    def first(self) -> CharsetMatch | None:
+        """
+        Redundant method, call the method best(). Kept for BC reasons.
+        """
+        return self.best()
+
+
+CoherenceMatch = Tuple[str, float]
+CoherenceMatches = List[CoherenceMatch]
+
+
+class CliDetectionResult:
+    def __init__(
+        self,
+        path: str,
+        encoding: str | None,
+        encoding_aliases: list[str],
+        alternative_encodings: list[str],
+        language: str,
+        alphabets: list[str],
+        has_sig_or_bom: bool,
+        chaos: float,
+        coherence: float,
+        unicode_path: str | None,
+        is_preferred: bool,
+    ):
+        self.path: str = path
+        self.unicode_path: str | None = unicode_path
+        self.encoding: str | None = encoding
+        self.encoding_aliases: list[str] = encoding_aliases
+        self.alternative_encodings: list[str] = alternative_encodings
+        self.language: str = language
+        self.alphabets: list[str] = alphabets
+        self.has_sig_or_bom: bool = has_sig_or_bom
+        self.chaos: float = chaos
+        self.coherence: float = coherence
+        self.is_preferred: bool = is_preferred
+
+    @property
+    def __dict__(self) -> dict[str, Any]:  # type: ignore
+        return {
+            "path": self.path,
+            "encoding": self.encoding,
+            "encoding_aliases": self.encoding_aliases,
+            "alternative_encodings": self.alternative_encodings,
+            "language": self.language,
+            "alphabets": self.alphabets,
+            "has_sig_or_bom": self.has_sig_or_bom,
+            "chaos": self.chaos,
+            "coherence": self.coherence,
+            "unicode_path": self.unicode_path,
+            "is_preferred": self.is_preferred,
+        }
+
+    def to_json(self) -> str:
+        return dumps(self.__dict__, ensure_ascii=True, indent=4)
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/py.typed
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/py.typed
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/utils.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/utils.py
@ -0,0 +1,422 @@
+from __future__ import annotations
+
+import importlib
+import logging
+import unicodedata
+from bisect import bisect_right
+from codecs import IncrementalDecoder
+from encodings.aliases import aliases
+from functools import lru_cache
+from re import findall
+from typing import Generator
+
+from _multibytecodec import (  # type: ignore[import-not-found,import]
+    MultibyteIncrementalDecoder,
+)
+
+from .constant import (
+    ENCODING_MARKS,
+    IANA_SUPPORTED_SIMILAR,
+    RE_POSSIBLE_ENCODING_INDICATION,
+    UNICODE_RANGES_COMBINED,
+    UNICODE_SECONDARY_RANGE_KEYWORD,
+    UTF8_MAXIMAL_ALLOCATION,
+    COMMON_CJK_CHARACTERS,
+    _LATIN,
+    _CJK,
+    _HANGUL,
+    _KATAKANA,
+    _HIRAGANA,
+    _THAI,
+    _ARABIC,
+    _ARABIC_ISOLATED_FORM,
+    _ACCENT_KEYWORDS,
+    _ACCENTUATED,
+)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def _character_flags(character: str) -> int:
+    """Compute all name-based classification flags with a single unicodedata.name() call."""
+    try:
+        desc: str = unicodedata.name(character)
+    except ValueError:
+        return 0
+
+    flags: int = 0
+
+    if "LATIN" in desc:
+        flags |= _LATIN
+    if "CJK" in desc:
+        flags |= _CJK
+    if "HANGUL" in desc:
+        flags |= _HANGUL
+    if "KATAKANA" in desc:
+        flags |= _KATAKANA
+    if "HIRAGANA" in desc:
+        flags |= _HIRAGANA
+    if "THAI" in desc:
+        flags |= _THAI
+    if "ARABIC" in desc:
+        flags |= _ARABIC
+        if "ISOLATED FORM" in desc:
+            flags |= _ARABIC_ISOLATED_FORM
+
+    for kw in _ACCENT_KEYWORDS:
+        if kw in desc:
+            flags |= _ACCENTUATED
+            break
+
+    return flags
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_accentuated(character: str) -> bool:
+    return bool(_character_flags(character) & _ACCENTUATED)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def remove_accent(character: str) -> str:
+    decomposed: str = unicodedata.decomposition(character)
+    if not decomposed:
+        return character
+
+    codes: list[str] = decomposed.split(" ")
+
+    return chr(int(codes[0], 16))
+
+
+# Pre-built sorted lookup table for O(log n) binary search in unicode_range().
+# Each entry is (range_start, range_end_exclusive, range_name).
+_UNICODE_RANGES_SORTED: list[tuple[int, int, str]] = sorted(
+    (ord_range.start, ord_range.stop, name)
+    for name, ord_range in UNICODE_RANGES_COMBINED.items()
+)
+_UNICODE_RANGE_STARTS: list[int] = [e[0] for e in _UNICODE_RANGES_SORTED]
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def unicode_range(character: str) -> str | None:
+    """
+    Retrieve the Unicode range official name from a single character.
+    """
+    character_ord: int = ord(character)
+
+    # Binary search: find the rightmost range whose start <= character_ord
+    idx = bisect_right(_UNICODE_RANGE_STARTS, character_ord) - 1
+    if idx >= 0:
+        start, stop, name = _UNICODE_RANGES_SORTED[idx]
+        if character_ord < stop:
+            return name
+
+    return None
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_latin(character: str) -> bool:
+    return bool(_character_flags(character) & _LATIN)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_punctuation(character: str) -> bool:
+    character_category: str = unicodedata.category(character)
+
+    if "P" in character_category:
+        return True
+
+    character_range: str | None = unicode_range(character)
+
+    if character_range is None:
+        return False
+
+    return "Punctuation" in character_range
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_symbol(character: str) -> bool:
+    character_category: str = unicodedata.category(character)
+
+    if "S" in character_category or "N" in character_category:
+        return True
+
+    character_range: str | None = unicode_range(character)
+
+    if character_range is None:
+        return False
+
+    return "Forms" in character_range and character_category != "Lo"
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_emoticon(character: str) -> bool:
+    character_range: str | None = unicode_range(character)
+
+    if character_range is None:
+        return False
+
+    return "Emoticons" in character_range or "Pictographs" in character_range
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_separator(character: str) -> bool:
+    if character.isspace() or character in {"｜", "+", "<", ">"}:
+        return True
+
+    character_category: str = unicodedata.category(character)
+
+    return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_case_variable(character: str) -> bool:
+    return character.islower() != character.isupper()
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_cjk(character: str) -> bool:
+    return bool(_character_flags(character) & _CJK)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hiragana(character: str) -> bool:
+    return bool(_character_flags(character) & _HIRAGANA)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_katakana(character: str) -> bool:
+    return bool(_character_flags(character) & _KATAKANA)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hangul(character: str) -> bool:
+    return bool(_character_flags(character) & _HANGUL)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_thai(character: str) -> bool:
+    return bool(_character_flags(character) & _THAI)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic(character: str) -> bool:
+    return bool(_character_flags(character) & _ARABIC)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic_isolated_form(character: str) -> bool:
+    return bool(_character_flags(character) & _ARABIC_ISOLATED_FORM)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_cjk_uncommon(character: str) -> bool:
+    return character not in COMMON_CJK_CHARACTERS
+
+
+@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
+def is_unicode_range_secondary(range_name: str) -> bool:
+    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_unprintable(character: str) -> bool:
+    return (
+        character.isspace() is False  # includes \n \t \r \v
+        and character.isprintable() is False
+        and character != "\x1a"  # Why? Its the ASCII substitute character.
+        and character != "\ufeff"  # bug discovered in Python,
+        # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
+    )
+
+
+def any_specified_encoding(
+    sequence: bytes | bytearray, search_zone: int = 8192
+) -> str | None:
+    """
+    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
+    """
+    if not isinstance(sequence, (bytes, bytearray)):
+        raise TypeError
+
+    seq_len: int = len(sequence)
+
+    results: list[str] = findall(
+        RE_POSSIBLE_ENCODING_INDICATION,
+        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
+    )
+
+    if len(results) == 0:
+        return None
+
+    for specified_encoding in results:
+        specified_encoding = specified_encoding.lower().replace("-", "_")
+
+        encoding_alias: str
+        encoding_iana: str
+
+        for encoding_alias, encoding_iana in aliases.items():
+            if encoding_alias == specified_encoding:
+                return encoding_iana
+            if encoding_iana == specified_encoding:
+                return encoding_iana
+
+    return None
+
+
+@lru_cache(maxsize=128)
+def is_multi_byte_encoding(name: str) -> bool:
+    """
+    Verify is a specific encoding is a multi byte one based on it IANA name
+    """
+    return name in {
+        "utf_8",
+        "utf_8_sig",
+        "utf_16",
+        "utf_16_be",
+        "utf_16_le",
+        "utf_32",
+        "utf_32_le",
+        "utf_32_be",
+        "utf_7",
+    } or issubclass(
+        importlib.import_module(f"encodings.{name}").IncrementalDecoder,
+        MultibyteIncrementalDecoder,
+    )
+
+
+def identify_sig_or_bom(sequence: bytes | bytearray) -> tuple[str | None, bytes]:
+    """
+    Identify and extract SIG/BOM in given sequence.
+    """
+
+    for iana_encoding in ENCODING_MARKS:
+        marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
+
+        if isinstance(marks, bytes):
+            marks = [marks]
+
+        for mark in marks:
+            if sequence.startswith(mark):
+                return iana_encoding, mark
+
+    return None, b""
+
+
+def should_strip_sig_or_bom(iana_encoding: str) -> bool:
+    return iana_encoding not in {"utf_16", "utf_32"}
+
+
+def iana_name(cp_name: str, strict: bool = True) -> str:
+    """Returns the Python normalized encoding name (Not the IANA official name)."""
+    cp_name = cp_name.lower().replace("-", "_")
+
+    encoding_alias: str
+    encoding_iana: str
+
+    for encoding_alias, encoding_iana in aliases.items():
+        if cp_name in [encoding_alias, encoding_iana]:
+            return encoding_iana
+
+    if strict:
+        raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
+
+    return cp_name
+
+
+def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
+    if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
+        return 0.0
+
+    decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
+    decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
+
+    id_a: IncrementalDecoder = decoder_a(errors="ignore")
+    id_b: IncrementalDecoder = decoder_b(errors="ignore")
+
+    character_match_count: int = 0
+
+    for i in range(256):
+        to_be_decoded: bytes = bytes([i])
+        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
+            character_match_count += 1
+
+    return character_match_count / 256
+
+
+def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
+    """
+    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
+    the function cp_similarity.
+    """
+    return (
+        iana_name_a in IANA_SUPPORTED_SIMILAR
+        and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
+    )
+
+
+def set_logging_handler(
+    name: str = "charset_normalizer",
+    level: int = logging.INFO,
+    format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
+) -> None:
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter(format_string))
+    logger.addHandler(handler)
+
+
+def cut_sequence_chunks(
+    sequences: bytes | bytearray,
+    encoding_iana: str,
+    offsets: range,
+    chunk_size: int,
+    bom_or_sig_available: bool,
+    strip_sig_or_bom: bool,
+    sig_payload: bytes,
+    is_multi_byte_decoder: bool,
+    decoded_payload: str | None = None,
+) -> Generator[str, None, None]:
+    if decoded_payload and is_multi_byte_decoder is False:
+        for i in offsets:
+            chunk = decoded_payload[i : i + chunk_size]
+            if not chunk:
+                break
+            yield chunk
+    else:
+        for i in offsets:
+            chunk_end = i + chunk_size
+            if chunk_end > len(sequences) + 8:
+                continue
+
+            cut_sequence = sequences[i : i + chunk_size]
+
+            if bom_or_sig_available and strip_sig_or_bom is False:
+                cut_sequence = sig_payload + cut_sequence
+
+            chunk = cut_sequence.decode(
+                encoding_iana,
+                errors="ignore" if is_multi_byte_decoder else "strict",
+            )
+
+            # multi-byte bad cutting detector and adjustment
+            # not the cleanest way to perform that fix but clever enough for now.
+            if is_multi_byte_decoder and i > 0:
+                chunk_partial_size_chk: int = min(chunk_size, 16)
+
+                if (
+                    decoded_payload
+                    and chunk[:chunk_partial_size_chk] not in decoded_payload
+                ):
+                    for j in range(i, i - 4, -1):
+                        cut_sequence = sequences[j:chunk_end]
+
+                        if bom_or_sig_available and strip_sig_or_bom is False:
+                            cut_sequence = sig_payload + cut_sequence
+
+                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+
+                        if chunk[:chunk_partial_size_chk] in decoded_payload:
+                            break
+
+            yield chunk
--- a/.venv/lib/python3.10/site-packages/charset_normalizer/version.py
+++ b/.venv/lib/python3.10/site-packages/charset_normalizer/version.py
@ -0,0 +1,8 @@
+"""
+Expose version
+"""
+
+from __future__ import annotations
+
+__version__ = "3.4.6"
+VERSION = __version__.split(".")