Make SubFox production-ready with parallel translation and UI controls
This commit is contained in:
parent
c40b8bed2b
commit
2b1d05f02c
6046 changed files with 798327 additions and 0 deletions
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
The Real First Universal Charset Detector.
|
||||
A library that helps you read text from an unknown charset encoding.
|
||||
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||
All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
Basic usage:
|
||||
>>> from charset_normalizer import from_bytes
|
||||
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||
>>> best_guess = results.best()
|
||||
>>> str(best_guess)
|
||||
'Bсеки човек има право на образование. Oбразованието!'
|
||||
|
||||
Others methods and usages are available - see the full documentation
|
||||
at <https://github.com/Ousret/charset_normalizer>.
|
||||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, is_binary
|
||||
from .legacy import detect
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"is_binary",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .cli import cli_detect
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
974
.venv/lib/python3.10/site-packages/charset_normalizer/api.py
Normal file
974
.venv/lib/python3.10/site-packages/charset_normalizer/api.py
Normal file
|
|
@ -0,0 +1,974 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from os import PathLike
|
||||
from typing import BinaryIO
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
mb_encoding_languages,
|
||||
merge_coherence_ratios,
|
||||
)
|
||||
from .constant import (
|
||||
IANA_SUPPORTED,
|
||||
IANA_SUPPORTED_SIMILAR,
|
||||
TOO_BIG_SEQUENCE,
|
||||
TOO_SMALL_SEQUENCE,
|
||||
TRACE,
|
||||
)
|
||||
from .md import mess_ratio
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_multi_byte_encoding,
|
||||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
# Pre-compute a reordered encoding list: multibyte first, then single-byte.
|
||||
# This allows the mb_definitive_match optimization to fire earlier, skipping
|
||||
# all single-byte encodings for genuine CJK content. Multibyte codecs
|
||||
# hard-fail (UnicodeDecodeError) on single-byte data almost instantly, so
|
||||
# testing them first costs negligible time for non-CJK files.
|
||||
_mb_supported: list[str] = []
|
||||
_sb_supported: list[str] = []
|
||||
|
||||
for _supported_enc in IANA_SUPPORTED:
|
||||
try:
|
||||
if is_multi_byte_encoding(_supported_enc):
|
||||
_mb_supported.append(_supported_enc)
|
||||
else:
|
||||
_sb_supported.append(_supported_enc)
|
||||
except ImportError:
|
||||
_sb_supported.append(_supported_enc)
|
||||
|
||||
IANA_SUPPORTED_MB_FIRST: list[str] = _mb_supported + _sb_supported
|
||||
|
||||
|
||||
def from_bytes(
|
||||
sequences: bytes | bytearray,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
but never take it for granted. Can improve the performance.
|
||||
|
||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_isolation is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding allowed : %s.",
|
||||
", ".join(cp_isolation),
|
||||
)
|
||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||
else:
|
||||
cp_isolation = []
|
||||
|
||||
if cp_exclusion is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding excluded : %s.",
|
||||
", ".join(cp_exclusion),
|
||||
)
|
||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||
else:
|
||||
cp_exclusion = []
|
||||
|
||||
if length <= (chunk_size * steps):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||
steps,
|
||||
chunk_size,
|
||||
length,
|
||||
)
|
||||
steps = 1
|
||||
chunk_size = length
|
||||
|
||||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
elif is_too_large_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: list[str] = []
|
||||
|
||||
specified_encoding: str | None = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: set[str] = set()
|
||||
tested_but_hard_failure: list[str] = []
|
||||
tested_but_soft_failure: list[str] = []
|
||||
soft_failure_skip: set[str] = set()
|
||||
success_fast_tracked: set[str] = set()
|
||||
|
||||
# Cache for decoded payload deduplication: hash(decoded_payload) -> (mean_mess_ratio, cd_ratios_merged, passed)
|
||||
# When multiple encodings decode to the exact same string, we can skip the expensive
|
||||
# mess_ratio and coherence_ratio analysis and reuse the results from the first encoding.
|
||||
payload_result_cache: dict[int, tuple[float, list[tuple[str, float]], bool]] = {}
|
||||
|
||||
# When a definitive result (chaos=0.0 and good coherence) is found after testing
|
||||
# the prioritized encodings (ascii, utf_8), we can significantly reduce the remaining
|
||||
# work. Encodings that target completely different language families (e.g., Cyrillic
|
||||
# when the definitive match is Latin) are skipped entirely.
|
||||
# Additionally, for same-family encodings that pass chaos probing, we reuse the
|
||||
# definitive match's coherence ratios instead of recomputing them — a major savings
|
||||
# since coherence_ratio accounts for ~30% of total time on slow Latin files.
|
||||
definitive_match_found: bool = False
|
||||
definitive_target_languages: set[str] = set()
|
||||
# After the definitive match fires, we cap the number of additional same-family
|
||||
# single-byte encodings that pass chaos probing. Once we've accumulated enough
|
||||
# good candidates (N), further same-family SB encodings are unlikely to produce
|
||||
# a better best() result and just waste mess_ratio + coherence_ratio time.
|
||||
# The first encoding to trigger the definitive match is NOT counted (it's already in).
|
||||
post_definitive_sb_success_count: int = 0
|
||||
POST_DEFINITIVE_SB_CAP: int = 7
|
||||
|
||||
# When a non-UTF multibyte encoding passes chaos probing with significant multibyte
|
||||
# content (decoded length < 98% of raw length), skip all remaining single-byte encodings.
|
||||
# Rationale: multi-byte decoders (CJK) have strict byte-sequence validation — if they
|
||||
# decode without error AND pass chaos probing with substantial multibyte content, the
|
||||
# data is genuinely multibyte encoded. Single-byte encodings will always decode (every
|
||||
# byte maps to something) but waste time on mess_ratio before failing.
|
||||
# The 98% threshold prevents false triggers on files that happen to have a few valid
|
||||
# multibyte pairs (e.g., cp424/_ude_1.txt where big5 decodes with 99% ratio).
|
||||
mb_definitive_match_found: bool = False
|
||||
|
||||
fallback_ascii: CharsetMatch | None = None
|
||||
fallback_u8: CharsetMatch | None = None
|
||||
fallback_specified: CharsetMatch | None = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
early_stop_results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
if sig_encoding is not None:
|
||||
prioritized_encodings.append(sig_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||
len(sig_payload),
|
||||
sig_encoding,
|
||||
)
|
||||
|
||||
prioritized_encodings.append("ascii")
|
||||
|
||||
if "utf_8" not in prioritized_encodings:
|
||||
prioritized_encodings.append("utf_8")
|
||||
|
||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED_MB_FIRST:
|
||||
if cp_isolation and encoding_iana not in cp_isolation:
|
||||
continue
|
||||
|
||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||
continue
|
||||
|
||||
if encoding_iana in tested:
|
||||
continue
|
||||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: str | None = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
# Skip encodings similar to ones that already soft-failed (high mess ratio).
|
||||
# Checked BEFORE the expensive decode attempt.
|
||||
if encoding_iana in soft_failure_skip:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s is deemed too similar to a code page that was already considered unsuited. Continuing!",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
# Skip encodings that were already fast-tracked from a similar successful encoding.
|
||||
if encoding_iana in success_fast_tracked:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Skipping %s: already fast-tracked from a similar successful encoding.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError): # Defensive:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s does not provide an IncrementalDecoder",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
# When we've already found a definitive match (chaos=0.0 with good coherence)
|
||||
# after testing the prioritized encodings, skip encodings that target
|
||||
# completely different language families. This avoids running expensive
|
||||
# mess_ratio + coherence_ratio on clearly unrelated candidates (e.g., Cyrillic
|
||||
# when the definitive match is Latin-based).
|
||||
if definitive_match_found:
|
||||
if not is_multi_byte_decoder:
|
||||
enc_languages = set(encoding_languages(encoding_iana))
|
||||
else:
|
||||
enc_languages = set(mb_encoding_languages(encoding_iana))
|
||||
if not enc_languages.intersection(definitive_target_languages):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Skipping %s: definitive match already found, this encoding targets different languages (%s vs %s).",
|
||||
encoding_iana,
|
||||
enc_languages,
|
||||
definitive_target_languages,
|
||||
)
|
||||
continue
|
||||
|
||||
# After the definitive match, cap the number of additional same-family
|
||||
# single-byte encodings that pass chaos probing. This avoids testing the
|
||||
# tail of rare, low-value same-family encodings (mac_iceland, cp860, etc.)
|
||||
# that almost never change best() but each cost ~1-2ms of mess_ratio + coherence.
|
||||
if (
|
||||
definitive_match_found
|
||||
and not is_multi_byte_decoder
|
||||
and post_definitive_sb_success_count >= POST_DEFINITIVE_SB_CAP
|
||||
):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Skipping %s: already accumulated %d same-family results after definitive match (cap=%d).",
|
||||
encoding_iana,
|
||||
post_definitive_sb_success_count,
|
||||
POST_DEFINITIVE_SB_CAP,
|
||||
)
|
||||
continue
|
||||
|
||||
# When a multibyte encoding with significant multibyte content has already
|
||||
# passed chaos probing, skip all single-byte encodings. They will either fail
|
||||
# chaos probing (wasting mess_ratio time) or produce inferior results.
|
||||
if mb_definitive_match_found and not is_multi_byte_decoder:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Skipping single-byte %s: multi-byte definitive match already found.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||
str(
|
||||
(
|
||||
sequences[: int(50e4)]
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) : int(50e4)]
|
||||
),
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
else:
|
||||
decoded_payload = str(
|
||||
(
|
||||
sequences
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) :]
|
||||
),
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
if not isinstance(e, LookupError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||
"was encoded using n-bytes.",
|
||||
encoding_iana,
|
||||
)
|
||||
|
||||
# Payload-hash deduplication: if another encoding already decoded to the
|
||||
# exact same string, reuse its mess_ratio and coherence results entirely.
|
||||
# This is strictly more general than the old IANA_SUPPORTED_SIMILAR approach
|
||||
# because it catches ALL identical decoding, not just pre-mapped ones.
|
||||
if decoded_payload is not None and not is_multi_byte_decoder:
|
||||
payload_hash: int = hash(decoded_payload)
|
||||
cached = payload_result_cache.get(payload_hash)
|
||||
if cached is not None:
|
||||
cached_mess, cached_cd, cached_passed = cached
|
||||
if cached_passed:
|
||||
# The previous encoding with identical output passed chaos probing.
|
||||
fast_match = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
cached_mess,
|
||||
bom_or_sig_available,
|
||||
cached_cd,
|
||||
(
|
||||
decoded_payload
|
||||
if (
|
||||
is_too_large_sequence is False
|
||||
or encoding_iana
|
||||
in [specified_encoding, "ascii", "utf_8"]
|
||||
)
|
||||
else None
|
||||
),
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
results.append(fast_match)
|
||||
success_fast_tracked.add(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s fast-tracked (identical decoded payload to a prior encoding, chaos=%f %%).",
|
||||
encoding_iana,
|
||||
round(cached_mess * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and cached_mess < 0.1
|
||||
):
|
||||
if cached_mess == 0.0:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
fast_match.encoding,
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([fast_match])
|
||||
early_stop_results.append(fast_match)
|
||||
|
||||
if (
|
||||
len(early_stop_results)
|
||||
and (specified_encoding is None or specified_encoding in tested)
|
||||
and "ascii" in tested
|
||||
and "utf_8" in tested
|
||||
):
|
||||
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
probable_result.encoding,
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([probable_result])
|
||||
|
||||
continue
|
||||
else:
|
||||
# The previous encoding with identical output failed chaos probing.
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s fast-skipped (identical decoded payload to a prior encoding that failed chaos probing).",
|
||||
encoding_iana,
|
||||
)
|
||||
# Prepare fallbacks for special encodings even when skipped.
|
||||
if enable_fallback and encoding_iana in [
|
||||
"ascii",
|
||||
"utf_8",
|
||||
specified_encoding,
|
||||
"utf_16",
|
||||
"utf_32",
|
||||
]:
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
threshold,
|
||||
bom_or_sig_available,
|
||||
[],
|
||||
decoded_payload,
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: list[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(
|
||||
mess_ratio(
|
||||
chunk,
|
||||
threshold,
|
||||
explain is True and 1 <= len(cp_isolation) <= 2,
|
||||
)
|
||||
)
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except (
|
||||
UnicodeDecodeError
|
||||
) as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
if (
|
||||
not lazy_str_hard_failure
|
||||
and is_too_large_sequence
|
||||
and not is_multi_byte_decoder
|
||||
):
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
if encoding_iana in IANA_SUPPORTED_SIMILAR:
|
||||
soft_failure_skip.update(IANA_SUPPORTED_SIMILAR[encoding_iana])
|
||||
# Cache this soft-failure so identical decoding from other encodings
|
||||
# can be skipped immediately.
|
||||
if decoded_payload is not None and not is_multi_byte_decoder:
|
||||
payload_result_cache.setdefault(
|
||||
hash(decoded_payload), (mean_mess_ratio, [], False)
|
||||
)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||
"Computed mean chaos is %f %%.",
|
||||
encoding_iana,
|
||||
early_stop_count,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
# Preparing those fallbacks in case we got nothing.
|
||||
if (
|
||||
enable_fallback
|
||||
and encoding_iana
|
||||
in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
|
||||
and not lazy_str_hard_failure
|
||||
):
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
threshold,
|
||||
bom_or_sig_available,
|
||||
[],
|
||||
decoded_payload,
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||
encoding_iana,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: list[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
if target_languages:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"{} should target any language(s) of {}".format(
|
||||
encoding_iana, str(target_languages)
|
||||
),
|
||||
)
|
||||
|
||||
cd_ratios = []
|
||||
|
||||
# Run coherence detection on all chunks. We previously tried limiting to
|
||||
# 1-2 chunks for post-definitive encodings to save time, but this caused
|
||||
# coverage regressions by producing unrepresentative coherence scores.
|
||||
# The SB cap and language-family skip optimizations provide sufficient
|
||||
# speedup without sacrificing coherence accuracy.
|
||||
if encoding_iana != "ascii":
|
||||
# We shall skip the CD when its about ASCII
|
||||
# Most of the time its not relevant to run "language-detection" on it.
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk,
|
||||
language_threshold,
|
||||
",".join(target_languages) if target_languages else None,
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
else:
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
|
||||
if cd_ratios_merged:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"We detected language {} using {}".format(
|
||||
cd_ratios_merged, encoding_iana
|
||||
),
|
||||
)
|
||||
|
||||
current_match = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
bom_or_sig_available,
|
||||
cd_ratios_merged,
|
||||
(
|
||||
decoded_payload
|
||||
if (
|
||||
is_too_large_sequence is False
|
||||
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
)
|
||||
else None
|
||||
),
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
|
||||
results.append(current_match)
|
||||
|
||||
# Cache the successful result for payload-hash deduplication.
|
||||
if decoded_payload is not None and not is_multi_byte_decoder:
|
||||
payload_result_cache.setdefault(
|
||||
hash(decoded_payload),
|
||||
(mean_mess_ratio, cd_ratios_merged, True),
|
||||
)
|
||||
|
||||
# Count post-definitive same-family SB successes for the early termination cap.
|
||||
# Only count low-mess encodings (< 2%) toward the cap. High-mess encodings are
|
||||
# marginal results that shouldn't prevent better-quality candidates from being
|
||||
# tested. For example, iso8859_4 (mess=0%) should not be skipped just because
|
||||
# 7 high-mess Latin encodings (cp1252 at 8%, etc.) were tried first.
|
||||
if (
|
||||
definitive_match_found
|
||||
and not is_multi_byte_decoder
|
||||
and mean_mess_ratio < 0.02
|
||||
):
|
||||
post_definitive_sb_success_count += 1
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and mean_mess_ratio < 0.1
|
||||
):
|
||||
# If md says nothing to worry about, then... stop immediately!
|
||||
if mean_mess_ratio == 0.0:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
current_match.encoding,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([current_match])
|
||||
|
||||
early_stop_results.append(current_match)
|
||||
|
||||
if (
|
||||
len(early_stop_results)
|
||||
and (specified_encoding is None or specified_encoding in tested)
|
||||
and "ascii" in tested
|
||||
and "utf_8" in tested
|
||||
):
|
||||
probable_result = early_stop_results.best() # type: ignore[assignment]
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
probable_result.encoding, # type: ignore[union-attr]
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return CharsetMatches([probable_result])
|
||||
|
||||
# Once we find a result with good coherence (>= 0.5) after testing the
|
||||
# prioritized encodings (ascii, utf_8), activate "definitive mode": skip
|
||||
# encodings that target completely different language families. This avoids
|
||||
# running expensive mess_ratio + coherence_ratio on clearly unrelated
|
||||
# candidates (e.g., Cyrillic encodings when the match is Latin-based).
|
||||
# We require coherence >= 0.5 to avoid false positives (e.g., cp1251 decoding
|
||||
# Hebrew text with 0.0 chaos but wrong language detection at coherence 0.33).
|
||||
if not definitive_match_found and not is_multi_byte_decoder:
|
||||
best_coherence = (
|
||||
max((v for _, v in cd_ratios_merged), default=0.0)
|
||||
if cd_ratios_merged
|
||||
else 0.0
|
||||
)
|
||||
if best_coherence >= 0.5 and "ascii" in tested and "utf_8" in tested:
|
||||
definitive_match_found = True
|
||||
definitive_target_languages.update(target_languages)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Definitive match found: %s (chaos=%.3f, coherence=%.2f). Encodings targeting different language families will be skipped.",
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
best_coherence,
|
||||
)
|
||||
|
||||
# When a non-UTF multibyte encoding passes chaos probing with significant
|
||||
# multibyte content (decoded < 98% of raw), activate mb_definitive_match.
|
||||
# This skips all remaining single-byte encodings which would either soft-fail
|
||||
# (running expensive mess_ratio for nothing) or produce inferior results.
|
||||
if (
|
||||
not mb_definitive_match_found
|
||||
and is_multi_byte_decoder
|
||||
and multi_byte_bonus
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length * 0.98
|
||||
and encoding_iana
|
||||
not in {
|
||||
"utf_8",
|
||||
"utf_8_sig",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_be",
|
||||
"utf_32_le",
|
||||
"utf_7",
|
||||
}
|
||||
and "ascii" in tested
|
||||
and "utf_8" in tested
|
||||
):
|
||||
mb_definitive_match_found = True
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Multi-byte definitive match: %s (chaos=%.3f, decoded=%d/%d=%.1f%%). Single-byte encodings will be skipped.",
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
len(decoded_payload),
|
||||
length,
|
||||
len(decoded_payload) / length * 100,
|
||||
)
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
||||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
||||
)
|
||||
|
||||
if fallback_specified:
|
||||
logger.debug(
|
||||
"Encoding detection: %s will be used as a fallback match",
|
||||
fallback_specified.encoding,
|
||||
)
|
||||
results.append(fallback_specified)
|
||||
elif (
|
||||
(fallback_u8 and fallback_ascii is None)
|
||||
or (
|
||||
fallback_u8
|
||||
and fallback_ascii
|
||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||
)
|
||||
or (fallback_u8 is not None)
|
||||
):
|
||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
||||
results.append(fallback_u8)
|
||||
elif fallback_ascii:
|
||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if results:
|
||||
logger.debug(
|
||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
||||
results.best().encoding, # type: ignore
|
||||
len(results) - 1,
|
||||
)
|
||||
else:
|
||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def from_fp(
|
||||
fp: BinaryIO,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
Will not close the file pointer.
|
||||
"""
|
||||
return from_bytes(
|
||||
fp.read(),
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def from_path(
|
||||
path: str | bytes | PathLike, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
Can raise IOError.
|
||||
"""
|
||||
with open(path, "rb") as fp:
|
||||
return from_fp(
|
||||
fp,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def is_binary(
|
||||
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
||||
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
||||
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
||||
"""
|
||||
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
||||
guesses = from_path(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
elif isinstance(
|
||||
fp_or_path_or_payload,
|
||||
(
|
||||
bytes,
|
||||
bytearray,
|
||||
),
|
||||
):
|
||||
guesses = from_bytes(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
else:
|
||||
guesses = from_fp(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
|
||||
return not guesses
|
||||
Binary file not shown.
454
.venv/lib/python3.10/site-packages/charset_normalizer/cd.py
Normal file
454
.venv/lib/python3.10/site-packages/charset_normalizer/cd.py
Normal file
|
|
@ -0,0 +1,454 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter
|
||||
|
||||
from .constant import (
|
||||
FREQUENCIES,
|
||||
KO_NAMES,
|
||||
LANGUAGE_SUPPORTED_COUNT,
|
||||
TOO_SMALL_SEQUENCE,
|
||||
ZH_NAMES,
|
||||
_FREQUENCIES_SET,
|
||||
_FREQUENCIES_RANK,
|
||||
)
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_latin,
|
||||
is_multi_byte_encoding,
|
||||
is_unicode_range_secondary,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise OSError( # Defensive:
|
||||
"Function not supported on multi-byte code page"
|
||||
)
|
||||
|
||||
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: str | None = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
if is_unicode_range_secondary(character_range) is False:
|
||||
if character_range not in seen_ranges:
|
||||
seen_ranges[character_range] = 0
|
||||
seen_ranges[character_range] += 1
|
||||
character_count += 1
|
||||
|
||||
return sorted(
|
||||
[
|
||||
character_range
|
||||
for character_range in seen_ranges
|
||||
if seen_ranges[character_range] / character_count >= 0.15
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> list[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: list[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
if unicode_range(character) == primary_range:
|
||||
languages.append(language)
|
||||
break
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: str | None = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
primary_range = specified_range
|
||||
break
|
||||
|
||||
if primary_range is None:
|
||||
return ["Latin Based"]
|
||||
|
||||
return unicode_range_languages(primary_range)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
if (
|
||||
iana_name.startswith("shift_")
|
||||
or iana_name.startswith("iso2022_jp")
|
||||
or iana_name.startswith("euc_j")
|
||||
or iana_name == "cp932"
|
||||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: list[str], ignore_non_latin: bool = False
|
||||
) -> list[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: list[tuple[str, float]] = []
|
||||
|
||||
characters_set: frozenset[str] = frozenset(characters)
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
||||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set)
|
||||
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||
|
||||
return [compatible_language[0] for compatible_language in languages]
|
||||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: list[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError(f"{language} not available") # Defensive:
|
||||
|
||||
character_approved_count: int = 0
|
||||
frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
|
||||
lang_rank: dict[str, int] = _FREQUENCIES_RANK[language]
|
||||
|
||||
ordered_characters_count: int = len(ordered_characters)
|
||||
target_language_characters_count: int = len(FREQUENCIES[language])
|
||||
|
||||
large_alphabet: bool = target_language_characters_count > 26
|
||||
|
||||
expected_projection_ratio: float = (
|
||||
target_language_characters_count / ordered_characters_count
|
||||
)
|
||||
|
||||
# Pre-built rank dict for ordered_characters (avoids repeated list slicing).
|
||||
ordered_rank: dict[str, int] = {
|
||||
char: rank for rank, char in enumerate(ordered_characters)
|
||||
}
|
||||
|
||||
# Pre-compute characters common to both orderings.
|
||||
# Avoids repeated `c in ordered_rank` dict lookups in the inner counts.
|
||||
common_chars: list[tuple[int, int]] = [
|
||||
(lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
|
||||
]
|
||||
|
||||
# Pre-extract lr and orr arrays for faster iteration in the inner loop.
|
||||
# Plain integer loops with local arrays are much faster under mypyc than
|
||||
# generator expression sums over a list of tuples.
|
||||
common_count: int = len(common_chars)
|
||||
common_lr: list[int] = [p[0] for p in common_chars]
|
||||
common_orr: list[int] = [p[1] for p in common_chars]
|
||||
|
||||
for character, character_rank in zip(
|
||||
ordered_characters, range(0, ordered_characters_count)
|
||||
):
|
||||
if character not in frequencies_language_set:
|
||||
continue
|
||||
|
||||
character_rank_in_language: int = lang_rank[character]
|
||||
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
||||
|
||||
if (
|
||||
large_alphabet is False
|
||||
and abs(character_rank_projection - character_rank_in_language) > 4
|
||||
):
|
||||
continue
|
||||
|
||||
if (
|
||||
large_alphabet is True
|
||||
and abs(character_rank_projection - character_rank_in_language)
|
||||
< target_language_characters_count / 3
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
# Count how many characters appear "before" in both orderings,
|
||||
# and how many appear "at or after" in both orderings.
|
||||
# Single pass over pre-extracted arrays — much faster under mypyc
|
||||
# than two generator expression sums.
|
||||
before_match_count: int = 0
|
||||
after_match_count: int = 0
|
||||
for i in range(common_count):
|
||||
lr_i: int = common_lr[i]
|
||||
orr_i: int = common_orr[i]
|
||||
if lr_i < character_rank_in_language:
|
||||
if orr_i < character_rank:
|
||||
before_match_count += 1
|
||||
else:
|
||||
if orr_i >= character_rank:
|
||||
after_match_count += 1
|
||||
|
||||
after_len: int = target_language_characters_count - character_rank_in_language
|
||||
|
||||
if character_rank_in_language == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if after_len == 0 and after_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
character_rank_in_language > 0
|
||||
and before_match_count / character_rank_in_language >= 0.4
|
||||
) or (after_len > 0 and after_match_count / after_len >= 0.4):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: dict[str, list[str]] = {}
|
||||
|
||||
# Fast path: track single-layer key to skip dict iteration for single-script text.
|
||||
single_layer_key: str | None = None
|
||||
multi_layer: bool = False
|
||||
|
||||
# Cache the last character_range and its resolved layer to avoid repeated
|
||||
# is_suspiciously_successive_range calls for consecutive same-range chars.
|
||||
prev_character_range: str | None = None
|
||||
prev_layer_target: str | None = None
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
# ASCII fast-path: a-z and A-Z are always "Basic Latin".
|
||||
# Avoids unicode_range() function call overhead for the most common case.
|
||||
character_ord: int = ord(character)
|
||||
if character_ord < 128:
|
||||
character_range: str | None = "Basic Latin"
|
||||
else:
|
||||
character_range = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
# Fast path: same range as previous character → reuse cached layer target.
|
||||
if character_range == prev_character_range:
|
||||
if prev_layer_target is not None:
|
||||
layers[prev_layer_target].append(character)
|
||||
continue
|
||||
|
||||
layer_target_range: str | None = None
|
||||
|
||||
if multi_layer:
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
is_suspiciously_successive_range(discovered_range, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = discovered_range
|
||||
break
|
||||
elif single_layer_key is not None:
|
||||
if (
|
||||
is_suspiciously_successive_range(single_layer_key, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = single_layer_key
|
||||
|
||||
if layer_target_range is None:
|
||||
layer_target_range = character_range
|
||||
|
||||
if layer_target_range not in layers:
|
||||
layers[layer_target_range] = []
|
||||
if single_layer_key is None:
|
||||
single_layer_key = layer_target_range
|
||||
else:
|
||||
multi_layer = True
|
||||
|
||||
layers[layer_target_range].append(character)
|
||||
|
||||
# Cache for next iteration
|
||||
prev_character_range = character_range
|
||||
prev_layer_target = layer_target_range
|
||||
|
||||
return ["".join(chars).lower() for chars in layers.values()]
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: dict[str, list[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
if language not in per_language_ratios:
|
||||
per_language_ratios[language] = [ratio]
|
||||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
||||
"""
|
||||
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
||||
of "English". This function only keeps the best match and remove the em-dash in it.
|
||||
"""
|
||||
index_results: dict[str, list[float]] = dict()
|
||||
|
||||
for result in results:
|
||||
language, ratio = result
|
||||
no_em_name: str = language.replace("—", "")
|
||||
|
||||
if no_em_name not in index_results:
|
||||
index_results[no_em_name] = []
|
||||
|
||||
index_results[no_em_name].append(ratio)
|
||||
|
||||
if any(len(index_results[e]) > 1 for e in index_results):
|
||||
filtered_results: CoherenceMatches = []
|
||||
|
||||
for language in index_results:
|
||||
filtered_results.append((language, max(index_results[language])))
|
||||
|
||||
return filtered_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: list[tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count: int = len(layer)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: list[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
elif ratio >= 0.8:
|
||||
sufficient_match_count += 1
|
||||
|
||||
results.append((language, round(ratio, 4)))
|
||||
|
||||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(
|
||||
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
||||
)
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .__main__ import cli_detect, query_yes_no
|
||||
|
||||
__all__ = (
|
||||
"cli_detect",
|
||||
"query_yes_no",
|
||||
)
|
||||
|
|
@ -0,0 +1,362 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import typing
|
||||
from json import dumps
|
||||
from os.path import abspath, basename, dirname, join, realpath
|
||||
from platform import python_version
|
||||
from unicodedata import unidata_version
|
||||
|
||||
import charset_normalizer.md as md_module
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool: # Defensive:
|
||||
"""Ask a yes/no question via input() and return the answer as a bool."""
|
||||
prompt = " [Y/n] " if default == "yes" else " [y/N] "
|
||||
|
||||
while True:
|
||||
choice = input(question + prompt).strip().lower()
|
||||
if not choice:
|
||||
return default == "yes"
|
||||
if choice in ("y", "yes"):
|
||||
return True
|
||||
if choice in ("n", "no"):
|
||||
return False
|
||||
print("Please respond with 'y' or 'n'.")
|
||||
|
||||
|
||||
class FileType:
|
||||
"""Factory for creating file object types
|
||||
|
||||
Instances of FileType are typically passed as type= arguments to the
|
||||
ArgumentParser add_argument() method.
|
||||
|
||||
Keyword Arguments:
|
||||
- mode -- A string indicating how the file is to be opened. Accepts the
|
||||
same values as the builtin open() function.
|
||||
- bufsize -- The file's desired buffer size. Accepts the same values as
|
||||
the builtin open() function.
|
||||
- encoding -- The file's encoding. Accepts the same values as the
|
||||
builtin open() function.
|
||||
- errors -- A string indicating how encoding and decoding errors are to
|
||||
be handled. Accepts the same value as the builtin open() function.
|
||||
|
||||
Backported from CPython 3.12
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "r",
|
||||
bufsize: int = -1,
|
||||
encoding: str | None = None,
|
||||
errors: str | None = None,
|
||||
):
|
||||
self._mode = mode
|
||||
self._bufsize = bufsize
|
||||
self._encoding = encoding
|
||||
self._errors = errors
|
||||
|
||||
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
|
||||
# the special argument "-" means sys.std{in,out}
|
||||
if string == "-":
|
||||
if "r" in self._mode:
|
||||
return sys.stdin.buffer if "b" in self._mode else sys.stdin
|
||||
elif any(c in self._mode for c in "wax"):
|
||||
return sys.stdout.buffer if "b" in self._mode else sys.stdout
|
||||
else:
|
||||
msg = f'argument "-" with mode {self._mode}'
|
||||
raise ValueError(msg)
|
||||
|
||||
# all other arguments are used as file names
|
||||
try:
|
||||
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
|
||||
except OSError as e:
|
||||
message = f"can't open '{string}': {e}"
|
||||
raise argparse.ArgumentTypeError(message)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
args = self._mode, self._bufsize
|
||||
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
|
||||
args_str = ", ".join(
|
||||
[repr(arg) for arg in args if arg != -1]
|
||||
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
|
||||
)
|
||||
return f"{type(self).__name__}({args_str})"
|
||||
|
||||
|
||||
def cli_detect(argv: list[str] | None = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--no-preemptive",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="no_preemptive",
|
||||
help="Disable looking at a charset declaration to hint the detector.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
||||
__version__,
|
||||
python_version(),
|
||||
unidata_version,
|
||||
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
matches = from_fp(
|
||||
my_file,
|
||||
threshold=args.threshold,
|
||||
explain=args.verbose,
|
||||
preemptive_behaviour=args.no_preemptive is False,
|
||||
)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
(
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else ""
|
||||
),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
cli_result = CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
x_.append(cli_result)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
dir_path = dirname(realpath(my_file.name))
|
||||
file_name = basename(realpath(my_file.name))
|
||||
|
||||
o_: list[str] = file_name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
cli_result.unicode_path = join(dir_path, ".".join(o_))
|
||||
|
||||
with open(cli_result.unicode_path, "wb") as fp:
|
||||
fp.write(best_guess.output())
|
||||
except OSError as e: # Defensive:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__": # Defensive:
|
||||
cli_detect()
|
||||
Binary file not shown.
Binary file not shown.
2050
.venv/lib/python3.10/site-packages/charset_normalizer/constant.py
Normal file
2050
.venv/lib/python3.10/site-packages/charset_normalizer/constant.py
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -0,0 +1,79 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from warnings import warn
|
||||
|
||||
from .api import from_bytes
|
||||
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypedDict
|
||||
|
||||
class ResultDict(TypedDict):
|
||||
encoding: str | None
|
||||
language: str
|
||||
confidence: float | None
|
||||
|
||||
|
||||
def detect(
|
||||
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
||||
) -> ResultDict:
|
||||
"""
|
||||
chardet legacy method
|
||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||
further information. Not planned for removal.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
"""
|
||||
if len(kwargs):
|
||||
warn(
|
||||
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
||||
)
|
||||
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
byte_str = bytes(byte_str)
|
||||
|
||||
r = from_bytes(byte_str).best()
|
||||
|
||||
encoding = r.encoding if r is not None else None
|
||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||
confidence = 1.0 - r.chaos if r is not None else None
|
||||
|
||||
# automatically lower confidence
|
||||
# on small bytes samples.
|
||||
# https://github.com/jawah/charset_normalizer/issues/391
|
||||
if (
|
||||
confidence is not None
|
||||
and confidence >= 0.9
|
||||
and encoding
|
||||
not in {
|
||||
"utf_8",
|
||||
"ascii",
|
||||
}
|
||||
and r.bom is False # type: ignore[union-attr]
|
||||
and len(byte_str) < TOO_SMALL_SEQUENCE
|
||||
):
|
||||
confidence -= 0.2
|
||||
|
||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||
if r is not None and encoding == "utf_8" and r.bom:
|
||||
encoding += "_sig"
|
||||
|
||||
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
||||
encoding = CHARDET_CORRESPONDENCE[encoding]
|
||||
|
||||
return {
|
||||
"encoding": encoding,
|
||||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
||||
Binary file not shown.
936
.venv/lib/python3.10/site-packages/charset_normalizer/md.py
Normal file
936
.venv/lib/python3.10/site-packages/charset_normalizer/md.py
Normal file
|
|
@ -0,0 +1,936 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
from logging import getLogger
|
||||
|
||||
if sys.version_info >= (3, 8):
|
||||
from typing import final
|
||||
else:
|
||||
try:
|
||||
from typing_extensions import final
|
||||
except ImportError:
|
||||
|
||||
def final(cls): # type: ignore[misc,no-untyped-def]
|
||||
return cls
|
||||
|
||||
|
||||
from .constant import (
|
||||
COMMON_CJK_CHARACTERS,
|
||||
COMMON_SAFE_ASCII_CHARACTERS,
|
||||
TRACE,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
_ACCENTUATED,
|
||||
_ARABIC,
|
||||
_ARABIC_ISOLATED_FORM,
|
||||
_CJK,
|
||||
_HANGUL,
|
||||
_HIRAGANA,
|
||||
_KATAKANA,
|
||||
_LATIN,
|
||||
_THAI,
|
||||
)
|
||||
from .utils import (
|
||||
_character_flags,
|
||||
is_emoticon,
|
||||
is_punctuation,
|
||||
is_separator,
|
||||
is_symbol,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
# Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection.
|
||||
_GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI
|
||||
|
||||
|
||||
@final
|
||||
class CharInfo:
|
||||
"""Pre-computed character properties shared across all detectors.
|
||||
|
||||
Instantiated once and reused via :meth:`update` on every character
|
||||
in the hot loop so that redundant calls to str methods
|
||||
(``isalpha``, ``isupper``, …) and cached utility functions
|
||||
(``_character_flags``, ``is_punctuation``, …) are avoided when
|
||||
several plugins need the same information.
|
||||
"""
|
||||
|
||||
__slots__ = (
|
||||
"character",
|
||||
"printable",
|
||||
"alpha",
|
||||
"upper",
|
||||
"lower",
|
||||
"space",
|
||||
"digit",
|
||||
"is_ascii",
|
||||
"case_variable",
|
||||
"flags",
|
||||
"accentuated",
|
||||
"latin",
|
||||
"is_cjk",
|
||||
"is_arabic",
|
||||
"is_glyph",
|
||||
"punct",
|
||||
"sym",
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.character: str = ""
|
||||
self.printable: bool = False
|
||||
self.alpha: bool = False
|
||||
self.upper: bool = False
|
||||
self.lower: bool = False
|
||||
self.space: bool = False
|
||||
self.digit: bool = False
|
||||
self.is_ascii: bool = False
|
||||
self.case_variable: bool = False
|
||||
self.flags: int = 0
|
||||
self.accentuated: bool = False
|
||||
self.latin: bool = False
|
||||
self.is_cjk: bool = False
|
||||
self.is_arabic: bool = False
|
||||
self.is_glyph: bool = False
|
||||
self.punct: bool = False
|
||||
self.sym: bool = False
|
||||
|
||||
def update(self, character: str) -> None:
|
||||
"""Update all properties for *character* (called once per character)."""
|
||||
self.character = character
|
||||
|
||||
# ASCII fast-path: for characters with ord < 128, we can skip
|
||||
# _character_flags() entirely and derive most properties from ord.
|
||||
o: int = ord(character)
|
||||
if o < 128:
|
||||
self.is_ascii = True
|
||||
self.accentuated = False
|
||||
self.is_cjk = False
|
||||
self.is_arabic = False
|
||||
self.is_glyph = False
|
||||
# ASCII alpha: a-z (97-122) or A-Z (65-90)
|
||||
if 65 <= o <= 90:
|
||||
# Uppercase ASCII letter
|
||||
self.alpha = True
|
||||
self.upper = True
|
||||
self.lower = False
|
||||
self.space = False
|
||||
self.digit = False
|
||||
self.printable = True
|
||||
self.case_variable = True
|
||||
self.flags = _LATIN
|
||||
self.latin = True
|
||||
self.punct = False
|
||||
self.sym = False
|
||||
elif 97 <= o <= 122:
|
||||
# Lowercase ASCII letter
|
||||
self.alpha = True
|
||||
self.upper = False
|
||||
self.lower = True
|
||||
self.space = False
|
||||
self.digit = False
|
||||
self.printable = True
|
||||
self.case_variable = True
|
||||
self.flags = _LATIN
|
||||
self.latin = True
|
||||
self.punct = False
|
||||
self.sym = False
|
||||
elif 48 <= o <= 57:
|
||||
# ASCII digit 0-9
|
||||
self.alpha = False
|
||||
self.upper = False
|
||||
self.lower = False
|
||||
self.space = False
|
||||
self.digit = True
|
||||
self.printable = True
|
||||
self.case_variable = False
|
||||
self.flags = 0
|
||||
self.latin = False
|
||||
self.punct = False
|
||||
self.sym = False
|
||||
elif o == 32 or (9 <= o <= 13):
|
||||
# Space, tab, newline, etc.
|
||||
self.alpha = False
|
||||
self.upper = False
|
||||
self.lower = False
|
||||
self.space = True
|
||||
self.digit = False
|
||||
self.printable = o == 32
|
||||
self.case_variable = False
|
||||
self.flags = 0
|
||||
self.latin = False
|
||||
self.punct = False
|
||||
self.sym = False
|
||||
else:
|
||||
# Other ASCII (punctuation, symbols, control chars)
|
||||
self.printable = character.isprintable()
|
||||
self.alpha = False
|
||||
self.upper = False
|
||||
self.lower = False
|
||||
self.space = False
|
||||
self.digit = False
|
||||
self.case_variable = False
|
||||
self.flags = 0
|
||||
self.latin = False
|
||||
self.punct = is_punctuation(character) if self.printable else False
|
||||
self.sym = is_symbol(character) if self.printable else False
|
||||
else:
|
||||
# Non-ASCII path
|
||||
self.is_ascii = False
|
||||
self.printable = character.isprintable()
|
||||
self.alpha = character.isalpha()
|
||||
self.upper = character.isupper()
|
||||
self.lower = character.islower()
|
||||
self.space = character.isspace()
|
||||
self.digit = character.isdigit()
|
||||
self.case_variable = self.lower != self.upper
|
||||
|
||||
# Flag-based classification (single unicodedata.name() call, lru-cached)
|
||||
flags: int
|
||||
if self.alpha:
|
||||
flags = _character_flags(character)
|
||||
else:
|
||||
flags = 0
|
||||
self.flags = flags
|
||||
self.accentuated = bool(flags & _ACCENTUATED)
|
||||
self.latin = bool(flags & _LATIN)
|
||||
self.is_cjk = bool(flags & _CJK)
|
||||
self.is_arabic = bool(flags & _ARABIC)
|
||||
self.is_glyph = bool(flags & _GLYPH_MASK)
|
||||
|
||||
# Eagerly compute punct and sym (avoids property dispatch overhead
|
||||
# on 300K+ accesses in the hot loop).
|
||||
self.punct = is_punctuation(character) if self.printable else False
|
||||
self.sym = is_symbol(character) if self.printable else False
|
||||
|
||||
|
||||
class MessDetectorPlugin:
|
||||
"""
|
||||
Base abstract class used for mess detection plugins.
|
||||
All detectors MUST extend and implement given methods.
|
||||
"""
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""
|
||||
The main routine to be executed upon character.
|
||||
Insert the logic in witch the text would be considered chaotic.
|
||||
"""
|
||||
raise NotImplementedError # Defensive:
|
||||
|
||||
def reset(self) -> None: # Defensive:
|
||||
"""
|
||||
Permit to reset the plugin to the initial state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
"""
|
||||
Compute the chaos ratio based on what your feed() has seen.
|
||||
Must NOT be lower than 0.; No restriction gt 0.
|
||||
"""
|
||||
raise NotImplementedError # Defensive:
|
||||
|
||||
|
||||
@final
|
||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||
__slots__ = (
|
||||
"_punctuation_count",
|
||||
"_symbol_count",
|
||||
"_character_count",
|
||||
"_last_printable_char",
|
||||
"_frenzy_symbol_in_word",
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._punctuation_count: int = 0
|
||||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char: str | None = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character != self._last_printable_char
|
||||
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
if info.punct:
|
||||
self._punctuation_count += 1
|
||||
elif not info.digit and info.sym and not is_emoticon(character):
|
||||
self._symbol_count += 2
|
||||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_punctuation: float = (
|
||||
self._punctuation_count + self._symbol_count
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||
|
||||
|
||||
@final
|
||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||
__slots__ = ("_character_count", "_accentuated_count")
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._accentuated_count: int = 0
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
self._character_count += 1
|
||||
|
||||
if info.accentuated:
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
||||
|
||||
@final
|
||||
class UnprintablePlugin(MessDetectorPlugin):
|
||||
__slots__ = ("_unprintable_count", "_character_count")
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._unprintable_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
if (
|
||||
not info.space
|
||||
and not info.printable
|
||||
and character != "\x1a"
|
||||
and character != "\ufeff"
|
||||
):
|
||||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0: # Defensive:
|
||||
return 0.0
|
||||
|
||||
return (self._unprintable_count * 8) / self._character_count
|
||||
|
||||
|
||||
@final
|
||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||
__slots__ = (
|
||||
"_successive_count",
|
||||
"_character_count",
|
||||
"_last_latin_character",
|
||||
"_last_was_accentuated",
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character: str | None = None
|
||||
self._last_was_accentuated: bool = False
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
self._character_count += 1
|
||||
if (
|
||||
self._last_latin_character is not None
|
||||
and info.accentuated
|
||||
and self._last_was_accentuated
|
||||
):
|
||||
if info.upper and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
||||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
self._last_was_accentuated = info.accentuated
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
self._last_was_accentuated = False
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._successive_count * 2) / self._character_count
|
||||
|
||||
|
||||
@final
|
||||
class SuspiciousRange(MessDetectorPlugin):
|
||||
__slots__ = (
|
||||
"_suspicious_successive_range_count",
|
||||
"_character_count",
|
||||
"_last_printable_seen",
|
||||
"_last_printable_range",
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: str | None = None
|
||||
self._last_printable_range: str | None = None
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
self._character_count += 1
|
||||
|
||||
if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS:
|
||||
self._last_printable_seen = None
|
||||
self._last_printable_range = None
|
||||
return
|
||||
|
||||
if self._last_printable_seen is None:
|
||||
self._last_printable_seen = character
|
||||
self._last_printable_range = unicode_range(character)
|
||||
return
|
||||
|
||||
unicode_range_a: str | None = self._last_printable_range
|
||||
unicode_range_b: str | None = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
||||
self._last_printable_seen = character
|
||||
self._last_printable_range = unicode_range_b
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
self._last_printable_range = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count <= 13:
|
||||
return 0.0
|
||||
|
||||
ratio_of_suspicious_range_usage: float = (
|
||||
self._suspicious_successive_range_count * 2
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_suspicious_range_usage
|
||||
|
||||
|
||||
@final
|
||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||
__slots__ = (
|
||||
"_word_count",
|
||||
"_bad_word_count",
|
||||
"_foreign_long_count",
|
||||
"_is_current_word_bad",
|
||||
"_foreign_long_watch",
|
||||
"_character_count",
|
||||
"_bad_character_count",
|
||||
"_buffer_length",
|
||||
"_buffer_last_char",
|
||||
"_buffer_last_char_accentuated",
|
||||
"_buffer_accent_count",
|
||||
"_buffer_glyph_count",
|
||||
"_buffer_upper_count",
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._word_count: int = 0
|
||||
self._bad_word_count: int = 0
|
||||
self._foreign_long_count: int = 0
|
||||
|
||||
self._is_current_word_bad: bool = False
|
||||
self._foreign_long_watch: bool = False
|
||||
|
||||
self._character_count: int = 0
|
||||
self._bad_character_count: int = 0
|
||||
|
||||
self._buffer_length: int = 0
|
||||
self._buffer_last_char: str | None = None
|
||||
self._buffer_last_char_accentuated: bool = False
|
||||
self._buffer_accent_count: int = 0
|
||||
self._buffer_glyph_count: int = 0
|
||||
self._buffer_upper_count: int = 0
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
if info.alpha:
|
||||
self._buffer_length += 1
|
||||
self._buffer_last_char = character
|
||||
|
||||
if info.upper:
|
||||
self._buffer_upper_count += 1
|
||||
|
||||
self._buffer_last_char_accentuated = info.accentuated
|
||||
|
||||
if info.accentuated:
|
||||
self._buffer_accent_count += 1
|
||||
if (
|
||||
not self._foreign_long_watch
|
||||
and (not info.latin or info.accentuated)
|
||||
and not info.is_glyph
|
||||
):
|
||||
self._foreign_long_watch = True
|
||||
if info.is_glyph:
|
||||
self._buffer_glyph_count += 1
|
||||
return
|
||||
if not self._buffer_length:
|
||||
return
|
||||
if info.space or info.punct or is_separator(character):
|
||||
self._word_count += 1
|
||||
buffer_length: int = self._buffer_length
|
||||
|
||||
self._character_count += buffer_length
|
||||
|
||||
if buffer_length >= 4:
|
||||
if self._buffer_accent_count / buffer_length >= 0.5:
|
||||
self._is_current_word_bad = True
|
||||
elif (
|
||||
self._buffer_last_char_accentuated
|
||||
and self._buffer_last_char.isupper() # type: ignore[union-attr]
|
||||
and self._buffer_upper_count != buffer_length
|
||||
):
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
elif self._buffer_glyph_count == 1:
|
||||
self._is_current_word_bad = True
|
||||
self._foreign_long_count += 1
|
||||
if buffer_length >= 24 and self._foreign_long_watch:
|
||||
probable_camel_cased: bool = (
|
||||
self._buffer_upper_count > 0
|
||||
and self._buffer_upper_count / buffer_length <= 0.3
|
||||
)
|
||||
|
||||
if not probable_camel_cased:
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
|
||||
if self._is_current_word_bad:
|
||||
self._bad_word_count += 1
|
||||
self._bad_character_count += buffer_length
|
||||
self._is_current_word_bad = False
|
||||
|
||||
self._foreign_long_watch = False
|
||||
self._buffer_length = 0
|
||||
self._buffer_last_char = None
|
||||
self._buffer_last_char_accentuated = False
|
||||
self._buffer_accent_count = 0
|
||||
self._buffer_glyph_count = 0
|
||||
self._buffer_upper_count = 0
|
||||
elif (
|
||||
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
||||
and not info.digit
|
||||
and info.sym
|
||||
):
|
||||
self._is_current_word_bad = True
|
||||
self._buffer_length += 1
|
||||
self._buffer_last_char = character
|
||||
self._buffer_last_char_accentuated = False
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._buffer_length = 0
|
||||
self._buffer_last_char = None
|
||||
self._buffer_last_char_accentuated = False
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
self._bad_word_count = 0
|
||||
self._word_count = 0
|
||||
self._character_count = 0
|
||||
self._bad_character_count = 0
|
||||
self._foreign_long_count = 0
|
||||
self._buffer_accent_count = 0
|
||||
self._buffer_glyph_count = 0
|
||||
self._buffer_upper_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._bad_character_count / self._character_count
|
||||
|
||||
|
||||
@final
|
||||
class CjkUncommonPlugin(MessDetectorPlugin):
|
||||
"""
|
||||
Detect messy CJK text that probably means nothing.
|
||||
"""
|
||||
|
||||
__slots__ = ("_character_count", "_uncommon_count")
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._uncommon_count: int = 0
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
self._character_count += 1
|
||||
|
||||
if character not in COMMON_CJK_CHARACTERS:
|
||||
self._uncommon_count += 1
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._uncommon_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
uncommon_form_usage: float = self._uncommon_count / self._character_count
|
||||
|
||||
# we can be pretty sure it's garbage when uncommon characters are widely
|
||||
# used. otherwise it could just be traditional chinese for example.
|
||||
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
|
||||
|
||||
|
||||
@final
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
__slots__ = (
|
||||
"_buf",
|
||||
"_character_count_since_last_sep",
|
||||
"_successive_upper_lower_count",
|
||||
"_successive_upper_lower_count_final",
|
||||
"_character_count",
|
||||
"_last_alpha_seen",
|
||||
"_last_alpha_seen_upper",
|
||||
"_last_alpha_seen_lower",
|
||||
"_current_ascii_only",
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._buf: bool = False
|
||||
|
||||
self._character_count_since_last_sep: int = 0
|
||||
|
||||
self._successive_upper_lower_count: int = 0
|
||||
self._successive_upper_lower_count_final: int = 0
|
||||
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen: str | None = None
|
||||
self._last_alpha_seen_upper: bool = False
|
||||
self._last_alpha_seen_lower: bool = False
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
is_concerned: bool = info.alpha and info.case_variable
|
||||
chunk_sep: bool = not is_concerned
|
||||
|
||||
if chunk_sep and self._character_count_since_last_sep > 0:
|
||||
if (
|
||||
self._character_count_since_last_sep <= 64
|
||||
and not info.digit
|
||||
and not self._current_ascii_only
|
||||
):
|
||||
self._successive_upper_lower_count_final += (
|
||||
self._successive_upper_lower_count
|
||||
)
|
||||
|
||||
self._successive_upper_lower_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._character_count += 1
|
||||
self._current_ascii_only = True
|
||||
|
||||
return
|
||||
|
||||
if self._current_ascii_only and not info.is_ascii:
|
||||
self._current_ascii_only = False
|
||||
|
||||
if self._last_alpha_seen is not None:
|
||||
if (info.upper and self._last_alpha_seen_lower) or (
|
||||
info.lower and self._last_alpha_seen_upper
|
||||
):
|
||||
if self._buf:
|
||||
self._successive_upper_lower_count += 2
|
||||
self._buf = False
|
||||
else:
|
||||
self._buf = True
|
||||
else:
|
||||
self._buf = False
|
||||
|
||||
self._character_count += 1
|
||||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
self._last_alpha_seen_upper = info.upper
|
||||
self._last_alpha_seen_lower = info.lower
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
self._successive_upper_lower_count_final = 0
|
||||
self._last_alpha_seen = None
|
||||
self._last_alpha_seen_upper = False
|
||||
self._last_alpha_seen_lower = False
|
||||
self._buf = False
|
||||
self._current_ascii_only = True
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0: # Defensive:
|
||||
return 0.0
|
||||
|
||||
return self._successive_upper_lower_count_final / self._character_count
|
||||
|
||||
|
||||
@final
|
||||
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
||||
__slots__ = ("_character_count", "_isolated_form_count")
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._isolated_form_count: int = 0
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._isolated_form_count = 0
|
||||
|
||||
def feed_info(self, character: str, info: CharInfo) -> None:
|
||||
"""Optimized feed using pre-computed character info."""
|
||||
self._character_count += 1
|
||||
|
||||
if info.flags & _ARABIC_ISOLATED_FORM:
|
||||
self._isolated_form_count += 1
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
||||
|
||||
return isolated_form_usage
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: str | None, unicode_range_b: str | None
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||
"""
|
||||
if unicode_range_a is None or unicode_range_b is None:
|
||||
return True
|
||||
|
||||
if unicode_range_a == unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||
return False
|
||||
|
||||
# Latin characters can be accompanied with a combining diacritical mark
|
||||
# eg. Vietnamese.
|
||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = (
|
||||
unicode_range_a.split(" "),
|
||||
unicode_range_b.split(" "),
|
||||
)
|
||||
|
||||
for el in keywords_range_a:
|
||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
continue
|
||||
if el in keywords_range_b:
|
||||
return False
|
||||
|
||||
# Japanese Exception
|
||||
range_a_jp_chars, range_b_jp_chars = (
|
||||
unicode_range_a
|
||||
in (
|
||||
"Hiragana",
|
||||
"Katakana",
|
||||
),
|
||||
unicode_range_b in ("Hiragana", "Katakana"),
|
||||
)
|
||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
|
||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
||||
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
||||
unicode_range_a in ["Katakana", "Hiragana"]
|
||||
and unicode_range_b in ["Katakana", "Hiragana"]
|
||||
):
|
||||
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
||||
return False
|
||||
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def mess_ratio(
|
||||
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
seq_len: int = len(decoded_sequence)
|
||||
|
||||
if seq_len < 511:
|
||||
step: int = 32
|
||||
elif seq_len < 1024:
|
||||
step = 64
|
||||
else:
|
||||
step = 128
|
||||
|
||||
# Create each detector as a named local variable (unrolled from the generic loop).
|
||||
# This eliminates per-character iteration over the detector list and
|
||||
# per-character eligible() virtual dispatch, while keeping every plugin class
|
||||
# intact and fully readable.
|
||||
d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin()
|
||||
d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin()
|
||||
d_up: UnprintablePlugin = UnprintablePlugin()
|
||||
d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin()
|
||||
d_sr: SuspiciousRange = SuspiciousRange()
|
||||
d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin()
|
||||
d_cu: CjkUncommonPlugin = CjkUncommonPlugin()
|
||||
d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin()
|
||||
d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin()
|
||||
|
||||
# Local references for feed_info methods called in the hot loop.
|
||||
d_sp_feed = d_sp.feed_info
|
||||
d_ta_feed = d_ta.feed_info
|
||||
d_up_feed = d_up.feed_info
|
||||
d_sda_feed = d_sda.feed_info
|
||||
d_sr_feed = d_sr.feed_info
|
||||
d_sw_feed = d_sw.feed_info
|
||||
d_cu_feed = d_cu.feed_info
|
||||
d_au_feed = d_au.feed_info
|
||||
d_ai_feed = d_ai.feed_info
|
||||
|
||||
# Single reusable CharInfo object (avoids per-character allocation).
|
||||
info: CharInfo = CharInfo()
|
||||
info_update = info.update
|
||||
|
||||
mean_mess_ratio: float
|
||||
|
||||
for block_start in range(0, seq_len, step):
|
||||
for character in decoded_sequence[block_start : block_start + step]:
|
||||
# Pre-compute all character properties once (shared across all plugins).
|
||||
info_update(character)
|
||||
|
||||
# Detectors with eligible() == always True
|
||||
d_up_feed(character, info)
|
||||
d_sw_feed(character, info)
|
||||
d_au_feed(character, info)
|
||||
|
||||
# Detectors with eligible() == isprintable
|
||||
if info.printable:
|
||||
d_sp_feed(character, info)
|
||||
d_sr_feed(character, info)
|
||||
|
||||
# Detectors with eligible() == isalpha
|
||||
if info.alpha:
|
||||
d_ta_feed(character, info)
|
||||
# SuspiciousDuplicateAccent: isalpha() and is_latin()
|
||||
if info.latin:
|
||||
d_sda_feed(character, info)
|
||||
# CjkUncommon: is_cjk()
|
||||
if info.is_cjk:
|
||||
d_cu_feed(character, info)
|
||||
# ArabicIsolatedForm: is_arabic()
|
||||
if info.is_arabic:
|
||||
d_ai_feed(character, info)
|
||||
|
||||
mean_mess_ratio = (
|
||||
d_sp.ratio
|
||||
+ d_ta.ratio
|
||||
+ d_up.ratio
|
||||
+ d_sda.ratio
|
||||
+ d_sr.ratio
|
||||
+ d_sw.ratio
|
||||
+ d_cu.ratio
|
||||
+ d_au.ratio
|
||||
+ d_ai.ratio
|
||||
)
|
||||
|
||||
if mean_mess_ratio >= maximum_threshold:
|
||||
break
|
||||
else:
|
||||
# Flush last word buffer in SuperWeirdWordPlugin via trailing newline.
|
||||
info_update("\n")
|
||||
d_sw_feed("\n", info)
|
||||
d_au_feed("\n", info)
|
||||
d_up_feed("\n", info)
|
||||
|
||||
mean_mess_ratio = (
|
||||
d_sp.ratio
|
||||
+ d_ta.ratio
|
||||
+ d_up.ratio
|
||||
+ d_sda.ratio
|
||||
+ d_sr.ratio
|
||||
+ d_sw.ratio
|
||||
+ d_cu.ratio
|
||||
+ d_au.ratio
|
||||
+ d_ai.ratio
|
||||
)
|
||||
|
||||
if debug: # Defensive:
|
||||
logger = getLogger("charset_normalizer")
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Mess-detector extended-analysis start. "
|
||||
f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} "
|
||||
f"maximum_threshold={maximum_threshold}",
|
||||
)
|
||||
|
||||
if seq_len > 16:
|
||||
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
||||
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
||||
|
||||
for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]:
|
||||
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
||||
359
.venv/lib/python3.10/site-packages/charset_normalizer/models.py
Normal file
359
.venv/lib/python3.10/site-packages/charset_normalizer/models.py
Normal file
|
|
@ -0,0 +1,359 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from encodings.aliases import aliases
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import Any, Iterator, List, Tuple
|
||||
|
||||
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
||||
|
||||
class CharsetMatch:
|
||||
def __init__(
|
||||
self,
|
||||
payload: bytes | bytearray,
|
||||
guessed_encoding: str,
|
||||
mean_mess_ratio: float,
|
||||
has_sig_or_bom: bool,
|
||||
languages: CoherenceMatches,
|
||||
decoded_payload: str | None = None,
|
||||
preemptive_declaration: str | None = None,
|
||||
):
|
||||
self._payload: bytes | bytearray = payload
|
||||
|
||||
self._encoding: str = guessed_encoding
|
||||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: list[str] | None = None
|
||||
|
||||
self._leaves: list[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload: bytes | None = None
|
||||
self._output_encoding: str | None = None
|
||||
|
||||
self._string: str | None = decoded_payload
|
||||
|
||||
self._preemptive_declaration: str | None = preemptive_declaration
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
if isinstance(other, str):
|
||||
return iana_name(other) == self.encoding
|
||||
return False
|
||||
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
||||
|
||||
def __lt__(self, other: object) -> bool:
|
||||
"""
|
||||
Implemented to make sorted available upon CharsetMatches items.
|
||||
"""
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise ValueError
|
||||
|
||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Below 0.5% difference --> Use Coherence
|
||||
if chaos_difference < 0.005 and coherence_difference > 0.02:
|
||||
return self.coherence > other.coherence
|
||||
elif chaos_difference < 0.005 and coherence_difference <= 0.02:
|
||||
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
||||
# preserve RAM usage!
|
||||
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
||||
return self.chaos < other.chaos
|
||||
return self.multi_byte_usage > other.multi_byte_usage
|
||||
|
||||
return self.chaos < other.chaos
|
||||
|
||||
@property
|
||||
def multi_byte_usage(self) -> float:
|
||||
return 1.0 - (len(str(self)) / len(self.raw))
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Lazy Str Loading
|
||||
if self._string is None:
|
||||
self._string = str(self._payload, self._encoding, "strict")
|
||||
return self._string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>"
|
||||
|
||||
def add_submatch(self, other: CharsetMatch) -> None:
|
||||
if not isinstance(other, CharsetMatch) or other == self:
|
||||
raise ValueError(
|
||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||
other.__class__
|
||||
)
|
||||
)
|
||||
|
||||
other._string = None # Unload RAM usage; dirty trick.
|
||||
self._leaves.append(other)
|
||||
|
||||
@property
|
||||
def encoding(self) -> str:
|
||||
return self._encoding
|
||||
|
||||
@property
|
||||
def encoding_aliases(self) -> list[str]:
|
||||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as: list[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
elif self.encoding == p:
|
||||
also_known_as.append(u)
|
||||
return also_known_as
|
||||
|
||||
@property
|
||||
def bom(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def byte_order_mark(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def languages(self) -> list[str]:
|
||||
"""
|
||||
Return the complete list of possible languages found in decoded sequence.
|
||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||
"""
|
||||
return [e[0] for e in self._languages]
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
"""
|
||||
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
||||
"Unknown".
|
||||
"""
|
||||
if not self._languages:
|
||||
# Trying to infer the language based on the given encoding
|
||||
# Its either English or we should not pronounce ourselves in certain cases.
|
||||
if "ascii" in self.could_be_from_charset:
|
||||
return "English"
|
||||
|
||||
# doing it there to avoid circular import
|
||||
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
||||
|
||||
languages = (
|
||||
mb_encoding_languages(self.encoding)
|
||||
if is_multi_byte_encoding(self.encoding)
|
||||
else encoding_languages(self.encoding)
|
||||
)
|
||||
|
||||
if len(languages) == 0 or "Latin Based" in languages:
|
||||
return "Unknown"
|
||||
|
||||
return languages[0]
|
||||
|
||||
return self._languages[0][0]
|
||||
|
||||
@property
|
||||
def chaos(self) -> float:
|
||||
return self._mean_mess_ratio
|
||||
|
||||
@property
|
||||
def coherence(self) -> float:
|
||||
if not self._languages:
|
||||
return 0.0
|
||||
return self._languages[0][1]
|
||||
|
||||
@property
|
||||
def percent_chaos(self) -> float:
|
||||
return round(self.chaos * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def percent_coherence(self) -> float:
|
||||
return round(self.coherence * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def raw(self) -> bytes | bytearray:
|
||||
"""
|
||||
Original untouched bytes.
|
||||
"""
|
||||
return self._payload
|
||||
|
||||
@property
|
||||
def submatch(self) -> list[CharsetMatch]:
|
||||
return self._leaves
|
||||
|
||||
@property
|
||||
def has_submatch(self) -> bool:
|
||||
return len(self._leaves) > 0
|
||||
|
||||
@property
|
||||
def alphabets(self) -> list[str]:
|
||||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
||||
@property
|
||||
def could_be_from_charset(self) -> list[str]:
|
||||
"""
|
||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||
encoding.
|
||||
This list does include the encoding available in property 'encoding'.
|
||||
"""
|
||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||
|
||||
def output(self, encoding: str = "utf_8") -> bytes:
|
||||
"""
|
||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||
Any errors will be simply ignored by the encoder NOT replaced.
|
||||
"""
|
||||
if self._output_encoding is None or self._output_encoding != encoding:
|
||||
self._output_encoding = encoding
|
||||
decoded_string = str(self)
|
||||
if (
|
||||
self._preemptive_declaration is not None
|
||||
and self._preemptive_declaration.lower()
|
||||
not in ["utf-8", "utf8", "utf_8"]
|
||||
):
|
||||
patched_header = sub(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
||||
m.groups()[0],
|
||||
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
||||
),
|
||||
decoded_string[:8192],
|
||||
count=1,
|
||||
)
|
||||
|
||||
decoded_string = patched_header + decoded_string[8192:]
|
||||
|
||||
self._output_payload = decoded_string.encode(encoding, "replace")
|
||||
|
||||
return self._output_payload # type: ignore
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> int:
|
||||
"""
|
||||
Retrieve a hash fingerprint of the decoded payload, used for deduplication.
|
||||
"""
|
||||
return hash(str(self))
|
||||
|
||||
|
||||
class CharsetMatches:
|
||||
"""
|
||||
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
||||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: list[CharsetMatch] | None = None):
|
||||
self._results: list[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: int | str) -> CharsetMatch:
|
||||
"""
|
||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||
Raise KeyError upon invalid index or encoding not present in results.
|
||||
"""
|
||||
if isinstance(item, int):
|
||||
return self._results[item]
|
||||
if isinstance(item, str):
|
||||
item = iana_name(item, False)
|
||||
for result in self._results:
|
||||
if item in result.could_be_from_charset:
|
||||
return result
|
||||
raise KeyError
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._results)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self._results) > 0
|
||||
|
||||
def append(self, item: CharsetMatch) -> None:
|
||||
"""
|
||||
Insert a single match. Will be inserted accordingly to preserve sort.
|
||||
Can be inserted as a submatch.
|
||||
"""
|
||||
if not isinstance(item, CharsetMatch):
|
||||
raise ValueError(
|
||||
"Cannot append instance '{}' to CharsetMatches".format(
|
||||
str(item.__class__)
|
||||
)
|
||||
)
|
||||
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
||||
if len(item.raw) < TOO_BIG_SEQUENCE:
|
||||
for match in self._results:
|
||||
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
||||
match.add_submatch(item)
|
||||
return
|
||||
self._results.append(item)
|
||||
self._results = sorted(self._results)
|
||||
|
||||
def best(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Simply return the first match. Strict equivalent to matches[0].
|
||||
"""
|
||||
if not self._results:
|
||||
return None
|
||||
return self._results[0]
|
||||
|
||||
def first(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Redundant method, call the method best(). Kept for BC reasons.
|
||||
"""
|
||||
return self.best()
|
||||
|
||||
|
||||
CoherenceMatch = Tuple[str, float]
|
||||
CoherenceMatches = List[CoherenceMatch]
|
||||
|
||||
|
||||
class CliDetectionResult:
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
encoding: str | None,
|
||||
encoding_aliases: list[str],
|
||||
alternative_encodings: list[str],
|
||||
language: str,
|
||||
alphabets: list[str],
|
||||
has_sig_or_bom: bool,
|
||||
chaos: float,
|
||||
coherence: float,
|
||||
unicode_path: str | None,
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path: str = path
|
||||
self.unicode_path: str | None = unicode_path
|
||||
self.encoding: str | None = encoding
|
||||
self.encoding_aliases: list[str] = encoding_aliases
|
||||
self.alternative_encodings: list[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: list[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> dict[str, Any]: # type: ignore
|
||||
return {
|
||||
"path": self.path,
|
||||
"encoding": self.encoding,
|
||||
"encoding_aliases": self.encoding_aliases,
|
||||
"alternative_encodings": self.alternative_encodings,
|
||||
"language": self.language,
|
||||
"alphabets": self.alphabets,
|
||||
"has_sig_or_bom": self.has_sig_or_bom,
|
||||
"chaos": self.chaos,
|
||||
"coherence": self.coherence,
|
||||
"unicode_path": self.unicode_path,
|
||||
"is_preferred": self.is_preferred,
|
||||
}
|
||||
|
||||
def to_json(self) -> str:
|
||||
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
||||
422
.venv/lib/python3.10/site-packages/charset_normalizer/utils.py
Normal file
422
.venv/lib/python3.10/site-packages/charset_normalizer/utils.py
Normal file
|
|
@ -0,0 +1,422 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import unicodedata
|
||||
from bisect import bisect_right
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import Generator
|
||||
|
||||
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
IANA_SUPPORTED_SIMILAR,
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
UNICODE_RANGES_COMBINED,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
UTF8_MAXIMAL_ALLOCATION,
|
||||
COMMON_CJK_CHARACTERS,
|
||||
_LATIN,
|
||||
_CJK,
|
||||
_HANGUL,
|
||||
_KATAKANA,
|
||||
_HIRAGANA,
|
||||
_THAI,
|
||||
_ARABIC,
|
||||
_ARABIC_ISOLATED_FORM,
|
||||
_ACCENT_KEYWORDS,
|
||||
_ACCENTUATED,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def _character_flags(character: str) -> int:
|
||||
"""Compute all name-based classification flags with a single unicodedata.name() call."""
|
||||
try:
|
||||
desc: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return 0
|
||||
|
||||
flags: int = 0
|
||||
|
||||
if "LATIN" in desc:
|
||||
flags |= _LATIN
|
||||
if "CJK" in desc:
|
||||
flags |= _CJK
|
||||
if "HANGUL" in desc:
|
||||
flags |= _HANGUL
|
||||
if "KATAKANA" in desc:
|
||||
flags |= _KATAKANA
|
||||
if "HIRAGANA" in desc:
|
||||
flags |= _HIRAGANA
|
||||
if "THAI" in desc:
|
||||
flags |= _THAI
|
||||
if "ARABIC" in desc:
|
||||
flags |= _ARABIC
|
||||
if "ISOLATED FORM" in desc:
|
||||
flags |= _ARABIC_ISOLATED_FORM
|
||||
|
||||
for kw in _ACCENT_KEYWORDS:
|
||||
if kw in desc:
|
||||
flags |= _ACCENTUATED
|
||||
break
|
||||
|
||||
return flags
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_accentuated(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _ACCENTUATED)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def remove_accent(character: str) -> str:
|
||||
decomposed: str = unicodedata.decomposition(character)
|
||||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes: list[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
||||
# Pre-built sorted lookup table for O(log n) binary search in unicode_range().
|
||||
# Each entry is (range_start, range_end_exclusive, range_name).
|
||||
_UNICODE_RANGES_SORTED: list[tuple[int, int, str]] = sorted(
|
||||
(ord_range.start, ord_range.stop, name)
|
||||
for name, ord_range in UNICODE_RANGES_COMBINED.items()
|
||||
)
|
||||
_UNICODE_RANGE_STARTS: list[int] = [e[0] for e in _UNICODE_RANGES_SORTED]
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def unicode_range(character: str) -> str | None:
|
||||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
character_ord: int = ord(character)
|
||||
|
||||
# Binary search: find the rightmost range whose start <= character_ord
|
||||
idx = bisect_right(_UNICODE_RANGE_STARTS, character_ord) - 1
|
||||
if idx >= 0:
|
||||
start, stop, name = _UNICODE_RANGES_SORTED[idx]
|
||||
if character_ord < stop:
|
||||
return name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_latin(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _LATIN)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_punctuation(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Punctuation" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_symbol(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Forms" in character_range and character_category != "Lo"
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Emoticons" in character_range or "Pictographs" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_separator(character: str) -> bool:
|
||||
if character.isspace() or character in {"|", "+", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_case_variable(character: str) -> bool:
|
||||
return character.islower() != character.isupper()
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _CJK)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hiragana(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _HIRAGANA)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_katakana(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _KATAKANA)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hangul(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _HANGUL)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_thai(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _THAI)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_arabic(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _ARABIC)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_arabic_isolated_form(character: str) -> bool:
|
||||
return bool(_character_flags(character) & _ARABIC_ISOLATED_FORM)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk_uncommon(character: str) -> bool:
|
||||
return character not in COMMON_CJK_CHARACTERS
|
||||
|
||||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_unprintable(character: str) -> bool:
|
||||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1a" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(
|
||||
sequence: bytes | bytearray, search_zone: int = 8192
|
||||
) -> str | None:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
"""
|
||||
if not isinstance(sequence, (bytes, bytearray)):
|
||||
raise TypeError
|
||||
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results: list[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
)
|
||||
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
for specified_encoding in results:
|
||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if encoding_alias == specified_encoding:
|
||||
return encoding_iana
|
||||
if encoding_iana == specified_encoding:
|
||||
return encoding_iana
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def is_multi_byte_encoding(name: str) -> bool:
|
||||
"""
|
||||
Verify is a specific encoding is a multi byte one based on it IANA name
|
||||
"""
|
||||
return name in {
|
||||
"utf_8",
|
||||
"utf_8_sig",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_le",
|
||||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
||||
def identify_sig_or_bom(sequence: bytes | bytearray) -> tuple[str | None, bytes]:
|
||||
"""
|
||||
Identify and extract SIG/BOM in given sequence.
|
||||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
||||
for mark in marks:
|
||||
if sequence.startswith(mark):
|
||||
return iana_encoding, mark
|
||||
|
||||
return None, b""
|
||||
|
||||
|
||||
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||
return iana_encoding not in {"utf_16", "utf_32"}
|
||||
|
||||
|
||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
||||
|
||||
return cp_name
|
||||
|
||||
|
||||
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
||||
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
||||
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
||||
character_match_count: int = 0
|
||||
|
||||
for i in range(256):
|
||||
to_be_decoded: bytes = bytes([i])
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
||||
return character_match_count / 256
|
||||
|
||||
|
||||
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
||||
"""
|
||||
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
||||
the function cp_similarity.
|
||||
"""
|
||||
return (
|
||||
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||
)
|
||||
|
||||
|
||||
def set_logging_handler(
|
||||
name: str = "charset_normalizer",
|
||||
level: int = logging.INFO,
|
||||
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
||||
) -> None:
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def cut_sequence_chunks(
|
||||
sequences: bytes | bytearray,
|
||||
encoding_iana: str,
|
||||
offsets: range,
|
||||
chunk_size: int,
|
||||
bom_or_sig_available: bool,
|
||||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: str | None = None,
|
||||
) -> Generator[str, None, None]:
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
chunk = decoded_payload[i : i + chunk_size]
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
else:
|
||||
for i in offsets:
|
||||
chunk_end = i + chunk_size
|
||||
if chunk_end > len(sequences) + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
)
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0:
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j:chunk_end]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
yield chunk
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
"""
|
||||
Expose version
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
__version__ = "3.4.6"
|
||||
VERSION = __version__.split(".")
|
||||
Loading…
Add table
Add a link
Reference in a new issue