from __future__ import annotations import sys from functools import lru_cache from logging import getLogger if sys.version_info >= (3, 8): from typing import final else: try: from typing_extensions import final except ImportError: def final(cls): # type: ignore[misc,no-untyped-def] return cls from .constant import ( COMMON_CJK_CHARACTERS, COMMON_SAFE_ASCII_CHARACTERS, TRACE, UNICODE_SECONDARY_RANGE_KEYWORD, _ACCENTUATED, _ARABIC, _ARABIC_ISOLATED_FORM, _CJK, _HANGUL, _HIRAGANA, _KATAKANA, _LATIN, _THAI, ) from .utils import ( _character_flags, is_emoticon, is_punctuation, is_separator, is_symbol, remove_accent, unicode_range, ) # Combined bitmask for CJK/Hangul/Katakana/Hiragana/Thai glyph detection. _GLYPH_MASK: int = _CJK | _HANGUL | _KATAKANA | _HIRAGANA | _THAI @final class CharInfo: """Pre-computed character properties shared across all detectors. Instantiated once and reused via :meth:`update` on every character in the hot loop so that redundant calls to str methods (``isalpha``, ``isupper``, …) and cached utility functions (``_character_flags``, ``is_punctuation``, …) are avoided when several plugins need the same information. """ __slots__ = ( "character", "printable", "alpha", "upper", "lower", "space", "digit", "is_ascii", "case_variable", "flags", "accentuated", "latin", "is_cjk", "is_arabic", "is_glyph", "punct", "sym", ) def __init__(self) -> None: self.character: str = "" self.printable: bool = False self.alpha: bool = False self.upper: bool = False self.lower: bool = False self.space: bool = False self.digit: bool = False self.is_ascii: bool = False self.case_variable: bool = False self.flags: int = 0 self.accentuated: bool = False self.latin: bool = False self.is_cjk: bool = False self.is_arabic: bool = False self.is_glyph: bool = False self.punct: bool = False self.sym: bool = False def update(self, character: str) -> None: """Update all properties for *character* (called once per character).""" self.character = character # ASCII fast-path: for characters with ord < 128, we can skip # _character_flags() entirely and derive most properties from ord. o: int = ord(character) if o < 128: self.is_ascii = True self.accentuated = False self.is_cjk = False self.is_arabic = False self.is_glyph = False # ASCII alpha: a-z (97-122) or A-Z (65-90) if 65 <= o <= 90: # Uppercase ASCII letter self.alpha = True self.upper = True self.lower = False self.space = False self.digit = False self.printable = True self.case_variable = True self.flags = _LATIN self.latin = True self.punct = False self.sym = False elif 97 <= o <= 122: # Lowercase ASCII letter self.alpha = True self.upper = False self.lower = True self.space = False self.digit = False self.printable = True self.case_variable = True self.flags = _LATIN self.latin = True self.punct = False self.sym = False elif 48 <= o <= 57: # ASCII digit 0-9 self.alpha = False self.upper = False self.lower = False self.space = False self.digit = True self.printable = True self.case_variable = False self.flags = 0 self.latin = False self.punct = False self.sym = False elif o == 32 or (9 <= o <= 13): # Space, tab, newline, etc. self.alpha = False self.upper = False self.lower = False self.space = True self.digit = False self.printable = o == 32 self.case_variable = False self.flags = 0 self.latin = False self.punct = False self.sym = False else: # Other ASCII (punctuation, symbols, control chars) self.printable = character.isprintable() self.alpha = False self.upper = False self.lower = False self.space = False self.digit = False self.case_variable = False self.flags = 0 self.latin = False self.punct = is_punctuation(character) if self.printable else False self.sym = is_symbol(character) if self.printable else False else: # Non-ASCII path self.is_ascii = False self.printable = character.isprintable() self.alpha = character.isalpha() self.upper = character.isupper() self.lower = character.islower() self.space = character.isspace() self.digit = character.isdigit() self.case_variable = self.lower != self.upper # Flag-based classification (single unicodedata.name() call, lru-cached) flags: int if self.alpha: flags = _character_flags(character) else: flags = 0 self.flags = flags self.accentuated = bool(flags & _ACCENTUATED) self.latin = bool(flags & _LATIN) self.is_cjk = bool(flags & _CJK) self.is_arabic = bool(flags & _ARABIC) self.is_glyph = bool(flags & _GLYPH_MASK) # Eagerly compute punct and sym (avoids property dispatch overhead # on 300K+ accesses in the hot loop). self.punct = is_punctuation(character) if self.printable else False self.sym = is_symbol(character) if self.printable else False class MessDetectorPlugin: """ Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. """ __slots__ = () def feed_info(self, character: str, info: CharInfo) -> None: """ The main routine to be executed upon character. Insert the logic in witch the text would be considered chaotic. """ raise NotImplementedError # Defensive: def reset(self) -> None: # Defensive: """ Permit to reset the plugin to the initial state. """ raise NotImplementedError @property def ratio(self) -> float: """ Compute the chaos ratio based on what your feed() has seen. Must NOT be lower than 0.; No restriction gt 0. """ raise NotImplementedError # Defensive: @final class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): __slots__ = ( "_punctuation_count", "_symbol_count", "_character_count", "_last_printable_char", "_frenzy_symbol_in_word", ) def __init__(self) -> None: self._punctuation_count: int = 0 self._symbol_count: int = 0 self._character_count: int = 0 self._last_printable_char: str | None = None self._frenzy_symbol_in_word: bool = False def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" self._character_count += 1 if ( character != self._last_printable_char and character not in COMMON_SAFE_ASCII_CHARACTERS ): if info.punct: self._punctuation_count += 1 elif not info.digit and info.sym and not is_emoticon(character): self._symbol_count += 2 self._last_printable_char = character def reset(self) -> None: # Abstract self._punctuation_count = 0 self._character_count = 0 self._symbol_count = 0 @property def ratio(self) -> float: if self._character_count == 0: return 0.0 ratio_of_punctuation: float = ( self._punctuation_count + self._symbol_count ) / self._character_count return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 @final class TooManyAccentuatedPlugin(MessDetectorPlugin): __slots__ = ("_character_count", "_accentuated_count") def __init__(self) -> None: self._character_count: int = 0 self._accentuated_count: int = 0 def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" self._character_count += 1 if info.accentuated: self._accentuated_count += 1 def reset(self) -> None: # Abstract self._character_count = 0 self._accentuated_count = 0 @property def ratio(self) -> float: if self._character_count < 8: return 0.0 ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 @final class UnprintablePlugin(MessDetectorPlugin): __slots__ = ("_unprintable_count", "_character_count") def __init__(self) -> None: self._unprintable_count: int = 0 self._character_count: int = 0 def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" if ( not info.space and not info.printable and character != "\x1a" and character != "\ufeff" ): self._unprintable_count += 1 self._character_count += 1 def reset(self) -> None: # Abstract self._unprintable_count = 0 @property def ratio(self) -> float: if self._character_count == 0: # Defensive: return 0.0 return (self._unprintable_count * 8) / self._character_count @final class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): __slots__ = ( "_successive_count", "_character_count", "_last_latin_character", "_last_was_accentuated", ) def __init__(self) -> None: self._successive_count: int = 0 self._character_count: int = 0 self._last_latin_character: str | None = None self._last_was_accentuated: bool = False def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" self._character_count += 1 if ( self._last_latin_character is not None and info.accentuated and self._last_was_accentuated ): if info.upper and self._last_latin_character.isupper(): self._successive_count += 1 if remove_accent(character) == remove_accent(self._last_latin_character): self._successive_count += 1 self._last_latin_character = character self._last_was_accentuated = info.accentuated def reset(self) -> None: # Abstract self._successive_count = 0 self._character_count = 0 self._last_latin_character = None self._last_was_accentuated = False @property def ratio(self) -> float: if self._character_count == 0: return 0.0 return (self._successive_count * 2) / self._character_count @final class SuspiciousRange(MessDetectorPlugin): __slots__ = ( "_suspicious_successive_range_count", "_character_count", "_last_printable_seen", "_last_printable_range", ) def __init__(self) -> None: self._suspicious_successive_range_count: int = 0 self._character_count: int = 0 self._last_printable_seen: str | None = None self._last_printable_range: str | None = None def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" self._character_count += 1 if info.space or info.punct or character in COMMON_SAFE_ASCII_CHARACTERS: self._last_printable_seen = None self._last_printable_range = None return if self._last_printable_seen is None: self._last_printable_seen = character self._last_printable_range = unicode_range(character) return unicode_range_a: str | None = self._last_printable_range unicode_range_b: str | None = unicode_range(character) if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): self._suspicious_successive_range_count += 1 self._last_printable_seen = character self._last_printable_range = unicode_range_b def reset(self) -> None: # Abstract self._character_count = 0 self._suspicious_successive_range_count = 0 self._last_printable_seen = None self._last_printable_range = None @property def ratio(self) -> float: if self._character_count <= 13: return 0.0 ratio_of_suspicious_range_usage: float = ( self._suspicious_successive_range_count * 2 ) / self._character_count return ratio_of_suspicious_range_usage @final class SuperWeirdWordPlugin(MessDetectorPlugin): __slots__ = ( "_word_count", "_bad_word_count", "_foreign_long_count", "_is_current_word_bad", "_foreign_long_watch", "_character_count", "_bad_character_count", "_buffer_length", "_buffer_last_char", "_buffer_last_char_accentuated", "_buffer_accent_count", "_buffer_glyph_count", "_buffer_upper_count", ) def __init__(self) -> None: self._word_count: int = 0 self._bad_word_count: int = 0 self._foreign_long_count: int = 0 self._is_current_word_bad: bool = False self._foreign_long_watch: bool = False self._character_count: int = 0 self._bad_character_count: int = 0 self._buffer_length: int = 0 self._buffer_last_char: str | None = None self._buffer_last_char_accentuated: bool = False self._buffer_accent_count: int = 0 self._buffer_glyph_count: int = 0 self._buffer_upper_count: int = 0 def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" if info.alpha: self._buffer_length += 1 self._buffer_last_char = character if info.upper: self._buffer_upper_count += 1 self._buffer_last_char_accentuated = info.accentuated if info.accentuated: self._buffer_accent_count += 1 if ( not self._foreign_long_watch and (not info.latin or info.accentuated) and not info.is_glyph ): self._foreign_long_watch = True if info.is_glyph: self._buffer_glyph_count += 1 return if not self._buffer_length: return if info.space or info.punct or is_separator(character): self._word_count += 1 buffer_length: int = self._buffer_length self._character_count += buffer_length if buffer_length >= 4: if self._buffer_accent_count / buffer_length >= 0.5: self._is_current_word_bad = True elif ( self._buffer_last_char_accentuated and self._buffer_last_char.isupper() # type: ignore[union-attr] and self._buffer_upper_count != buffer_length ): self._foreign_long_count += 1 self._is_current_word_bad = True elif self._buffer_glyph_count == 1: self._is_current_word_bad = True self._foreign_long_count += 1 if buffer_length >= 24 and self._foreign_long_watch: probable_camel_cased: bool = ( self._buffer_upper_count > 0 and self._buffer_upper_count / buffer_length <= 0.3 ) if not probable_camel_cased: self._foreign_long_count += 1 self._is_current_word_bad = True if self._is_current_word_bad: self._bad_word_count += 1 self._bad_character_count += buffer_length self._is_current_word_bad = False self._foreign_long_watch = False self._buffer_length = 0 self._buffer_last_char = None self._buffer_last_char_accentuated = False self._buffer_accent_count = 0 self._buffer_glyph_count = 0 self._buffer_upper_count = 0 elif ( character not in {"<", ">", "-", "=", "~", "|", "_"} and not info.digit and info.sym ): self._is_current_word_bad = True self._buffer_length += 1 self._buffer_last_char = character self._buffer_last_char_accentuated = False def reset(self) -> None: # Abstract self._buffer_length = 0 self._buffer_last_char = None self._buffer_last_char_accentuated = False self._is_current_word_bad = False self._foreign_long_watch = False self._bad_word_count = 0 self._word_count = 0 self._character_count = 0 self._bad_character_count = 0 self._foreign_long_count = 0 self._buffer_accent_count = 0 self._buffer_glyph_count = 0 self._buffer_upper_count = 0 @property def ratio(self) -> float: if self._word_count <= 10 and self._foreign_long_count == 0: return 0.0 return self._bad_character_count / self._character_count @final class CjkUncommonPlugin(MessDetectorPlugin): """ Detect messy CJK text that probably means nothing. """ __slots__ = ("_character_count", "_uncommon_count") def __init__(self) -> None: self._character_count: int = 0 self._uncommon_count: int = 0 def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" self._character_count += 1 if character not in COMMON_CJK_CHARACTERS: self._uncommon_count += 1 def reset(self) -> None: # Abstract self._character_count = 0 self._uncommon_count = 0 @property def ratio(self) -> float: if self._character_count < 8: return 0.0 uncommon_form_usage: float = self._uncommon_count / self._character_count # we can be pretty sure it's garbage when uncommon characters are widely # used. otherwise it could just be traditional chinese for example. return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0 @final class ArchaicUpperLowerPlugin(MessDetectorPlugin): __slots__ = ( "_buf", "_character_count_since_last_sep", "_successive_upper_lower_count", "_successive_upper_lower_count_final", "_character_count", "_last_alpha_seen", "_last_alpha_seen_upper", "_last_alpha_seen_lower", "_current_ascii_only", ) def __init__(self) -> None: self._buf: bool = False self._character_count_since_last_sep: int = 0 self._successive_upper_lower_count: int = 0 self._successive_upper_lower_count_final: int = 0 self._character_count: int = 0 self._last_alpha_seen: str | None = None self._last_alpha_seen_upper: bool = False self._last_alpha_seen_lower: bool = False self._current_ascii_only: bool = True def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" is_concerned: bool = info.alpha and info.case_variable chunk_sep: bool = not is_concerned if chunk_sep and self._character_count_since_last_sep > 0: if ( self._character_count_since_last_sep <= 64 and not info.digit and not self._current_ascii_only ): self._successive_upper_lower_count_final += ( self._successive_upper_lower_count ) self._successive_upper_lower_count = 0 self._character_count_since_last_sep = 0 self._last_alpha_seen = None self._buf = False self._character_count += 1 self._current_ascii_only = True return if self._current_ascii_only and not info.is_ascii: self._current_ascii_only = False if self._last_alpha_seen is not None: if (info.upper and self._last_alpha_seen_lower) or ( info.lower and self._last_alpha_seen_upper ): if self._buf: self._successive_upper_lower_count += 2 self._buf = False else: self._buf = True else: self._buf = False self._character_count += 1 self._character_count_since_last_sep += 1 self._last_alpha_seen = character self._last_alpha_seen_upper = info.upper self._last_alpha_seen_lower = info.lower def reset(self) -> None: # Abstract self._character_count = 0 self._character_count_since_last_sep = 0 self._successive_upper_lower_count = 0 self._successive_upper_lower_count_final = 0 self._last_alpha_seen = None self._last_alpha_seen_upper = False self._last_alpha_seen_lower = False self._buf = False self._current_ascii_only = True @property def ratio(self) -> float: if self._character_count == 0: # Defensive: return 0.0 return self._successive_upper_lower_count_final / self._character_count @final class ArabicIsolatedFormPlugin(MessDetectorPlugin): __slots__ = ("_character_count", "_isolated_form_count") def __init__(self) -> None: self._character_count: int = 0 self._isolated_form_count: int = 0 def reset(self) -> None: # Abstract self._character_count = 0 self._isolated_form_count = 0 def feed_info(self, character: str, info: CharInfo) -> None: """Optimized feed using pre-computed character info.""" self._character_count += 1 if info.flags & _ARABIC_ISOLATED_FORM: self._isolated_form_count += 1 @property def ratio(self) -> float: if self._character_count < 8: return 0.0 isolated_form_usage: float = self._isolated_form_count / self._character_count return isolated_form_usage @lru_cache(maxsize=1024) def is_suspiciously_successive_range( unicode_range_a: str | None, unicode_range_b: str | None ) -> bool: """ Determine if two Unicode range seen next to each other can be considered as suspicious. """ if unicode_range_a is None or unicode_range_b is None: return True if unicode_range_a == unicode_range_b: return False if "Latin" in unicode_range_a and "Latin" in unicode_range_b: return False if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: return False # Latin characters can be accompanied with a combining diacritical mark # eg. Vietnamese. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( "Combining" in unicode_range_a or "Combining" in unicode_range_b ): return False keywords_range_a, keywords_range_b = ( unicode_range_a.split(" "), unicode_range_b.split(" "), ) for el in keywords_range_a: if el in UNICODE_SECONDARY_RANGE_KEYWORD: continue if el in keywords_range_b: return False # Japanese Exception range_a_jp_chars, range_b_jp_chars = ( unicode_range_a in ( "Hiragana", "Katakana", ), unicode_range_b in ("Hiragana", "Katakana"), ) if (range_a_jp_chars or range_b_jp_chars) and ( "CJK" in unicode_range_a or "CJK" in unicode_range_b ): return False if range_a_jp_chars and range_b_jp_chars: return False if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: if "CJK" in unicode_range_a or "CJK" in unicode_range_b: return False if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": return False # Chinese/Japanese use dedicated range for punctuation and/or separators. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in ["Katakana", "Hiragana"] ): if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: return False if "Forms" in unicode_range_a or "Forms" in unicode_range_b: return False if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": return False return True @lru_cache(maxsize=2048) def mess_ratio( decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False ) -> float: """ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """ seq_len: int = len(decoded_sequence) if seq_len < 511: step: int = 32 elif seq_len < 1024: step = 64 else: step = 128 # Create each detector as a named local variable (unrolled from the generic loop). # This eliminates per-character iteration over the detector list and # per-character eligible() virtual dispatch, while keeping every plugin class # intact and fully readable. d_sp: TooManySymbolOrPunctuationPlugin = TooManySymbolOrPunctuationPlugin() d_ta: TooManyAccentuatedPlugin = TooManyAccentuatedPlugin() d_up: UnprintablePlugin = UnprintablePlugin() d_sda: SuspiciousDuplicateAccentPlugin = SuspiciousDuplicateAccentPlugin() d_sr: SuspiciousRange = SuspiciousRange() d_sw: SuperWeirdWordPlugin = SuperWeirdWordPlugin() d_cu: CjkUncommonPlugin = CjkUncommonPlugin() d_au: ArchaicUpperLowerPlugin = ArchaicUpperLowerPlugin() d_ai: ArabicIsolatedFormPlugin = ArabicIsolatedFormPlugin() # Local references for feed_info methods called in the hot loop. d_sp_feed = d_sp.feed_info d_ta_feed = d_ta.feed_info d_up_feed = d_up.feed_info d_sda_feed = d_sda.feed_info d_sr_feed = d_sr.feed_info d_sw_feed = d_sw.feed_info d_cu_feed = d_cu.feed_info d_au_feed = d_au.feed_info d_ai_feed = d_ai.feed_info # Single reusable CharInfo object (avoids per-character allocation). info: CharInfo = CharInfo() info_update = info.update mean_mess_ratio: float for block_start in range(0, seq_len, step): for character in decoded_sequence[block_start : block_start + step]: # Pre-compute all character properties once (shared across all plugins). info_update(character) # Detectors with eligible() == always True d_up_feed(character, info) d_sw_feed(character, info) d_au_feed(character, info) # Detectors with eligible() == isprintable if info.printable: d_sp_feed(character, info) d_sr_feed(character, info) # Detectors with eligible() == isalpha if info.alpha: d_ta_feed(character, info) # SuspiciousDuplicateAccent: isalpha() and is_latin() if info.latin: d_sda_feed(character, info) # CjkUncommon: is_cjk() if info.is_cjk: d_cu_feed(character, info) # ArabicIsolatedForm: is_arabic() if info.is_arabic: d_ai_feed(character, info) mean_mess_ratio = ( d_sp.ratio + d_ta.ratio + d_up.ratio + d_sda.ratio + d_sr.ratio + d_sw.ratio + d_cu.ratio + d_au.ratio + d_ai.ratio ) if mean_mess_ratio >= maximum_threshold: break else: # Flush last word buffer in SuperWeirdWordPlugin via trailing newline. info_update("\n") d_sw_feed("\n", info) d_au_feed("\n", info) d_up_feed("\n", info) mean_mess_ratio = ( d_sp.ratio + d_ta.ratio + d_up.ratio + d_sda.ratio + d_sr.ratio + d_sw.ratio + d_cu.ratio + d_au.ratio + d_ai.ratio ) if debug: # Defensive: logger = getLogger("charset_normalizer") logger.log( TRACE, "Mess-detector extended-analysis start. " f"intermediary_mean_mess_ratio_calc={step} mean_mess_ratio={mean_mess_ratio} " f"maximum_threshold={maximum_threshold}", ) if seq_len > 16: logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") for dt in [d_sp, d_ta, d_up, d_sda, d_sr, d_sw, d_cu, d_au, d_ai]: logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") return round(mean_mess_ratio, 3)