using System.Buffers.Binary; using System.Diagnostics; using System.IO; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; namespace Dalamud.Interface.Internal.ImGuiSeStringRenderer.TextProcessing; /// Represents a single value to be used in a UTF-N byte sequence. [StructLayout(LayoutKind.Explicit, Size = 4)] [DebuggerDisplay("0x{IntValue,h} ({CharValue})")] internal readonly struct UtfValue : IEquatable, IComparable { /// The unicode codepoint in int, that may not be in a valid range. [FieldOffset(0)] public readonly int IntValue; /// The unicode codepoint in uint, that may not be in a valid range. [FieldOffset(0)] public readonly uint UIntValue; /// The high UInt16 value in char, that may have been cut off if outside BMP. [FieldOffset(0)] public readonly char CharValue; /// Initializes a new instance of the struct. /// The raw codepoint value. [MethodImpl(MethodImplOptions.AggressiveInlining)] public UtfValue(uint value) => this.UIntValue = value; /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public UtfValue(int value) => this.IntValue = value; /// Gets the length of this codepoint, encoded in UTF-8. public int Length8 { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => GetEncodedLength8(this); } /// Gets the length of this codepoint, encoded in UTF-16. public int Length16 { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => GetEncodedLength16(this); } /// Gets the short name, if supported. /// The buffer containing the short name, or empty if unsupported. public ReadOnlySpan ShortName { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => GetShortName(this); } public static implicit operator uint(UtfValue c) => c.UIntValue; public static implicit operator int(UtfValue c) => c.IntValue; public static implicit operator UtfValue(byte c) => new(c); public static implicit operator UtfValue(sbyte c) => new(c); public static implicit operator UtfValue(ushort c) => new(c); public static implicit operator UtfValue(short c) => new(c); public static implicit operator UtfValue(uint c) => new(c); public static implicit operator UtfValue(int c) => new(c); public static implicit operator UtfValue(char c) => new(c); public static implicit operator UtfValue(Rune c) => new(c.Value); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool operator ==(UtfValue left, UtfValue right) => left.Equals(right); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool operator !=(UtfValue left, UtfValue right) => !left.Equals(right); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool operator <(UtfValue left, UtfValue right) => left.CompareTo(right) < 0; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool operator >(UtfValue left, UtfValue right) => left.CompareTo(right) > 0; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool operator <=(UtfValue left, UtfValue right) => left.CompareTo(right) <= 0; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool operator >=(UtfValue left, UtfValue right) => left.CompareTo(right) >= 0; /// Gets the short name of the codepoint, for some select codepoints. /// The codepoint. /// The value. public static ReadOnlySpan GetShortName(int codepoint) => codepoint switch { 0x00 => "NUL", 0x01 => "SOH", 0x02 => "STX", 0x03 => "ETX", 0x04 => "EOT", 0x05 => "ENQ", 0x06 => "ACK", 0x07 => "BEL", 0x08 => "BS", 0x09 => "HT", 0x0a => "LF", 0x0b => "VT", 0x0c => "FF", 0x0d => "CR", 0x0e => "SO", 0x0f => "SI", 0x10 => "DLE", 0x11 => "DC1", 0x12 => "DC2", 0x13 => "DC3", 0x14 => "DC4", 0x15 => "NAK", 0x16 => "SYN", 0x17 => "SOH", 0x18 => "CAN", 0x19 => "EOM", 0x1a => "SUB", 0x1b => "ESC", 0x1c => "FS", 0x1d => "GS", 0x1e => "RS", 0x1f => "US", 0x80 => "PAD", 0x81 => "HOP", 0x82 => "BPH", 0x83 => "NBH", 0x84 => "IND", 0x85 => "NEL", 0x86 => "SSA", 0x87 => "ESA", 0x88 => "HTS", 0x89 => "HTJ", 0x8a => "VTS", 0x8b => "PLD", 0x8c => "PLU", 0x8d => "RI", 0x8e => "SS2", 0x8f => "SS3", 0x90 => "DCS", 0x91 => "PU1", 0x92 => "PU2", 0x93 => "STS", 0x94 => "CCH", 0x95 => "MW", 0x96 => "SPA", 0x97 => "EPA", 0x98 => "SOS", 0x99 => "SGC", 0x9a => "SCI", 0x9b => "CSI", 0x9c => "ST", 0x9d => "OSC", 0x9e => "PM", 0x9f => "APC", 0xa0 => "NBSP", 0xad => "SHY", _ => default, }; /// Gets the length of the codepoint, when encoded in UTF-8. /// The codepoint to encode. /// The length. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetEncodedLength8(int codepoint) => (uint)codepoint switch { < 1u << 7 => 1, < 1u << 11 => 2, < 1u << 16 => 3, < 1u << 21 => 4, // Not a valid Unicode codepoint anymore below. < 1u << 26 => 5, < 1u << 31 => 6, _ => 7, }; /// Gets the length of the codepoint, when encoded in UTF-16. /// The codepoint to encode. /// The length. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetEncodedLength16(int codepoint) => (uint)codepoint switch { < 0x10000 => 2, < 0x10000 + (1 << 20) => 4, // Not a valid Unicode codepoint anymore below. < 0x10000 + (1 << 30) => 6, _ => 8, }; /// /// Trims at beginning by . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryDecode8(ref ReadOnlySpan source, out UtfValue value, out int length) { var v = TryDecode8(source, out value, out length); source = source[length..]; return v; } /// Attempts to decode a value from a UTF-8 byte sequence. /// The span to decode from. /// The decoded value. /// The length of the consumed bytes. 1 if sequence is broken. /// true if is successfully decoded. /// Codepoints that results in false from can still be returned, /// including unpaired surrogate characters, or codepoints above U+10FFFFF. This function returns a value only /// indicating whether the sequence could be decoded into a number, without being too short. public static unsafe bool TryDecode8(ReadOnlySpan source, out UtfValue value, out int length) { if (source.IsEmpty) { value = default; length = 0; return false; } fixed (byte* ptr = source) { if ((ptr[0] & 0x80) == 0) { length = 1; value = ptr[0]; } else if ((ptr[0] & 0b11100000) == 0b11000000 && source.Length >= 2 && ((uint)ptr[1] & 0b11000000) == 0b10000000) { length = 2; value = (((uint)ptr[0] & 0x1F) << 6) | (((uint)ptr[1] & 0x3F) << 0); } else if (((uint)ptr[0] & 0b11110000) == 0b11100000 && source.Length >= 3 && ((uint)ptr[1] & 0b11000000) == 0b10000000 && ((uint)ptr[2] & 0b11000000) == 0b10000000) { length = 3; value = (((uint)ptr[0] & 0x0F) << 12) | (((uint)ptr[1] & 0x3F) << 6) | (((uint)ptr[2] & 0x3F) << 0); } else if (((uint)ptr[0] & 0b11111000) == 0b11110000 && source.Length >= 4 && ((uint)ptr[1] & 0b11000000) == 0b10000000 && ((uint)ptr[2] & 0b11000000) == 0b10000000 && ((uint)ptr[3] & 0b11000000) == 0b10000000) { length = 4; value = (((uint)ptr[0] & 0x07) << 18) | (((uint)ptr[1] & 0x3F) << 12) | (((uint)ptr[2] & 0x3F) << 6) | (((uint)ptr[3] & 0x3F) << 0); } else if (((uint)ptr[0] & 0b11111100) == 0b11111000 && source.Length >= 5 && ((uint)ptr[1] & 0b11000000) == 0b10000000 && ((uint)ptr[2] & 0b11000000) == 0b10000000 && ((uint)ptr[3] & 0b11000000) == 0b10000000 && ((uint)ptr[4] & 0b11000000) == 0b10000000) { length = 5; value = (((uint)ptr[0] & 0x03) << 24) | (((uint)ptr[1] & 0x3F) << 18) | (((uint)ptr[2] & 0x3F) << 12) | (((uint)ptr[3] & 0x3F) << 6) | (((uint)ptr[4] & 0x3F) << 0); } else if (((uint)ptr[0] & 0b11111110) == 0b11111100 && source.Length >= 6 && ((uint)ptr[1] & 0b11000000) == 0b10000000 && ((uint)ptr[2] & 0b11000000) == 0b10000000 && ((uint)ptr[3] & 0b11000000) == 0b10000000 && ((uint)ptr[4] & 0b11000000) == 0b10000000 && ((uint)ptr[5] & 0b11000000) == 0b10000000) { length = 6; value = (((uint)ptr[0] & 0x01) << 30) | (((uint)ptr[1] & 0x3F) << 24) | (((uint)ptr[2] & 0x3F) << 18) | (((uint)ptr[3] & 0x3F) << 12) | (((uint)ptr[4] & 0x3F) << 6) | (((uint)ptr[5] & 0x3F) << 0); } else if (((uint)ptr[0] & 0b11111111) == 0b11111110 && source.Length >= 7 && ((uint)ptr[1] & 0b11111100) == 0b10000000 && ((uint)ptr[2] & 0b11000000) == 0b10000000 && ((uint)ptr[3] & 0b11000000) == 0b10000000 && ((uint)ptr[4] & 0b11000000) == 0b10000000 && ((uint)ptr[5] & 0b11000000) == 0b10000000 && ((uint)ptr[6] & 0b11000000) == 0b10000000) { length = 7; value = (((uint)ptr[1] & 0x03) << 30) | (((uint)ptr[2] & 0x3F) << 24) | (((uint)ptr[3] & 0x3F) << 18) | (((uint)ptr[4] & 0x3F) << 12) | (((uint)ptr[5] & 0x3F) << 6) | (((uint)ptr[6] & 0x3F) << 0); } else { length = 1; value = default; return false; } return true; } } /// /// Trims at beginning by . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryDecode16(ref ReadOnlySpan source, bool be, out UtfValue value, out int length) { var v = TryDecode16(source, be, out value, out length); source = source[length..]; return v; } /// Attempts to decode a value from a UTF-16 byte sequence. /// The span to decode from. /// Whether to use big endian. /// The decoded value. /// The length of the consumed bytes. 1 if cut short. /// 2 if sequence is broken. /// true if is successfully decoded. /// Codepoints that results in false from can still be returned, /// including unpaired surrogate characters, or codepoints above U+10FFFFF. This function returns a value only /// indicating whether the sequence could be decoded into a number, without being too short. public static unsafe bool TryDecode16(ReadOnlySpan source, bool be, out UtfValue value, out int length) { if (source.Length < 2) { value = default; length = source.Length; return false; } fixed (byte* ptr = source) { var p16 = (ushort*)ptr; var val = be == BitConverter.IsLittleEndian ? BinaryPrimitives.ReverseEndianness(*p16) : *p16; if (char.IsHighSurrogate((char)val)) { var lookahead1 = source.Length >= 4 ? p16[1] : 0; var lookahead2 = source.Length >= 6 ? p16[2] : 0; var lookahead3 = source.Length >= 8 ? p16[3] : 0; if (char.IsLowSurrogate((char)lookahead1)) { // Not a valid Unicode codepoint anymore inside the block below. if (char.IsLowSurrogate((char)lookahead2)) { if (char.IsLowSurrogate((char)lookahead3)) { value = 0x10000 + (((val & 0x3) << 30) | ((lookahead1 & 0x3FF) << 20) | ((lookahead2 & 0x3FF) << 10) | ((lookahead3 & 0x3FF) << 0)); length = 8; return true; } value = 0x10000 + (((val & 0x3FF) << 20) | ((lookahead1 & 0x3FF) << 10) | ((lookahead2 & 0x3FF) << 0)); length = 6; return true; } value = 0x10000 + (((val & 0x3FF) << 10) | ((lookahead1 & 0x3FF) << 0)); length = 4; return true; } } // Calls are supposed to handle unpaired surrogates. value = val; length = 2; return true; } } /// /// Trims at beginning by . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryDecode32(ref ReadOnlySpan source, bool be, out UtfValue value, out int length) { var v = TryDecode32(source, be, out value, out length); source = source[length..]; return v; } /// Attempts to decode a value from a UTF-32 byte sequence. /// The span to decode from. /// Whether to use big endian. /// The decoded value. /// The length of the consumed bytes. 1 to 3 if cut short. /// 4 if sequence is broken. /// true if is successfully decoded. /// Codepoints that results in false from can still be returned, /// including unpaired surrogate characters, or codepoints above U+10FFFFF. This function returns a value only /// indicating whether the sequence could be decoded into a number, without being too short. public static bool TryDecode32(ReadOnlySpan source, bool be, out UtfValue value, out int length) { if (source.Length < 4) { value = default; length = source.Length; return false; } length = 4; if ((be && BinaryPrimitives.TryReadInt32BigEndian(source, out var i32)) || (!be && BinaryPrimitives.TryReadInt32LittleEndian(source, out i32))) { value = i32; return true; } value = default; return false; } /// Encodes the codepoint to the target in UTF-8. /// The target stream. /// The codepoint to encode. /// The length of the encoded data. /// Trims at beginning by the length. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int Encode8(Stream target, int codepoint) { Span buf = stackalloc byte[7]; Encode8(buf, codepoint, out var length); target.Write(buf[..length]); return length; } /// Encodes the codepoint to the target in UTF-8. /// The target byte span. /// The codepoint to encode. /// The length of the encoded data. /// Trims at beginning by the length. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int Encode8(ref Span target, int codepoint) { target = Encode8(target, codepoint, out var length); return length; } /// Encodes the codepoint to the target in UTF-8. /// The optional target byte span. /// The codepoint to encode. /// The length of the encoded data. /// The remaning region of . public static Span Encode8(Span target, int codepoint, out int length) { var value = (uint)codepoint; length = GetEncodedLength8(codepoint); if (target.IsEmpty) return target; switch (length) { case 1: target[0] = (byte)value; return target[1..]; case 2: target[0] = (byte)(0xC0 | ((value >> 6) & 0x1F)); target[1] = (byte)(0x80 | ((value >> 0) & 0x3F)); return target[2..]; case 3: target[0] = (byte)(0xE0 | ((value >> 12) & 0x0F)); target[1] = (byte)(0x80 | ((value >> 6) & 0x3F)); target[2] = (byte)(0x80 | ((value >> 0) & 0x3F)); return target[3..]; case 4: target[0] = (byte)(0xF0 | ((value >> 18) & 0x07)); target[1] = (byte)(0x80 | ((value >> 12) & 0x3F)); target[2] = (byte)(0x80 | ((value >> 6) & 0x3F)); target[3] = (byte)(0x80 | ((value >> 0) & 0x3F)); return target[4..]; case 5: target[0] = (byte)(0xF8 | ((value >> 24) & 0x03)); target[1] = (byte)(0x80 | ((value >> 18) & 0x3F)); target[2] = (byte)(0x80 | ((value >> 12) & 0x3F)); target[3] = (byte)(0x80 | ((value >> 6) & 0x3F)); target[4] = (byte)(0x80 | ((value >> 0) & 0x3F)); return target[5..]; case 6: target[0] = (byte)(0xFC | ((value >> 30) & 0x01)); target[1] = (byte)(0x80 | ((value >> 24) & 0x3F)); target[2] = (byte)(0x80 | ((value >> 18) & 0x3F)); target[3] = (byte)(0x80 | ((value >> 12) & 0x3F)); target[4] = (byte)(0x80 | ((value >> 6) & 0x3F)); target[5] = (byte)(0x80 | ((value >> 0) & 0x3F)); return target[6..]; case 7: target[0] = 0xFE; target[1] = (byte)(0x80 | ((value >> 30) & 0x03)); target[2] = (byte)(0x80 | ((value >> 24) & 0x3F)); target[3] = (byte)(0x80 | ((value >> 18) & 0x3F)); target[4] = (byte)(0x80 | ((value >> 12) & 0x3F)); target[5] = (byte)(0x80 | ((value >> 6) & 0x3F)); target[6] = (byte)(0x80 | ((value >> 0) & 0x3F)); return target[7..]; default: Debug.Assert(false, $"{nameof(Length8)} property should have produced all possible cases."); return target; } } /// Encodes the codepoint to the target in UTF-16. /// The target stream. /// The codepoint to encode. /// Whether to use big endian. /// The length of the encoded data. /// Trims at beginning by the length. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int Encode16(Stream target, int codepoint, bool be) { Span buf = stackalloc byte[8]; Encode16(buf, codepoint, be, out var length); target.Write(buf[..length]); return length; } /// Encodes the codepoint to the target in UTF-16. /// The target byte span. /// The codepoint to encode. /// Whether to use big endian. /// The length of the encoded data. /// Trims at beginning by the length. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int Encode16(ref Span target, int codepoint, bool be) { target = Encode16(target, codepoint, be, out var length); return length; } /// Encodes the codepoint to the target in UTF-16. /// The optional target byte span. /// The codepoint to encode. /// Whether to use big endian. /// The length of the encoded data. /// The remaning region of . public static Span Encode16(Span target, int codepoint, bool be, out int length) { var value = (uint)codepoint; length = GetEncodedLength16(codepoint); if (target.IsEmpty) return target; if (be) { switch (length) { case 2: BinaryPrimitives.WriteUInt16BigEndian(target[0..], (ushort)value); return target[2..]; case 4: value -= 0x10000; BinaryPrimitives.WriteUInt16BigEndian(target[0..], (ushort)(0xD800 | ((value >> 10) & 0x3FF))); BinaryPrimitives.WriteUInt16BigEndian(target[2..], (ushort)(0xDC00 | ((value >> 00) & 0x3FF))); return target[4..]; case 6: value -= 0x10000; BinaryPrimitives.WriteUInt16BigEndian(target[0..], (ushort)(0xD800 | ((value >> 20) & 0x3FF))); BinaryPrimitives.WriteUInt16BigEndian(target[2..], (ushort)(0xDC00 | ((value >> 10) & 0x3FF))); BinaryPrimitives.WriteUInt16BigEndian(target[4..], (ushort)(0xDC00 | ((value >> 00) & 0x3FF))); return target[6..]; case 8: value -= 0x10000; BinaryPrimitives.WriteUInt16BigEndian(target[0..], (ushort)(0xD800 | ((value >> 30) & 0x3))); BinaryPrimitives.WriteUInt16BigEndian(target[2..], (ushort)(0xDC00 | ((value >> 20) & 0x3FF))); BinaryPrimitives.WriteUInt16BigEndian(target[4..], (ushort)(0xDC00 | ((value >> 10) & 0x3FF))); BinaryPrimitives.WriteUInt16BigEndian(target[6..], (ushort)(0xDC00 | ((value >> 00) & 0x3FF))); return target[8..]; default: Debug.Assert(false, $"{nameof(Length16)} property should have produced all possible cases."); return target; } } switch (length) { case 2: BinaryPrimitives.WriteUInt16LittleEndian(target[0..], (ushort)value); return target[2..]; case 4: value -= 0x10000; BinaryPrimitives.WriteUInt16LittleEndian(target[0..], (ushort)(0xD800 | ((value >> 10) & 0x3FF))); BinaryPrimitives.WriteUInt16LittleEndian(target[2..], (ushort)(0xDC00 | ((value >> 00) & 0x3FF))); return target[4..]; case 6: value -= 0x10000; BinaryPrimitives.WriteUInt16LittleEndian(target[0..], (ushort)(0xD800 | ((value >> 20) & 0x3FF))); BinaryPrimitives.WriteUInt16LittleEndian(target[2..], (ushort)(0xDC00 | ((value >> 10) & 0x3FF))); BinaryPrimitives.WriteUInt16LittleEndian(target[4..], (ushort)(0xDC00 | ((value >> 00) & 0x3FF))); return target[6..]; case 8: value -= 0x10000; BinaryPrimitives.WriteUInt16LittleEndian(target[0..], (ushort)(0xD800 | ((value >> 30) & 0x3))); BinaryPrimitives.WriteUInt16LittleEndian(target[2..], (ushort)(0xDC00 | ((value >> 20) & 0x3FF))); BinaryPrimitives.WriteUInt16LittleEndian(target[4..], (ushort)(0xDC00 | ((value >> 10) & 0x3FF))); BinaryPrimitives.WriteUInt16LittleEndian(target[6..], (ushort)(0xDC00 | ((value >> 00) & 0x3FF))); return target[8..]; default: Debug.Assert(false, $"{nameof(Length16)} property should have produced all possible cases."); return target; } } /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public int CompareTo(UtfValue other) => this.IntValue.CompareTo(other.IntValue); /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool Equals(UtfValue other) => this.IntValue == other.IntValue; /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public override bool Equals(object? obj) => obj is UtfValue other && this.Equals(other); /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public override int GetHashCode() => this.IntValue; /// Attempts to get the corresponding rune. /// The retrieved rune. /// true if retrieved. [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool TryGetRune(out Rune rune) { if (Rune.IsValid(this.IntValue)) { rune = new(this.IntValue); return true; } rune = default; return false; } /// Encodes the codepoint to the target. /// The target byte span. /// The remaning region of . [MethodImpl(MethodImplOptions.AggressiveInlining)] public Span Encode8(Span target) => Encode8(target, this, out _); }