123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- ---@class utf8
- utf8 = {}
- --byte index of the next char after the char at byte index i, followed by a valid flag for the char at byte index i.
- --nil if not found. invalid characters are iterated as 1-byte chars.
- function utf8.next_raw(s, i)
- if not i then
- if #s == 0 then return nil end
- return 1, true --fake flag (doesn't matter since this flag is not to be taken as full validation)
- end
- if i > #s then return end
- local c = s:byte(i)
- if c >= 0x00 and c <= 0x7F then
- i = i + 1
- elseif c >= 0xC2 and c <= 0xDF then
- i = i + 2
- elseif c >= 0xE0 and c <= 0xEF then
- i = i + 3
- elseif c >= 0xF0 and c <= 0xF4 then
- i = i + 4
- else --invalid
- return i + 1, false
- end
- if i > #s then return end
- return i, true
- end
- --next() is the generic iterator and can be replaced for different semantics. next_raw() must preserve its semantics.
- utf8.next = utf8.next_raw
- --iterate chars, returning the byte index where each char starts
- function utf8.byte_indices(s, previ)
- return utf8.next, s, previ
- end
- --number of chars in string
- function utf8.len(s)
- assert(s, "bad argument #1 to 'len' (string expected, got nil)")
- local len = 0
- for _ in utf8.byte_indices(s) do
- len = len + 1
- end
- return len
- end
- --byte index given char index. nil if the index is outside the string.
- function utf8.byte_index(s, target_ci)
- if target_ci < 1 then return end
- local ci = 0
- for i in utf8.byte_indices(s) do
- ci = ci + 1
- if ci == target_ci then
- return i
- end
- end
- assert(target_ci > ci, "invalid index")
- end
- --char index given byte index. nil if the index is outside the string.
- function utf8.char_index(s, target_i)
- if target_i < 1 or target_i > #s then return end
- local ci = 0
- for i in utf8.byte_indices(s) do
- ci = ci + 1
- if i == target_i then
- return ci
- end
- end
- error("invalid index")
- end
- --byte index of the prev. char before the char at byte index i, which defaults to #s + 1.
- --nil if the index is outside the 2..#s+1 range.
- --NOTE: unlike next(), this is a O(N) operation!
- function utf8.prev(s, nexti)
- nexti = nexti or #s + 1
- if nexti <= 1 or nexti > #s + 1 then return end
- local lasti, lastvalid = utf8.next(s)
- for i, valid in utf8.byte_indices(s) do
- if i == nexti then
- return lasti, lastvalid
- end
- lasti, lastvalid = i, valid
- end
- if nexti == #s + 1 then
- return lasti, lastvalid
- end
- error("invalid index")
- end
- --iterate chars in reverse order, returning the byte index where each char starts.
- function utf8.byte_indices_reverse(s, nexti)
- if #s < 200 then
- --using prev() is a O(N^2/2) operation, ok for small strings (200 chars need 40,000 iterations)
- return utf8.prev, s, nexti
- else
- --store byte indices in a table and iterate them in reverse.
- --this is 40x slower than byte_indices() but still fast at 2mil chars/second (but eats RAM and makes garbage).
- ---@class t
- ---@class t
- local t = {}
- for i in utf8.byte_indices(s) do
- if nexti and i >= nexti then break end
- table.insert(t, i)
- end
- local i = #t + 1
- return function()
- i = i - 1
- return t[i]
- end
- end
- end
- --sub based on char indices, which, unlike with standard string.sub(), can't be negative.
- --start_ci can be 1..inf and end_ci can be 0..inf. end_ci can be nil meaning last char.
- --if start_ci is out of range or end_ci < start_ci, the empty string is returned.
- --if end_ci is out of range, it is considered to be the last position in the string.
- function utf8.sub(s, start_ci, end_ci)
- --assert for positive indices because we might implement negative indices in the future.
- assert(start_ci >= 1)
- assert(not end_ci or end_ci >= 0)
- local ci = 0
- local start_i, end_i
- for i in utf8.byte_indices(s) do
- ci = ci + 1
- if ci == start_ci then
- start_i = i
- end
- if ci == end_ci then
- end_i = i
- end
- end
- if not start_i then
- assert(start_ci > ci, 'invalid index')
- return ''
- end
- if end_ci and not end_i then
- if end_ci < start_ci then
- return ''
- end
- assert(end_ci > ci, 'invalid index')
- end
- return s:sub(start_i, end_i and end_i - 1)
- end
- --check if a string contains a substring at byte index i without making garbage.
- --nil if the index is out of range. true if searching for the empty string.
- function utf8.contains(s, i, sub)
- if i < 1 or i > #s then return nil end
- for si = 1, #sub do
- if s:byte(i + si - 1) ~= sub:byte(si) then
- return false
- end
- end
- return true
- end
- --count the number of occurences of a substring in a string. the substring cannot be the empty string.
- function utf8.count(s, sub)
- assert(#sub > 0)
- local count = 0
- local i = 1
- while i do
- if utf8.contains(s, i, sub) then
- count = count + 1
- i = i + #sub
- if i > #s then break end
- else
- i = utf8.next(s, i)
- end
- end
- return count
- end
- --utf8 validation and sanitization
- --check if there's a valid utf8 codepoint at byte index i. valid ranges for each utf8 byte are:
- -- byte 1 2 3 4
- --------------------------------------------
- -- 00 - 7F
- -- C2 - DF 80 - BF
- -- E0 A0 - BF 80 - BF
- -- E1 - EC 80 - BF 80 - BF
- -- ED 80 - 9F 80 - BF
- -- EE - EF 80 - BF 80 - BF
- -- F0 90 - BF 80 - BF 80 - BF
- -- F1 - F3 80 - BF 80 - BF 80 - BF
- -- F4 80 - 8F 80 - BF 80 - BF
- function utf8.isvalid(s, i)
- local c = s:byte(i)
- if not c then
- return false
- elseif c >= 0x00 and c <= 0x7F then
- return true
- elseif c >= 0xC2 and c <= 0xDF then
- local c2 = s:byte(i + 1)
- return c2 and c2 >= 0x80 and c2 <= 0xBF
- elseif c >= 0xE0 and c <= 0xEF then
- local c2 = s:byte(i + 1)
- local c3 = s:byte(i + 2)
- if c == 0xE0 then
- return c2 and c3 and
- c2 >= 0xA0 and c2 <= 0xBF and
- c3 >= 0x80 and c3 <= 0xBF
- elseif c >= 0xE1 and c <= 0xEC then
- return c2 and c3 and
- c2 >= 0x80 and c2 <= 0xBF and
- c3 >= 0x80 and c3 <= 0xBF
- elseif c == 0xED then
- return c2 and c3 and
- c2 >= 0x80 and c2 <= 0x9F and
- c3 >= 0x80 and c3 <= 0xBF
- elseif c >= 0xEE and c <= 0xEF then
- if c == 0xEF and c2 == 0xBF and (c3 == 0xBE or c3 == 0xBF) then
- return false --uFFFE and uFFFF non-characters
- end
- return c2 and c3 and
- c2 >= 0x80 and c2 <= 0xBF and
- c3 >= 0x80 and c3 <= 0xBF
- end
- elseif c >= 0xF0 and c <= 0xF4 then
- local c2 = s:byte(i + 1)
- local c3 = s:byte(i + 2)
- local c4 = s:byte(i + 3)
- if c == 0xF0 then
- return c2 and c3 and c4 and
- c2 >= 0x90 and c2 <= 0xBF and
- c3 >= 0x80 and c3 <= 0xBF and
- c4 >= 0x80 and c4 <= 0xBF
- elseif c >= 0xF1 and c <= 0xF3 then
- return c2 and c3 and c4 and
- c2 >= 0x80 and c2 <= 0xBF and
- c3 >= 0x80 and c3 <= 0xBF and
- c4 >= 0x80 and c4 <= 0xBF
- elseif c == 0xF4 then
- return c2 and c3 and c4 and
- c2 >= 0x80 and c2 <= 0x8F and
- c3 >= 0x80 and c3 <= 0xBF and
- c4 >= 0x80 and c4 <= 0xBF
- end
- end
- return false
- end
- --byte index of the next valid utf8 char after the char at byte index i.
- --nil if indices go out of range. invalid characters are skipped.
- function utf8.next_valid(s, i)
- local valid
- i, valid = utf8.next_raw(s, i)
- while i and (not valid or not utf8.isvalid(s, i)) do
- i, valid = utf8.next(s, i)
- end
- return i
- end
- --iterate valid chars, returning the byte index where each char starts
- function utf8.valid_byte_indices(s)
- return utf8.next_valid, s
- end
- --assert that a string only contains valid utf8 characters
- function utf8.validate(s)
- for i, valid in utf8.byte_indices(s) do
- if not valid or not utf8.isvalid(s, i) then
- error(string.format('invalid utf8 char at #%d', i))
- end
- end
- end
- local function table_lookup(s, i, j, t)
- return t[s:sub(i, j)]
- end
- --replace characters in string based on a function f(s, i, j, ...) -> replacement_string | nil
- function utf8.replace(s, f, ...)
- if type(f) == 'table' then
- return utf8.replace(s, table_lookup, f)
- end
- if s == '' then
- return s
- end
- local t = {}
- local lasti = 1
- for i in utf8.byte_indices(s) do
- local nexti = utf8.next(s, i) or #s + 1
- local repl = f(s, i, nexti - 1, ...)
- if repl then
- table.insert(t, s:sub(lasti, i - 1))
- table.insert(t, repl)
- lasti = nexti
- end
- end
- table.insert(t, s:sub(lasti))
- return table.concat(t)
- end
- local function replace_invalid(s, i, j, repl_char)
- if not utf8.isvalid(s, i) then
- return repl_char
- end
- end
- --replace invalid utf8 chars with a replacement char
- function utf8.sanitize(s, repl_char)
- repl_char = repl_char or '�' --\uFFFD
- return utf8.replace(s, replace_invalid, repl_char)
- end
|