utf8_Tips.lua 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. ---@class utf8
  2. utf8 = {}
  3. --byte index of the next char after the char at byte index i, followed by a valid flag for the char at byte index i.
  4. --nil if not found. invalid characters are iterated as 1-byte chars.
  5. function utf8.next_raw(s, i)
  6. if not i then
  7. if #s == 0 then return nil end
  8. return 1, true --fake flag (doesn't matter since this flag is not to be taken as full validation)
  9. end
  10. if i > #s then return end
  11. local c = s:byte(i)
  12. if c >= 0x00 and c <= 0x7F then
  13. i = i + 1
  14. elseif c >= 0xC2 and c <= 0xDF then
  15. i = i + 2
  16. elseif c >= 0xE0 and c <= 0xEF then
  17. i = i + 3
  18. elseif c >= 0xF0 and c <= 0xF4 then
  19. i = i + 4
  20. else --invalid
  21. return i + 1, false
  22. end
  23. if i > #s then return end
  24. return i, true
  25. end
  26. --next() is the generic iterator and can be replaced for different semantics. next_raw() must preserve its semantics.
  27. utf8.next = utf8.next_raw
  28. --iterate chars, returning the byte index where each char starts
  29. function utf8.byte_indices(s, previ)
  30. return utf8.next, s, previ
  31. end
  32. --number of chars in string
  33. function utf8.len(s)
  34. assert(s, "bad argument #1 to 'len' (string expected, got nil)")
  35. local len = 0
  36. for _ in utf8.byte_indices(s) do
  37. len = len + 1
  38. end
  39. return len
  40. end
  41. --byte index given char index. nil if the index is outside the string.
  42. function utf8.byte_index(s, target_ci)
  43. if target_ci < 1 then return end
  44. local ci = 0
  45. for i in utf8.byte_indices(s) do
  46. ci = ci + 1
  47. if ci == target_ci then
  48. return i
  49. end
  50. end
  51. assert(target_ci > ci, "invalid index")
  52. end
  53. --char index given byte index. nil if the index is outside the string.
  54. function utf8.char_index(s, target_i)
  55. if target_i < 1 or target_i > #s then return end
  56. local ci = 0
  57. for i in utf8.byte_indices(s) do
  58. ci = ci + 1
  59. if i == target_i then
  60. return ci
  61. end
  62. end
  63. error("invalid index")
  64. end
  65. --byte index of the prev. char before the char at byte index i, which defaults to #s + 1.
  66. --nil if the index is outside the 2..#s+1 range.
  67. --NOTE: unlike next(), this is a O(N) operation!
  68. function utf8.prev(s, nexti)
  69. nexti = nexti or #s + 1
  70. if nexti <= 1 or nexti > #s + 1 then return end
  71. local lasti, lastvalid = utf8.next(s)
  72. for i, valid in utf8.byte_indices(s) do
  73. if i == nexti then
  74. return lasti, lastvalid
  75. end
  76. lasti, lastvalid = i, valid
  77. end
  78. if nexti == #s + 1 then
  79. return lasti, lastvalid
  80. end
  81. error("invalid index")
  82. end
  83. --iterate chars in reverse order, returning the byte index where each char starts.
  84. function utf8.byte_indices_reverse(s, nexti)
  85. if #s < 200 then
  86. --using prev() is a O(N^2/2) operation, ok for small strings (200 chars need 40,000 iterations)
  87. return utf8.prev, s, nexti
  88. else
  89. --store byte indices in a table and iterate them in reverse.
  90. --this is 40x slower than byte_indices() but still fast at 2mil chars/second (but eats RAM and makes garbage).
  91. ---@class t
  92. ---@class t
  93. local t = {}
  94. for i in utf8.byte_indices(s) do
  95. if nexti and i >= nexti then break end
  96. table.insert(t, i)
  97. end
  98. local i = #t + 1
  99. return function()
  100. i = i - 1
  101. return t[i]
  102. end
  103. end
  104. end
  105. --sub based on char indices, which, unlike with standard string.sub(), can't be negative.
  106. --start_ci can be 1..inf and end_ci can be 0..inf. end_ci can be nil meaning last char.
  107. --if start_ci is out of range or end_ci < start_ci, the empty string is returned.
  108. --if end_ci is out of range, it is considered to be the last position in the string.
  109. function utf8.sub(s, start_ci, end_ci)
  110. --assert for positive indices because we might implement negative indices in the future.
  111. assert(start_ci >= 1)
  112. assert(not end_ci or end_ci >= 0)
  113. local ci = 0
  114. local start_i, end_i
  115. for i in utf8.byte_indices(s) do
  116. ci = ci + 1
  117. if ci == start_ci then
  118. start_i = i
  119. end
  120. if ci == end_ci then
  121. end_i = i
  122. end
  123. end
  124. if not start_i then
  125. assert(start_ci > ci, 'invalid index')
  126. return ''
  127. end
  128. if end_ci and not end_i then
  129. if end_ci < start_ci then
  130. return ''
  131. end
  132. assert(end_ci > ci, 'invalid index')
  133. end
  134. return s:sub(start_i, end_i and end_i - 1)
  135. end
  136. --check if a string contains a substring at byte index i without making garbage.
  137. --nil if the index is out of range. true if searching for the empty string.
  138. function utf8.contains(s, i, sub)
  139. if i < 1 or i > #s then return nil end
  140. for si = 1, #sub do
  141. if s:byte(i + si - 1) ~= sub:byte(si) then
  142. return false
  143. end
  144. end
  145. return true
  146. end
  147. --count the number of occurences of a substring in a string. the substring cannot be the empty string.
  148. function utf8.count(s, sub)
  149. assert(#sub > 0)
  150. local count = 0
  151. local i = 1
  152. while i do
  153. if utf8.contains(s, i, sub) then
  154. count = count + 1
  155. i = i + #sub
  156. if i > #s then break end
  157. else
  158. i = utf8.next(s, i)
  159. end
  160. end
  161. return count
  162. end
  163. --utf8 validation and sanitization
  164. --check if there's a valid utf8 codepoint at byte index i. valid ranges for each utf8 byte are:
  165. -- byte 1 2 3 4
  166. --------------------------------------------
  167. -- 00 - 7F
  168. -- C2 - DF 80 - BF
  169. -- E0 A0 - BF 80 - BF
  170. -- E1 - EC 80 - BF 80 - BF
  171. -- ED 80 - 9F 80 - BF
  172. -- EE - EF 80 - BF 80 - BF
  173. -- F0 90 - BF 80 - BF 80 - BF
  174. -- F1 - F3 80 - BF 80 - BF 80 - BF
  175. -- F4 80 - 8F 80 - BF 80 - BF
  176. function utf8.isvalid(s, i)
  177. local c = s:byte(i)
  178. if not c then
  179. return false
  180. elseif c >= 0x00 and c <= 0x7F then
  181. return true
  182. elseif c >= 0xC2 and c <= 0xDF then
  183. local c2 = s:byte(i + 1)
  184. return c2 and c2 >= 0x80 and c2 <= 0xBF
  185. elseif c >= 0xE0 and c <= 0xEF then
  186. local c2 = s:byte(i + 1)
  187. local c3 = s:byte(i + 2)
  188. if c == 0xE0 then
  189. return c2 and c3 and
  190. c2 >= 0xA0 and c2 <= 0xBF and
  191. c3 >= 0x80 and c3 <= 0xBF
  192. elseif c >= 0xE1 and c <= 0xEC then
  193. return c2 and c3 and
  194. c2 >= 0x80 and c2 <= 0xBF and
  195. c3 >= 0x80 and c3 <= 0xBF
  196. elseif c == 0xED then
  197. return c2 and c3 and
  198. c2 >= 0x80 and c2 <= 0x9F and
  199. c3 >= 0x80 and c3 <= 0xBF
  200. elseif c >= 0xEE and c <= 0xEF then
  201. if c == 0xEF and c2 == 0xBF and (c3 == 0xBE or c3 == 0xBF) then
  202. return false --uFFFE and uFFFF non-characters
  203. end
  204. return c2 and c3 and
  205. c2 >= 0x80 and c2 <= 0xBF and
  206. c3 >= 0x80 and c3 <= 0xBF
  207. end
  208. elseif c >= 0xF0 and c <= 0xF4 then
  209. local c2 = s:byte(i + 1)
  210. local c3 = s:byte(i + 2)
  211. local c4 = s:byte(i + 3)
  212. if c == 0xF0 then
  213. return c2 and c3 and c4 and
  214. c2 >= 0x90 and c2 <= 0xBF and
  215. c3 >= 0x80 and c3 <= 0xBF and
  216. c4 >= 0x80 and c4 <= 0xBF
  217. elseif c >= 0xF1 and c <= 0xF3 then
  218. return c2 and c3 and c4 and
  219. c2 >= 0x80 and c2 <= 0xBF and
  220. c3 >= 0x80 and c3 <= 0xBF and
  221. c4 >= 0x80 and c4 <= 0xBF
  222. elseif c == 0xF4 then
  223. return c2 and c3 and c4 and
  224. c2 >= 0x80 and c2 <= 0x8F and
  225. c3 >= 0x80 and c3 <= 0xBF and
  226. c4 >= 0x80 and c4 <= 0xBF
  227. end
  228. end
  229. return false
  230. end
  231. --byte index of the next valid utf8 char after the char at byte index i.
  232. --nil if indices go out of range. invalid characters are skipped.
  233. function utf8.next_valid(s, i)
  234. local valid
  235. i, valid = utf8.next_raw(s, i)
  236. while i and (not valid or not utf8.isvalid(s, i)) do
  237. i, valid = utf8.next(s, i)
  238. end
  239. return i
  240. end
  241. --iterate valid chars, returning the byte index where each char starts
  242. function utf8.valid_byte_indices(s)
  243. return utf8.next_valid, s
  244. end
  245. --assert that a string only contains valid utf8 characters
  246. function utf8.validate(s)
  247. for i, valid in utf8.byte_indices(s) do
  248. if not valid or not utf8.isvalid(s, i) then
  249. error(string.format('invalid utf8 char at #%d', i))
  250. end
  251. end
  252. end
  253. local function table_lookup(s, i, j, t)
  254. return t[s:sub(i, j)]
  255. end
  256. --replace characters in string based on a function f(s, i, j, ...) -> replacement_string | nil
  257. function utf8.replace(s, f, ...)
  258. if type(f) == 'table' then
  259. return utf8.replace(s, table_lookup, f)
  260. end
  261. if s == '' then
  262. return s
  263. end
  264. local t = {}
  265. local lasti = 1
  266. for i in utf8.byte_indices(s) do
  267. local nexti = utf8.next(s, i) or #s + 1
  268. local repl = f(s, i, nexti - 1, ...)
  269. if repl then
  270. table.insert(t, s:sub(lasti, i - 1))
  271. table.insert(t, repl)
  272. lasti = nexti
  273. end
  274. end
  275. table.insert(t, s:sub(lasti))
  276. return table.concat(t)
  277. end
  278. local function replace_invalid(s, i, j, repl_char)
  279. if not utf8.isvalid(s, i) then
  280. return repl_char
  281. end
  282. end
  283. --replace invalid utf8 chars with a replacement char
  284. function utf8.sanitize(s, repl_char)
  285. repl_char = repl_char or '�' --\uFFFD
  286. return utf8.replace(s, replace_invalid, repl_char)
  287. end