|
| 1 | +defmodule ElixirLS.LanguageServer.CodeUnit do |
| 2 | + @moduledoc """ |
| 3 | + Code unit and offset conversions |
| 4 | +
|
| 5 | + The LSP protocol speaks in positions, which defines where something happens in a document. |
| 6 | + Positions have a start and an end, which are defined as code unit _offsets_ from the beginning |
| 7 | + of a line. this module helps to convert between utf8, which most of the world speaks |
| 8 | + natively, and utf16, which has been forced upon us by microsoft. |
| 9 | +
|
| 10 | + Converting between offsets and code units is 0(n), and allocations only happen if a |
| 11 | + multi-byte character is detected, at which point, only that character is allocated. |
| 12 | + This exploits the fact that most source code consists of ascii characters, with at best, |
| 13 | + sporadic multi-byte characters in it. Thus, the vast majority of documents will not require |
| 14 | + any allocations at all. |
| 15 | + """ |
| 16 | + @type utf8_code_unit :: non_neg_integer() |
| 17 | + @type utf16_code_unit :: non_neg_integer() |
| 18 | + @type utf8_offset :: non_neg_integer() |
| 19 | + @type utf16_offset :: non_neg_integer() |
| 20 | + |
| 21 | + @type error :: {:error, :misaligned} | {:error, :out_of_bounds} |
| 22 | + |
| 23 | + # public |
| 24 | + |
| 25 | + @doc """ |
| 26 | + Converts a utf8 character offset into a utf16 character offset. This implementation |
| 27 | + clamps the maximum size of an offset so that any initial character position can be |
| 28 | + passed in and the offset returned will reflect the end of the line. |
| 29 | + """ |
| 30 | + @spec utf16_offset(String.t(), utf8_offset()) :: utf16_offset() |
| 31 | + def utf16_offset(binary, character_position) do |
| 32 | + do_utf16_offset(binary, character_position, 0) |
| 33 | + end |
| 34 | + |
| 35 | + @doc """ |
| 36 | + Converts a utf16 character offset into a utf8 character offset. This implementation |
| 37 | + clamps the maximum size of an offset so that any initial character position can be |
| 38 | + passed in and the offset returned will reflect the end of the line. |
| 39 | + """ |
| 40 | + @spec utf8_offset(String.t(), utf16_offset()) :: utf8_offset() |
| 41 | + def utf8_offset(binary, character_position) do |
| 42 | + do_utf8_offset(binary, character_position, 0) |
| 43 | + end |
| 44 | + |
| 45 | + @spec to_utf8(String.t(), utf16_code_unit()) :: {:ok, utf8_code_unit()} | error |
| 46 | + def to_utf8(binary, utf16_unit) do |
| 47 | + do_to_utf8(binary, utf16_unit, 0) |
| 48 | + end |
| 49 | + |
| 50 | + @spec to_utf16(String.t(), utf8_code_unit()) :: {:ok, utf16_code_unit()} | error |
| 51 | + def to_utf16(binary, utf16_unit) do |
| 52 | + do_to_utf16(binary, utf16_unit, 0) |
| 53 | + end |
| 54 | + |
| 55 | + def count(:utf16, binary) do |
| 56 | + do_count_utf16(binary, 0) |
| 57 | + end |
| 58 | + |
| 59 | + # Private |
| 60 | + |
| 61 | + # UTF-16 |
| 62 | + |
| 63 | + def do_count_utf16(<<>>, count) do |
| 64 | + count |
| 65 | + end |
| 66 | + |
| 67 | + def do_count_utf16(<<c, rest::binary>>, count) when c < 128 do |
| 68 | + do_count_utf16(rest, count + 1) |
| 69 | + end |
| 70 | + |
| 71 | + def do_count_utf16(<<c::utf8, rest::binary>>, count) do |
| 72 | + increment = |
| 73 | + <<c::utf16>> |
| 74 | + |> byte_size() |
| 75 | + |> div(2) |
| 76 | + |
| 77 | + do_count_utf16(rest, count + increment) |
| 78 | + end |
| 79 | + |
| 80 | + defp do_utf16_offset(_, 0, offset) do |
| 81 | + offset |
| 82 | + end |
| 83 | + |
| 84 | + defp do_utf16_offset(<<>>, _, offset) do |
| 85 | + # this clause pegs the offset at the end of the string |
| 86 | + # no matter the character index |
| 87 | + offset |
| 88 | + end |
| 89 | + |
| 90 | + defp do_utf16_offset(<<c, rest::binary>>, remaining, offset) when c < 128 do |
| 91 | + do_utf16_offset(rest, remaining - 1, offset + 1) |
| 92 | + end |
| 93 | + |
| 94 | + defp do_utf16_offset(<<c::utf8, rest::binary>>, remaining, offset) do |
| 95 | + s = <<c::utf8>> |
| 96 | + increment = utf16_size(s) |
| 97 | + do_utf16_offset(rest, remaining - 1, offset + increment) |
| 98 | + end |
| 99 | + |
| 100 | + defp do_to_utf16(_, 0, utf16_unit) do |
| 101 | + {:ok, utf16_unit} |
| 102 | + end |
| 103 | + |
| 104 | + defp do_to_utf16(_, utf8_unit, _) when utf8_unit < 0 do |
| 105 | + {:error, :misaligned} |
| 106 | + end |
| 107 | + |
| 108 | + defp do_to_utf16(<<>>, _remaining, _utf16_unit) do |
| 109 | + {:error, :out_of_bounds} |
| 110 | + end |
| 111 | + |
| 112 | + defp do_to_utf16(<<c, rest::binary>>, utf8_unit, utf16_unit) when c < 128 do |
| 113 | + do_to_utf16(rest, utf8_unit - 1, utf16_unit + 1) |
| 114 | + end |
| 115 | + |
| 116 | + defp do_to_utf16(<<c::utf8, rest::binary>>, utf8_unit, utf16_unit) do |
| 117 | + utf8_string = <<c::utf8>> |
| 118 | + increment = utf16_size(utf8_string) |
| 119 | + decrement = byte_size(utf8_string) |
| 120 | + |
| 121 | + do_to_utf16(rest, utf8_unit - decrement, utf16_unit + increment) |
| 122 | + end |
| 123 | + |
| 124 | + defp utf16_size(binary) when is_binary(binary) do |
| 125 | + binary |
| 126 | + |> :unicode.characters_to_binary(:utf8, :utf16) |
| 127 | + |> byte_size() |
| 128 | + |> div(2) |
| 129 | + end |
| 130 | + |
| 131 | + # UTF-8 |
| 132 | + |
| 133 | + defp do_utf8_offset(_, 0, offset) do |
| 134 | + offset |
| 135 | + end |
| 136 | + |
| 137 | + defp do_utf8_offset(<<>>, _, offset) do |
| 138 | + # this clause pegs the offset at the end of the string |
| 139 | + # no matter the character index |
| 140 | + offset |
| 141 | + end |
| 142 | + |
| 143 | + defp do_utf8_offset(<<c, rest::binary>>, remaining, offset) when c < 128 do |
| 144 | + do_utf8_offset(rest, remaining - 1, offset + 1) |
| 145 | + end |
| 146 | + |
| 147 | + defp do_utf8_offset(<<c::utf8, rest::binary>>, remaining, offset) do |
| 148 | + s = <<c::utf8>> |
| 149 | + increment = utf8_size(s) |
| 150 | + decrement = utf16_size(s) |
| 151 | + do_utf8_offset(rest, remaining - decrement, offset + increment) |
| 152 | + end |
| 153 | + |
| 154 | + defp do_to_utf8(_, 0, utf8_unit) do |
| 155 | + {:ok, utf8_unit} |
| 156 | + end |
| 157 | + |
| 158 | + defp do_to_utf8(_, utf_16_units, _) when utf_16_units < 0 do |
| 159 | + {:error, :misaligned} |
| 160 | + end |
| 161 | + |
| 162 | + defp do_to_utf8(<<>>, _remaining, _utf8_unit) do |
| 163 | + {:error, :out_of_bounds} |
| 164 | + end |
| 165 | + |
| 166 | + defp do_to_utf8(<<c, rest::binary>>, utf16_unit, utf8_unit) when c < 128 do |
| 167 | + do_to_utf8(rest, utf16_unit - 1, utf8_unit + 1) |
| 168 | + end |
| 169 | + |
| 170 | + defp do_to_utf8(<<c::utf8, rest::binary>>, utf16_unit, utf8_unit) do |
| 171 | + utf8_code_units = byte_size(<<c::utf8>>) |
| 172 | + utf16_code_units = utf16_size(<<c::utf8>>) |
| 173 | + |
| 174 | + do_to_utf8(rest, utf16_unit - utf16_code_units, utf8_unit + utf8_code_units) |
| 175 | + end |
| 176 | + |
| 177 | + defp utf8_size(binary) when is_binary(binary) do |
| 178 | + byte_size(binary) |
| 179 | + end |
| 180 | +end |
0 commit comments