refactor: extract out utf16 map from lsp to cli::util module (#27950)

2025-03-03 17:34:47 -05:00 · 2025-02-03 20:24:26 -05:00 · 2025-02-03 20:24:26 -05:00 · 073caf5fe9
commit 073caf5fe9
parent 108b6a8bfb
3 changed files with 322 additions and 350 deletions
--- a/cli/lsp/text.rs
+++ b/cli/lsp/text.rs
@ -1,9 +1,6 @@
 // Copyright 2018-2025 the Deno authors. MIT license.

-use std::collections::HashMap;
-
 use deno_core::error::AnyError;
-use deno_error::JsErrorBox;
 use dissimilar::diff;
 use dissimilar::Chunk;
 use text_size::TextRange;
@ -12,112 +9,17 @@ use tower_lsp::jsonrpc;
 use tower_lsp::lsp_types as lsp;
 use tower_lsp::lsp_types::TextEdit;

-fn partition_point<T, P>(slice: &[T], mut predicate: P) -> usize
-where
-  P: FnMut(&T) -> bool,
-{
-  let mut left = 0;
-  let mut right = slice.len() - 1;
-
-  while left != right {
-    let mid = left + (right - left) / 2;
-    // SAFETY:
-    // When left < right, left <= mid < right.
-    // Therefore left always increases and right always decreases,
-    // and either of them is selected.
-    // In both cases left <= right is satisfied.
-    // Therefore if left < right in a step,
-    // left <= right is satisfied in the next step.
-    // Therefore as long as left != right, 0 <= left < right < len is satisfied
-    // and if this case 0 <= mid < len is satisfied too.
-    let value = unsafe { slice.get_unchecked(mid) };
-    if predicate(value) {
-      left = mid + 1;
-    } else {
-      right = mid;
-    }
-  }
-
-  left
-}
-
-#[derive(Debug, Clone, Eq, PartialEq, Hash)]
-pub struct Utf16Char {
-  pub start: TextSize,
-  pub end: TextSize,
-}
-
-impl Utf16Char {
-  fn len(&self) -> TextSize {
-    self.end - self.start
-  }
-
-  fn len_utf16(&self) -> usize {
-    if self.len() == TextSize::from(4) {
-      2
-    } else {
-      1
-    }
-  }
-}
+use crate::util::text_encoding::Utf16Map;

 #[derive(Debug, Clone, Default, Eq, PartialEq)]
 pub struct LineIndex {
-  utf8_offsets: Vec<TextSize>,
-  utf16_lines: HashMap<u32, Vec<Utf16Char>>,
-  utf16_offsets: Vec<TextSize>,
+  inner: Utf16Map,
 }

 impl LineIndex {
  pub fn new(text: &str) -> LineIndex {
-    let mut utf16_lines = HashMap::new();
-    let mut utf16_chars = Vec::new();
-
-    let mut utf8_offsets = vec![0.into()];
-    let mut utf16_offsets = vec![0.into()];
-    let mut curr_row = 0.into();
-    let mut curr_col = 0.into();
-    let mut curr_offset_u16 = 0.into();
-    let mut line = 0;
-    for c in text.chars() {
-      let c_len = TextSize::of(c);
-      curr_row += c_len;
-      curr_offset_u16 += TextSize::from(c.len_utf16() as u32);
-      if c == '\n' {
-        utf8_offsets.push(curr_row);
-        utf16_offsets.push(curr_offset_u16);
-
-        if !utf16_chars.is_empty() {
-          utf16_lines.insert(line, utf16_chars);
-          utf16_chars = Vec::new();
-        }
-
-        curr_col = 0.into();
-        line += 1;
-        continue;
-      }
-
-      if !c.is_ascii() {
-        utf16_chars.push(Utf16Char {
-          start: curr_col,
-          end: curr_col + c_len,
-        });
-      }
-      curr_col += c_len;
-    }
-
-    // utf8_offsets and utf16_offsets length is equal to (# of lines + 1)
-    utf8_offsets.push(curr_row);
-    utf16_offsets.push(curr_offset_u16);
-
-    if !utf16_chars.is_empty() {
-      utf16_lines.insert(line, utf16_chars);
-    }
-
    LineIndex {
-      utf8_offsets,
-      utf16_lines,
-      utf16_offsets,
+      inner: Utf16Map::new(text),
    }
  }

@ -133,79 +35,35 @@ impl LineIndex {

  /// Return a u8 offset based on a u16 position.
  pub fn offset(&self, position: lsp::Position) -> Result<TextSize, AnyError> {
-    let col = self.utf16_to_utf8_col(position.line, position.character);
-    if let Some(line_offset) = self.utf8_offsets.get(position.line as usize) {
-      Ok(line_offset + col)
-    } else {
-      Err(JsErrorBox::new("OutOfRange", "The position is out of range.").into())
-    }
+    self.inner.offset(position.line, position.character)
  }

  /// Convert an lsp Position into a tsc/TypeScript "position", which is really
  /// an u16 byte offset from the start of the string represented as an u32.
  pub fn offset_tsc(&self, position: lsp::Position) -> jsonrpc::Result<u32> {
    self
-      .offset_utf16(position)
+      .inner
+      .offset_utf16(position.line, position.character)
      .map(|ts| ts.into())
      .map_err(|err| jsonrpc::Error::invalid_params(err.to_string()))
  }

-  fn offset_utf16(
-    &self,
-    position: lsp::Position,
-  ) -> Result<TextSize, AnyError> {
-    if let Some(line_offset) = self.utf16_offsets.get(position.line as usize) {
-      Ok(line_offset + TextSize::from(position.character))
-    } else {
-      Err(JsErrorBox::new("OutOfRange", "The position is out of range.").into())
-    }
-  }
-
  /// Returns a u16 position based on a u16 offset, which TypeScript offsets are
  /// returned as u16.
-  pub fn position_tsc(&self, offset: TextSize) -> lsp::Position {
-    let line = partition_point(&self.utf16_offsets, |&it| it <= offset) - 1;
-    let line_start_offset = self.utf16_offsets[line];
-    let col = offset - line_start_offset;
-
-    lsp::Position {
-      line: line as u32,
-      character: col.into(),
-    }
-  }
-
-  /// Returns a u16 position based on a u8 offset.
  pub fn position_utf16(&self, offset: TextSize) -> lsp::Position {
-    let line = partition_point(&self.utf16_offsets, |&it| it <= offset) - 1;
-    let line_start_offset = self.utf16_offsets[line];
-    let col = offset - line_start_offset;
-
+    let lc = self.inner.position_utf16(offset);
    lsp::Position {
-      line: line as u32,
-      character: col.into(),
+      line: lc.line_index as u32,
+      character: lc.column_index as u32,
    }
  }

  pub fn line_length_utf16(&self, line: u32) -> TextSize {
-    self.utf16_offsets[(line + 1) as usize] - self.utf16_offsets[line as usize]
+    self.inner.line_length_utf16(line)
  }

  pub fn text_content_length_utf16(&self) -> TextSize {
-    *self.utf16_offsets.last().unwrap()
-  }
-
-  fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize {
-    if let Some(utf16_chars) = self.utf16_lines.get(&line) {
-      for c in utf16_chars {
-        if col > u32::from(c.start) {
-          col += u32::from(c.len()) - c.len_utf16() as u32;
-        } else {
-          break;
-        }
-      }
-    }
-
-    col.into()
+    self.inner.text_content_length_utf16()
  }
 }

@ -217,7 +75,7 @@ pub fn get_edits(a: &str, b: &str, line_index: &LineIndex) -> Vec<TextEdit> {
  }
  // Heuristic to detect things like minified files. `diff()` is expensive.
  if b.chars().filter(|c| *c == '\n').count()
-    > line_index.utf8_offsets.len() * 3
+    > line_index.inner.utf8_offsets_len() * 3
  {
    return vec![TextEdit {
      range: lsp::Range {
@ -278,197 +136,6 @@ pub fn get_edits(a: &str, b: &str, line_index: &LineIndex) -> Vec<TextEdit> {
 mod tests {
  use super::*;

-  #[test]
-  fn test_line_index() {
-    let text = "hello\nworld";
-    let index = LineIndex::new(text);
-    assert_eq!(
-      index.position_utf16(0.into()),
-      lsp::Position {
-        line: 0,
-        character: 0
-      }
-    );
-    assert_eq!(
-      index.position_utf16(1.into()),
-      lsp::Position {
-        line: 0,
-        character: 1
-      }
-    );
-    assert_eq!(
-      index.position_utf16(5.into()),
-      lsp::Position {
-        line: 0,
-        character: 5
-      }
-    );
-    assert_eq!(
-      index.position_utf16(6.into()),
-      lsp::Position {
-        line: 1,
-        character: 0
-      }
-    );
-    assert_eq!(
-      index.position_utf16(7.into()),
-      lsp::Position {
-        line: 1,
-        character: 1
-      }
-    );
-    assert_eq!(
-      index.position_utf16(8.into()),
-      lsp::Position {
-        line: 1,
-        character: 2
-      }
-    );
-    assert_eq!(
-      index.position_utf16(10.into()),
-      lsp::Position {
-        line: 1,
-        character: 4
-      }
-    );
-    assert_eq!(
-      index.position_utf16(11.into()),
-      lsp::Position {
-        line: 1,
-        character: 5
-      }
-    );
-    assert_eq!(
-      index.position_utf16(12.into()),
-      lsp::Position {
-        line: 1,
-        character: 6
-      }
-    );
-
-    let text = "\nhello\nworld";
-    let index = LineIndex::new(text);
-    assert_eq!(
-      index.position_utf16(0.into()),
-      lsp::Position {
-        line: 0,
-        character: 0
-      }
-    );
-    assert_eq!(
-      index.position_utf16(1.into()),
-      lsp::Position {
-        line: 1,
-        character: 0
-      }
-    );
-    assert_eq!(
-      index.position_utf16(2.into()),
-      lsp::Position {
-        line: 1,
-        character: 1
-      }
-    );
-    assert_eq!(
-      index.position_utf16(6.into()),
-      lsp::Position {
-        line: 1,
-        character: 5
-      }
-    );
-    assert_eq!(
-      index.position_utf16(7.into()),
-      lsp::Position {
-        line: 2,
-        character: 0
-      }
-    );
-  }
-
-  #[test]
-  fn test_char_len() {
-    assert_eq!('メ'.len_utf8(), 3);
-    assert_eq!('メ'.len_utf16(), 1);
-    assert_eq!('编'.len_utf8(), 3);
-    assert_eq!('编'.len_utf16(), 1);
-    assert_eq!('🦕'.len_utf8(), 4);
-    assert_eq!('🦕'.len_utf16(), 2);
-  }
-
-  #[test]
-  fn test_empty_index() {
-    let col_index = LineIndex::new(
-      "
-const C: char = 'x';
-",
-    );
-    assert_eq!(col_index.utf16_lines.len(), 0);
-  }
-
-  #[test]
-  fn test_single_char() {
-    let col_index = LineIndex::new(
-      "
-const C: char = 'メ';
-",
-    );
-
-    assert_eq!(col_index.utf16_lines.len(), 1);
-    assert_eq!(col_index.utf16_lines[&1].len(), 1);
-    assert_eq!(
-      col_index.utf16_lines[&1][0],
-      Utf16Char {
-        start: 17.into(),
-        end: 20.into()
-      }
-    );
-
-    // UTF-16 to UTF-8, no changes
-    assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
-
-    // UTF-16 to UTF-8
-    assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21));
-
-    let col_index = LineIndex::new("a𐐏b");
-    assert_eq!(col_index.utf16_to_utf8_col(0, 3), TextSize::from(5));
-  }
-
-  #[test]
-  fn test_string() {
-    let col_index = LineIndex::new(
-      "
-const C: char = \"メ メ\";
-",
-    );
-
-    assert_eq!(col_index.utf16_lines.len(), 1);
-    assert_eq!(col_index.utf16_lines[&1].len(), 2);
-    assert_eq!(
-      col_index.utf16_lines[&1][0],
-      Utf16Char {
-        start: 17.into(),
-        end: 20.into()
-      }
-    );
-    assert_eq!(
-      col_index.utf16_lines[&1][1],
-      Utf16Char {
-        start: 21.into(),
-        end: 24.into()
-      }
-    );
-
-    // UTF-16 to UTF-8
-    assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
-
-    // メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
-    assert_eq!(col_index.utf16_to_utf8_col(1, 17), TextSize::from(17)); // first メ at 17..20
-    assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // space
-    assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); // second メ at 21..24
-
-    assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15));
-  }
-
  #[test]
  fn test_get_edits() {
    let a = "abcdefg";
--- a/cli/lsp/tsc.rs
+++ b/cli/lsp/tsc.rs
@ -1827,8 +1827,8 @@ impl TextSpan {

  pub fn to_range(&self, line_index: Arc<LineIndex>) -> lsp::Range {
    lsp::Range {
-      start: line_index.position_tsc(self.start.into()),
-      end: line_index.position_tsc(TextSize::from(self.start + self.length)),
+      start: line_index.position_utf16(self.start.into()),
+      end: line_index.position_utf16(TextSize::from(self.start + self.length)),
    }
  }
 }
@ -2276,7 +2276,7 @@ impl InlayHint {
    language_server: &language_server::Inner,
  ) -> lsp::InlayHint {
    lsp::InlayHint {
-      position: line_index.position_tsc(self.position.into()),
+      position: line_index.position_utf16(self.position.into()),
      label: if let Some(display_parts) = &self.display_parts {
        lsp::InlayHintLabel::LabelParts(
          display_parts
@ -2836,8 +2836,8 @@ impl Classifications {
          ts_classification,
        );

-      let start_pos = line_index.position_tsc(offset.into());
-      let end_pos = line_index.position_tsc(TextSize::from(offset + length));
+      let start_pos = line_index.position_utf16(offset.into());
+      let end_pos = line_index.position_utf16(TextSize::from(offset + length));

      for line in start_pos.line..(end_pos.line + 1) {
        let start_character = if line == start_pos.line {
--- a/cli/util/text_encoding.rs
+++ b/cli/util/text_encoding.rs
@ -1,10 +1,15 @@
 // Copyright 2018-2025 the Deno authors. MIT license.

+use std::collections::HashMap;
 use std::ops::Range;

 use base64::prelude::BASE64_STANDARD;
 use base64::Engine;
+use deno_core::error::AnyError;
 use deno_core::ModuleSourceCode;
+use deno_error::JsErrorBox;
+use text_lines::LineAndColumnIndex;
+use text_size::TextSize;

 static SOURCE_MAP_PREFIX: &[u8] =
  b"//# sourceMappingURL=data:application/json;base64,";
@ -85,6 +90,177 @@ fn find_source_map_range(code: &[u8]) -> Option<Range<usize>> {
  }
 }

+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+pub struct Utf16Char {
+  pub start: TextSize,
+  pub end: TextSize,
+}
+
+impl Utf16Char {
+  pub fn len(&self) -> TextSize {
+    self.end - self.start
+  }
+
+  pub fn len_utf16(&self) -> usize {
+    if self.len() == TextSize::from(4) {
+      2
+    } else {
+      1
+    }
+  }
+}
+
+#[derive(Debug, Clone, Default, Eq, PartialEq)]
+pub struct Utf16Map {
+  utf8_offsets: Vec<TextSize>,
+  utf16_lines: HashMap<u32, Vec<Utf16Char>>,
+  utf16_offsets: Vec<TextSize>,
+}
+
+impl Utf16Map {
+  pub fn new(text: &str) -> Utf16Map {
+    let mut utf16_lines = HashMap::new();
+    let mut utf16_chars = Vec::new();
+
+    let mut utf8_offsets = vec![0.into()];
+    let mut utf16_offsets = vec![0.into()];
+    let mut curr_row = 0.into();
+    let mut curr_col = 0.into();
+    let mut curr_offset_u16 = 0.into();
+    let mut line = 0;
+    for c in text.chars() {
+      let c_len = TextSize::of(c);
+      curr_row += c_len;
+      curr_offset_u16 += TextSize::from(c.len_utf16() as u32);
+      if c == '\n' {
+        utf8_offsets.push(curr_row);
+        utf16_offsets.push(curr_offset_u16);
+
+        if !utf16_chars.is_empty() {
+          utf16_lines.insert(line, utf16_chars);
+          utf16_chars = Vec::new();
+        }
+
+        curr_col = 0.into();
+        line += 1;
+        continue;
+      }
+
+      if !c.is_ascii() {
+        utf16_chars.push(Utf16Char {
+          start: curr_col,
+          end: curr_col + c_len,
+        });
+      }
+      curr_col += c_len;
+    }
+
+    // utf8_offsets and utf16_offsets length is equal to (# of lines + 1)
+    utf8_offsets.push(curr_row);
+    utf16_offsets.push(curr_offset_u16);
+
+    if !utf16_chars.is_empty() {
+      utf16_lines.insert(line, utf16_chars);
+    }
+
+    Utf16Map {
+      utf8_offsets,
+      utf16_lines,
+      utf16_offsets,
+    }
+  }
+
+  pub fn text_content_length_utf16(&self) -> TextSize {
+    *self.utf16_offsets.last().unwrap()
+  }
+
+  pub fn utf8_offsets_len(&self) -> usize {
+    self.utf8_offsets.len()
+  }
+
+  pub fn line_length_utf16(&self, line: u32) -> TextSize {
+    self.utf16_offsets[(line + 1) as usize] - self.utf16_offsets[line as usize]
+  }
+
+  pub fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize {
+    if let Some(utf16_chars) = self.utf16_lines.get(&line) {
+      for c in utf16_chars {
+        if col > u32::from(c.start) {
+          col += u32::from(c.len()) - c.len_utf16() as u32;
+        } else {
+          break;
+        }
+      }
+    }
+
+    col.into()
+  }
+
+  /// Return a u8 offset based on a u16 position.
+  pub fn offset(&self, line: u32, col: u32) -> Result<TextSize, AnyError> {
+    let col = self.utf16_to_utf8_col(line, col);
+    if let Some(line_offset) = self.utf8_offsets.get(line as usize) {
+      Ok(line_offset + col)
+    } else {
+      Err(JsErrorBox::new("OutOfRange", "The position is out of range.").into())
+    }
+  }
+
+  pub fn offset_utf16(
+    &self,
+    line: u32,
+    col: u32,
+  ) -> Result<TextSize, AnyError> {
+    if let Some(line_offset) = self.utf16_offsets.get(line as usize) {
+      Ok(line_offset + TextSize::from(col))
+    } else {
+      Err(JsErrorBox::new("OutOfRange", "The position is out of range.").into())
+    }
+  }
+
+  /// Returns a u16 line and column based on a u16 offset, which
+  /// TypeScript offsets are returned as u16.
+  pub fn position_utf16(&self, offset: TextSize) -> LineAndColumnIndex {
+    let line = partition_point(&self.utf16_offsets, |&it| it <= offset) - 1;
+    let line_start_offset = self.utf16_offsets[line];
+    let col = offset - line_start_offset;
+
+    LineAndColumnIndex {
+      line_index: line,
+      column_index: col.into(),
+    }
+  }
+}
+
+fn partition_point<T, P>(slice: &[T], mut predicate: P) -> usize
+where
+  P: FnMut(&T) -> bool,
+{
+  let mut left = 0;
+  let mut right = slice.len() - 1;
+
+  while left != right {
+    let mid = left + (right - left) / 2;
+    // SAFETY:
+    // When left < right, left <= mid < right.
+    // Therefore left always increases and right always decreases,
+    // and either of them is selected.
+    // In both cases left <= right is satisfied.
+    // Therefore if left < right in a step,
+    // left <= right is satisfied in the next step.
+    // Therefore as long as left != right, 0 <= left < right < len is satisfied
+    // and if this case 0 <= mid < len is satisfied too.
+    let value = unsafe { slice.get_unchecked(mid) };
+    if predicate(value) {
+      left = mid + 1;
+    } else {
+      right = mid;
+    }
+  }
+
+  left
+}
+
 #[cfg(test)]
 mod tests {
  use std::sync::Arc;
@ -185,4 +361,133 @@ throw new Error(\"Hello world!\");
      }
    }
  }
+
+  #[test]
+  fn test_line_index() {
+    let cases = [
+      (0, (0, 0)),
+      (1, (0, 1)),
+      (5, (0, 5)),
+      (6, (1, 0)),
+      (7, (1, 1)),
+      (8, (1, 2)),
+      (10, (1, 4)),
+      (11, (1, 5)),
+      (12, (1, 6)),
+    ];
+    let text = "hello\nworld";
+    let index = Utf16Map::new(text);
+    for (input, expected) in cases {
+      assert_eq!(
+        index.position_utf16(input.into()),
+        LineAndColumnIndex {
+          line_index: expected.0,
+          column_index: expected.1
+        }
+      );
+    }
+
+    let cases = [
+      (0, (0, 0)),
+      (1, (1, 0)),
+      (2, (1, 1)),
+      (6, (1, 5)),
+      (7, (2, 0)),
+    ];
+    let text = "\nhello\nworld";
+    let index = Utf16Map::new(text);
+    for (input, expected) in cases {
+      assert_eq!(
+        index.position_utf16(input.into()),
+        LineAndColumnIndex {
+          line_index: expected.0,
+          column_index: expected.1
+        }
+      );
+    }
+  }
+
+  #[test]
+  fn test_char_len() {
+    assert_eq!('メ'.len_utf8(), 3);
+    assert_eq!('メ'.len_utf16(), 1);
+    assert_eq!('编'.len_utf8(), 3);
+    assert_eq!('编'.len_utf16(), 1);
+    assert_eq!('🦕'.len_utf8(), 4);
+    assert_eq!('🦕'.len_utf16(), 2);
+  }
+
+  #[test]
+  fn test_empty_index() {
+    let col_index = Utf16Map::new(
+      "
+const C: char = 'x';
+",
+    );
+    assert_eq!(col_index.utf16_lines.len(), 0);
+  }
+
+  #[test]
+  fn test_single_char() {
+    let col_index = Utf16Map::new(
+      "
+const C: char = 'メ';
+",
+    );
+
+    assert_eq!(col_index.utf16_lines.len(), 1);
+    assert_eq!(col_index.utf16_lines[&1].len(), 1);
+    assert_eq!(
+      col_index.utf16_lines[&1][0],
+      Utf16Char {
+        start: 17.into(),
+        end: 20.into()
+      }
+    );
+
+    // UTF-16 to UTF-8, no changes
+    assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
+
+    // UTF-16 to UTF-8
+    assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21));
+
+    let col_index = Utf16Map::new("a𐐏b");
+    assert_eq!(col_index.utf16_to_utf8_col(0, 3), TextSize::from(5));
+  }
+
+  #[test]
+  fn test_string() {
+    let col_index = Utf16Map::new(
+      "
+const C: char = \"メ メ\";
+",
+    );
+
+    assert_eq!(col_index.utf16_lines.len(), 1);
+    assert_eq!(col_index.utf16_lines[&1].len(), 2);
+    assert_eq!(
+      col_index.utf16_lines[&1][0],
+      Utf16Char {
+        start: 17.into(),
+        end: 20.into()
+      }
+    );
+    assert_eq!(
+      col_index.utf16_lines[&1][1],
+      Utf16Char {
+        start: 21.into(),
+        end: 24.into()
+      }
+    );
+
+    // UTF-16 to UTF-8
+    assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
+
+    // メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
+    assert_eq!(col_index.utf16_to_utf8_col(1, 17), TextSize::from(17)); // first メ at 17..20
+    assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // space
+    assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); // second メ at 21..24
+
+    assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15));
+  }
 }