0
0
Fork 0
mirror of https://github.com/denoland/deno.git synced 2025-02-21 12:53:05 -05:00
denoland-deno/cli/util/text_encoding.rs

493 lines
13 KiB
Rust

// Copyright 2018-2025 the Deno authors. MIT license.
use std::collections::HashMap;
use std::ops::Range;
use base64::prelude::BASE64_STANDARD;
use base64::Engine;
use deno_core::error::AnyError;
use deno_core::ModuleSourceCode;
use deno_error::JsErrorBox;
use text_lines::LineAndColumnIndex;
use text_size::TextSize;
static SOURCE_MAP_PREFIX: &[u8] =
b"//# sourceMappingURL=data:application/json;base64,";
pub fn source_map_from_code(code: &[u8]) -> Option<Vec<u8>> {
let range = find_source_map_range(code)?;
let source_map_range = &code[range];
let input = source_map_range.split_at(SOURCE_MAP_PREFIX.len()).1;
let decoded_map = BASE64_STANDARD.decode(input).ok()?;
Some(decoded_map)
}
/// Truncate the source code before the source map.
pub fn code_without_source_map(code: ModuleSourceCode) -> ModuleSourceCode {
use deno_core::ModuleCodeBytes;
match code {
ModuleSourceCode::String(mut code) => {
if let Some(range) = find_source_map_range(code.as_bytes()) {
code.truncate(range.start);
}
ModuleSourceCode::String(code)
}
ModuleSourceCode::Bytes(code) => {
if let Some(range) = find_source_map_range(code.as_bytes()) {
let source_map_index = range.start;
ModuleSourceCode::Bytes(match code {
ModuleCodeBytes::Static(bytes) => {
ModuleCodeBytes::Static(&bytes[..source_map_index])
}
ModuleCodeBytes::Boxed(bytes) => {
// todo(dsherret): should be possible without cloning
ModuleCodeBytes::Boxed(
bytes[..source_map_index].to_vec().into_boxed_slice(),
)
}
ModuleCodeBytes::Arc(bytes) => ModuleCodeBytes::Boxed(
bytes[..source_map_index].to_vec().into_boxed_slice(),
),
})
} else {
ModuleSourceCode::Bytes(code)
}
}
}
}
fn find_source_map_range(code: &[u8]) -> Option<Range<usize>> {
fn last_non_blank_line_range(code: &[u8]) -> Option<Range<usize>> {
let mut hit_non_whitespace = false;
let mut range_end = code.len();
for i in (0..code.len()).rev() {
match code[i] {
b' ' | b'\t' => {
if !hit_non_whitespace {
range_end = i;
}
}
b'\n' | b'\r' => {
if hit_non_whitespace {
return Some(i + 1..range_end);
}
range_end = i;
}
_ => {
hit_non_whitespace = true;
}
}
}
None
}
let range = last_non_blank_line_range(code)?;
if code[range.start..range.end].starts_with(SOURCE_MAP_PREFIX) {
Some(range)
} else {
None
}
}
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct Utf16Char {
pub start: TextSize,
pub end: TextSize,
}
impl Utf16Char {
pub fn len(&self) -> TextSize {
self.end - self.start
}
pub fn len_utf16(&self) -> usize {
if self.len() == TextSize::from(4) {
2
} else {
1
}
}
}
#[derive(Debug, Clone, Default, Eq, PartialEq)]
pub struct Utf16Map {
utf8_offsets: Vec<TextSize>,
utf16_lines: HashMap<u32, Vec<Utf16Char>>,
utf16_offsets: Vec<TextSize>,
}
impl Utf16Map {
pub fn new(text: &str) -> Utf16Map {
let mut utf16_lines = HashMap::new();
let mut utf16_chars = Vec::new();
let mut utf8_offsets = vec![0.into()];
let mut utf16_offsets = vec![0.into()];
let mut curr_row = 0.into();
let mut curr_col = 0.into();
let mut curr_offset_u16 = 0.into();
let mut line = 0;
for c in text.chars() {
let c_len = TextSize::of(c);
curr_row += c_len;
curr_offset_u16 += TextSize::from(c.len_utf16() as u32);
if c == '\n' {
utf8_offsets.push(curr_row);
utf16_offsets.push(curr_offset_u16);
if !utf16_chars.is_empty() {
utf16_lines.insert(line, utf16_chars);
utf16_chars = Vec::new();
}
curr_col = 0.into();
line += 1;
continue;
}
if !c.is_ascii() {
utf16_chars.push(Utf16Char {
start: curr_col,
end: curr_col + c_len,
});
}
curr_col += c_len;
}
// utf8_offsets and utf16_offsets length is equal to (# of lines + 1)
utf8_offsets.push(curr_row);
utf16_offsets.push(curr_offset_u16);
if !utf16_chars.is_empty() {
utf16_lines.insert(line, utf16_chars);
}
Utf16Map {
utf8_offsets,
utf16_lines,
utf16_offsets,
}
}
pub fn text_content_length_utf16(&self) -> TextSize {
*self.utf16_offsets.last().unwrap()
}
pub fn utf8_offsets_len(&self) -> usize {
self.utf8_offsets.len()
}
pub fn line_length_utf16(&self, line: u32) -> TextSize {
self.utf16_offsets[(line + 1) as usize] - self.utf16_offsets[line as usize]
}
pub fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize {
if let Some(utf16_chars) = self.utf16_lines.get(&line) {
for c in utf16_chars {
if col > u32::from(c.start) {
col += u32::from(c.len()) - c.len_utf16() as u32;
} else {
break;
}
}
}
col.into()
}
/// Return a u8 offset based on a u16 position.
pub fn offset(&self, line: u32, col: u32) -> Result<TextSize, AnyError> {
let col = self.utf16_to_utf8_col(line, col);
if let Some(line_offset) = self.utf8_offsets.get(line as usize) {
Ok(line_offset + col)
} else {
Err(JsErrorBox::new("OutOfRange", "The position is out of range.").into())
}
}
pub fn offset_utf16(
&self,
line: u32,
col: u32,
) -> Result<TextSize, AnyError> {
if let Some(line_offset) = self.utf16_offsets.get(line as usize) {
Ok(line_offset + TextSize::from(col))
} else {
Err(JsErrorBox::new("OutOfRange", "The position is out of range.").into())
}
}
/// Returns a u16 line and column based on a u16 offset, which
/// TypeScript offsets are returned as u16.
pub fn position_utf16(&self, offset: TextSize) -> LineAndColumnIndex {
let line = partition_point(&self.utf16_offsets, |&it| it <= offset) - 1;
let line_start_offset = self.utf16_offsets[line];
let col = offset - line_start_offset;
LineAndColumnIndex {
line_index: line,
column_index: col.into(),
}
}
}
fn partition_point<T, P>(slice: &[T], mut predicate: P) -> usize
where
P: FnMut(&T) -> bool,
{
let mut left = 0;
let mut right = slice.len() - 1;
while left != right {
let mid = left + (right - left) / 2;
// SAFETY:
// When left < right, left <= mid < right.
// Therefore left always increases and right always decreases,
// and either of them is selected.
// In both cases left <= right is satisfied.
// Therefore if left < right in a step,
// left <= right is satisfied in the next step.
// Therefore as long as left != right, 0 <= left < right < len is satisfied
// and if this case 0 <= mid < len is satisfied too.
let value = unsafe { slice.get_unchecked(mid) };
if predicate(value) {
left = mid + 1;
} else {
right = mid;
}
}
left
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use deno_core::ModuleCodeBytes;
use deno_core::ModuleCodeString;
use super::*;
#[test]
fn test_source_map_from_code() {
let to_string =
|bytes: Vec<u8>| -> String { String::from_utf8(bytes.to_vec()).unwrap() };
assert_eq!(
source_map_from_code(
b"test\n//# sourceMappingURL=data:application/json;base64,dGVzdGluZ3Rlc3Rpbmc="
).map(to_string),
Some("testingtesting".to_string())
);
assert_eq!(
source_map_from_code(
b"test\n//# sourceMappingURL=data:application/json;base64,dGVzdGluZ3Rlc3Rpbmc=\n \n"
).map(to_string),
Some("testingtesting".to_string())
);
assert_eq!(
source_map_from_code(
b"test\n//# sourceMappingURL=data:application/json;base64,dGVzdGluZ3Rlc3Rpbmc=\n test\n"
).map(to_string),
None
);
assert_eq!(
source_map_from_code(
b"\"use strict\";
throw new Error(\"Hello world!\");
//# sourceMappingURL=data:application/json;base64,{"
),
None
);
}
#[test]
fn test_source_without_source_map() {
run_test("", "");
run_test("\n", "\n");
run_test("\r\n", "\r\n");
run_test("a", "a");
run_test("a\n", "a\n");
run_test("a\r\n", "a\r\n");
run_test("a\r\nb", "a\r\nb");
run_test("a\nb\n", "a\nb\n");
run_test("a\r\nb\r\n", "a\r\nb\r\n");
run_test(
"test\n//# sourceMappingURL=data:application/json;base64,test",
"test\n",
);
run_test(
"test\r\n//# sourceMappingURL=data:application/json;base64,test",
"test\r\n",
);
run_test(
"\n//# sourceMappingURL=data:application/json;base64,test",
"\n",
);
run_test(
"test\n//# sourceMappingURL=data:application/json;base64,test\n\n",
"test\n",
);
run_test(
"test\n//# sourceMappingURL=data:application/json;base64,test\n \n ",
"test\n",
);
fn run_test(input: &'static str, output: &'static str) {
let forms = [
ModuleSourceCode::String(ModuleCodeString::from_static(input)),
ModuleSourceCode::String({
let text: Arc<str> = input.into();
text.into()
}),
ModuleSourceCode::String({
let text: String = input.into();
text.into()
}),
ModuleSourceCode::Bytes(ModuleCodeBytes::Static(input.as_bytes())),
ModuleSourceCode::Bytes(ModuleCodeBytes::Boxed(
input.as_bytes().to_vec().into_boxed_slice(),
)),
ModuleSourceCode::Bytes(ModuleCodeBytes::Arc(
input.as_bytes().to_vec().into(),
)),
];
for form in forms {
let result = code_without_source_map(form);
let bytes = result.as_bytes();
assert_eq!(bytes, output.as_bytes());
}
}
}
#[test]
fn test_line_index() {
let cases = [
(0, (0, 0)),
(1, (0, 1)),
(5, (0, 5)),
(6, (1, 0)),
(7, (1, 1)),
(8, (1, 2)),
(10, (1, 4)),
(11, (1, 5)),
(12, (1, 6)),
];
let text = "hello\nworld";
let index = Utf16Map::new(text);
for (input, expected) in cases {
assert_eq!(
index.position_utf16(input.into()),
LineAndColumnIndex {
line_index: expected.0,
column_index: expected.1
}
);
}
let cases = [
(0, (0, 0)),
(1, (1, 0)),
(2, (1, 1)),
(6, (1, 5)),
(7, (2, 0)),
];
let text = "\nhello\nworld";
let index = Utf16Map::new(text);
for (input, expected) in cases {
assert_eq!(
index.position_utf16(input.into()),
LineAndColumnIndex {
line_index: expected.0,
column_index: expected.1
}
);
}
}
#[test]
fn test_char_len() {
assert_eq!('メ'.len_utf8(), 3);
assert_eq!('メ'.len_utf16(), 1);
assert_eq!('编'.len_utf8(), 3);
assert_eq!('编'.len_utf16(), 1);
assert_eq!('🦕'.len_utf8(), 4);
assert_eq!('🦕'.len_utf16(), 2);
}
#[test]
fn test_empty_index() {
let col_index = Utf16Map::new(
"
const C: char = 'x';
",
);
assert_eq!(col_index.utf16_lines.len(), 0);
}
#[test]
fn test_single_char() {
let col_index = Utf16Map::new(
"
const C: char = 'メ';
",
);
assert_eq!(col_index.utf16_lines.len(), 1);
assert_eq!(col_index.utf16_lines[&1].len(), 1);
assert_eq!(
col_index.utf16_lines[&1][0],
Utf16Char {
start: 17.into(),
end: 20.into()
}
);
// UTF-16 to UTF-8, no changes
assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
// UTF-16 to UTF-8
assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21));
let col_index = Utf16Map::new("a𐐏b");
assert_eq!(col_index.utf16_to_utf8_col(0, 3), TextSize::from(5));
}
#[test]
fn test_string() {
let col_index = Utf16Map::new(
"
const C: char = \"メ メ\";
",
);
assert_eq!(col_index.utf16_lines.len(), 1);
assert_eq!(col_index.utf16_lines[&1].len(), 2);
assert_eq!(
col_index.utf16_lines[&1][0],
Utf16Char {
start: 17.into(),
end: 20.into()
}
);
assert_eq!(
col_index.utf16_lines[&1][1],
Utf16Char {
start: 21.into(),
end: 24.into()
}
);
// UTF-16 to UTF-8
assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
// メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
assert_eq!(col_index.utf16_to_utf8_col(1, 17), TextSize::from(17)); // first メ at 17..20
assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // space
assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); // second メ at 21..24
assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15));
}
}