From c1e14347eca8139e7fee45094ff5328c9892f057 Mon Sep 17 00:00:00 2001 From: Yaroslav Bolyukin Date: Fri, 22 Apr 2022 19:51:09 +0000 Subject: [PATCH] fix: decode non-bmp characters --- --- a/crates/jrsonnet-parser/src/unescape.rs +++ b/crates/jrsonnet-parser/src/unescape.rs @@ -1,3 +1,11 @@ +use std::str::Chars; + +fn decode_unicode(chars: &mut Chars) -> Option { + IntoIterator::into_iter([chars.next()?, chars.next()?, chars.next()?, chars.next()?]) + .map(|c| c.to_digit(16).map(|f| f as u16)) + .try_fold(0u16, |acc, v| Some((acc << 4) | (v?))) +} + pub fn unescape(s: &str) -> Option { let mut chars = s.chars(); let mut out = String::with_capacity(s.len()); @@ -14,17 +22,26 @@ 'n' => out.push('\n'), 'r' => out.push('\r'), 't' => out.push('\t'), - 'u' => { - let c = IntoIterator::into_iter([ - chars.next()?, - chars.next()?, - chars.next()?, - chars.next()?, - ]) - .map(|c| c.to_digit(16)) - .try_fold(0u32, |acc, v| Some((acc << 8) | (v?)))?; - out.push(char::from_u32(c)?) - } + 'u' => match decode_unicode(&mut chars)? { + // May only be second byte + 0xDC00..=0xDFFF => return None, + // Surrogate pair + n1 @ 0xD800..=0xDBFF => { + if chars.next() != Some('\\') { + return None; + } + if chars.next() != Some('u') { + return None; + } + let n2 = decode_unicode(&mut chars)?; + if !matches!(n2, 0xDC00..=0xDFFF) { + return None; + } + let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; + out.push(char::from_u32(n)?); + } + n => out.push(char::from_u32(n as u32)?), + }, 'x' => { let c = IntoIterator::into_iter([chars.next()?, chars.next()?]) .map(|c| c.to_digit(16)) -- gitstuff