From 93f08ca017e4a7f49a4fe1791c7484fb6c7caf59 Mon Sep 17 00:00:00 2001 From: Yaroslav Bolyukin Date: Sun, 22 Mar 2026 06:24:24 +0000 Subject: [PATCH] refactor: split lexer from rowan parser --- --- a/Cargo.lock +++ b/Cargo.lock @@ -716,6 +716,13 @@ ] [[package]] +name = "jrsonnet-lexer" +version = "0.5.0-pre97" +dependencies = [ + "logos", +] + +[[package]] name = "jrsonnet-macros" version = "0.5.0-pre97" dependencies = [ @@ -744,7 +751,7 @@ "hi-doc", "indoc", "insta", - "logos", + "jrsonnet-lexer", "rowan", "strip-ansi-escapes", "thiserror", --- a/Cargo.toml +++ b/Cargo.toml @@ -81,9 +81,6 @@ itertools = "0.14.0" xshell = "0.2.7" -lsp-server = "0.7.9" -lsp-types = "0.97.0" - regex = "1.12" lru = "0.16.3" --- /dev/null +++ b/crates/jrsonnet-lexer/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "jrsonnet-lexer" +authors.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +version.workspace = true + +[dependencies] +logos.workspace = true + +[lints] +workspace = true --- /dev/null +++ b/crates/jrsonnet-lexer/src/generated/mod.rs @@ -0,0 +1 @@ +pub mod syntax_kinds; --- /dev/null +++ b/crates/jrsonnet-lexer/src/generated/syntax_kinds.rs @@ -0,0 +1,210 @@ +//! This is a generated file, please do not edit manually. Changes can be +//! made in codegeneration that lives in `xtask` top-level dir. + +#![allow( + bad_style, + missing_docs, + unreachable_pub, + clippy::manual_non_exhaustive, + clippy::match_like_matches_macro +)] +#[doc = r" The kind of syntax node, e.g. `IDENT`, `USE_KW`, or `STRUCT`."] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, logos :: Logos)] +#[repr(u16)] +pub enum SyntaxKind { + #[doc(hidden)] + TOMBSTONE, + #[doc(hidden)] + EOF, + #[token("||")] + OR, + #[token("??")] + NULL_COAELSE, + #[token("&&")] + AND, + #[token("|")] + BIT_OR, + #[token("^")] + BIT_XOR, + #[token("&")] + BIT_AND, + #[token("==")] + EQ, + #[token("!=")] + NE, + #[token("<")] + LT, + #[token(">")] + GT, + #[token("<=")] + LE, + #[token(">=")] + GE, + #[token("<<")] + LHS, + #[token(">>")] + RHS, + #[token("+")] + PLUS, + #[token("-")] + MINUS, + #[token("*")] + MUL, + #[token("/")] + DIV, + #[token("%")] + MODULO, + #[token("!")] + NOT, + #[token("~")] + BIT_NOT, + #[token("[")] + L_BRACK, + #[token("]")] + R_BRACK, + #[token("(")] + L_PAREN, + #[token(")")] + R_PAREN, + #[token("{")] + L_BRACE, + #[token("}")] + R_BRACE, + #[token(":")] + COLON, + #[token("::")] + COLONCOLON, + #[token(":::")] + COLONCOLONCOLON, + #[token(";")] + SEMI, + #[token(".")] + DOT, + #[token("...")] + DOTDOTDOT, + #[token(",")] + COMMA, + #[token("$")] + DOLLAR, + #[token("=")] + ASSIGN, + #[token("?")] + QUESTION_MARK, + #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")] + FLOAT, + #[regex("(?:0|[1-9][0-9]*)\\.[^0-9]")] + ERROR_FLOAT_JUNK_AFTER_POINT, + #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][^+\\-0-9]")] + ERROR_FLOAT_JUNK_AFTER_EXPONENT, + #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][+-][^0-9]")] + ERROR_FLOAT_JUNK_AFTER_EXPONENT_SIGN, + #[regex("\"(?s:[^\"\\\\]|\\\\.)*\"")] + STRING_DOUBLE, + #[regex("\"(?s:[^\"\\\\]|\\\\.)*")] + ERROR_STRING_DOUBLE_UNTERMINATED, + #[regex("'(?s:[^'\\\\]|\\\\.)*'")] + STRING_SINGLE, + #[regex("'(?s:[^'\\\\]|\\\\.)*")] + ERROR_STRING_SINGLE_UNTERMINATED, + #[regex("@\"(?:[^\"]|\"\")*\"")] + STRING_DOUBLE_VERBATIM, + #[regex("@\"(?:[^\"]|\"\")*")] + ERROR_STRING_DOUBLE_VERBATIM_UNTERMINATED, + #[regex("@'(?:[^']|'')*'")] + STRING_SINGLE_VERBATIM, + #[regex("@'(?:[^']|'')*")] + ERROR_STRING_SINGLE_VERBATIM_UNTERMINATED, + #[regex("@[^\"'\\s]\\S+")] + ERROR_STRING_VERBATIM_MISSING_QUOTES, + #[regex("\\|\\|\\|", crate::string_block::lex_str_block_test)] + STRING_BLOCK, + ERROR_STRING_BLOCK_UNEXPECTED_END, + ERROR_STRING_BLOCK_MISSING_NEW_LINE, + ERROR_STRING_BLOCK_MISSING_TERMINATION, + ERROR_STRING_BLOCK_MISSING_INDENT, + #[regex("[_a-zA-Z][_a-zA-Z0-9]*")] + IDENT, + #[regex("[ \\t\\n\\r]+")] + WHITESPACE, + #[regex("//[^\\r\\n]*?(\\r\\n|\\n)?")] + SINGLE_LINE_SLASH_COMMENT, + #[regex("#[^\\r\\n]*?(\\r\\n|\\n)?")] + SINGLE_LINE_HASH_COMMENT, + #[regex("/\\*([^*]|\\*[^/])*\\*/")] + MULTI_LINE_COMMENT, + #[regex("/\\*/")] + ERROR_COMMENT_TOO_SHORT, + #[regex("/\\*([^*/]|\\*[^/])+")] + ERROR_COMMENT_UNTERMINATED, + #[token("tailstrict")] + TAILSTRICT_KW, + #[token("local")] + LOCAL_KW, + #[token("importstr")] + IMPORTSTR_KW, + #[token("importbin")] + IMPORTBIN_KW, + #[token("import")] + IMPORT_KW, + #[token("if")] + IF_KW, + #[token("then")] + THEN_KW, + #[token("else")] + ELSE_KW, + #[token("function")] + FUNCTION_KW, + #[token("error")] + ERROR_KW, + #[token("in")] + IN_KW, + META_OBJECT_APPLY, + ERROR_NO_OPERATOR, + #[token("null")] + NULL_KW, + #[token("true")] + TRUE_KW, + #[token("false")] + FALSE_KW, + #[token("self")] + SELF_KW, + #[token("super")] + SUPER_KW, + #[token("for")] + FOR_KW, + #[token("assert")] + ASSERT_KW, + ERROR_MISSING_TOKEN, + ERROR_UNEXPECTED_TOKEN, + ERROR_CUSTOM, + LEXING_ERROR, + __LAST_TOKEN, + #[doc(hidden)] + __LAST, +} +use self::SyntaxKind::*; +impl SyntaxKind { + pub fn is_keyword(self) -> bool { + match self { + OR | NULL_COAELSE | AND | BIT_OR | BIT_XOR | BIT_AND | EQ | NE | LT | GT | LE | GE + | LHS | RHS | PLUS | MINUS | MUL | DIV | MODULO | NOT | BIT_NOT | L_BRACK | R_BRACK + | L_PAREN | R_PAREN | L_BRACE | R_BRACE | COLON | COLONCOLON | COLONCOLONCOLON + | SEMI | DOT | DOTDOTDOT | COMMA | DOLLAR | ASSIGN | QUESTION_MARK | TAILSTRICT_KW + | LOCAL_KW | IMPORTSTR_KW | IMPORTBIN_KW | IMPORT_KW | IF_KW | THEN_KW | ELSE_KW + | FUNCTION_KW | ERROR_KW | IN_KW | NULL_KW | TRUE_KW | FALSE_KW | SELF_KW + | SUPER_KW | FOR_KW | ASSERT_KW => true, + _ => false, + } + } + pub fn from_raw(r: u16) -> Self { + assert!(r < Self::__LAST as u16); + unsafe { std::mem::transmute(r) } + } + pub fn into_raw(self) -> u16 { + self as u16 + } +} +#[macro_export] +macro_rules ! T { [||] => { $ crate :: SyntaxKind :: OR } ; [??] => { $ crate :: SyntaxKind :: NULL_COAELSE } ; [&&] => { $ crate :: SyntaxKind :: AND } ; [|] => { $ crate :: SyntaxKind :: BIT_OR } ; [^] => { $ crate :: SyntaxKind :: BIT_XOR } ; [&] => { $ crate :: SyntaxKind :: BIT_AND } ; [==] => { $ crate :: SyntaxKind :: EQ } ; [!=] => { $ crate :: SyntaxKind :: NE } ; [<] => { $ crate :: SyntaxKind :: LT } ; [>] => { $ crate :: SyntaxKind :: GT } ; [<=] => { $ crate :: SyntaxKind :: LE } ; [>=] => { $ crate :: SyntaxKind :: GE } ; [<<] => { $ crate :: SyntaxKind :: LHS } ; [>>] => { $ crate :: SyntaxKind :: RHS } ; [+] => { $ crate :: SyntaxKind :: PLUS } ; [-] => { $ crate :: SyntaxKind :: MINUS } ; [*] => { $ crate :: SyntaxKind :: MUL } ; [/] => { $ crate :: SyntaxKind :: DIV } ; [%] => { $ crate :: SyntaxKind :: MODULO } ; [!] => { $ crate :: SyntaxKind :: NOT } ; [~] => { $ crate :: SyntaxKind :: BIT_NOT } ; ['['] => { $ crate :: SyntaxKind :: L_BRACK } ; [']'] => { $ crate :: SyntaxKind :: R_BRACK } ; ['('] => { $ crate :: SyntaxKind :: L_PAREN } ; [')'] => { $ crate :: SyntaxKind :: R_PAREN } ; ['{'] => { $ crate :: SyntaxKind :: L_BRACE } ; ['}'] => { $ crate :: SyntaxKind :: R_BRACE } ; [:] => { $ crate :: SyntaxKind :: COLON } ; [::] => { $ crate :: SyntaxKind :: COLONCOLON } ; [:::] => { $ crate :: SyntaxKind :: COLONCOLONCOLON } ; [;] => { $ crate :: SyntaxKind :: SEMI } ; [.] => { $ crate :: SyntaxKind :: DOT } ; [...] => { $ crate :: SyntaxKind :: DOTDOTDOT } ; [,] => { $ crate :: SyntaxKind :: COMMA } ; ['$'] => { $ crate :: SyntaxKind :: DOLLAR } ; [=] => { $ crate :: SyntaxKind :: ASSIGN } ; [?] => { $ crate :: SyntaxKind :: QUESTION_MARK } ; [tailstrict] => { $ crate :: SyntaxKind :: TAILSTRICT_KW } ; [local] => { $ crate :: SyntaxKind :: LOCAL_KW } ; [importstr] => { $ crate :: SyntaxKind :: IMPORTSTR_KW } ; [importbin] => { $ crate :: SyntaxKind :: IMPORTBIN_KW } ; [import] => { $ crate :: SyntaxKind :: IMPORT_KW } ; [if] => { $ crate :: SyntaxKind :: IF_KW } ; [then] => { $ crate :: SyntaxKind :: THEN_KW } ; [else] => { $ crate :: SyntaxKind :: ELSE_KW } ; [function] => { $ crate :: SyntaxKind :: FUNCTION_KW } ; [error] => { $ crate :: SyntaxKind :: ERROR_KW } ; [in] => { $ crate :: SyntaxKind :: IN_KW } ; [null] => { $ crate :: SyntaxKind :: NULL_KW } ; [true] => { $ crate :: SyntaxKind :: TRUE_KW } ; [false] => { $ crate :: SyntaxKind :: FALSE_KW } ; [self] => { $ crate :: SyntaxKind :: SELF_KW } ; [super] => { $ crate :: SyntaxKind :: SUPER_KW } ; [for] => { $ crate :: SyntaxKind :: FOR_KW } ; [assert] => { $ crate :: SyntaxKind :: ASSERT_KW } } +#[allow(unused_imports)] +pub use T; --- /dev/null +++ b/crates/jrsonnet-lexer/src/lex.rs @@ -0,0 +1,78 @@ +use core::ops::Range; + +use logos::Logos; +// use rowan::{TextRange, TextSize}; + +use crate::{ + generated::syntax_kinds::SyntaxKind, + string_block::{lex_str_block, StringBlockError}, + Span, +}; + +pub struct Lexer<'a> { + inner: logos::Lexer<'a, SyntaxKind>, +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str) -> Self { + Self { + inner: SyntaxKind::lexer(input), + } + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Lexeme<'a>; + + fn next(&mut self) -> Option { + use SyntaxKind::*; + + let mut kind = self.inner.next()?; + let text = self.inner.slice(); + + if kind == Ok(STRING_BLOCK) { + // We use custom lexer, which skips enough bytes, but not returns error + // Instead we should call lexer again to verify if there is something wrong with string block + let mut lexer = logos::Lexer::::new(text); + // In kinds, string blocks is parsed at least as `|||` + lexer.bump(3); + let res = lex_str_block(&mut lexer); + let next = lexer.next(); + assert!(next.is_none(), "str_block is lexed"); + match res { + Ok(()) => {} + Err(e) => { + kind = Ok(match e { + StringBlockError::UnexpectedEnd => ERROR_STRING_BLOCK_UNEXPECTED_END, + StringBlockError::MissingNewLine => ERROR_STRING_BLOCK_MISSING_NEW_LINE, + StringBlockError::MissingTermination => { + ERROR_STRING_BLOCK_MISSING_TERMINATION + } + StringBlockError::MissingIndent => ERROR_STRING_BLOCK_MISSING_INDENT, + }); + } + } + } + + Some(Self::Item { + kind: kind.unwrap_or(SyntaxKind::LEXING_ERROR), + text, + range: { + let Range { start, end } = self.inner.span(); + + Span(start as u32, end as u32) + }, + }) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct Lexeme<'s> { + pub kind: SyntaxKind, + pub text: &'s str, + pub range: Span, +} + +pub fn lex(input: &str) -> Vec> { + Lexer::new(input).collect() +} --- /dev/null +++ b/crates/jrsonnet-lexer/src/lib.rs @@ -0,0 +1,8 @@ +mod generated; +mod lex; +mod string_block; + +#[derive(Clone, Copy, Debug)] +pub struct Span(pub u32, pub u32); + +pub use lex::{Lexeme, Lexer}; --- /dev/null +++ b/crates/jrsonnet-lexer/src/string_block.rs @@ -0,0 +1,282 @@ +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum StringBlockError { + UnexpectedEnd, + MissingNewLine, + MissingTermination, + MissingIndent, +} + +use logos::Lexer; +use StringBlockError::*; + +use crate::generated::syntax_kinds::SyntaxKind; + +pub(crate) fn lex_str_block_test(lex: &mut Lexer<'_, SyntaxKind>) { + let _ = lex_str_block(lex); +} + +pub(crate) struct Context<'a> { + source: &'a str, + index: usize, +} + +impl<'a> Context<'a> { + fn rest(&self) -> &'a str { + &self.source[self.index..] + } + + fn next(&mut self) -> Option { + if self.index == self.source.len() { + return None; + } + + match self.rest().chars().next() { + None => None, + Some(c) => { + self.index += c.len_utf8(); + Some(c) + } + } + } + + fn peek(&self) -> Option { + if self.index == self.source.len() { + return None; + } + + self.rest().chars().next() + } + + fn eat_if(&mut self, f: impl Fn(char) -> bool) -> usize { + if self.peek().is_some_and(f) { + self.index += 1; + return 1; + } + 0 + } + + fn eat_while(&mut self, f: impl Fn(char) -> bool) -> usize { + if self.index == self.source.len() { + return 0; + } + + let next_char = self.rest().char_indices().find(|(_, c)| !f(*c)); + + match next_char { + None => { + let diff = self.source.len() - self.index; + self.index = self.source.len(); + diff + } + Some((idx, _)) => { + self.index += idx; + idx + } + } + } + + fn skip(&mut self, len: usize) { + self.index = match self.index + len { + n if n > self.source.len() => self.source.len(), + n => n, + }; + } +} + +// Check that b has at least the same whitespace prefix as a and returns the +// amount of this whitespace, otherwise returns 0. If a has no whitespace +// prefix than return 0. +fn check_whitespace(a: &str, b: &str) -> usize { + let a = a.as_bytes(); + let b = b.as_bytes(); + + for i in 0..a.len() { + if a[i] != b' ' && a[i] != b'\t' { + // a has run out of whitespace and b matched up to this point. Return result. + return i; + } + + if i >= b.len() { + // We ran off the edge of b while a still has whitespace. Return 0 as failure. + return 0; + } + + if a[i] != b[i] { + // a has whitespace but b does not. Return 0 as failure. + return 0; + } + } + + // We ran off the end of a and b kept up + a.len() +} + +pub(crate) trait StrBlockLexCtx<'d> { + fn remainder(&self) -> &'d str; + fn eat_error(&mut self, ctx: &Context<'d>); + fn bump_pos(&mut self, s: usize); + fn mark_truncating(&mut self); + fn mark_line(&mut self, line: &'d str); +} + +impl<'d> StrBlockLexCtx<'d> for Lexer<'d, SyntaxKind> { + fn remainder(&self) -> &'d str { + self.remainder() + } + fn eat_error(&mut self, ctx: &Context<'d>) { + let end_index = ctx + .rest() + .find("|||") + .map_or_else(|| ctx.rest().len(), |v| v + 3); + self.bump(ctx.index + end_index); + } + fn bump_pos(&mut self, s: usize) { + self.bump(s); + } + fn mark_truncating(&mut self) { + // Lexer test doesn't collect anything + } + fn mark_line(&mut self, _line: &'d str) { + // Lexer test doesn't collect anything + } +} + +pub fn collect_lexed_str_block(input: &str) -> Result, StringBlockError> { + let mut collect = CollectStrBlock { + truncate: false, + lines: vec![], + input, + offset: 0, + }; + lex_str_block(&mut collect)?; + Ok(collect) +} + +pub struct CollectStrBlock<'s> { + pub truncate: bool, + pub lines: Vec<&'s str>, + input: &'s str, + offset: usize, +} + +impl<'d> StrBlockLexCtx<'d> for CollectStrBlock<'d> { + fn remainder(&self) -> &'d str { + self.input + } + + fn eat_error(&mut self, _ctx: &Context<'d>) { + // Error will be returned, no need to record it here + } + + fn bump_pos(&mut self, s: usize) { + self.offset += s; + } + + fn mark_truncating(&mut self) { + self.truncate = true; + } + + fn mark_line(&mut self, line: &'d str) { + self.lines.push(line); + } +} + +pub(crate) fn lex_str_block<'a>(lex: &mut impl StrBlockLexCtx<'a>) -> Result<(), StringBlockError> { + // debug_assert_eq!(lex.slice(), "|||"); + let mut ctx = Context::<'a> { + source: lex.remainder(), + index: 0, + }; + + if ctx.eat_if(|v| v == '-') != 0 { + lex.mark_truncating(); + } + + // Skip whitespaces + ctx.eat_while(|r| r == ' ' || r == '\t' || r == '\r'); + + // Skip \n + match ctx.next() { + Some('\n') => (), + None => { + lex.eat_error(&ctx); + return Err(UnexpectedEnd); + } + // Text block requires new line after |||. + Some(_) => { + lex.eat_error(&ctx); + return Err(MissingNewLine); + } + } + + // Process leading blank lines before calculating string block indent + while ctx.peek() == Some('\n') { + ctx.next(); + } + + let mut num_whitespace = check_whitespace(ctx.rest(), ctx.rest()); + let str_block_indent = &ctx.rest()[..num_whitespace]; + + if num_whitespace == 0 { + // Text block's first line must start with whitespace + lex.eat_error(&ctx); + return Err(MissingIndent); + } + + loop { + debug_assert_ne!(num_whitespace, 0, "Unexpected value for num_whitespace"); + ctx.skip(num_whitespace); + + let line_start = ctx.index; + let mut line_size = 0; + loop { + match ctx.next() { + None => { + lex.eat_error(&ctx); + return Err(UnexpectedEnd); + } + Some('\n') => { + lex.mark_line(&ctx.source[line_start..line_start + line_size]); + break; + } + Some(c) => { + line_size += c.len_utf8(); + } + } + } + + // Skip any blank lines + while ctx.peek() == Some('\n') { + lex.mark_line(""); + ctx.next(); + } + + // Look at the next line + num_whitespace = check_whitespace(str_block_indent, ctx.rest()); + if num_whitespace == 0 { + // End of the text block + // let mut term_indent = String::with_capacity(num_whitespace); + while let Some(' ' | '\t') = ctx.peek() { + // term_indent.push( + ctx.next().unwrap(); + // ); + } + + if !ctx.rest().starts_with("|||") { + if ctx.rest().is_empty() { + lex.bump_pos(ctx.index); + return Err(UnexpectedEnd); + } + lex.eat_error(&ctx); + return Err(MissingTermination); + } + + // Skip '|||' + ctx.skip(3); + break; + } + } + + lex.bump_pos(ctx.index); + Ok(()) +} --- a/crates/jrsonnet-rowan-parser/Cargo.toml +++ b/crates/jrsonnet-rowan-parser/Cargo.toml @@ -14,7 +14,7 @@ drop_bomb.workspace = true hi-doc.workspace = true indoc.workspace = true -logos.workspace = true +jrsonnet-lexer = { version = "0.5.0-pre97", path = "../jrsonnet-lexer" } rowan.workspace = true thiserror.workspace = true --- a/crates/jrsonnet-rowan-parser/src/generated/syntax_kinds.rs +++ b/crates/jrsonnet-rowan-parser/src/generated/syntax_kinds.rs @@ -8,172 +8,95 @@ clippy::manual_non_exhaustive, clippy::match_like_matches_macro )] -use logos::Logos; #[doc = r" The kind of syntax node, e.g. `IDENT`, `USE_KW`, or `STRUCT`."] -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Logos)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] #[repr(u16)] pub enum SyntaxKind { #[doc(hidden)] TOMBSTONE, #[doc(hidden)] EOF, - #[token("||")] OR, - #[token("??")] NULL_COAELSE, - #[token("&&")] AND, - #[token("|")] BIT_OR, - #[token("^")] BIT_XOR, - #[token("&")] BIT_AND, - #[token("==")] EQ, - #[token("!=")] NE, - #[token("<")] LT, - #[token(">")] GT, - #[token("<=")] LE, - #[token(">=")] GE, - #[token("<<")] LHS, - #[token(">>")] RHS, - #[token("+")] PLUS, - #[token("-")] MINUS, - #[token("*")] MUL, - #[token("/")] DIV, - #[token("%")] MODULO, - #[token("!")] NOT, - #[token("~")] BIT_NOT, - #[token("[")] L_BRACK, - #[token("]")] R_BRACK, - #[token("(")] L_PAREN, - #[token(")")] R_PAREN, - #[token("{")] L_BRACE, - #[token("}")] R_BRACE, - #[token(":")] COLON, - #[token("::")] COLONCOLON, - #[token(":::")] COLONCOLONCOLON, - #[token(";")] SEMI, - #[token(".")] DOT, - #[token("...")] DOTDOTDOT, - #[token(",")] COMMA, - #[token("$")] DOLLAR, - #[token("=")] ASSIGN, - #[token("?")] QUESTION_MARK, - #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")] FLOAT, - #[regex("(?:0|[1-9][0-9]*)\\.[^0-9]")] ERROR_FLOAT_JUNK_AFTER_POINT, - #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][^+\\-0-9]")] ERROR_FLOAT_JUNK_AFTER_EXPONENT, - #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][+-][^0-9]")] ERROR_FLOAT_JUNK_AFTER_EXPONENT_SIGN, - #[regex("\"(?s:[^\"\\\\]|\\\\.)*\"")] STRING_DOUBLE, - #[regex("\"(?s:[^\"\\\\]|\\\\.)*")] ERROR_STRING_DOUBLE_UNTERMINATED, - #[regex("'(?s:[^'\\\\]|\\\\.)*'")] STRING_SINGLE, - #[regex("'(?s:[^'\\\\]|\\\\.)*")] ERROR_STRING_SINGLE_UNTERMINATED, - #[regex("@\"(?:[^\"]|\"\")*\"")] STRING_DOUBLE_VERBATIM, - #[regex("@\"(?:[^\"]|\"\")*")] ERROR_STRING_DOUBLE_VERBATIM_UNTERMINATED, - #[regex("@'(?:[^']|'')*'")] STRING_SINGLE_VERBATIM, - #[regex("@'(?:[^']|'')*")] ERROR_STRING_SINGLE_VERBATIM_UNTERMINATED, - #[regex("@[^\"'\\s]\\S+")] ERROR_STRING_VERBATIM_MISSING_QUOTES, - #[regex("\\|\\|\\|", crate::string_block::lex_str_block_test)] STRING_BLOCK, ERROR_STRING_BLOCK_UNEXPECTED_END, ERROR_STRING_BLOCK_MISSING_NEW_LINE, ERROR_STRING_BLOCK_MISSING_TERMINATION, ERROR_STRING_BLOCK_MISSING_INDENT, - #[regex("[_a-zA-Z][_a-zA-Z0-9]*")] IDENT, - #[regex("[ \\t\\n\\r]+")] WHITESPACE, - #[regex("//[^\\r\\n]*?(\\r\\n|\\n)?")] SINGLE_LINE_SLASH_COMMENT, - #[regex("#[^\\r\\n]*?(\\r\\n|\\n)?")] SINGLE_LINE_HASH_COMMENT, - #[regex("/\\*([^*]|\\*[^/])*\\*/")] MULTI_LINE_COMMENT, - #[regex("/\\*/")] ERROR_COMMENT_TOO_SHORT, - #[regex("/\\*([^*/]|\\*[^/])+")] ERROR_COMMENT_UNTERMINATED, - #[token("tailstrict")] TAILSTRICT_KW, - #[token("local")] LOCAL_KW, - #[token("importstr")] IMPORTSTR_KW, - #[token("importbin")] IMPORTBIN_KW, - #[token("import")] IMPORT_KW, - #[token("if")] IF_KW, - #[token("then")] THEN_KW, - #[token("else")] ELSE_KW, - #[token("function")] FUNCTION_KW, - #[token("error")] ERROR_KW, - #[token("in")] IN_KW, META_OBJECT_APPLY, ERROR_NO_OPERATOR, - #[token("null")] NULL_KW, - #[token("true")] TRUE_KW, - #[token("false")] FALSE_KW, - #[token("self")] SELF_KW, - #[token("super")] SUPER_KW, - #[token("for")] FOR_KW, - #[token("assert")] ASSERT_KW, ERROR_MISSING_TOKEN, ERROR_UNEXPECTED_TOKEN, --- a/crates/jrsonnet-rowan-parser/src/lex.rs +++ b/crates/jrsonnet-rowan-parser/src/lex.rs @@ -1,81 +1,19 @@ -use core::ops::Range; -use std::convert::TryFrom; - -use logos::Logos; +use jrsonnet_lexer::Lexer; use rowan::{TextRange, TextSize}; -use crate::{ - string_block::{lex_str_block, StringBlockError}, - SyntaxKind, -}; - -pub struct Lexer<'a> { - inner: logos::Lexer<'a, SyntaxKind>, -} - -impl<'a> Lexer<'a> { - pub fn new(input: &'a str) -> Self { - Self { - inner: SyntaxKind::lexer(input), - } - } -} - -impl<'a> Iterator for Lexer<'a> { - type Item = Lexeme<'a>; - - fn next(&mut self) -> Option { - use SyntaxKind::*; - - let mut kind = self.inner.next()?; - let text = self.inner.slice(); - - if kind == Ok(STRING_BLOCK) { - // We use custom lexer, which skips enough bytes, but not returns error - // Instead we should call lexer again to verify if there is something wrong with string block - let mut lexer = logos::Lexer::::new(text); - // In kinds, string blocks is parsed at least as `|||` - lexer.bump(3); - let res = lex_str_block(&mut lexer); - let next = lexer.next(); - assert!(next.is_none(), "str_block is lexed"); - match res { - Ok(()) => {} - Err(e) => { - kind = Ok(match e { - StringBlockError::UnexpectedEnd => ERROR_STRING_BLOCK_UNEXPECTED_END, - StringBlockError::MissingNewLine => ERROR_STRING_BLOCK_MISSING_NEW_LINE, - StringBlockError::MissingTermination => { - ERROR_STRING_BLOCK_MISSING_TERMINATION - } - StringBlockError::MissingIndent => ERROR_STRING_BLOCK_MISSING_INDENT, - }); - } - } - } - - Some(Self::Item { - kind: kind.unwrap_or(SyntaxKind::LEXING_ERROR), - text, - range: { - let Range { start, end } = self.inner.span(); - - TextRange::new( - TextSize::try_from(start).unwrap(), - TextSize::try_from(end).unwrap(), - ) - }, - }) - } -} +use crate::SyntaxKind; #[derive(Clone, Copy, Debug)] -pub struct Lexeme<'i> { +pub struct Lexeme<'s> { pub kind: SyntaxKind, - pub text: &'i str, + pub text: &'s str, pub range: TextRange, } pub fn lex(input: &str) -> Vec> { - Lexer::new(input).collect() + Lexer::new(input).map(|l| Lexeme { + kind: SyntaxKind::from_raw(l.kind.into_raw()), + text: l.text, + range: TextRange::new(TextSize::from(l.range.0), TextSize::from(l.range.1)), + }).collect() } --- a/crates/jrsonnet-rowan-parser/src/lib.rs +++ b/crates/jrsonnet-rowan-parser/src/lib.rs @@ -2,7 +2,6 @@ use event::Sink; use generated::nodes::{SourceFile, Trivia}; -use lex::lex; use parser::{LocatedSyntaxError, Parser}; pub use rowan; @@ -14,14 +13,12 @@ mod marker; mod parser; mod precedence; -mod string_block; mod tests; mod token_set; pub use ast::{AstChildren, AstNode, AstToken}; pub use generated::{nodes, syntax_kinds::SyntaxKind}; pub use language::*; -pub use string_block::{collect_lexed_str_block, CollectStrBlock}; pub use token_set::SyntaxKindSet; use self::{ @@ -30,7 +27,7 @@ }; pub fn parse(input: &str) -> (SourceFile, Vec) { - let lexemes = lex(input); + let lexemes = lex::lex(input); let kinds = lexemes .iter() .map(|l| l.kind) --- a/crates/jrsonnet-rowan-parser/src/string_block.rs +++ /dev/null @@ -1,282 +0,0 @@ -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum StringBlockError { - UnexpectedEnd, - MissingNewLine, - MissingTermination, - MissingIndent, -} - -use logos::Lexer; -use StringBlockError::*; - -use crate::SyntaxKind; - -pub(crate) fn lex_str_block_test(lex: &mut Lexer<'_, SyntaxKind>) { - let _ = lex_str_block(lex); -} - -pub(crate) struct Context<'a> { - source: &'a str, - index: usize, -} - -impl<'a> Context<'a> { - fn rest(&self) -> &'a str { - &self.source[self.index..] - } - - fn next(&mut self) -> Option { - if self.index == self.source.len() { - return None; - } - - match self.rest().chars().next() { - None => None, - Some(c) => { - self.index += c.len_utf8(); - Some(c) - } - } - } - - fn peek(&self) -> Option { - if self.index == self.source.len() { - return None; - } - - self.rest().chars().next() - } - - fn eat_if(&mut self, f: impl Fn(char) -> bool) -> usize { - if self.peek().is_some_and(f) { - self.index += 1; - return 1; - } - 0 - } - - fn eat_while(&mut self, f: impl Fn(char) -> bool) -> usize { - if self.index == self.source.len() { - return 0; - } - - let next_char = self.rest().char_indices().find(|(_, c)| !f(*c)); - - match next_char { - None => { - let diff = self.source.len() - self.index; - self.index = self.source.len(); - diff - } - Some((idx, _)) => { - self.index += idx; - idx - } - } - } - - fn skip(&mut self, len: usize) { - self.index = match self.index + len { - n if n > self.source.len() => self.source.len(), - n => n, - }; - } -} - -// Check that b has at least the same whitespace prefix as a and returns the -// amount of this whitespace, otherwise returns 0. If a has no whitespace -// prefix than return 0. -fn check_whitespace(a: &str, b: &str) -> usize { - let a = a.as_bytes(); - let b = b.as_bytes(); - - for i in 0..a.len() { - if a[i] != b' ' && a[i] != b'\t' { - // a has run out of whitespace and b matched up to this point. Return result. - return i; - } - - if i >= b.len() { - // We ran off the edge of b while a still has whitespace. Return 0 as failure. - return 0; - } - - if a[i] != b[i] { - // a has whitespace but b does not. Return 0 as failure. - return 0; - } - } - - // We ran off the end of a and b kept up - a.len() -} - -pub(crate) trait StrBlockLexCtx<'d> { - fn remainder(&self) -> &'d str; - fn eat_error(&mut self, ctx: &Context<'d>); - fn bump_pos(&mut self, s: usize); - fn mark_truncating(&mut self); - fn mark_line(&mut self, line: &'d str); -} - -impl<'d> StrBlockLexCtx<'d> for Lexer<'d, SyntaxKind> { - fn remainder(&self) -> &'d str { - self.remainder() - } - fn eat_error(&mut self, ctx: &Context<'d>) { - let end_index = ctx - .rest() - .find("|||") - .map_or_else(|| ctx.rest().len(), |v| v + 3); - self.bump(ctx.index + end_index); - } - fn bump_pos(&mut self, s: usize) { - self.bump(s); - } - fn mark_truncating(&mut self) { - // Lexer test doesn't collect anything - } - fn mark_line(&mut self, _line: &'d str) { - // Lexer test doesn't collect anything - } -} - -pub fn collect_lexed_str_block(input: &str) -> Result, StringBlockError> { - let mut collect = CollectStrBlock { - truncate: false, - lines: vec![], - input, - offset: 0, - }; - lex_str_block(&mut collect)?; - Ok(collect) -} - -pub struct CollectStrBlock<'s> { - pub truncate: bool, - pub lines: Vec<&'s str>, - input: &'s str, - offset: usize, -} - -impl<'d> StrBlockLexCtx<'d> for CollectStrBlock<'d> { - fn remainder(&self) -> &'d str { - self.input - } - - fn eat_error(&mut self, _ctx: &Context<'d>) { - // Error will be returned, no need to record it here - } - - fn bump_pos(&mut self, s: usize) { - self.offset += s; - } - - fn mark_truncating(&mut self) { - self.truncate = true; - } - - fn mark_line(&mut self, line: &'d str) { - self.lines.push(line); - } -} - -pub(crate) fn lex_str_block<'a>(lex: &mut impl StrBlockLexCtx<'a>) -> Result<(), StringBlockError> { - // debug_assert_eq!(lex.slice(), "|||"); - let mut ctx = Context::<'a> { - source: lex.remainder(), - index: 0, - }; - - if ctx.eat_if(|v| v == '-') != 0 { - lex.mark_truncating(); - } - - // Skip whitespaces - ctx.eat_while(|r| r == ' ' || r == '\t' || r == '\r'); - - // Skip \n - match ctx.next() { - Some('\n') => (), - None => { - lex.eat_error(&ctx); - return Err(UnexpectedEnd); - } - // Text block requires new line after |||. - Some(_) => { - lex.eat_error(&ctx); - return Err(MissingNewLine); - } - } - - // Process leading blank lines before calculating string block indent - while ctx.peek() == Some('\n') { - ctx.next(); - } - - let mut num_whitespace = check_whitespace(ctx.rest(), ctx.rest()); - let str_block_indent = &ctx.rest()[..num_whitespace]; - - if num_whitespace == 0 { - // Text block's first line must start with whitespace - lex.eat_error(&ctx); - return Err(MissingIndent); - } - - loop { - debug_assert_ne!(num_whitespace, 0, "Unexpected value for num_whitespace"); - ctx.skip(num_whitespace); - - let line_start = ctx.index; - let mut line_size = 0; - loop { - match ctx.next() { - None => { - lex.eat_error(&ctx); - return Err(UnexpectedEnd); - } - Some('\n') => { - lex.mark_line(&ctx.source[line_start..line_start + line_size]); - break; - } - Some(c) => { - line_size += c.len_utf8(); - } - } - } - - // Skip any blank lines - while ctx.peek() == Some('\n') { - lex.mark_line(""); - ctx.next(); - } - - // Look at the next line - num_whitespace = check_whitespace(str_block_indent, ctx.rest()); - if num_whitespace == 0 { - // End of the text block - // let mut term_indent = String::with_capacity(num_whitespace); - while let Some(' ' | '\t') = ctx.peek() { - // term_indent.push( - ctx.next().unwrap(); - // ); - } - - if !ctx.rest().starts_with("|||") { - if ctx.rest().is_empty() { - lex.bump_pos(ctx.index); - return Err(UnexpectedEnd); - } - lex.eat_error(&ctx); - return Err(MissingTermination); - } - - // Skip '|||' - ctx.skip(3); - break; - } - } - - lex.bump_pos(ctx.index); - Ok(()) -} --- a/xtask/src/sourcegen/kinds.rs +++ b/xtask/src/sourcegen/kinds.rs @@ -56,7 +56,7 @@ | Self::Error { name, .. } => name, } } - pub fn expand_kind(&self) -> TokenStream { + pub fn expand_kind(&self, lexer: bool) -> TokenStream { let name = format_ident!("{}", self.name()); let attr = match self { Self::Keyword { code, .. } => quote! {#[token(#code)]}, @@ -75,6 +75,11 @@ } _ => quote! {}, }; + let attr = if lexer { + attr + } else { + quote! {} + }; quote! { #attr #name --- a/xtask/src/sourcegen/mod.rs +++ b/xtask/src/sourcegen/mod.rs @@ -89,7 +89,7 @@ kinds.define_node(&name); } - let syntax_kinds = generate_syntax_kinds(&kinds, &ast)?; + let syntax_kinds = generate_syntax_kinds(&kinds, &ast, false)?; let nodes = generate_nodes(&kinds, &ast)?; ensure_file_contents( @@ -106,12 +106,21 @@ )), &nodes, ); + + let lexer_syntax_kinds = generate_syntax_kinds(&kinds, &ast, true)?; + ensure_file_contents( + &PathBuf::from(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../crates/jrsonnet-lexer/src/generated/syntax_kinds.rs", + )), + &lexer_syntax_kinds, + ); Ok(()) } -fn generate_syntax_kinds(kinds: &KindsSrc, grammar: &AstSrc) -> Result { +fn generate_syntax_kinds(kinds: &KindsSrc, grammar: &AstSrc, lexer: bool) -> Result { let t_macros = kinds.tokens().filter_map(TokenKind::expand_t_macros); - let token_kinds = kinds.tokens().map(TokenKind::expand_kind); + let token_kinds = kinds.tokens().map(|t| t.expand_kind(lexer)); let keywords = kinds .tokens() @@ -119,12 +128,16 @@ .map(TokenKind::name) .map(|n| format_ident!("{n}")); - let nodes = kinds + let mut nodes = kinds .nodes .iter() .map(|name| format_ident!("{}", name)) .collect::>(); + if lexer { + nodes.clear(); + } + let enums = grammar .enums .iter() @@ -134,14 +147,34 @@ .token_enums .iter() .map(|e| format_ident!("{}", to_upper_snake_case(&e.name))), - ); + ) + .collect::>(); + let is_enum = if lexer { + quote! {} + } else { + quote! { + pub fn is_enum(self) -> bool { + match self { + #(#enums)|* => true, + _ => false, + } + } + } + }; + let derive_logos = if lexer { + quote! { + , logos::Logos + } + } else { + quote! {} + }; + let ast = quote! { #![allow(bad_style, missing_docs, unreachable_pub, clippy::manual_non_exhaustive, clippy::match_like_matches_macro)] - use logos::Logos; /// The kind of syntax node, e.g. `IDENT`, `USE_KW`, or `STRUCT`. - #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Logos)] + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug #derive_logos)] #[repr(u16)] pub enum SyntaxKind { #[doc(hidden)] @@ -164,13 +197,9 @@ _ => false, } } - pub fn is_enum(self) -> bool { - match self { - #(#enums)|* => true, - _ => false, - } - } + #is_enum + pub fn from_raw(r: u16) -> Self { assert!(r < Self::__LAST as u16); unsafe { std::mem::transmute(r) } -- gitstuff