From 7c03fc40023daeec3fabfad1a129901ab795d453 Mon Sep 17 00:00:00 2001 From: Yaroslav Bolyukin Date: Mon, 23 Mar 2026 00:17:41 +0000 Subject: [PATCH] feat(lexer): explicit token names --- --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ jrsonnet-evaluator = { path = "./crates/jrsonnet-evaluator", version = "0.5.0-pre97" } jrsonnet-macros = { path = "./crates/jrsonnet-macros", version = "0.5.0-pre97" } jrsonnet-ir = { path = "./crates/jrsonnet-ir", version = "0.5.0-pre97" } -jrsonnet-ir-parser = { path = "./crates/jrsonnet-rowan-parser", version = "0.5.0-pre97" } +jrsonnet-ir-parser = { path = "./crates/jrsonnet-ir-parser", version = "0.5.0-pre97" } jrsonnet-peg-parser = { path = "./crates/jrsonnet-peg-parser", version = "0.5.0-pre97" } jrsonnet-rowan-parser = { path = "./crates/jrsonnet-rowan-parser", version = "0.5.0-pre97" } jrsonnet-interner = { path = "./crates/jrsonnet-interner", version = "0.5.0-pre97" } --- a/crates/jrsonnet-lexer/src/generated/syntax_kinds.rs +++ b/crates/jrsonnet-lexer/src/generated/syntax_kinds.rs @@ -132,6 +132,10 @@ ERROR_COMMENT_TOO_SHORT, #[regex("/\\*([^*/]|\\*[^/])+")] ERROR_COMMENT_UNTERMINATED, + ERROR_NO_OPERATOR, + ERROR_MISSING_TOKEN, + ERROR_UNEXPECTED_TOKEN, + ERROR_CUSTOM, #[token("tailstrict")] TAILSTRICT_KW, #[token("local")] @@ -155,7 +159,6 @@ #[token("in")] IN_KW, META_OBJECT_APPLY, - ERROR_NO_OPERATOR, #[token("null")] NULL_KW, #[token("true")] @@ -170,9 +173,6 @@ FOR_KW, #[token("assert")] ASSERT_KW, - ERROR_MISSING_TOKEN, - ERROR_UNEXPECTED_TOKEN, - ERROR_CUSTOM, LEXING_ERROR, __LAST_TOKEN, #[doc(hidden)] --- a/crates/jrsonnet-lexer/src/string_block.rs +++ b/crates/jrsonnet-lexer/src/string_block.rs @@ -211,6 +211,7 @@ // Process leading blank lines before calculating string block indent while ctx.peek() == Some('\n') { + lex.mark_line(""); ctx.next(); } --- a/crates/jrsonnet-rowan-parser/jsonnet.ungram +++ b/crates/jrsonnet-rowan-parser/jsonnet.ungram @@ -209,7 +209,7 @@ | FieldNameDynamic Visibility = - ':' v1:':'? v2:':'? + ':' ':'? ':'? Literal = 'null' --- a/crates/jrsonnet-rowan-parser/src/generated/syntax_kinds.rs +++ b/crates/jrsonnet-rowan-parser/src/generated/syntax_kinds.rs @@ -76,6 +76,10 @@ MULTI_LINE_COMMENT, ERROR_COMMENT_TOO_SHORT, ERROR_COMMENT_UNTERMINATED, + ERROR_NO_OPERATOR, + ERROR_MISSING_TOKEN, + ERROR_UNEXPECTED_TOKEN, + ERROR_CUSTOM, TAILSTRICT_KW, LOCAL_KW, IMPORTSTR_KW, @@ -88,7 +92,6 @@ ERROR_KW, IN_KW, META_OBJECT_APPLY, - ERROR_NO_OPERATOR, NULL_KW, TRUE_KW, FALSE_KW, @@ -96,9 +99,6 @@ SUPER_KW, FOR_KW, ASSERT_KW, - ERROR_MISSING_TOKEN, - ERROR_UNEXPECTED_TOKEN, - ERROR_CUSTOM, LEXING_ERROR, __LAST_TOKEN, SOURCE_FILE, @@ -199,6 +199,149 @@ _ => false, } } + pub fn error_description(self) -> Option<&'static str> { + match self { + ERROR_FLOAT_JUNK_AFTER_POINT => { + ::core::option::Option::Some("junk after decimal point in number literal") + } + ERROR_FLOAT_JUNK_AFTER_EXPONENT => { + ::core::option::Option::Some("junk after exponent in number literal") + } + ERROR_FLOAT_JUNK_AFTER_EXPONENT_SIGN => { + ::core::option::Option::Some("junk after exponent sign in number literal") + } + ERROR_STRING_DOUBLE_UNTERMINATED => { + ::core::option::Option::Some("unterminated double-quoted string") + } + ERROR_STRING_SINGLE_UNTERMINATED => { + ::core::option::Option::Some("unterminated single-quoted string") + } + ERROR_STRING_DOUBLE_VERBATIM_UNTERMINATED => { + ::core::option::Option::Some("unterminated verbatim double-quoted string") + } + ERROR_STRING_SINGLE_VERBATIM_UNTERMINATED => { + ::core::option::Option::Some("unterminated verbatim single-quoted string") + } + ERROR_STRING_VERBATIM_MISSING_QUOTES => { + ::core::option::Option::Some("verbatim string missing opening quotes") + } + ERROR_STRING_BLOCK_UNEXPECTED_END => { + ::core::option::Option::Some("unexpected end of text block") + } + ERROR_STRING_BLOCK_MISSING_NEW_LINE => { + ::core::option::Option::Some("text block requires new line after |||") + } + ERROR_STRING_BLOCK_MISSING_TERMINATION => { + ::core::option::Option::Some("unterminated text block") + } + ERROR_STRING_BLOCK_MISSING_INDENT => { + ::core::option::Option::Some("text block first line must be indented") + } + ERROR_COMMENT_TOO_SHORT => ::core::option::Option::Some("comment too short"), + ERROR_COMMENT_UNTERMINATED => { + ::core::option::Option::Some("unterminated multi-line comment") + } + ERROR_NO_OPERATOR => ::core::option::Option::Some("expected operator"), + ERROR_MISSING_TOKEN => ::core::option::Option::Some("missing token"), + ERROR_UNEXPECTED_TOKEN => ::core::option::Option::Some("unexpected token"), + ERROR_CUSTOM => ::core::option::Option::Some("error"), + LEXING_ERROR => ::core::option::Option::Some("unexpected character"), + _ => None, + } + } + pub fn display_name(self) -> &'static str { + match self { + OR => "'||'", + NULL_COAELSE => "'??'", + AND => "'&&'", + BIT_OR => "'|'", + BIT_XOR => "'^'", + BIT_AND => "'&'", + EQ => "'=='", + NE => "'!='", + LT => "'<'", + GT => "'>'", + LE => "'<='", + GE => "'>='", + LHS => "'<<'", + RHS => "'>>'", + PLUS => "'+'", + MINUS => "'-'", + MUL => "'*'", + DIV => "'/'", + MODULO => "'%'", + NOT => "'!'", + BIT_NOT => "'~'", + L_BRACK => "'['", + R_BRACK => "']'", + L_PAREN => "'('", + R_PAREN => "')'", + L_BRACE => "'{'", + R_BRACE => "'}'", + COLON => "':'", + SEMI => "';'", + DOT => "'.'", + DOTDOTDOT => "'...'", + COMMA => "','", + DOLLAR => "'$'", + ASSIGN => "'='", + QUESTION_MARK => "'?'", + FLOAT => "number", + ERROR_FLOAT_JUNK_AFTER_POINT => "junk after decimal point in number literal", + ERROR_FLOAT_JUNK_AFTER_EXPONENT => "junk after exponent in number literal", + ERROR_FLOAT_JUNK_AFTER_EXPONENT_SIGN => "junk after exponent sign in number literal", + STRING_DOUBLE => "string", + ERROR_STRING_DOUBLE_UNTERMINATED => "unterminated double-quoted string", + STRING_SINGLE => "string", + ERROR_STRING_SINGLE_UNTERMINATED => "unterminated single-quoted string", + STRING_DOUBLE_VERBATIM => "string", + ERROR_STRING_DOUBLE_VERBATIM_UNTERMINATED => { + "unterminated verbatim double-quoted string" + } + STRING_SINGLE_VERBATIM => "string", + ERROR_STRING_SINGLE_VERBATIM_UNTERMINATED => { + "unterminated verbatim single-quoted string" + } + ERROR_STRING_VERBATIM_MISSING_QUOTES => "verbatim string missing opening quotes", + STRING_BLOCK => "string", + ERROR_STRING_BLOCK_UNEXPECTED_END => "unexpected end of text block", + ERROR_STRING_BLOCK_MISSING_NEW_LINE => "text block requires new line after |||", + ERROR_STRING_BLOCK_MISSING_TERMINATION => "unterminated text block", + ERROR_STRING_BLOCK_MISSING_INDENT => "text block first line must be indented", + IDENT => "identifier", + WHITESPACE => "whitespace", + SINGLE_LINE_SLASH_COMMENT => "comment", + SINGLE_LINE_HASH_COMMENT => "comment", + MULTI_LINE_COMMENT => "comment", + ERROR_COMMENT_TOO_SHORT => "comment too short", + ERROR_COMMENT_UNTERMINATED => "unterminated multi-line comment", + ERROR_NO_OPERATOR => "expected operator", + ERROR_MISSING_TOKEN => "missing token", + ERROR_UNEXPECTED_TOKEN => "unexpected token", + ERROR_CUSTOM => "error", + TAILSTRICT_KW => "'tailstrict'", + LOCAL_KW => "'local'", + IMPORTSTR_KW => "'importstr'", + IMPORTBIN_KW => "'importbin'", + IMPORT_KW => "'import'", + IF_KW => "'if'", + THEN_KW => "'then'", + ELSE_KW => "'else'", + FUNCTION_KW => "'function'", + ERROR_KW => "'error'", + IN_KW => "'in'", + META_OBJECT_APPLY => "meta_object_apply", + NULL_KW => "'null'", + TRUE_KW => "'true'", + FALSE_KW => "'false'", + SELF_KW => "'self'", + SUPER_KW => "'super'", + FOR_KW => "'for'", + ASSERT_KW => "'assert'", + LEXING_ERROR => "unexpected character", + _ => "unknown", + } + } pub fn from_raw(r: u16) -> Self { assert!(r < Self::__LAST as u16); unsafe { std::mem::transmute(r) } --- a/xtask/src/sourcegen/kinds.rs +++ b/xtask/src/sourcegen/kinds.rs @@ -19,6 +19,7 @@ is_lexer_error: bool, regex: Option, priority: Option, + description: String, }, /// Keyword - literal match of token Keyword { @@ -113,6 +114,24 @@ } } + pub fn display_name(&self) -> String { + match self { + Self::Keyword { code, .. } => format!("'{code}'"), + Self::Literal { name, .. } => match name.as_str() { + "FLOAT" => "number".to_owned(), + "IDENT" => "identifier".to_owned(), + "STRING_DOUBLE" | "STRING_SINGLE" | "STRING_DOUBLE_VERBATIM" + | "STRING_SINGLE_VERBATIM" | "STRING_BLOCK" => "string".to_owned(), + "WHITESPACE" => "whitespace".to_owned(), + "SINGLE_LINE_SLASH_COMMENT" | "SINGLE_LINE_HASH_COMMENT" + | "MULTI_LINE_COMMENT" => "comment".to_owned(), + _ => name.to_lowercase(), + }, + Self::Meta { name, .. } => name.to_lowercase(), + Self::Error { description, .. } => description.clone(), + } + } + pub fn method_name(&self) -> Ident { match self { Self::Keyword { name, .. } => { @@ -138,7 +157,7 @@ }); $(define_kinds!($into = $($rest)*))? }}; - ($into:ident = error($name:literal$(, priority = $priority:literal)? $(, lexer = $lexer:literal)?) $(=> $regex:literal)? $(; $($rest:tt)*)?) => {{ + ($into:ident = error($name:literal, $desc:literal $(, priority = $priority:literal)? $(, lexer = $lexer:literal)?) $(=> $regex:literal)? $(; $($rest:tt)*)?) => {{ { let regex = None$(.or(Some($regex.to_owned())))?; let priority = None$(.or(Some($priority)))?; @@ -148,6 +167,7 @@ is_lexer_error: false $(|| $lexer)? || regex.is_some() || priority.is_some(), regex, priority, + description: $desc.to_owned(), }); } $(define_kinds!($into = $($rest)*))? @@ -248,31 +268,35 @@ "=" => "ASSIGN"; "?" => "QUESTION_MARK"; // Literals - lit("FLOAT") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?"; - error("FLOAT_JUNK_AFTER_POINT") => r"(?:0|[1-9][0-9]*)\.[^0-9]"; - error("FLOAT_JUNK_AFTER_EXPONENT") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?[eE][^+\-0-9]"; - error("FLOAT_JUNK_AFTER_EXPONENT_SIGN") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?[eE][+-][^0-9]"; + lit("FLOAT") => r"(?:0|[1-9][0-9]*(?:_[0-9]+)*)(?:\.[0-9]+(?:_[0-9]+)*)?(?:[eE][+-]?[0-9]+(?:_[0-9]+)*)?"; + error("FLOAT_JUNK_AFTER_POINT", "junk after decimal point in number literal") => r"(?:0|[1-9][0-9]*(?:_[0-9]+)*)\.[^0-9]"; + error("FLOAT_JUNK_AFTER_EXPONENT", "junk after exponent in number literal") => r"(?:0|[1-9][0-9]*(?:_[0-9]+)*)(?:\.[0-9]+(?:_[0-9]+)*)?[eE][^+\-0-9]"; + error("FLOAT_JUNK_AFTER_EXPONENT_SIGN", "junk after exponent sign in number literal") => r"(?:0|[1-9][0-9]*(?:_[0-9]+)*)(?:\.[0-9]+(?:_[0-9]+)*)?[eE][+-][^0-9]"; lit("STRING_DOUBLE") => "\"(?s:[^\"\\\\]|\\\\.)*\""; - error("STRING_DOUBLE_UNTERMINATED") => "\"(?s:[^\"\\\\]|\\\\.)*"; + error("STRING_DOUBLE_UNTERMINATED", "unterminated double-quoted string") => "\"(?s:[^\"\\\\]|\\\\.)*"; lit("STRING_SINGLE") => "'(?s:[^'\\\\]|\\\\.)*'"; - error("STRING_SINGLE_UNTERMINATED") => "'(?s:[^'\\\\]|\\\\.)*"; + error("STRING_SINGLE_UNTERMINATED", "unterminated single-quoted string") => "'(?s:[^'\\\\]|\\\\.)*"; lit("STRING_DOUBLE_VERBATIM") => "@\"(?:[^\"]|\"\")*\""; - error("STRING_DOUBLE_VERBATIM_UNTERMINATED") => "@\"(?:[^\"]|\"\")*"; + error("STRING_DOUBLE_VERBATIM_UNTERMINATED", "unterminated verbatim double-quoted string") => "@\"(?:[^\"]|\"\")*"; lit("STRING_SINGLE_VERBATIM") => "@'(?:[^']|'')*'"; - error("STRING_SINGLE_VERBATIM_UNTERMINATED") => "@'(?:[^']|'')*"; - error("STRING_VERBATIM_MISSING_QUOTES") => "@[^\"'\\s]\\S+"; + error("STRING_SINGLE_VERBATIM_UNTERMINATED", "unterminated verbatim single-quoted string") => "@'(?:[^']|'')*"; + error("STRING_VERBATIM_MISSING_QUOTES", "verbatim string missing opening quotes") => "@[^\"'\\s]\\S+"; lit("STRING_BLOCK") => r"\|\|\|", "crate::string_block::lex_str_block_test"; - error("STRING_BLOCK_UNEXPECTED_END", lexer = true); - error("STRING_BLOCK_MISSING_NEW_LINE", lexer = true); - error("STRING_BLOCK_MISSING_TERMINATION", lexer = true); - error("STRING_BLOCK_MISSING_INDENT", lexer = true); + error("STRING_BLOCK_UNEXPECTED_END", "unexpected end of text block", lexer = true); + error("STRING_BLOCK_MISSING_NEW_LINE", "text block requires new line after |||", lexer = true); + error("STRING_BLOCK_MISSING_TERMINATION", "unterminated text block", lexer = true); + error("STRING_BLOCK_MISSING_INDENT", "text block first line must be indented", lexer = true); lit("IDENT") => r"[_a-zA-Z][_a-zA-Z0-9]*"; lit("WHITESPACE") => r"[ \t\n\r]+"; lit("SINGLE_LINE_SLASH_COMMENT") => r"//[^\r\n]*?(\r\n|\n)?"; lit("SINGLE_LINE_HASH_COMMENT") => r"#[^\r\n]*?(\r\n|\n)?"; lit("MULTI_LINE_COMMENT") => r"/\*([^*]|\*[^/])*\*/"; - error("COMMENT_TOO_SHORT") => r"/\*/"; - error("COMMENT_UNTERMINATED") => r"/\*([^*/]|\*[^/])+"; + error("COMMENT_TOO_SHORT", "comment too short") => r"/\*/"; + error("COMMENT_UNTERMINATED", "unterminated multi-line comment") => r"/\*([^*/]|\*[^/])+"; + error("NO_OPERATOR", "expected operator"); + error("MISSING_TOKEN", "missing token"); + error("UNEXPECTED_TOKEN", "unexpected token"); + error("CUSTOM", "error"); ]; kinds } --- a/xtask/src/sourcegen/mod.rs +++ b/xtask/src/sourcegen/mod.rs @@ -56,14 +56,7 @@ }); } SpecialName::Error => { - eprintln!("implicit error: {name}"); - kinds.define_token(TokenKind::Error { - grammar_name: token.to_owned(), - name: format!("ERROR_{name}"), - regex: None, - priority: None, - is_lexer_error: true, - }); + panic!("error token ERROR_{name} must be explicitly defined in jsonnet_kinds()"); } } continue; @@ -170,6 +163,24 @@ quote! {} }; + let error_desc_arms = kinds.tokens().filter_map(|t| { + if let TokenKind::Error { + name, description, .. + } = t + { + let ident = format_ident!("{name}"); + Some(quote! { #ident => ::core::option::Option::Some(#description) }) + } else { + None + } + }); + + let display_name_arms = kinds.tokens().map(|t| { + let ident = format_ident!("{}", t.name()); + let display = t.display_name(); + quote! { #ident => #display } + }); + let ast = quote! { #![allow(bad_style, missing_docs, unreachable_pub, clippy::manual_non_exhaustive, clippy::match_like_matches_macro)] @@ -200,6 +211,22 @@ #is_enum + pub fn error_description(self) -> Option<&'static str> { + match self { + #(#error_desc_arms,)* + LEXING_ERROR => ::core::option::Option::Some("unexpected character"), + _ => None, + } + } + + pub fn display_name(self) -> &'static str { + match self { + #(#display_name_arms,)* + LEXING_ERROR => "unexpected character", + _ => "unknown", + } + } + pub fn from_raw(r: u16) -> Self { assert!(r < Self::__LAST as u16); unsafe { std::mem::transmute(r) } -- gitstuff