difftreelog
refactor split lexer from rowan parser
in: master
15 files changed
Cargo.lockdiffbeforeafterboth--- a/Cargo.lock
+++ b/Cargo.lock
@@ -716,6 +716,13 @@
]
[[package]]
+name = "jrsonnet-lexer"
+version = "0.5.0-pre97"
+dependencies = [
+ "logos",
+]
+
+[[package]]
name = "jrsonnet-macros"
version = "0.5.0-pre97"
dependencies = [
@@ -744,7 +751,7 @@
"hi-doc",
"indoc",
"insta",
- "logos",
+ "jrsonnet-lexer",
"rowan",
"strip-ansi-escapes",
"thiserror",
Cargo.tomldiffbeforeafterboth--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,9 +81,6 @@
itertools = "0.14.0"
xshell = "0.2.7"
-lsp-server = "0.7.9"
-lsp-types = "0.97.0"
-
regex = "1.12"
lru = "0.16.3"
crates/jrsonnet-lexer/Cargo.tomldiffbeforeafterboth--- /dev/null
+++ b/crates/jrsonnet-lexer/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "jrsonnet-lexer"
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+repository.workspace = true
+version.workspace = true
+
+[dependencies]
+logos.workspace = true
+
+[lints]
+workspace = true
crates/jrsonnet-lexer/src/generated/mod.rsdiffbeforeafterboth--- /dev/null
+++ b/crates/jrsonnet-lexer/src/generated/mod.rs
@@ -0,0 +1 @@
+pub mod syntax_kinds;
crates/jrsonnet-lexer/src/generated/syntax_kinds.rsdiffbeforeafterboth--- /dev/null
+++ b/crates/jrsonnet-lexer/src/generated/syntax_kinds.rs
@@ -0,0 +1,210 @@
+//! This is a generated file, please do not edit manually. Changes can be
+//! made in codegeneration that lives in `xtask` top-level dir.
+
+#![allow(
+ bad_style,
+ missing_docs,
+ unreachable_pub,
+ clippy::manual_non_exhaustive,
+ clippy::match_like_matches_macro
+)]
+#[doc = r" The kind of syntax node, e.g. `IDENT`, `USE_KW`, or `STRUCT`."]
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, logos :: Logos)]
+#[repr(u16)]
+pub enum SyntaxKind {
+ #[doc(hidden)]
+ TOMBSTONE,
+ #[doc(hidden)]
+ EOF,
+ #[token("||")]
+ OR,
+ #[token("??")]
+ NULL_COAELSE,
+ #[token("&&")]
+ AND,
+ #[token("|")]
+ BIT_OR,
+ #[token("^")]
+ BIT_XOR,
+ #[token("&")]
+ BIT_AND,
+ #[token("==")]
+ EQ,
+ #[token("!=")]
+ NE,
+ #[token("<")]
+ LT,
+ #[token(">")]
+ GT,
+ #[token("<=")]
+ LE,
+ #[token(">=")]
+ GE,
+ #[token("<<")]
+ LHS,
+ #[token(">>")]
+ RHS,
+ #[token("+")]
+ PLUS,
+ #[token("-")]
+ MINUS,
+ #[token("*")]
+ MUL,
+ #[token("/")]
+ DIV,
+ #[token("%")]
+ MODULO,
+ #[token("!")]
+ NOT,
+ #[token("~")]
+ BIT_NOT,
+ #[token("[")]
+ L_BRACK,
+ #[token("]")]
+ R_BRACK,
+ #[token("(")]
+ L_PAREN,
+ #[token(")")]
+ R_PAREN,
+ #[token("{")]
+ L_BRACE,
+ #[token("}")]
+ R_BRACE,
+ #[token(":")]
+ COLON,
+ #[token("::")]
+ COLONCOLON,
+ #[token(":::")]
+ COLONCOLONCOLON,
+ #[token(";")]
+ SEMI,
+ #[token(".")]
+ DOT,
+ #[token("...")]
+ DOTDOTDOT,
+ #[token(",")]
+ COMMA,
+ #[token("$")]
+ DOLLAR,
+ #[token("=")]
+ ASSIGN,
+ #[token("?")]
+ QUESTION_MARK,
+ #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")]
+ FLOAT,
+ #[regex("(?:0|[1-9][0-9]*)\\.[^0-9]")]
+ ERROR_FLOAT_JUNK_AFTER_POINT,
+ #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][^+\\-0-9]")]
+ ERROR_FLOAT_JUNK_AFTER_EXPONENT,
+ #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][+-][^0-9]")]
+ ERROR_FLOAT_JUNK_AFTER_EXPONENT_SIGN,
+ #[regex("\"(?s:[^\"\\\\]|\\\\.)*\"")]
+ STRING_DOUBLE,
+ #[regex("\"(?s:[^\"\\\\]|\\\\.)*")]
+ ERROR_STRING_DOUBLE_UNTERMINATED,
+ #[regex("'(?s:[^'\\\\]|\\\\.)*'")]
+ STRING_SINGLE,
+ #[regex("'(?s:[^'\\\\]|\\\\.)*")]
+ ERROR_STRING_SINGLE_UNTERMINATED,
+ #[regex("@\"(?:[^\"]|\"\")*\"")]
+ STRING_DOUBLE_VERBATIM,
+ #[regex("@\"(?:[^\"]|\"\")*")]
+ ERROR_STRING_DOUBLE_VERBATIM_UNTERMINATED,
+ #[regex("@'(?:[^']|'')*'")]
+ STRING_SINGLE_VERBATIM,
+ #[regex("@'(?:[^']|'')*")]
+ ERROR_STRING_SINGLE_VERBATIM_UNTERMINATED,
+ #[regex("@[^\"'\\s]\\S+")]
+ ERROR_STRING_VERBATIM_MISSING_QUOTES,
+ #[regex("\\|\\|\\|", crate::string_block::lex_str_block_test)]
+ STRING_BLOCK,
+ ERROR_STRING_BLOCK_UNEXPECTED_END,
+ ERROR_STRING_BLOCK_MISSING_NEW_LINE,
+ ERROR_STRING_BLOCK_MISSING_TERMINATION,
+ ERROR_STRING_BLOCK_MISSING_INDENT,
+ #[regex("[_a-zA-Z][_a-zA-Z0-9]*")]
+ IDENT,
+ #[regex("[ \\t\\n\\r]+")]
+ WHITESPACE,
+ #[regex("//[^\\r\\n]*?(\\r\\n|\\n)?")]
+ SINGLE_LINE_SLASH_COMMENT,
+ #[regex("#[^\\r\\n]*?(\\r\\n|\\n)?")]
+ SINGLE_LINE_HASH_COMMENT,
+ #[regex("/\\*([^*]|\\*[^/])*\\*/")]
+ MULTI_LINE_COMMENT,
+ #[regex("/\\*/")]
+ ERROR_COMMENT_TOO_SHORT,
+ #[regex("/\\*([^*/]|\\*[^/])+")]
+ ERROR_COMMENT_UNTERMINATED,
+ #[token("tailstrict")]
+ TAILSTRICT_KW,
+ #[token("local")]
+ LOCAL_KW,
+ #[token("importstr")]
+ IMPORTSTR_KW,
+ #[token("importbin")]
+ IMPORTBIN_KW,
+ #[token("import")]
+ IMPORT_KW,
+ #[token("if")]
+ IF_KW,
+ #[token("then")]
+ THEN_KW,
+ #[token("else")]
+ ELSE_KW,
+ #[token("function")]
+ FUNCTION_KW,
+ #[token("error")]
+ ERROR_KW,
+ #[token("in")]
+ IN_KW,
+ META_OBJECT_APPLY,
+ ERROR_NO_OPERATOR,
+ #[token("null")]
+ NULL_KW,
+ #[token("true")]
+ TRUE_KW,
+ #[token("false")]
+ FALSE_KW,
+ #[token("self")]
+ SELF_KW,
+ #[token("super")]
+ SUPER_KW,
+ #[token("for")]
+ FOR_KW,
+ #[token("assert")]
+ ASSERT_KW,
+ ERROR_MISSING_TOKEN,
+ ERROR_UNEXPECTED_TOKEN,
+ ERROR_CUSTOM,
+ LEXING_ERROR,
+ __LAST_TOKEN,
+ #[doc(hidden)]
+ __LAST,
+}
+use self::SyntaxKind::*;
+impl SyntaxKind {
+ pub fn is_keyword(self) -> bool {
+ match self {
+ OR | NULL_COAELSE | AND | BIT_OR | BIT_XOR | BIT_AND | EQ | NE | LT | GT | LE | GE
+ | LHS | RHS | PLUS | MINUS | MUL | DIV | MODULO | NOT | BIT_NOT | L_BRACK | R_BRACK
+ | L_PAREN | R_PAREN | L_BRACE | R_BRACE | COLON | COLONCOLON | COLONCOLONCOLON
+ | SEMI | DOT | DOTDOTDOT | COMMA | DOLLAR | ASSIGN | QUESTION_MARK | TAILSTRICT_KW
+ | LOCAL_KW | IMPORTSTR_KW | IMPORTBIN_KW | IMPORT_KW | IF_KW | THEN_KW | ELSE_KW
+ | FUNCTION_KW | ERROR_KW | IN_KW | NULL_KW | TRUE_KW | FALSE_KW | SELF_KW
+ | SUPER_KW | FOR_KW | ASSERT_KW => true,
+ _ => false,
+ }
+ }
+ pub fn from_raw(r: u16) -> Self {
+ assert!(r < Self::__LAST as u16);
+ unsafe { std::mem::transmute(r) }
+ }
+ pub fn into_raw(self) -> u16 {
+ self as u16
+ }
+}
+#[macro_export]
+macro_rules ! T { [||] => { $ crate :: SyntaxKind :: OR } ; [??] => { $ crate :: SyntaxKind :: NULL_COAELSE } ; [&&] => { $ crate :: SyntaxKind :: AND } ; [|] => { $ crate :: SyntaxKind :: BIT_OR } ; [^] => { $ crate :: SyntaxKind :: BIT_XOR } ; [&] => { $ crate :: SyntaxKind :: BIT_AND } ; [==] => { $ crate :: SyntaxKind :: EQ } ; [!=] => { $ crate :: SyntaxKind :: NE } ; [<] => { $ crate :: SyntaxKind :: LT } ; [>] => { $ crate :: SyntaxKind :: GT } ; [<=] => { $ crate :: SyntaxKind :: LE } ; [>=] => { $ crate :: SyntaxKind :: GE } ; [<<] => { $ crate :: SyntaxKind :: LHS } ; [>>] => { $ crate :: SyntaxKind :: RHS } ; [+] => { $ crate :: SyntaxKind :: PLUS } ; [-] => { $ crate :: SyntaxKind :: MINUS } ; [*] => { $ crate :: SyntaxKind :: MUL } ; [/] => { $ crate :: SyntaxKind :: DIV } ; [%] => { $ crate :: SyntaxKind :: MODULO } ; [!] => { $ crate :: SyntaxKind :: NOT } ; [~] => { $ crate :: SyntaxKind :: BIT_NOT } ; ['['] => { $ crate :: SyntaxKind :: L_BRACK } ; [']'] => { $ crate :: SyntaxKind :: R_BRACK } ; ['('] => { $ crate :: SyntaxKind :: L_PAREN } ; [')'] => { $ crate :: SyntaxKind :: R_PAREN } ; ['{'] => { $ crate :: SyntaxKind :: L_BRACE } ; ['}'] => { $ crate :: SyntaxKind :: R_BRACE } ; [:] => { $ crate :: SyntaxKind :: COLON } ; [::] => { $ crate :: SyntaxKind :: COLONCOLON } ; [:::] => { $ crate :: SyntaxKind :: COLONCOLONCOLON } ; [;] => { $ crate :: SyntaxKind :: SEMI } ; [.] => { $ crate :: SyntaxKind :: DOT } ; [...] => { $ crate :: SyntaxKind :: DOTDOTDOT } ; [,] => { $ crate :: SyntaxKind :: COMMA } ; ['$'] => { $ crate :: SyntaxKind :: DOLLAR } ; [=] => { $ crate :: SyntaxKind :: ASSIGN } ; [?] => { $ crate :: SyntaxKind :: QUESTION_MARK } ; [tailstrict] => { $ crate :: SyntaxKind :: TAILSTRICT_KW } ; [local] => { $ crate :: SyntaxKind :: LOCAL_KW } ; [importstr] => { $ crate :: SyntaxKind :: IMPORTSTR_KW } ; [importbin] => { $ crate :: SyntaxKind :: IMPORTBIN_KW } ; [import] => { $ crate :: SyntaxKind :: IMPORT_KW } ; [if] => { $ crate :: SyntaxKind :: IF_KW } ; [then] => { $ crate :: SyntaxKind :: THEN_KW } ; [else] => { $ crate :: SyntaxKind :: ELSE_KW } ; [function] => { $ crate :: SyntaxKind :: FUNCTION_KW } ; [error] => { $ crate :: SyntaxKind :: ERROR_KW } ; [in] => { $ crate :: SyntaxKind :: IN_KW } ; [null] => { $ crate :: SyntaxKind :: NULL_KW } ; [true] => { $ crate :: SyntaxKind :: TRUE_KW } ; [false] => { $ crate :: SyntaxKind :: FALSE_KW } ; [self] => { $ crate :: SyntaxKind :: SELF_KW } ; [super] => { $ crate :: SyntaxKind :: SUPER_KW } ; [for] => { $ crate :: SyntaxKind :: FOR_KW } ; [assert] => { $ crate :: SyntaxKind :: ASSERT_KW } }
+#[allow(unused_imports)]
+pub use T;
crates/jrsonnet-lexer/src/lex.rsdiffbeforeafterboth--- /dev/null
+++ b/crates/jrsonnet-lexer/src/lex.rs
@@ -0,0 +1,78 @@
+use core::ops::Range;
+
+use logos::Logos;
+// use rowan::{TextRange, TextSize};
+
+use crate::{
+ generated::syntax_kinds::SyntaxKind,
+ string_block::{lex_str_block, StringBlockError},
+ Span,
+};
+
+pub struct Lexer<'a> {
+ inner: logos::Lexer<'a, SyntaxKind>,
+}
+
+impl<'a> Lexer<'a> {
+ pub fn new(input: &'a str) -> Self {
+ Self {
+ inner: SyntaxKind::lexer(input),
+ }
+ }
+}
+
+impl<'a> Iterator for Lexer<'a> {
+ type Item = Lexeme<'a>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ use SyntaxKind::*;
+
+ let mut kind = self.inner.next()?;
+ let text = self.inner.slice();
+
+ if kind == Ok(STRING_BLOCK) {
+ // We use custom lexer, which skips enough bytes, but not returns error
+ // Instead we should call lexer again to verify if there is something wrong with string block
+ let mut lexer = logos::Lexer::<SyntaxKind>::new(text);
+ // In kinds, string blocks is parsed at least as `|||`
+ lexer.bump(3);
+ let res = lex_str_block(&mut lexer);
+ let next = lexer.next();
+ assert!(next.is_none(), "str_block is lexed");
+ match res {
+ Ok(()) => {}
+ Err(e) => {
+ kind = Ok(match e {
+ StringBlockError::UnexpectedEnd => ERROR_STRING_BLOCK_UNEXPECTED_END,
+ StringBlockError::MissingNewLine => ERROR_STRING_BLOCK_MISSING_NEW_LINE,
+ StringBlockError::MissingTermination => {
+ ERROR_STRING_BLOCK_MISSING_TERMINATION
+ }
+ StringBlockError::MissingIndent => ERROR_STRING_BLOCK_MISSING_INDENT,
+ });
+ }
+ }
+ }
+
+ Some(Self::Item {
+ kind: kind.unwrap_or(SyntaxKind::LEXING_ERROR),
+ text,
+ range: {
+ let Range { start, end } = self.inner.span();
+
+ Span(start as u32, end as u32)
+ },
+ })
+ }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct Lexeme<'s> {
+ pub kind: SyntaxKind,
+ pub text: &'s str,
+ pub range: Span,
+}
+
+pub fn lex(input: &str) -> Vec<Lexeme<'_>> {
+ Lexer::new(input).collect()
+}
crates/jrsonnet-lexer/src/lib.rsdiffbeforeafterboth--- /dev/null
+++ b/crates/jrsonnet-lexer/src/lib.rs
@@ -0,0 +1,8 @@
+mod generated;
+mod lex;
+mod string_block;
+
+#[derive(Clone, Copy, Debug)]
+pub struct Span(pub u32, pub u32);
+
+pub use lex::{Lexeme, Lexer};
crates/jrsonnet-lexer/src/string_block.rsdiffbeforeafterboth--- /dev/null
+++ b/crates/jrsonnet-lexer/src/string_block.rs
@@ -0,0 +1,282 @@
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum StringBlockError {
+ UnexpectedEnd,
+ MissingNewLine,
+ MissingTermination,
+ MissingIndent,
+}
+
+use logos::Lexer;
+use StringBlockError::*;
+
+use crate::generated::syntax_kinds::SyntaxKind;
+
+pub(crate) fn lex_str_block_test(lex: &mut Lexer<'_, SyntaxKind>) {
+ let _ = lex_str_block(lex);
+}
+
+pub(crate) struct Context<'a> {
+ source: &'a str,
+ index: usize,
+}
+
+impl<'a> Context<'a> {
+ fn rest(&self) -> &'a str {
+ &self.source[self.index..]
+ }
+
+ fn next(&mut self) -> Option<char> {
+ if self.index == self.source.len() {
+ return None;
+ }
+
+ match self.rest().chars().next() {
+ None => None,
+ Some(c) => {
+ self.index += c.len_utf8();
+ Some(c)
+ }
+ }
+ }
+
+ fn peek(&self) -> Option<char> {
+ if self.index == self.source.len() {
+ return None;
+ }
+
+ self.rest().chars().next()
+ }
+
+ fn eat_if(&mut self, f: impl Fn(char) -> bool) -> usize {
+ if self.peek().is_some_and(f) {
+ self.index += 1;
+ return 1;
+ }
+ 0
+ }
+
+ fn eat_while(&mut self, f: impl Fn(char) -> bool) -> usize {
+ if self.index == self.source.len() {
+ return 0;
+ }
+
+ let next_char = self.rest().char_indices().find(|(_, c)| !f(*c));
+
+ match next_char {
+ None => {
+ let diff = self.source.len() - self.index;
+ self.index = self.source.len();
+ diff
+ }
+ Some((idx, _)) => {
+ self.index += idx;
+ idx
+ }
+ }
+ }
+
+ fn skip(&mut self, len: usize) {
+ self.index = match self.index + len {
+ n if n > self.source.len() => self.source.len(),
+ n => n,
+ };
+ }
+}
+
+// Check that b has at least the same whitespace prefix as a and returns the
+// amount of this whitespace, otherwise returns 0. If a has no whitespace
+// prefix than return 0.
+fn check_whitespace(a: &str, b: &str) -> usize {
+ let a = a.as_bytes();
+ let b = b.as_bytes();
+
+ for i in 0..a.len() {
+ if a[i] != b' ' && a[i] != b'\t' {
+ // a has run out of whitespace and b matched up to this point. Return result.
+ return i;
+ }
+
+ if i >= b.len() {
+ // We ran off the edge of b while a still has whitespace. Return 0 as failure.
+ return 0;
+ }
+
+ if a[i] != b[i] {
+ // a has whitespace but b does not. Return 0 as failure.
+ return 0;
+ }
+ }
+
+ // We ran off the end of a and b kept up
+ a.len()
+}
+
+pub(crate) trait StrBlockLexCtx<'d> {
+ fn remainder(&self) -> &'d str;
+ fn eat_error(&mut self, ctx: &Context<'d>);
+ fn bump_pos(&mut self, s: usize);
+ fn mark_truncating(&mut self);
+ fn mark_line(&mut self, line: &'d str);
+}
+
+impl<'d> StrBlockLexCtx<'d> for Lexer<'d, SyntaxKind> {
+ fn remainder(&self) -> &'d str {
+ self.remainder()
+ }
+ fn eat_error(&mut self, ctx: &Context<'d>) {
+ let end_index = ctx
+ .rest()
+ .find("|||")
+ .map_or_else(|| ctx.rest().len(), |v| v + 3);
+ self.bump(ctx.index + end_index);
+ }
+ fn bump_pos(&mut self, s: usize) {
+ self.bump(s);
+ }
+ fn mark_truncating(&mut self) {
+ // Lexer test doesn't collect anything
+ }
+ fn mark_line(&mut self, _line: &'d str) {
+ // Lexer test doesn't collect anything
+ }
+}
+
+pub fn collect_lexed_str_block(input: &str) -> Result<CollectStrBlock<'_>, StringBlockError> {
+ let mut collect = CollectStrBlock {
+ truncate: false,
+ lines: vec![],
+ input,
+ offset: 0,
+ };
+ lex_str_block(&mut collect)?;
+ Ok(collect)
+}
+
+pub struct CollectStrBlock<'s> {
+ pub truncate: bool,
+ pub lines: Vec<&'s str>,
+ input: &'s str,
+ offset: usize,
+}
+
+impl<'d> StrBlockLexCtx<'d> for CollectStrBlock<'d> {
+ fn remainder(&self) -> &'d str {
+ self.input
+ }
+
+ fn eat_error(&mut self, _ctx: &Context<'d>) {
+ // Error will be returned, no need to record it here
+ }
+
+ fn bump_pos(&mut self, s: usize) {
+ self.offset += s;
+ }
+
+ fn mark_truncating(&mut self) {
+ self.truncate = true;
+ }
+
+ fn mark_line(&mut self, line: &'d str) {
+ self.lines.push(line);
+ }
+}
+
+pub(crate) fn lex_str_block<'a>(lex: &mut impl StrBlockLexCtx<'a>) -> Result<(), StringBlockError> {
+ // debug_assert_eq!(lex.slice(), "|||");
+ let mut ctx = Context::<'a> {
+ source: lex.remainder(),
+ index: 0,
+ };
+
+ if ctx.eat_if(|v| v == '-') != 0 {
+ lex.mark_truncating();
+ }
+
+ // Skip whitespaces
+ ctx.eat_while(|r| r == ' ' || r == '\t' || r == '\r');
+
+ // Skip \n
+ match ctx.next() {
+ Some('\n') => (),
+ None => {
+ lex.eat_error(&ctx);
+ return Err(UnexpectedEnd);
+ }
+ // Text block requires new line after |||.
+ Some(_) => {
+ lex.eat_error(&ctx);
+ return Err(MissingNewLine);
+ }
+ }
+
+ // Process leading blank lines before calculating string block indent
+ while ctx.peek() == Some('\n') {
+ ctx.next();
+ }
+
+ let mut num_whitespace = check_whitespace(ctx.rest(), ctx.rest());
+ let str_block_indent = &ctx.rest()[..num_whitespace];
+
+ if num_whitespace == 0 {
+ // Text block's first line must start with whitespace
+ lex.eat_error(&ctx);
+ return Err(MissingIndent);
+ }
+
+ loop {
+ debug_assert_ne!(num_whitespace, 0, "Unexpected value for num_whitespace");
+ ctx.skip(num_whitespace);
+
+ let line_start = ctx.index;
+ let mut line_size = 0;
+ loop {
+ match ctx.next() {
+ None => {
+ lex.eat_error(&ctx);
+ return Err(UnexpectedEnd);
+ }
+ Some('\n') => {
+ lex.mark_line(&ctx.source[line_start..line_start + line_size]);
+ break;
+ }
+ Some(c) => {
+ line_size += c.len_utf8();
+ }
+ }
+ }
+
+ // Skip any blank lines
+ while ctx.peek() == Some('\n') {
+ lex.mark_line("");
+ ctx.next();
+ }
+
+ // Look at the next line
+ num_whitespace = check_whitespace(str_block_indent, ctx.rest());
+ if num_whitespace == 0 {
+ // End of the text block
+ // let mut term_indent = String::with_capacity(num_whitespace);
+ while let Some(' ' | '\t') = ctx.peek() {
+ // term_indent.push(
+ ctx.next().unwrap();
+ // );
+ }
+
+ if !ctx.rest().starts_with("|||") {
+ if ctx.rest().is_empty() {
+ lex.bump_pos(ctx.index);
+ return Err(UnexpectedEnd);
+ }
+ lex.eat_error(&ctx);
+ return Err(MissingTermination);
+ }
+
+ // Skip '|||'
+ ctx.skip(3);
+ break;
+ }
+ }
+
+ lex.bump_pos(ctx.index);
+ Ok(())
+}
crates/jrsonnet-rowan-parser/Cargo.tomldiffbeforeafterboth--- a/crates/jrsonnet-rowan-parser/Cargo.toml
+++ b/crates/jrsonnet-rowan-parser/Cargo.toml
@@ -14,7 +14,7 @@
drop_bomb.workspace = true
hi-doc.workspace = true
indoc.workspace = true
-logos.workspace = true
+jrsonnet-lexer = { version = "0.5.0-pre97", path = "../jrsonnet-lexer" }
rowan.workspace = true
thiserror.workspace = true
crates/jrsonnet-rowan-parser/src/generated/syntax_kinds.rsdiffbeforeafterboth--- a/crates/jrsonnet-rowan-parser/src/generated/syntax_kinds.rs
+++ b/crates/jrsonnet-rowan-parser/src/generated/syntax_kinds.rs
@@ -8,172 +8,95 @@
clippy::manual_non_exhaustive,
clippy::match_like_matches_macro
)]
-use logos::Logos;
#[doc = r" The kind of syntax node, e.g. `IDENT`, `USE_KW`, or `STRUCT`."]
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Logos)]
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[repr(u16)]
pub enum SyntaxKind {
#[doc(hidden)]
TOMBSTONE,
#[doc(hidden)]
EOF,
- #[token("||")]
OR,
- #[token("??")]
NULL_COAELSE,
- #[token("&&")]
AND,
- #[token("|")]
BIT_OR,
- #[token("^")]
BIT_XOR,
- #[token("&")]
BIT_AND,
- #[token("==")]
EQ,
- #[token("!=")]
NE,
- #[token("<")]
LT,
- #[token(">")]
GT,
- #[token("<=")]
LE,
- #[token(">=")]
GE,
- #[token("<<")]
LHS,
- #[token(">>")]
RHS,
- #[token("+")]
PLUS,
- #[token("-")]
MINUS,
- #[token("*")]
MUL,
- #[token("/")]
DIV,
- #[token("%")]
MODULO,
- #[token("!")]
NOT,
- #[token("~")]
BIT_NOT,
- #[token("[")]
L_BRACK,
- #[token("]")]
R_BRACK,
- #[token("(")]
L_PAREN,
- #[token(")")]
R_PAREN,
- #[token("{")]
L_BRACE,
- #[token("}")]
R_BRACE,
- #[token(":")]
COLON,
- #[token("::")]
COLONCOLON,
- #[token(":::")]
COLONCOLONCOLON,
- #[token(";")]
SEMI,
- #[token(".")]
DOT,
- #[token("...")]
DOTDOTDOT,
- #[token(",")]
COMMA,
- #[token("$")]
DOLLAR,
- #[token("=")]
ASSIGN,
- #[token("?")]
QUESTION_MARK,
- #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?(?:[eE][+-]?[0-9]+)?")]
FLOAT,
- #[regex("(?:0|[1-9][0-9]*)\\.[^0-9]")]
ERROR_FLOAT_JUNK_AFTER_POINT,
- #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][^+\\-0-9]")]
ERROR_FLOAT_JUNK_AFTER_EXPONENT,
- #[regex("(?:0|[1-9][0-9]*)(?:\\.[0-9]+)?[eE][+-][^0-9]")]
ERROR_FLOAT_JUNK_AFTER_EXPONENT_SIGN,
- #[regex("\"(?s:[^\"\\\\]|\\\\.)*\"")]
STRING_DOUBLE,
- #[regex("\"(?s:[^\"\\\\]|\\\\.)*")]
ERROR_STRING_DOUBLE_UNTERMINATED,
- #[regex("'(?s:[^'\\\\]|\\\\.)*'")]
STRING_SINGLE,
- #[regex("'(?s:[^'\\\\]|\\\\.)*")]
ERROR_STRING_SINGLE_UNTERMINATED,
- #[regex("@\"(?:[^\"]|\"\")*\"")]
STRING_DOUBLE_VERBATIM,
- #[regex("@\"(?:[^\"]|\"\")*")]
ERROR_STRING_DOUBLE_VERBATIM_UNTERMINATED,
- #[regex("@'(?:[^']|'')*'")]
STRING_SINGLE_VERBATIM,
- #[regex("@'(?:[^']|'')*")]
ERROR_STRING_SINGLE_VERBATIM_UNTERMINATED,
- #[regex("@[^\"'\\s]\\S+")]
ERROR_STRING_VERBATIM_MISSING_QUOTES,
- #[regex("\\|\\|\\|", crate::string_block::lex_str_block_test)]
STRING_BLOCK,
ERROR_STRING_BLOCK_UNEXPECTED_END,
ERROR_STRING_BLOCK_MISSING_NEW_LINE,
ERROR_STRING_BLOCK_MISSING_TERMINATION,
ERROR_STRING_BLOCK_MISSING_INDENT,
- #[regex("[_a-zA-Z][_a-zA-Z0-9]*")]
IDENT,
- #[regex("[ \\t\\n\\r]+")]
WHITESPACE,
- #[regex("//[^\\r\\n]*?(\\r\\n|\\n)?")]
SINGLE_LINE_SLASH_COMMENT,
- #[regex("#[^\\r\\n]*?(\\r\\n|\\n)?")]
SINGLE_LINE_HASH_COMMENT,
- #[regex("/\\*([^*]|\\*[^/])*\\*/")]
MULTI_LINE_COMMENT,
- #[regex("/\\*/")]
ERROR_COMMENT_TOO_SHORT,
- #[regex("/\\*([^*/]|\\*[^/])+")]
ERROR_COMMENT_UNTERMINATED,
- #[token("tailstrict")]
TAILSTRICT_KW,
- #[token("local")]
LOCAL_KW,
- #[token("importstr")]
IMPORTSTR_KW,
- #[token("importbin")]
IMPORTBIN_KW,
- #[token("import")]
IMPORT_KW,
- #[token("if")]
IF_KW,
- #[token("then")]
THEN_KW,
- #[token("else")]
ELSE_KW,
- #[token("function")]
FUNCTION_KW,
- #[token("error")]
ERROR_KW,
- #[token("in")]
IN_KW,
META_OBJECT_APPLY,
ERROR_NO_OPERATOR,
- #[token("null")]
NULL_KW,
- #[token("true")]
TRUE_KW,
- #[token("false")]
FALSE_KW,
- #[token("self")]
SELF_KW,
- #[token("super")]
SUPER_KW,
- #[token("for")]
FOR_KW,
- #[token("assert")]
ASSERT_KW,
ERROR_MISSING_TOKEN,
ERROR_UNEXPECTED_TOKEN,
crates/jrsonnet-rowan-parser/src/lex.rsdiffbeforeafterboth--- a/crates/jrsonnet-rowan-parser/src/lex.rs
+++ b/crates/jrsonnet-rowan-parser/src/lex.rs
@@ -1,81 +1,19 @@
-use core::ops::Range;
-use std::convert::TryFrom;
-
-use logos::Logos;
+use jrsonnet_lexer::Lexer;
use rowan::{TextRange, TextSize};
-use crate::{
- string_block::{lex_str_block, StringBlockError},
- SyntaxKind,
-};
-
-pub struct Lexer<'a> {
- inner: logos::Lexer<'a, SyntaxKind>,
-}
-
-impl<'a> Lexer<'a> {
- pub fn new(input: &'a str) -> Self {
- Self {
- inner: SyntaxKind::lexer(input),
- }
- }
-}
-
-impl<'a> Iterator for Lexer<'a> {
- type Item = Lexeme<'a>;
-
- fn next(&mut self) -> Option<Self::Item> {
- use SyntaxKind::*;
-
- let mut kind = self.inner.next()?;
- let text = self.inner.slice();
-
- if kind == Ok(STRING_BLOCK) {
- // We use custom lexer, which skips enough bytes, but not returns error
- // Instead we should call lexer again to verify if there is something wrong with string block
- let mut lexer = logos::Lexer::<SyntaxKind>::new(text);
- // In kinds, string blocks is parsed at least as `|||`
- lexer.bump(3);
- let res = lex_str_block(&mut lexer);
- let next = lexer.next();
- assert!(next.is_none(), "str_block is lexed");
- match res {
- Ok(()) => {}
- Err(e) => {
- kind = Ok(match e {
- StringBlockError::UnexpectedEnd => ERROR_STRING_BLOCK_UNEXPECTED_END,
- StringBlockError::MissingNewLine => ERROR_STRING_BLOCK_MISSING_NEW_LINE,
- StringBlockError::MissingTermination => {
- ERROR_STRING_BLOCK_MISSING_TERMINATION
- }
- StringBlockError::MissingIndent => ERROR_STRING_BLOCK_MISSING_INDENT,
- });
- }
- }
- }
-
- Some(Self::Item {
- kind: kind.unwrap_or(SyntaxKind::LEXING_ERROR),
- text,
- range: {
- let Range { start, end } = self.inner.span();
-
- TextRange::new(
- TextSize::try_from(start).unwrap(),
- TextSize::try_from(end).unwrap(),
- )
- },
- })
- }
-}
+use crate::SyntaxKind;
#[derive(Clone, Copy, Debug)]
-pub struct Lexeme<'i> {
+pub struct Lexeme<'s> {
pub kind: SyntaxKind,
- pub text: &'i str,
+ pub text: &'s str,
pub range: TextRange,
}
pub fn lex(input: &str) -> Vec<Lexeme<'_>> {
- Lexer::new(input).collect()
+ Lexer::new(input).map(|l| Lexeme {
+ kind: SyntaxKind::from_raw(l.kind.into_raw()),
+ text: l.text,
+ range: TextRange::new(TextSize::from(l.range.0), TextSize::from(l.range.1)),
+ }).collect()
}
crates/jrsonnet-rowan-parser/src/lib.rsdiffbeforeafterboth--- a/crates/jrsonnet-rowan-parser/src/lib.rs
+++ b/crates/jrsonnet-rowan-parser/src/lib.rs
@@ -2,7 +2,6 @@
use event::Sink;
use generated::nodes::{SourceFile, Trivia};
-use lex::lex;
use parser::{LocatedSyntaxError, Parser};
pub use rowan;
@@ -14,14 +13,12 @@
mod marker;
mod parser;
mod precedence;
-mod string_block;
mod tests;
mod token_set;
pub use ast::{AstChildren, AstNode, AstToken};
pub use generated::{nodes, syntax_kinds::SyntaxKind};
pub use language::*;
-pub use string_block::{collect_lexed_str_block, CollectStrBlock};
pub use token_set::SyntaxKindSet;
use self::{
@@ -30,7 +27,7 @@
};
pub fn parse(input: &str) -> (SourceFile, Vec<LocatedSyntaxError>) {
- let lexemes = lex(input);
+ let lexemes = lex::lex(input);
let kinds = lexemes
.iter()
.map(|l| l.kind)
crates/jrsonnet-rowan-parser/src/string_block.rsdiffbeforeafterboth--- a/crates/jrsonnet-rowan-parser/src/string_block.rs
+++ /dev/null
@@ -1,282 +0,0 @@
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum StringBlockError {
- UnexpectedEnd,
- MissingNewLine,
- MissingTermination,
- MissingIndent,
-}
-
-use logos::Lexer;
-use StringBlockError::*;
-
-use crate::SyntaxKind;
-
-pub(crate) fn lex_str_block_test(lex: &mut Lexer<'_, SyntaxKind>) {
- let _ = lex_str_block(lex);
-}
-
-pub(crate) struct Context<'a> {
- source: &'a str,
- index: usize,
-}
-
-impl<'a> Context<'a> {
- fn rest(&self) -> &'a str {
- &self.source[self.index..]
- }
-
- fn next(&mut self) -> Option<char> {
- if self.index == self.source.len() {
- return None;
- }
-
- match self.rest().chars().next() {
- None => None,
- Some(c) => {
- self.index += c.len_utf8();
- Some(c)
- }
- }
- }
-
- fn peek(&self) -> Option<char> {
- if self.index == self.source.len() {
- return None;
- }
-
- self.rest().chars().next()
- }
-
- fn eat_if(&mut self, f: impl Fn(char) -> bool) -> usize {
- if self.peek().is_some_and(f) {
- self.index += 1;
- return 1;
- }
- 0
- }
-
- fn eat_while(&mut self, f: impl Fn(char) -> bool) -> usize {
- if self.index == self.source.len() {
- return 0;
- }
-
- let next_char = self.rest().char_indices().find(|(_, c)| !f(*c));
-
- match next_char {
- None => {
- let diff = self.source.len() - self.index;
- self.index = self.source.len();
- diff
- }
- Some((idx, _)) => {
- self.index += idx;
- idx
- }
- }
- }
-
- fn skip(&mut self, len: usize) {
- self.index = match self.index + len {
- n if n > self.source.len() => self.source.len(),
- n => n,
- };
- }
-}
-
-// Check that b has at least the same whitespace prefix as a and returns the
-// amount of this whitespace, otherwise returns 0. If a has no whitespace
-// prefix than return 0.
-fn check_whitespace(a: &str, b: &str) -> usize {
- let a = a.as_bytes();
- let b = b.as_bytes();
-
- for i in 0..a.len() {
- if a[i] != b' ' && a[i] != b'\t' {
- // a has run out of whitespace and b matched up to this point. Return result.
- return i;
- }
-
- if i >= b.len() {
- // We ran off the edge of b while a still has whitespace. Return 0 as failure.
- return 0;
- }
-
- if a[i] != b[i] {
- // a has whitespace but b does not. Return 0 as failure.
- return 0;
- }
- }
-
- // We ran off the end of a and b kept up
- a.len()
-}
-
-pub(crate) trait StrBlockLexCtx<'d> {
- fn remainder(&self) -> &'d str;
- fn eat_error(&mut self, ctx: &Context<'d>);
- fn bump_pos(&mut self, s: usize);
- fn mark_truncating(&mut self);
- fn mark_line(&mut self, line: &'d str);
-}
-
-impl<'d> StrBlockLexCtx<'d> for Lexer<'d, SyntaxKind> {
- fn remainder(&self) -> &'d str {
- self.remainder()
- }
- fn eat_error(&mut self, ctx: &Context<'d>) {
- let end_index = ctx
- .rest()
- .find("|||")
- .map_or_else(|| ctx.rest().len(), |v| v + 3);
- self.bump(ctx.index + end_index);
- }
- fn bump_pos(&mut self, s: usize) {
- self.bump(s);
- }
- fn mark_truncating(&mut self) {
- // Lexer test doesn't collect anything
- }
- fn mark_line(&mut self, _line: &'d str) {
- // Lexer test doesn't collect anything
- }
-}
-
-pub fn collect_lexed_str_block(input: &str) -> Result<CollectStrBlock<'_>, StringBlockError> {
- let mut collect = CollectStrBlock {
- truncate: false,
- lines: vec![],
- input,
- offset: 0,
- };
- lex_str_block(&mut collect)?;
- Ok(collect)
-}
-
-pub struct CollectStrBlock<'s> {
- pub truncate: bool,
- pub lines: Vec<&'s str>,
- input: &'s str,
- offset: usize,
-}
-
-impl<'d> StrBlockLexCtx<'d> for CollectStrBlock<'d> {
- fn remainder(&self) -> &'d str {
- self.input
- }
-
- fn eat_error(&mut self, _ctx: &Context<'d>) {
- // Error will be returned, no need to record it here
- }
-
- fn bump_pos(&mut self, s: usize) {
- self.offset += s;
- }
-
- fn mark_truncating(&mut self) {
- self.truncate = true;
- }
-
- fn mark_line(&mut self, line: &'d str) {
- self.lines.push(line);
- }
-}
-
-pub(crate) fn lex_str_block<'a>(lex: &mut impl StrBlockLexCtx<'a>) -> Result<(), StringBlockError> {
- // debug_assert_eq!(lex.slice(), "|||");
- let mut ctx = Context::<'a> {
- source: lex.remainder(),
- index: 0,
- };
-
- if ctx.eat_if(|v| v == '-') != 0 {
- lex.mark_truncating();
- }
-
- // Skip whitespaces
- ctx.eat_while(|r| r == ' ' || r == '\t' || r == '\r');
-
- // Skip \n
- match ctx.next() {
- Some('\n') => (),
- None => {
- lex.eat_error(&ctx);
- return Err(UnexpectedEnd);
- }
- // Text block requires new line after |||.
- Some(_) => {
- lex.eat_error(&ctx);
- return Err(MissingNewLine);
- }
- }
-
- // Process leading blank lines before calculating string block indent
- while ctx.peek() == Some('\n') {
- ctx.next();
- }
-
- let mut num_whitespace = check_whitespace(ctx.rest(), ctx.rest());
- let str_block_indent = &ctx.rest()[..num_whitespace];
-
- if num_whitespace == 0 {
- // Text block's first line must start with whitespace
- lex.eat_error(&ctx);
- return Err(MissingIndent);
- }
-
- loop {
- debug_assert_ne!(num_whitespace, 0, "Unexpected value for num_whitespace");
- ctx.skip(num_whitespace);
-
- let line_start = ctx.index;
- let mut line_size = 0;
- loop {
- match ctx.next() {
- None => {
- lex.eat_error(&ctx);
- return Err(UnexpectedEnd);
- }
- Some('\n') => {
- lex.mark_line(&ctx.source[line_start..line_start + line_size]);
- break;
- }
- Some(c) => {
- line_size += c.len_utf8();
- }
- }
- }
-
- // Skip any blank lines
- while ctx.peek() == Some('\n') {
- lex.mark_line("");
- ctx.next();
- }
-
- // Look at the next line
- num_whitespace = check_whitespace(str_block_indent, ctx.rest());
- if num_whitespace == 0 {
- // End of the text block
- // let mut term_indent = String::with_capacity(num_whitespace);
- while let Some(' ' | '\t') = ctx.peek() {
- // term_indent.push(
- ctx.next().unwrap();
- // );
- }
-
- if !ctx.rest().starts_with("|||") {
- if ctx.rest().is_empty() {
- lex.bump_pos(ctx.index);
- return Err(UnexpectedEnd);
- }
- lex.eat_error(&ctx);
- return Err(MissingTermination);
- }
-
- // Skip '|||'
- ctx.skip(3);
- break;
- }
- }
-
- lex.bump_pos(ctx.index);
- Ok(())
-}
xtask/src/sourcegen/kinds.rsdiffbeforeafterboth1#[derive(Debug)]2pub struct KindsSrc {3 /// Key - how this token appears in ungrammar4 defined_tokens: IndexMap<String, TokenKind>,5 defined_node_names: HashSet<String>,6 pub nodes: Vec<String>,7}89#[derive(Debug, Clone)]10pub enum TokenKind {11 /// May exist in token tree, but never in source code12 Meta { grammar_name: String, name: String },13 /// Specific parsing/lexing errors may be emitted as this type of kind14 Error {15 grammar_name: String,16 name: String,17 #[allow(dead_code)]18 /// Is this error returned by lexer directly, or from lex.rs19 is_lexer_error: bool,20 regex: Option<String>,21 priority: Option<u32>,22 },23 /// Keyword - literal match of token24 Keyword {25 /// How this keyword appears in grammar/code, should be same as Kinds key26 code: String,27 name: String,28 },29 /// Literal - something defined by user, i.e strings, identifiers, smth30 Literal {31 /// How this keyword appears in grammar, should be same as Kinds key32 grammar_name: String,33 name: String,34 /// Regex for Logos lexer35 regex: String,36 /// Path to custom lexer37 lexer: Option<String>,38 },39}4041impl TokenKind {42 pub fn grammar_name(&self) -> &str {43 match self {44 Self::Keyword { code, .. } => code,45 Self::Literal { grammar_name, .. }46 | Self::Meta { grammar_name, .. }47 | Self::Error { grammar_name, .. } => grammar_name,48 }49 }50 /// How this keyword should appear in kinds enum, screaming snake cased51 pub fn name(&self) -> &str {52 match self {53 Self::Keyword { name, .. }54 | Self::Literal { name, .. }55 | Self::Meta { name, .. }56 | Self::Error { name, .. } => name,57 }58 }59 pub fn expand_kind(&self) -> TokenStream {60 let name = format_ident!("{}", self.name());61 let attr = match self {62 Self::Keyword { code, .. } => quote! {#[token(#code)]},63 Self::Literal { regex, lexer, .. } => {64 let lexer = lexer65 .as_deref()66 .map(TokenStream::from_str)67 .map(|r| r.expect("path is correct"));68 quote! {#[regex(#regex, #lexer)]}69 }70 Self::Error {71 regex, priority, ..72 } if regex.is_some() => {73 let priority = priority.map(|p| quote! {, priority = #p});74 quote! {#[regex(#regex #priority)]}75 }76 _ => quote! {},77 };78 quote! {79 #attr80 #name81 }82 }83 pub fn expand_t_macros(&self) -> Option<TokenStream> {84 match self {85 Self::Keyword { code, name } => {86 let code = escape_token_macro(code);87 let name = format_ident!("{name}");88 Some(quote! {89 [#code] => {$crate::SyntaxKind::#name}90 })91 }92 // Meta items should not appear in T![_]93 _ => None,94 }95 }9697 /// How this token should be referenced in code98 /// Keywords are referenced with `T![_]` macro,99 /// and literals are referenced directly by name100 pub fn reference(&self) -> TokenStream {101 if let Self::Keyword { code, .. } = self {102 let code = escape_token_macro(code);103 quote! {T![#code]}104 } else {105 let name = self.name();106 let ident = format_ident!("{name}");107 quote! {#ident}108 }109 }110111 pub fn method_name(&self) -> Ident {112 match self {113 Self::Keyword { name, .. } => {114 format_ident!("{}_token", name.to_lowercase())115 }116 Self::Literal { name, .. } => {117 format_ident!("{}_lit", name.to_lowercase())118 }119 Self::Meta { name, .. } => format_ident!("{}_meta", name.to_lowercase()),120 Self::Error { name, .. } => format_ident!("{}_error", name.to_lowercase()),121 }122 }123}124125#[macro_export]126macro_rules! define_kinds {127 ($into:ident = lit($name:literal) => $regex:literal $(, $lexer:literal)? $(; $($rest:tt)*)?) => {{128 $into.define_token(TokenKind::Literal {129 grammar_name: format!("LIT_{}!", $name),130 name: $name.to_owned(),131 regex: $regex.to_owned(),132 lexer: None $(.or_else(|| Some($lexer.to_string())))?,133 });134 $(define_kinds!($into = $($rest)*))?135 }};136 ($into:ident = error($name:literal$(, priority = $priority:literal)? $(, lexer = $lexer:literal)?) $(=> $regex:literal)? $(; $($rest:tt)*)?) => {{137 {138 let regex = None$(.or(Some($regex.to_owned())))?;139 let priority = None$(.or(Some($priority)))?;140 $into.define_token(TokenKind::Error {141 grammar_name: format!("ERROR_{}!", $name),142 name: format!("ERROR_{}", $name),143 is_lexer_error: false $(|| $lexer)? || regex.is_some() || priority.is_some(),144 regex,145 priority,146 });147 }148 $(define_kinds!($into = $($rest)*))?149 }};150 ($into:ident = $tok:literal => $name:literal $(; $($rest:tt)*)?) => {{151 $into.define_token(TokenKind::Keyword {152 code: format!("{}", $tok),153 name: $name.to_owned(),154 });155 $(define_kinds!($into = $($rest)*))?156 }};157 ($into:ident =) => {{}}158}159use std::{collections::HashSet, str::FromStr};160161use indexmap::IndexMap;162use proc_macro2::{Ident, TokenStream};163use quote::{format_ident, quote};164165use super::escape_token_macro;166167impl KindsSrc {168 pub fn new() -> Self {169 Self {170 defined_tokens: IndexMap::new(),171 defined_node_names: HashSet::new(),172 nodes: Vec::new(),173 }174 }175 pub fn define_token(&mut self, token: TokenKind) {176 assert!(177 self.defined_node_names.insert(token.name().to_owned()),178 "node name already defined: {}",179 token.name()180 );181 assert!(182 self.defined_tokens183 .insert(token.grammar_name().to_owned(), token.clone())184 .is_none(),185 "token already defined: {}",186 token.grammar_name()187 );188 }189 pub fn define_node(&mut self, node: &str) {190 assert!(191 self.defined_node_names.insert(node.to_owned()),192 "node name already defined: {node}"193 );194 self.nodes.push(node.to_string());195 }196 pub fn token(&self, tok: &str) -> Option<&TokenKind> {197 self.defined_tokens.get(tok)198 }199 pub fn is_token(&self, tok: &str) -> bool {200 self.defined_tokens.contains_key(tok)201 }202 pub fn tokens(&self) -> impl Iterator<Item = &TokenKind> {203 self.defined_tokens.iter().map(|(_, v)| v)204 }205}206207pub fn jsonnet_kinds() -> KindsSrc {208 let mut kinds = KindsSrc::new();209 define_kinds![kinds =210 "||" => "OR";211 "??" => "NULL_COAELSE";212 "&&" => "AND";213 "|" => "BIT_OR";214 "^" => "BIT_XOR";215 "&" => "BIT_AND";216 "==" => "EQ";217 "!=" => "NE";218 "<" => "LT";219 ">" => "GT";220 "<=" => "LE";221 ">=" => "GE";222 "<<" => "LHS";223 ">>" => "RHS";224 "+" => "PLUS";225 "-" => "MINUS";226 "*" => "MUL";227 "/" => "DIV";228 "%" => "MODULO";229 "!" => "NOT";230 "~" => "BIT_NOT";231 "[" => "L_BRACK";232 "]" => "R_BRACK";233 "(" => "L_PAREN";234 ")" => "R_PAREN";235 "{" => "L_BRACE";236 "}" => "R_BRACE";237 ":" => "COLON";238 "::" => "COLONCOLON";239 ":::" => "COLONCOLONCOLON";240 ";" => "SEMI";241 "." => "DOT";242 "..." => "DOTDOTDOT";243 "," => "COMMA";244 "$" => "DOLLAR";245 "=" => "ASSIGN";246 "?" => "QUESTION_MARK";247 // Literals248 lit("FLOAT") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?";249 error("FLOAT_JUNK_AFTER_POINT") => r"(?:0|[1-9][0-9]*)\.[^0-9]";250 error("FLOAT_JUNK_AFTER_EXPONENT") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?[eE][^+\-0-9]";251 error("FLOAT_JUNK_AFTER_EXPONENT_SIGN") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?[eE][+-][^0-9]";252 lit("STRING_DOUBLE") => "\"(?s:[^\"\\\\]|\\\\.)*\"";253 error("STRING_DOUBLE_UNTERMINATED") => "\"(?s:[^\"\\\\]|\\\\.)*";254 lit("STRING_SINGLE") => "'(?s:[^'\\\\]|\\\\.)*'";255 error("STRING_SINGLE_UNTERMINATED") => "'(?s:[^'\\\\]|\\\\.)*";256 lit("STRING_DOUBLE_VERBATIM") => "@\"(?:[^\"]|\"\")*\"";257 error("STRING_DOUBLE_VERBATIM_UNTERMINATED") => "@\"(?:[^\"]|\"\")*";258 lit("STRING_SINGLE_VERBATIM") => "@'(?:[^']|'')*'";259 error("STRING_SINGLE_VERBATIM_UNTERMINATED") => "@'(?:[^']|'')*";260 error("STRING_VERBATIM_MISSING_QUOTES") => "@[^\"'\\s]\\S+";261 lit("STRING_BLOCK") => r"\|\|\|", "crate::string_block::lex_str_block_test";262 error("STRING_BLOCK_UNEXPECTED_END", lexer = true);263 error("STRING_BLOCK_MISSING_NEW_LINE", lexer = true);264 error("STRING_BLOCK_MISSING_TERMINATION", lexer = true);265 error("STRING_BLOCK_MISSING_INDENT", lexer = true);266 lit("IDENT") => r"[_a-zA-Z][_a-zA-Z0-9]*";267 lit("WHITESPACE") => r"[ \t\n\r]+";268 lit("SINGLE_LINE_SLASH_COMMENT") => r"//[^\r\n]*?(\r\n|\n)?";269 lit("SINGLE_LINE_HASH_COMMENT") => r"#[^\r\n]*?(\r\n|\n)?";270 lit("MULTI_LINE_COMMENT") => r"/\*([^*]|\*[^/])*\*/";271 error("COMMENT_TOO_SHORT") => r"/\*/";272 error("COMMENT_UNTERMINATED") => r"/\*([^*/]|\*[^/])+";273 ];274 kinds275}1#[derive(Debug)]2pub struct KindsSrc {3 /// Key - how this token appears in ungrammar4 defined_tokens: IndexMap<String, TokenKind>,5 defined_node_names: HashSet<String>,6 pub nodes: Vec<String>,7}89#[derive(Debug, Clone)]10pub enum TokenKind {11 /// May exist in token tree, but never in source code12 Meta { grammar_name: String, name: String },13 /// Specific parsing/lexing errors may be emitted as this type of kind14 Error {15 grammar_name: String,16 name: String,17 #[allow(dead_code)]18 /// Is this error returned by lexer directly, or from lex.rs19 is_lexer_error: bool,20 regex: Option<String>,21 priority: Option<u32>,22 },23 /// Keyword - literal match of token24 Keyword {25 /// How this keyword appears in grammar/code, should be same as Kinds key26 code: String,27 name: String,28 },29 /// Literal - something defined by user, i.e strings, identifiers, smth30 Literal {31 /// How this keyword appears in grammar, should be same as Kinds key32 grammar_name: String,33 name: String,34 /// Regex for Logos lexer35 regex: String,36 /// Path to custom lexer37 lexer: Option<String>,38 },39}4041impl TokenKind {42 pub fn grammar_name(&self) -> &str {43 match self {44 Self::Keyword { code, .. } => code,45 Self::Literal { grammar_name, .. }46 | Self::Meta { grammar_name, .. }47 | Self::Error { grammar_name, .. } => grammar_name,48 }49 }50 /// How this keyword should appear in kinds enum, screaming snake cased51 pub fn name(&self) -> &str {52 match self {53 Self::Keyword { name, .. }54 | Self::Literal { name, .. }55 | Self::Meta { name, .. }56 | Self::Error { name, .. } => name,57 }58 }59 pub fn expand_kind(&self, lexer: bool) -> TokenStream {60 let name = format_ident!("{}", self.name());61 let attr = match self {62 Self::Keyword { code, .. } => quote! {#[token(#code)]},63 Self::Literal { regex, lexer, .. } => {64 let lexer = lexer65 .as_deref()66 .map(TokenStream::from_str)67 .map(|r| r.expect("path is correct"));68 quote! {#[regex(#regex, #lexer)]}69 }70 Self::Error {71 regex, priority, ..72 } if regex.is_some() => {73 let priority = priority.map(|p| quote! {, priority = #p});74 quote! {#[regex(#regex #priority)]}75 }76 _ => quote! {},77 };78 let attr = if lexer {79 attr80 } else {81 quote! {}82 };83 quote! {84 #attr85 #name86 }87 }88 pub fn expand_t_macros(&self) -> Option<TokenStream> {89 match self {90 Self::Keyword { code, name } => {91 let code = escape_token_macro(code);92 let name = format_ident!("{name}");93 Some(quote! {94 [#code] => {$crate::SyntaxKind::#name}95 })96 }97 // Meta items should not appear in T![_]98 _ => None,99 }100 }101102 /// How this token should be referenced in code103 /// Keywords are referenced with `T![_]` macro,104 /// and literals are referenced directly by name105 pub fn reference(&self) -> TokenStream {106 if let Self::Keyword { code, .. } = self {107 let code = escape_token_macro(code);108 quote! {T![#code]}109 } else {110 let name = self.name();111 let ident = format_ident!("{name}");112 quote! {#ident}113 }114 }115116 pub fn method_name(&self) -> Ident {117 match self {118 Self::Keyword { name, .. } => {119 format_ident!("{}_token", name.to_lowercase())120 }121 Self::Literal { name, .. } => {122 format_ident!("{}_lit", name.to_lowercase())123 }124 Self::Meta { name, .. } => format_ident!("{}_meta", name.to_lowercase()),125 Self::Error { name, .. } => format_ident!("{}_error", name.to_lowercase()),126 }127 }128}129130#[macro_export]131macro_rules! define_kinds {132 ($into:ident = lit($name:literal) => $regex:literal $(, $lexer:literal)? $(; $($rest:tt)*)?) => {{133 $into.define_token(TokenKind::Literal {134 grammar_name: format!("LIT_{}!", $name),135 name: $name.to_owned(),136 regex: $regex.to_owned(),137 lexer: None $(.or_else(|| Some($lexer.to_string())))?,138 });139 $(define_kinds!($into = $($rest)*))?140 }};141 ($into:ident = error($name:literal$(, priority = $priority:literal)? $(, lexer = $lexer:literal)?) $(=> $regex:literal)? $(; $($rest:tt)*)?) => {{142 {143 let regex = None$(.or(Some($regex.to_owned())))?;144 let priority = None$(.or(Some($priority)))?;145 $into.define_token(TokenKind::Error {146 grammar_name: format!("ERROR_{}!", $name),147 name: format!("ERROR_{}", $name),148 is_lexer_error: false $(|| $lexer)? || regex.is_some() || priority.is_some(),149 regex,150 priority,151 });152 }153 $(define_kinds!($into = $($rest)*))?154 }};155 ($into:ident = $tok:literal => $name:literal $(; $($rest:tt)*)?) => {{156 $into.define_token(TokenKind::Keyword {157 code: format!("{}", $tok),158 name: $name.to_owned(),159 });160 $(define_kinds!($into = $($rest)*))?161 }};162 ($into:ident =) => {{}}163}164use std::{collections::HashSet, str::FromStr};165166use indexmap::IndexMap;167use proc_macro2::{Ident, TokenStream};168use quote::{format_ident, quote};169170use super::escape_token_macro;171172impl KindsSrc {173 pub fn new() -> Self {174 Self {175 defined_tokens: IndexMap::new(),176 defined_node_names: HashSet::new(),177 nodes: Vec::new(),178 }179 }180 pub fn define_token(&mut self, token: TokenKind) {181 assert!(182 self.defined_node_names.insert(token.name().to_owned()),183 "node name already defined: {}",184 token.name()185 );186 assert!(187 self.defined_tokens188 .insert(token.grammar_name().to_owned(), token.clone())189 .is_none(),190 "token already defined: {}",191 token.grammar_name()192 );193 }194 pub fn define_node(&mut self, node: &str) {195 assert!(196 self.defined_node_names.insert(node.to_owned()),197 "node name already defined: {node}"198 );199 self.nodes.push(node.to_string());200 }201 pub fn token(&self, tok: &str) -> Option<&TokenKind> {202 self.defined_tokens.get(tok)203 }204 pub fn is_token(&self, tok: &str) -> bool {205 self.defined_tokens.contains_key(tok)206 }207 pub fn tokens(&self) -> impl Iterator<Item = &TokenKind> {208 self.defined_tokens.iter().map(|(_, v)| v)209 }210}211212pub fn jsonnet_kinds() -> KindsSrc {213 let mut kinds = KindsSrc::new();214 define_kinds![kinds =215 "||" => "OR";216 "??" => "NULL_COAELSE";217 "&&" => "AND";218 "|" => "BIT_OR";219 "^" => "BIT_XOR";220 "&" => "BIT_AND";221 "==" => "EQ";222 "!=" => "NE";223 "<" => "LT";224 ">" => "GT";225 "<=" => "LE";226 ">=" => "GE";227 "<<" => "LHS";228 ">>" => "RHS";229 "+" => "PLUS";230 "-" => "MINUS";231 "*" => "MUL";232 "/" => "DIV";233 "%" => "MODULO";234 "!" => "NOT";235 "~" => "BIT_NOT";236 "[" => "L_BRACK";237 "]" => "R_BRACK";238 "(" => "L_PAREN";239 ")" => "R_PAREN";240 "{" => "L_BRACE";241 "}" => "R_BRACE";242 ":" => "COLON";243 "::" => "COLONCOLON";244 ":::" => "COLONCOLONCOLON";245 ";" => "SEMI";246 "." => "DOT";247 "..." => "DOTDOTDOT";248 "," => "COMMA";249 "$" => "DOLLAR";250 "=" => "ASSIGN";251 "?" => "QUESTION_MARK";252 // Literals253 lit("FLOAT") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?";254 error("FLOAT_JUNK_AFTER_POINT") => r"(?:0|[1-9][0-9]*)\.[^0-9]";255 error("FLOAT_JUNK_AFTER_EXPONENT") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?[eE][^+\-0-9]";256 error("FLOAT_JUNK_AFTER_EXPONENT_SIGN") => r"(?:0|[1-9][0-9]*)(?:\.[0-9]+)?[eE][+-][^0-9]";257 lit("STRING_DOUBLE") => "\"(?s:[^\"\\\\]|\\\\.)*\"";258 error("STRING_DOUBLE_UNTERMINATED") => "\"(?s:[^\"\\\\]|\\\\.)*";259 lit("STRING_SINGLE") => "'(?s:[^'\\\\]|\\\\.)*'";260 error("STRING_SINGLE_UNTERMINATED") => "'(?s:[^'\\\\]|\\\\.)*";261 lit("STRING_DOUBLE_VERBATIM") => "@\"(?:[^\"]|\"\")*\"";262 error("STRING_DOUBLE_VERBATIM_UNTERMINATED") => "@\"(?:[^\"]|\"\")*";263 lit("STRING_SINGLE_VERBATIM") => "@'(?:[^']|'')*'";264 error("STRING_SINGLE_VERBATIM_UNTERMINATED") => "@'(?:[^']|'')*";265 error("STRING_VERBATIM_MISSING_QUOTES") => "@[^\"'\\s]\\S+";266 lit("STRING_BLOCK") => r"\|\|\|", "crate::string_block::lex_str_block_test";267 error("STRING_BLOCK_UNEXPECTED_END", lexer = true);268 error("STRING_BLOCK_MISSING_NEW_LINE", lexer = true);269 error("STRING_BLOCK_MISSING_TERMINATION", lexer = true);270 error("STRING_BLOCK_MISSING_INDENT", lexer = true);271 lit("IDENT") => r"[_a-zA-Z][_a-zA-Z0-9]*";272 lit("WHITESPACE") => r"[ \t\n\r]+";273 lit("SINGLE_LINE_SLASH_COMMENT") => r"//[^\r\n]*?(\r\n|\n)?";274 lit("SINGLE_LINE_HASH_COMMENT") => r"#[^\r\n]*?(\r\n|\n)?";275 lit("MULTI_LINE_COMMENT") => r"/\*([^*]|\*[^/])*\*/";276 error("COMMENT_TOO_SHORT") => r"/\*/";277 error("COMMENT_UNTERMINATED") => r"/\*([^*/]|\*[^/])+";278 ];279 kinds280}xtask/src/sourcegen/mod.rsdiffbeforeafterboth--- a/xtask/src/sourcegen/mod.rs
+++ b/xtask/src/sourcegen/mod.rs
@@ -89,7 +89,7 @@
kinds.define_node(&name);
}
- let syntax_kinds = generate_syntax_kinds(&kinds, &ast)?;
+ let syntax_kinds = generate_syntax_kinds(&kinds, &ast, false)?;
let nodes = generate_nodes(&kinds, &ast)?;
ensure_file_contents(
@@ -106,12 +106,21 @@
)),
&nodes,
);
+
+ let lexer_syntax_kinds = generate_syntax_kinds(&kinds, &ast, true)?;
+ ensure_file_contents(
+ &PathBuf::from(concat!(
+ env!("CARGO_MANIFEST_DIR"),
+ "/../crates/jrsonnet-lexer/src/generated/syntax_kinds.rs",
+ )),
+ &lexer_syntax_kinds,
+ );
Ok(())
}
-fn generate_syntax_kinds(kinds: &KindsSrc, grammar: &AstSrc) -> Result<String> {
+fn generate_syntax_kinds(kinds: &KindsSrc, grammar: &AstSrc, lexer: bool) -> Result<String> {
let t_macros = kinds.tokens().filter_map(TokenKind::expand_t_macros);
- let token_kinds = kinds.tokens().map(TokenKind::expand_kind);
+ let token_kinds = kinds.tokens().map(|t| t.expand_kind(lexer));
let keywords = kinds
.tokens()
@@ -119,12 +128,16 @@
.map(TokenKind::name)
.map(|n| format_ident!("{n}"));
- let nodes = kinds
+ let mut nodes = kinds
.nodes
.iter()
.map(|name| format_ident!("{}", name))
.collect::<Vec<_>>();
+ if lexer {
+ nodes.clear();
+ }
+
let enums = grammar
.enums
.iter()
@@ -134,14 +147,34 @@
.token_enums
.iter()
.map(|e| format_ident!("{}", to_upper_snake_case(&e.name))),
- );
+ )
+ .collect::<Vec<_>>();
+ let is_enum = if lexer {
+ quote! {}
+ } else {
+ quote! {
+ pub fn is_enum(self) -> bool {
+ match self {
+ #(#enums)|* => true,
+ _ => false,
+ }
+ }
+ }
+ };
+ let derive_logos = if lexer {
+ quote! {
+ , logos::Logos
+ }
+ } else {
+ quote! {}
+ };
+
let ast = quote! {
#![allow(bad_style, missing_docs, unreachable_pub, clippy::manual_non_exhaustive, clippy::match_like_matches_macro)]
- use logos::Logos;
/// The kind of syntax node, e.g. `IDENT`, `USE_KW`, or `STRUCT`.
- #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Logos)]
+ #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug #derive_logos)]
#[repr(u16)]
pub enum SyntaxKind {
#[doc(hidden)]
@@ -164,13 +197,9 @@
_ => false,
}
}
- pub fn is_enum(self) -> bool {
- match self {
- #(#enums)|* => true,
- _ => false,
- }
- }
+ #is_enum
+
pub fn from_raw(r: u16) -> Self {
assert!(r < Self::__LAST as u16);
unsafe { std::mem::transmute(r) }