diff --git a/lex.js b/lex.js new file mode 100644 index 0000000..d76f94b --- /dev/null +++ b/lex.js @@ -0,0 +1,248 @@ +const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); + +class TokenStream +{ + constructor(input) + { + this.input = input; + this.inputLength = input.length; + + // how far in the token stream we are + this.index = 0; + + const lexed = lex(this.input); + this.tokens = lexed.tokens; + this.newlines = lexed.newlines; + + // when we attempt to match a sub-rule, save the location where we roll back to + this.savedLocations = []; + } + + eof() + { + return this.index < 0 || this.index >= this.tokens.length; + } + + peek(type = undefined, value = undefined) + { + if (this.index < 0 || this.index >= this.tokens.length) return { error: { msg: "Unexpected EOF", ...this.stringIndexToLinePos(this.inputLength - 1) } }; + + const token = this.tokens[this.index]; + + if (type) + { + if (token.type !== type) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + ", not " + TokenType.keyName(token.type) + " '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + + if (value !== undefined) + { + if (token.value !== value) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + " '" + value + "', not '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + } + } + + return token; + } + + consume(type = undefined, value = undefined) + { + const token = this.peek(type, value); + if (token.error) return error; + + this.index++; + return token; + } + + beginMatch() + { + this.savedLocations.push(this.index); + } + + doneMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected doneMatch()"); + this.savedLocations.pop(); + } + + failedMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected failedMatch()"); + this.index = this.savedLocations.pop(); + } + + stringIndexToLinePos(index) + { + index = Math.max(0, Math.min(this.inputLength - 1, index)); + + let line = this.newlines.length + 1; + let previousLinesLength = 0; + for (let i = 0; i < this.newlines.length; i++) + { + if (index <= this.newlines[i]) + { + line = i + 1; + break; + } + previousLinesLength += this.newlines[i] + 1; + } + return {line:line, pos:index - previousLinesLength + 1}; + } + + tokenIndexToLinePos(index) + { + if (index < 0 || index >= this.tokens.length) return this.stringIndexToLinePos(this.inputLength - 1); + + return this.tokenToLinePos(this.tokens[index]); + } + + tokenToLinePos(token) + { + return this.tokenIndexToLinePos(token.index); + } +} + +/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source + * + * recognized types: + * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords + * NumberLit: a literal value for the number type + * StringLit: a literal value for the string type + * Op: an operator (usually a single character of punctuation) + */ +function lex(input) +{ + let inputLength = input.length; + + let newlines = []; + + let i = 0; + while ((i = input.indexOf("\n", i)) != -1) + { + newlines.push(i); + i++; + } + + // break the input down into "parts": either words, digit strings, single character operators, or whitespace + let parts = []; + + // track where we are in the original string + let index = 0; + while (input.length > 0) + { + // match our possible sequences of more than one character (words, digits, or whitespace) + if ((match = input.match(/^\w+/)) || + (match = input.match(/^\d+/)) || + (match = input.match(/^\s+/))) + { + const matched = match[0]; + parts.push({token:matched, index:index}); + + // advance the string + input = input.substr(matched.length); + index += matched.length; + } + else + { + // everything else is a single character (punctuation) + parts.push({token:input[0], index:index}); + + input = input.substr(1); + index++; + } + } + + let tokens = []; + + for (let i = 0; i < parts.length; i++) + { + const partsLeft = parts.length - i - 1; + + let { token, index } = parts[i]; + + if (token.trim().length == 0) + { + // remove whitespace (whitespace in a string is handled in the block that handles the " token below) + continue; + } + + if (token.match(/\d+/)) + { + // try and get the decimal, if any + if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/)) + { + token += "." + parts[i + 2].token; + } + tokens.push({type:TokenType.NumberLit, token:token, index:index}); + } + else if (token == '"') + { + // we are starting a string literal + + // don't include the initial " + token = ""; + + let closed = false; + for (let j = i + 1; j < parts.length; j++) + { + // find an escape character + if (parts[j].token == '\\') + { + if (j + 1 < parts.length) + { + // the escape character is the next character (all tokens are guaranteed length > 0) + const escaped = parts[j + 1].token[0]; + + // remove the escaped character from the next token, being sure to remove the token + // if it was only that 1 character + parts[j + 1].token = parts[j + 1].token.substr(1); + if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1); + + // for now, just put the escaped character in literally, we'll deal with escape sequences later + token += escaped; + } + } + else if (parts[j].token == '"') + { + // close the string + i = j; + closed = true; + break; + } + else + { + // build up the string out of all tokens before the closing " + token += parts[j].token; + } + } + + if (!closed) + { + throw "Unclosed string"; + } + + tokens.push({type:TokenType.StringLit, token:token, index:index}); + } + else if (token.match(/\w+/)) + { + // any keyword or identifier + tokens.push({type:TokenType.Word, token:token, index:index}); + } + else + { + // anything left is an operator + tokens.push({type:TokenType.Op, token:token, index:index}); + } + } + + // keywords and identifiers are case insensitive + return { + tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t), + newlines: newlines, + }; +} + +module.exports = { TokenStream: TokenStream, TokenType: TokenType }; diff --git a/lex.js b/lex.js new file mode 100644 index 0000000..d76f94b --- /dev/null +++ b/lex.js @@ -0,0 +1,248 @@ +const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); + +class TokenStream +{ + constructor(input) + { + this.input = input; + this.inputLength = input.length; + + // how far in the token stream we are + this.index = 0; + + const lexed = lex(this.input); + this.tokens = lexed.tokens; + this.newlines = lexed.newlines; + + // when we attempt to match a sub-rule, save the location where we roll back to + this.savedLocations = []; + } + + eof() + { + return this.index < 0 || this.index >= this.tokens.length; + } + + peek(type = undefined, value = undefined) + { + if (this.index < 0 || this.index >= this.tokens.length) return { error: { msg: "Unexpected EOF", ...this.stringIndexToLinePos(this.inputLength - 1) } }; + + const token = this.tokens[this.index]; + + if (type) + { + if (token.type !== type) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + ", not " + TokenType.keyName(token.type) + " '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + + if (value !== undefined) + { + if (token.value !== value) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + " '" + value + "', not '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + } + } + + return token; + } + + consume(type = undefined, value = undefined) + { + const token = this.peek(type, value); + if (token.error) return error; + + this.index++; + return token; + } + + beginMatch() + { + this.savedLocations.push(this.index); + } + + doneMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected doneMatch()"); + this.savedLocations.pop(); + } + + failedMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected failedMatch()"); + this.index = this.savedLocations.pop(); + } + + stringIndexToLinePos(index) + { + index = Math.max(0, Math.min(this.inputLength - 1, index)); + + let line = this.newlines.length + 1; + let previousLinesLength = 0; + for (let i = 0; i < this.newlines.length; i++) + { + if (index <= this.newlines[i]) + { + line = i + 1; + break; + } + previousLinesLength += this.newlines[i] + 1; + } + return {line:line, pos:index - previousLinesLength + 1}; + } + + tokenIndexToLinePos(index) + { + if (index < 0 || index >= this.tokens.length) return this.stringIndexToLinePos(this.inputLength - 1); + + return this.tokenToLinePos(this.tokens[index]); + } + + tokenToLinePos(token) + { + return this.tokenIndexToLinePos(token.index); + } +} + +/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source + * + * recognized types: + * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords + * NumberLit: a literal value for the number type + * StringLit: a literal value for the string type + * Op: an operator (usually a single character of punctuation) + */ +function lex(input) +{ + let inputLength = input.length; + + let newlines = []; + + let i = 0; + while ((i = input.indexOf("\n", i)) != -1) + { + newlines.push(i); + i++; + } + + // break the input down into "parts": either words, digit strings, single character operators, or whitespace + let parts = []; + + // track where we are in the original string + let index = 0; + while (input.length > 0) + { + // match our possible sequences of more than one character (words, digits, or whitespace) + if ((match = input.match(/^\w+/)) || + (match = input.match(/^\d+/)) || + (match = input.match(/^\s+/))) + { + const matched = match[0]; + parts.push({token:matched, index:index}); + + // advance the string + input = input.substr(matched.length); + index += matched.length; + } + else + { + // everything else is a single character (punctuation) + parts.push({token:input[0], index:index}); + + input = input.substr(1); + index++; + } + } + + let tokens = []; + + for (let i = 0; i < parts.length; i++) + { + const partsLeft = parts.length - i - 1; + + let { token, index } = parts[i]; + + if (token.trim().length == 0) + { + // remove whitespace (whitespace in a string is handled in the block that handles the " token below) + continue; + } + + if (token.match(/\d+/)) + { + // try and get the decimal, if any + if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/)) + { + token += "." + parts[i + 2].token; + } + tokens.push({type:TokenType.NumberLit, token:token, index:index}); + } + else if (token == '"') + { + // we are starting a string literal + + // don't include the initial " + token = ""; + + let closed = false; + for (let j = i + 1; j < parts.length; j++) + { + // find an escape character + if (parts[j].token == '\\') + { + if (j + 1 < parts.length) + { + // the escape character is the next character (all tokens are guaranteed length > 0) + const escaped = parts[j + 1].token[0]; + + // remove the escaped character from the next token, being sure to remove the token + // if it was only that 1 character + parts[j + 1].token = parts[j + 1].token.substr(1); + if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1); + + // for now, just put the escaped character in literally, we'll deal with escape sequences later + token += escaped; + } + } + else if (parts[j].token == '"') + { + // close the string + i = j; + closed = true; + break; + } + else + { + // build up the string out of all tokens before the closing " + token += parts[j].token; + } + } + + if (!closed) + { + throw "Unclosed string"; + } + + tokens.push({type:TokenType.StringLit, token:token, index:index}); + } + else if (token.match(/\w+/)) + { + // any keyword or identifier + tokens.push({type:TokenType.Word, token:token, index:index}); + } + else + { + // anything left is an operator + tokens.push({type:TokenType.Op, token:token, index:index}); + } + } + + // keywords and identifiers are case insensitive + return { + tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t), + newlines: newlines, + }; +} + +module.exports = { TokenStream: TokenStream, TokenType: TokenType }; diff --git a/parse.js b/parse.js index c81e300..86ab105 100644 --- a/parse.js +++ b/parse.js @@ -1,145 +1,6 @@ -const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); +const lex = require('lex'); +const utils = require('utils'); -/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source - * - * recognized types: - * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords - * NumberLit: a literal value for the number type - * StringLit: a literal value for the string type - * Op: an operator (usually a single character of punctuation) - */ -function lex(input) -{ - let inputLength = input.length; - - let newlines = []; - - let i = 0; - while ((i = input.indexOf("\n", i)) != -1) - { - newlines.push(i); - i++; - } - - // break the input down into "parts": either words, digit strings, single character operators, or whitespace - let parts = []; - - // track where we are in the original string - let index = 0; - while (input.length > 0) - { - // match our possible sequences of more than one character (words, digits, or whitespace) - if ((match = input.match(/^\w+/)) || - (match = input.match(/^\d+/)) || - (match = input.match(/^\s+/))) - { - const matched = match[0]; - parts.push({token:matched, index:index}); - - // advance the string - input = input.substr(matched.length); - index += matched.length; - } - else - { - // everything else is a single character (punctuation) - parts.push({token:input[0], index:index}); - - input = input.substr(1); - index++; - } - } - - let tokens = []; - - for (let i = 0; i < parts.length; i++) - { - const partsLeft = parts.length - i - 1; - - let { token, index } = parts[i]; - - if (token.trim().length == 0) - { - // remove whitespace (whitespace in a string is handled in the block that handles the " token below) - continue; - } - - if (token.match(/\d+/)) - { - // try and get the decimal, if any - if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/)) - { - token += "." + parts[i + 2].token; - } - tokens.push({type:TokenType.NumberLit, token:token, index:index}); - } - else if (token == '"') - { - // we are starting a string literal - - // don't include the initial " - token = ""; - - let closed = false; - for (let j = i + 1; j < parts.length; j++) - { - // find an escape character - if (parts[j].token == '\\') - { - if (j + 1 < parts.length) - { - // the escape character is the next character (all tokens are guaranteed length > 0) - const escaped = parts[j + 1].token[0]; - - // remove the escaped character from the next token, being sure to remove the token - // if it was only that 1 character - parts[j + 1].token = parts[j + 1].token.substr(1); - if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1); - - // for now, just put the escaped character in literally, we'll deal with escape sequences later - token += escaped; - } - } - else if (parts[j].token == '"') - { - // close the string - i = j; - closed = true; - break; - } - else - { - // build up the string out of all tokens before the closing " - token += parts[j].token; - } - } - - if (!closed) - { - throw "Unclosed string"; - } - - tokens.push({type:TokenType.StringLit, token:token, index:index}); - } - else if (token.match(/\w+/)) - { - // any keyword or identifier - tokens.push({type:TokenType.Word, token:token, index:index}); - } - else - { - // anything left is an operator - tokens.push({type:TokenType.Op, token:token, index:index}); - } - } - - // keywords and identifiers are case insensitive - return { - tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t), - newlines: newlines, - inputLength: inputLength, - }; -} /* How powerfully infix operators bind to their left and right arguments * If a value is bound more powerfully from one side, it becomes part of the operator @@ -177,24 +38,6 @@ "(": 60, }); -// helper to grab the first item of an array, and throw a specific string if the array is empty -Object.defineProperty(Array.prototype, 'shiftThrow', { - enumerable: false, - value: function(err) { - if (this.length == 0) throw err; - return this.shift(); - } -}); - -// helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE"; -Object.defineProperty(Object.prototype, 'keyName', { - enumerable: false, - value: function(value) - { - return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE"; - } -}); - // all reserved keywords const Keywords = Object.freeze([ "if" ]); @@ -214,278 +57,262 @@ "false": {node:ASTNode.Literal, value:false}, }); -function stringIndexToLinePos(index, newlines) +class Parser { - let line = newlines.length + 1; - let previousLinesLength = 0; - for (let i = 0; i < newlines.length; i++) + constructor(tokenStream) { - if (index <= newlines[i]) - { - line = i + 1; - break; - } - previousLinesLength += newlines[i] + 1; + this.tokenStream = tokenStream; + this.tokenStream.index = 0; } - return {line:line, pos:index - previousLinesLength + 1}; -} - -/* The main entry point for parsing - * - * Takes an array of tokens from the lexer, and produces an AST of the program as raw objects - */ -function parseExpression(tokenInfo) -{ - return parseExpressionInternal(tokenInfo, 0); -} -/* Parses an expression consisting of a set of values, all separated by infix operators, - * as long none of the operators bind at least as powerfully as 'minBindingPower' - * - * This function will be called recursively to parse sub-expression, with the - * minBindingPower set to whatever the last seen infix operator was. This ensures - * that we never grab operators that can't be part of the subtree because their precedence - * should have had them happen earlier - * - * The while loop in the middle of this function is the main feature that makes this a pratt parser, - * and separates it from a standard recursive decent parser where you have to manually match on only - * certain terms to maintain precedence - * - * Produces an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications - * of operators with higher precedence) - * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information - */ -function parseExpressionInternal(tokenInfo, minBindingPower) -{ - let {tokens, newlines, inputLength} = tokenInfo; - - let eofError = "Expected more input at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); - - // grab the type of the next token, and its string representation - const {type, token, index:tokenIndex} = tokens.shiftThrow(eofError); - - // start parsing the left-hand-side of the next operator at this AST node - let lhs = null; - if (type == TokenType.Op) + expression(tokenStream) { - // the value of this part of the expression starts with an operator - // it will either be prefix operator, or a bracketed expression - if (token == "(") - { - // parse a subexpression - lhs = parseExpressionsInternal(tokens, 0); + return expressionInternal(tokenStream, 0); + } - // ensure the next token after the sub expression is a closing paren, and consume it - const {token:closeToken, type:closeType, index:closeIndex} = tokens.shiftThrow("Expecting ')', not '" + closeToken + "' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines))); - if (closeType != TokenType.Op && closeToken != ")") + /* Parses an expression consisting of a set of values, all separated by infix operators, + * as long none of the operators bind at least as powerfully as 'minBindingPower' + * + * This function will be called recursively to parse sub-expression, with the + * minBindingPower set to whatever the last seen infix operator was. This ensures + * that we never grab operators that can't be part of the subtree because their precedence + * should have had them happen earlier + * + * The while loop in the middle of this function is the main feature that makes this a pratt parser, + * and separates it from a standard recursive decent parser where you have to manually match on only + * certain terms to maintain precedence + * + * Produces an error, or an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications + * of operators with higher precedence) + * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information + * + */ + expressionInternal(minBindingPower) + { + // grab the type of the next token, and its string representation + const {type, token, index:tokenIndex, error} = this.tokenStream.consume(); + if (error) return this.getError(error); + + // start parsing the left-hand-side of the next operator at this AST node + let lhs = null; + if (type == lex.TokenType.Op) + { + // the value of this part of the expression starts with an operator + // it will either be prefix operator, or a bracketed expression + if (token == "(") { - throw "Expecting ')', not '" + closeToken + "' at " + JSON.stringify(stringIndexToLinePos(closeIndex, newlines)); + // parse a subexpression + const { expr, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + lhs = expr; + + // ensure the next token after the sub expression is a closing paren, and consume it + const { error } = tokens.consume(lex.TokenType.Op, ")"); + if (error) return this.getError(error); + } + else if ((power = PrefixBindingPower[token]) !== undefined) + { + // this is a prefix operator of the given power + // get the expression it applies to... + const { rhs, error } = this.attemptSubtree(expressionInternal, power); + if (error) return this.getError(error); + + // ... and generate an AST node for it + // this becomes the new LHS of the operator we are parsing + lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; + } + else + { + return this.getError({ msg: "Invalid Op '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); } } - else if ((power = PrefixBindingPower[token]) !== undefined) + else if (type == lex.TokenType.Word) { - // this is a prefix operator of the given power - // get the expression it applies to... - const rhs = parseExpressionInternal({...tokenInfo, tokens:tokens}, power); - // ... and generate an AST node for it - // this becomes the new LHS of the operator we are parsing - lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; - } - else - { - throw "Invalid operator '" + token + "' at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - } - else if (type == TokenType.Word) - { - if (Keywords.includes(token)) - { - // TODO: handle keywords that can go in expressions - throw "Keyword '" + token + "' not expected at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - else if (Literals[token] !== undefined) - { - // this is a hardcoded keyword literal, we already defined how those nodes look - lhs = Literals[token]; - } - else - { - // some unresolved identifier (this is the domain of the runtime/interpreter!) - lhs = {node:ASTNode.Identifier, value:token}; - } - } - else if (type == TokenType.NumberLit) - { - lhs = {node:ASTNode.Literal, value:parseFloat(token)}; - } - else if (type == TokenType.StringLit) - { - lhs = {node:ASTNode.Literal, value:token}; - } - else - { - throw "Unexpcted token " + TokenType.keyName(type) + " of '" + token + "' at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - - while (tokens.length > 0) - { - // start grabbing further operators and expression, that don't have higher precedence - // (higher precedence operators will be parsed as sub-expressions) - - /* Expecting an operator now - * Depending on its precedence, we may not actually get to parse it ourselves, - * so don't actually consume the token yet, just peek it (just in case) - */ - const {type:opType, token:opToken, index:opIndex} = tokens[0]; - if (opType != TokenType.Op) - { - throw "Instead of " + TokenType.keyName(opType) + " of '" + opToken + "', expected operator at " + JSON.stringify(stringIndexToLinePos(opIndex, newlines)); - } - - if (PostfixBindingPower[opToken] !== undefined) - { - // this is a post-fix operator, but it may apply to... - if (PostfixBindingPower[opToken] < minBindingPower) + if (Keywords.includes(token)) { - //... some outer expression (we were too low power to bind to lhs) + // TODO: handle keywords that can go in expressions + return this.getError({ msg: "Keyword '" + token + "' not expected", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); + } + else if (Literals[token] !== undefined) + { + // this is a hardcoded keyword literal, we already defined how those nodes look + lhs = Literals[token]; + } + else + { + // some unresolved identifier (this is the domain of the runtime/interpreter!) + lhs = {node:ASTNode.Identifier, value:token}; + } + } + else if (type == TokenType.NumberLit) + { + lhs = {node:ASTNode.Literal, value:parseFloat(token)}; + } + else if (type == TokenType.StringLit) + { + lhs = {node:ASTNode.Literal, value:token}; + } + else + { + return this.getError({ msg: "Unexpected " + lex.TokenType.keyName(type) + " of '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); + } + + while (!this.tokenStream.eof()) + { + // start grabbing further operators and expression, that don't have higher precedence + // (higher precedence operators will be parsed as sub-expressions) + + /* Expecting an operator now + * Depending on its precedence, we may not actually get to parse it ourselves, + * so don't actually consume the token yet, just peek it (just in case) + */ + const {type:opType, token:opToken, index:opIndex, error} = this.tokenStream.peek(lex.TokenType.Op); + if (error) return this.getError(error); + + if (PostfixBindingPower[opToken] !== undefined) + { + // this is a post-fix operator, but it may apply to... + if (PostfixBindingPower[opToken] < minBindingPower) + { + //... some outer expression (we were too low power to bind to lhs) + break; + } + + //... or our current lhs, in which case we get to consume the token + this.tokenStream.consume(); + + if (opToken == "[") + { + /* Parsing an index on the current lhs + * We need to parse the index expression. Note we reset the binding power, + * this is because we will always end with ], so we don't need to worry about our current precedence + */ + const { index, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + // parse the closing ] + const { error } = this.tokenStream.consume(lex.TokenType.Op, "]"); + if (error) return this.getError(error); + + // our new node is the old lhs indexed with the list of argument expressions + lhs = {node:ASTNode.Index, lhs:lhs, index:index}; + } + else if (opToken == "(") + { + /* Parsing a function call + * lhs represents the function we are calling + * We need to parse a list of expression separated by , as the arguments + */ + let args = []; + while (true) + { + if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); + + // peek the next op + const {token:nextToken, type:nextType, index:nextIndex, error} = this.tokenStream.peek(); + if (error) return this.getError(error); + + if (nextType == TokenType.Op && nextToken == ")") + { + // no more arguments, we should end the call (after consuming the bracket) + this.tokenStream.consume(); + break; + } + + if (args.length > 0) + { + const { error } = this.tokenStream.consume(lex.TokenType.Op, ","); + if (error) return this.getError(error); + } + + const { arg, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + // our new node is the old lhs as a function, called with the index expression + args.push(arg); + } + + // generate the AST node + lhs = {node:ASTNode.Call, func:lhs, args:args}; + } + else + { + return this.getError({ msg: "Unexpected postfix Op '" + opToken + "'", ...this.tokenStream.stringIndexToLinePos(opIndex) }); + } + + // after parsing a postfix operator, we are expecting an operator again + continue; + } + + // if we didn't get an infix operator, we are done this expression + if (InfixBindingPower[opToken] === undefined) + { break; } - //... or our current lhs, in which case we get to consume the token - tokens.shift(); + [leftBinding, rightBinding] = InfixBindingPower[opToken]; - if (opToken == "[") - { - /* Parsing an index on the current lhs - * We need to parse the index expression. Note we reset the binding power, - * this is because we will always end with ], so we don't need to worry about our current precedence - */ - let index = parseExpressionInternal({...tokenInfo, tokens:tokens}, 0); + // if the binding power is too low, we are done parsing the current subexpression, leave + // the current operator for our caller... + if (leftBinding < minBindingPower) break; - // parse the closing ] - const {token:closeToken, type:closeType, index:closeIndex} = tokens.shiftThrow("Unclosed indexing expression, expecting ']' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines))); - if (closeType != TokenType.Op && closeToken != "]") - { - throw "Unclosed indexing expression, expecting ']' at " + JSON.stringify(stringIndexToLinePos(closeIndex, newlines)); - } + //... otherwise, this operator is for us, so consume it + this.tokenStream.consume(); - // our new node is the old lhs indexed with the list of argument expressions - lhs = {node:ASTNode.Index, lhs:lhs, index:index}; - } - else if (opToken == "(") - { - /* Parsing a function call - * lhs represents the function we are calling - * We need to parse a list of expression separated by , as the arguments - */ - let args = []; - while (true) - { - if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); + // the next expression is the right side of our current operator, but only as much as the + // right-side binding power of our operator can grab + const { rhs, error } = this.attemptSubtree(this.expressionInternal, rightBinding); + if (error) return this.getError(error); - // peek the next op - const {token:nextToken, type:nextType, index:nextIndex} = tokens[0]; - if (nextType == TokenType.Op && nextToken == ")") - { - // no more arguments, we should end the call (after consuming the bracket) - tokens.shift(); - break; - } - - if (args.length > 0) - { - // if we have already parsed args, we need to consume a comma - if (nextType != TokenType.Op || nextToken != ",") - { - throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(nextToken, newlines)); - } - tokens.shift(); - } - - // our new node is the old lhs as a function, called with the index expression - args.push(parseExpressionInternal({...tokenInfo, tokens:tokens}, 0)); - } - - // generate the AST node - lhs = {node:ASTNode.Call, func:lhs, args:args}; - } - else - { - throw "Unexpected postfix operator '" + opToken + "' at " + JSON.stringify(stringIndexToLinePos(opIndex, newlines)); - } - - // after parsing a postfix operator, we are expecting an operator again - continue; + // our new lhs becomes our applied operator, + // leaving us free to match more operators of an appropriate precedence + lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; } - // if we didn't get an infix operator, we are done this expression - if (InfixBindingPower[opToken] === undefined) - { - break; - } - - [leftBinding, rightBinding] = InfixBindingPower[opToken]; - - // if the binding power is too low, we are done parsing the current subexpression, leave - // the current operator for our caller... - if (leftBinding < minBindingPower) break; - - //... otherwise, this operator is for us, so consume it - tokens.shift(); - - // the next expression is the right side of our current operator, but only as much as the - // right-side binding power of our operator can grab - let rhs = parseExpressionInternal({...tokenInfo, tokens:tokens}, rightBinding); - - // our new lhs becomes our applied operator, - // leaving us free to match more operators of an appropriate precedence - lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; + return { ast: lhs, error: null }; } - return lhs; -} - -// helper to map over an object's keys recursively -Object.defineProperty(Object.prototype, 'mapKey', { - enumerable: false, - value: function(keyName, mapFunc) + error(msg, at) { - let ret = {}; - for (let k in this) + let loc = this.tokenStream.tokenIndexToLinePos(this.tokenStream.index); + if (at) { - if (k == keyName) + if (at.index) { - ret[k] = mapFunc(this[k]); - } - else if (typeof(this[k]) == 'object') - { - ret[k] = { ...this[k] }.mapKey(keyName, mapFunc); + loc = this.tokenStream.tokenToLinePos(at); } else { - ret[k] = this[k]; + loc = this.tokenStream.tokenIndexToLinePos(at); } } - return ret; + return { ast:null, error: { msg: msg, ...loc } }; } -}); -fs = require('fs'); -console.format = v => console.dir(v, { depth: null }); + attemptSubtree(parseFunction, ...args) + { + this.tokenStream.attemptMatch(); + const subtree = parseFunction.apply(this, [tokenStream, ...args]); -function debugExpr(expr) -{ - const tokens = lex(expr); - console.log(tokens.mapKey("type", v => TokenType.keyName(v))); + if (subtree.error) + { + this.tokenStream.failedMatch(); + } + else + { + this.tokenSTream.doneMatch(); + } - const ast = parseExpression(tokens); - console.format(ast.mapKey("node", v => ASTNode.keyName(v))); + return subtree; + } + + getError(error) + { + return { ast: null, error: error }; + } + } - -//debugExpr("test.BLAH (3+\"4x\\\"4\\\"\")"); -//debugExpr('3+-3+4*5-2[blah."\\"b 3 "]'); - -debugExpr(fs.readFileSync(process.argv[2], "utf8")); diff --git a/lex.js b/lex.js new file mode 100644 index 0000000..d76f94b --- /dev/null +++ b/lex.js @@ -0,0 +1,248 @@ +const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); + +class TokenStream +{ + constructor(input) + { + this.input = input; + this.inputLength = input.length; + + // how far in the token stream we are + this.index = 0; + + const lexed = lex(this.input); + this.tokens = lexed.tokens; + this.newlines = lexed.newlines; + + // when we attempt to match a sub-rule, save the location where we roll back to + this.savedLocations = []; + } + + eof() + { + return this.index < 0 || this.index >= this.tokens.length; + } + + peek(type = undefined, value = undefined) + { + if (this.index < 0 || this.index >= this.tokens.length) return { error: { msg: "Unexpected EOF", ...this.stringIndexToLinePos(this.inputLength - 1) } }; + + const token = this.tokens[this.index]; + + if (type) + { + if (token.type !== type) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + ", not " + TokenType.keyName(token.type) + " '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + + if (value !== undefined) + { + if (token.value !== value) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + " '" + value + "', not '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + } + } + + return token; + } + + consume(type = undefined, value = undefined) + { + const token = this.peek(type, value); + if (token.error) return error; + + this.index++; + return token; + } + + beginMatch() + { + this.savedLocations.push(this.index); + } + + doneMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected doneMatch()"); + this.savedLocations.pop(); + } + + failedMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected failedMatch()"); + this.index = this.savedLocations.pop(); + } + + stringIndexToLinePos(index) + { + index = Math.max(0, Math.min(this.inputLength - 1, index)); + + let line = this.newlines.length + 1; + let previousLinesLength = 0; + for (let i = 0; i < this.newlines.length; i++) + { + if (index <= this.newlines[i]) + { + line = i + 1; + break; + } + previousLinesLength += this.newlines[i] + 1; + } + return {line:line, pos:index - previousLinesLength + 1}; + } + + tokenIndexToLinePos(index) + { + if (index < 0 || index >= this.tokens.length) return this.stringIndexToLinePos(this.inputLength - 1); + + return this.tokenToLinePos(this.tokens[index]); + } + + tokenToLinePos(token) + { + return this.tokenIndexToLinePos(token.index); + } +} + +/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source + * + * recognized types: + * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords + * NumberLit: a literal value for the number type + * StringLit: a literal value for the string type + * Op: an operator (usually a single character of punctuation) + */ +function lex(input) +{ + let inputLength = input.length; + + let newlines = []; + + let i = 0; + while ((i = input.indexOf("\n", i)) != -1) + { + newlines.push(i); + i++; + } + + // break the input down into "parts": either words, digit strings, single character operators, or whitespace + let parts = []; + + // track where we are in the original string + let index = 0; + while (input.length > 0) + { + // match our possible sequences of more than one character (words, digits, or whitespace) + if ((match = input.match(/^\w+/)) || + (match = input.match(/^\d+/)) || + (match = input.match(/^\s+/))) + { + const matched = match[0]; + parts.push({token:matched, index:index}); + + // advance the string + input = input.substr(matched.length); + index += matched.length; + } + else + { + // everything else is a single character (punctuation) + parts.push({token:input[0], index:index}); + + input = input.substr(1); + index++; + } + } + + let tokens = []; + + for (let i = 0; i < parts.length; i++) + { + const partsLeft = parts.length - i - 1; + + let { token, index } = parts[i]; + + if (token.trim().length == 0) + { + // remove whitespace (whitespace in a string is handled in the block that handles the " token below) + continue; + } + + if (token.match(/\d+/)) + { + // try and get the decimal, if any + if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/)) + { + token += "." + parts[i + 2].token; + } + tokens.push({type:TokenType.NumberLit, token:token, index:index}); + } + else if (token == '"') + { + // we are starting a string literal + + // don't include the initial " + token = ""; + + let closed = false; + for (let j = i + 1; j < parts.length; j++) + { + // find an escape character + if (parts[j].token == '\\') + { + if (j + 1 < parts.length) + { + // the escape character is the next character (all tokens are guaranteed length > 0) + const escaped = parts[j + 1].token[0]; + + // remove the escaped character from the next token, being sure to remove the token + // if it was only that 1 character + parts[j + 1].token = parts[j + 1].token.substr(1); + if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1); + + // for now, just put the escaped character in literally, we'll deal with escape sequences later + token += escaped; + } + } + else if (parts[j].token == '"') + { + // close the string + i = j; + closed = true; + break; + } + else + { + // build up the string out of all tokens before the closing " + token += parts[j].token; + } + } + + if (!closed) + { + throw "Unclosed string"; + } + + tokens.push({type:TokenType.StringLit, token:token, index:index}); + } + else if (token.match(/\w+/)) + { + // any keyword or identifier + tokens.push({type:TokenType.Word, token:token, index:index}); + } + else + { + // anything left is an operator + tokens.push({type:TokenType.Op, token:token, index:index}); + } + } + + // keywords and identifiers are case insensitive + return { + tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t), + newlines: newlines, + }; +} + +module.exports = { TokenStream: TokenStream, TokenType: TokenType }; diff --git a/parse.js b/parse.js index c81e300..86ab105 100644 --- a/parse.js +++ b/parse.js @@ -1,145 +1,6 @@ -const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); +const lex = require('lex'); +const utils = require('utils'); -/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source - * - * recognized types: - * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords - * NumberLit: a literal value for the number type - * StringLit: a literal value for the string type - * Op: an operator (usually a single character of punctuation) - */ -function lex(input) -{ - let inputLength = input.length; - - let newlines = []; - - let i = 0; - while ((i = input.indexOf("\n", i)) != -1) - { - newlines.push(i); - i++; - } - - // break the input down into "parts": either words, digit strings, single character operators, or whitespace - let parts = []; - - // track where we are in the original string - let index = 0; - while (input.length > 0) - { - // match our possible sequences of more than one character (words, digits, or whitespace) - if ((match = input.match(/^\w+/)) || - (match = input.match(/^\d+/)) || - (match = input.match(/^\s+/))) - { - const matched = match[0]; - parts.push({token:matched, index:index}); - - // advance the string - input = input.substr(matched.length); - index += matched.length; - } - else - { - // everything else is a single character (punctuation) - parts.push({token:input[0], index:index}); - - input = input.substr(1); - index++; - } - } - - let tokens = []; - - for (let i = 0; i < parts.length; i++) - { - const partsLeft = parts.length - i - 1; - - let { token, index } = parts[i]; - - if (token.trim().length == 0) - { - // remove whitespace (whitespace in a string is handled in the block that handles the " token below) - continue; - } - - if (token.match(/\d+/)) - { - // try and get the decimal, if any - if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/)) - { - token += "." + parts[i + 2].token; - } - tokens.push({type:TokenType.NumberLit, token:token, index:index}); - } - else if (token == '"') - { - // we are starting a string literal - - // don't include the initial " - token = ""; - - let closed = false; - for (let j = i + 1; j < parts.length; j++) - { - // find an escape character - if (parts[j].token == '\\') - { - if (j + 1 < parts.length) - { - // the escape character is the next character (all tokens are guaranteed length > 0) - const escaped = parts[j + 1].token[0]; - - // remove the escaped character from the next token, being sure to remove the token - // if it was only that 1 character - parts[j + 1].token = parts[j + 1].token.substr(1); - if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1); - - // for now, just put the escaped character in literally, we'll deal with escape sequences later - token += escaped; - } - } - else if (parts[j].token == '"') - { - // close the string - i = j; - closed = true; - break; - } - else - { - // build up the string out of all tokens before the closing " - token += parts[j].token; - } - } - - if (!closed) - { - throw "Unclosed string"; - } - - tokens.push({type:TokenType.StringLit, token:token, index:index}); - } - else if (token.match(/\w+/)) - { - // any keyword or identifier - tokens.push({type:TokenType.Word, token:token, index:index}); - } - else - { - // anything left is an operator - tokens.push({type:TokenType.Op, token:token, index:index}); - } - } - - // keywords and identifiers are case insensitive - return { - tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t), - newlines: newlines, - inputLength: inputLength, - }; -} /* How powerfully infix operators bind to their left and right arguments * If a value is bound more powerfully from one side, it becomes part of the operator @@ -177,24 +38,6 @@ "(": 60, }); -// helper to grab the first item of an array, and throw a specific string if the array is empty -Object.defineProperty(Array.prototype, 'shiftThrow', { - enumerable: false, - value: function(err) { - if (this.length == 0) throw err; - return this.shift(); - } -}); - -// helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE"; -Object.defineProperty(Object.prototype, 'keyName', { - enumerable: false, - value: function(value) - { - return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE"; - } -}); - // all reserved keywords const Keywords = Object.freeze([ "if" ]); @@ -214,278 +57,262 @@ "false": {node:ASTNode.Literal, value:false}, }); -function stringIndexToLinePos(index, newlines) +class Parser { - let line = newlines.length + 1; - let previousLinesLength = 0; - for (let i = 0; i < newlines.length; i++) + constructor(tokenStream) { - if (index <= newlines[i]) - { - line = i + 1; - break; - } - previousLinesLength += newlines[i] + 1; + this.tokenStream = tokenStream; + this.tokenStream.index = 0; } - return {line:line, pos:index - previousLinesLength + 1}; -} - -/* The main entry point for parsing - * - * Takes an array of tokens from the lexer, and produces an AST of the program as raw objects - */ -function parseExpression(tokenInfo) -{ - return parseExpressionInternal(tokenInfo, 0); -} -/* Parses an expression consisting of a set of values, all separated by infix operators, - * as long none of the operators bind at least as powerfully as 'minBindingPower' - * - * This function will be called recursively to parse sub-expression, with the - * minBindingPower set to whatever the last seen infix operator was. This ensures - * that we never grab operators that can't be part of the subtree because their precedence - * should have had them happen earlier - * - * The while loop in the middle of this function is the main feature that makes this a pratt parser, - * and separates it from a standard recursive decent parser where you have to manually match on only - * certain terms to maintain precedence - * - * Produces an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications - * of operators with higher precedence) - * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information - */ -function parseExpressionInternal(tokenInfo, minBindingPower) -{ - let {tokens, newlines, inputLength} = tokenInfo; - - let eofError = "Expected more input at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); - - // grab the type of the next token, and its string representation - const {type, token, index:tokenIndex} = tokens.shiftThrow(eofError); - - // start parsing the left-hand-side of the next operator at this AST node - let lhs = null; - if (type == TokenType.Op) + expression(tokenStream) { - // the value of this part of the expression starts with an operator - // it will either be prefix operator, or a bracketed expression - if (token == "(") - { - // parse a subexpression - lhs = parseExpressionsInternal(tokens, 0); + return expressionInternal(tokenStream, 0); + } - // ensure the next token after the sub expression is a closing paren, and consume it - const {token:closeToken, type:closeType, index:closeIndex} = tokens.shiftThrow("Expecting ')', not '" + closeToken + "' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines))); - if (closeType != TokenType.Op && closeToken != ")") + /* Parses an expression consisting of a set of values, all separated by infix operators, + * as long none of the operators bind at least as powerfully as 'minBindingPower' + * + * This function will be called recursively to parse sub-expression, with the + * minBindingPower set to whatever the last seen infix operator was. This ensures + * that we never grab operators that can't be part of the subtree because their precedence + * should have had them happen earlier + * + * The while loop in the middle of this function is the main feature that makes this a pratt parser, + * and separates it from a standard recursive decent parser where you have to manually match on only + * certain terms to maintain precedence + * + * Produces an error, or an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications + * of operators with higher precedence) + * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information + * + */ + expressionInternal(minBindingPower) + { + // grab the type of the next token, and its string representation + const {type, token, index:tokenIndex, error} = this.tokenStream.consume(); + if (error) return this.getError(error); + + // start parsing the left-hand-side of the next operator at this AST node + let lhs = null; + if (type == lex.TokenType.Op) + { + // the value of this part of the expression starts with an operator + // it will either be prefix operator, or a bracketed expression + if (token == "(") { - throw "Expecting ')', not '" + closeToken + "' at " + JSON.stringify(stringIndexToLinePos(closeIndex, newlines)); + // parse a subexpression + const { expr, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + lhs = expr; + + // ensure the next token after the sub expression is a closing paren, and consume it + const { error } = tokens.consume(lex.TokenType.Op, ")"); + if (error) return this.getError(error); + } + else if ((power = PrefixBindingPower[token]) !== undefined) + { + // this is a prefix operator of the given power + // get the expression it applies to... + const { rhs, error } = this.attemptSubtree(expressionInternal, power); + if (error) return this.getError(error); + + // ... and generate an AST node for it + // this becomes the new LHS of the operator we are parsing + lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; + } + else + { + return this.getError({ msg: "Invalid Op '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); } } - else if ((power = PrefixBindingPower[token]) !== undefined) + else if (type == lex.TokenType.Word) { - // this is a prefix operator of the given power - // get the expression it applies to... - const rhs = parseExpressionInternal({...tokenInfo, tokens:tokens}, power); - // ... and generate an AST node for it - // this becomes the new LHS of the operator we are parsing - lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; - } - else - { - throw "Invalid operator '" + token + "' at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - } - else if (type == TokenType.Word) - { - if (Keywords.includes(token)) - { - // TODO: handle keywords that can go in expressions - throw "Keyword '" + token + "' not expected at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - else if (Literals[token] !== undefined) - { - // this is a hardcoded keyword literal, we already defined how those nodes look - lhs = Literals[token]; - } - else - { - // some unresolved identifier (this is the domain of the runtime/interpreter!) - lhs = {node:ASTNode.Identifier, value:token}; - } - } - else if (type == TokenType.NumberLit) - { - lhs = {node:ASTNode.Literal, value:parseFloat(token)}; - } - else if (type == TokenType.StringLit) - { - lhs = {node:ASTNode.Literal, value:token}; - } - else - { - throw "Unexpcted token " + TokenType.keyName(type) + " of '" + token + "' at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - - while (tokens.length > 0) - { - // start grabbing further operators and expression, that don't have higher precedence - // (higher precedence operators will be parsed as sub-expressions) - - /* Expecting an operator now - * Depending on its precedence, we may not actually get to parse it ourselves, - * so don't actually consume the token yet, just peek it (just in case) - */ - const {type:opType, token:opToken, index:opIndex} = tokens[0]; - if (opType != TokenType.Op) - { - throw "Instead of " + TokenType.keyName(opType) + " of '" + opToken + "', expected operator at " + JSON.stringify(stringIndexToLinePos(opIndex, newlines)); - } - - if (PostfixBindingPower[opToken] !== undefined) - { - // this is a post-fix operator, but it may apply to... - if (PostfixBindingPower[opToken] < minBindingPower) + if (Keywords.includes(token)) { - //... some outer expression (we were too low power to bind to lhs) + // TODO: handle keywords that can go in expressions + return this.getError({ msg: "Keyword '" + token + "' not expected", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); + } + else if (Literals[token] !== undefined) + { + // this is a hardcoded keyword literal, we already defined how those nodes look + lhs = Literals[token]; + } + else + { + // some unresolved identifier (this is the domain of the runtime/interpreter!) + lhs = {node:ASTNode.Identifier, value:token}; + } + } + else if (type == TokenType.NumberLit) + { + lhs = {node:ASTNode.Literal, value:parseFloat(token)}; + } + else if (type == TokenType.StringLit) + { + lhs = {node:ASTNode.Literal, value:token}; + } + else + { + return this.getError({ msg: "Unexpected " + lex.TokenType.keyName(type) + " of '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); + } + + while (!this.tokenStream.eof()) + { + // start grabbing further operators and expression, that don't have higher precedence + // (higher precedence operators will be parsed as sub-expressions) + + /* Expecting an operator now + * Depending on its precedence, we may not actually get to parse it ourselves, + * so don't actually consume the token yet, just peek it (just in case) + */ + const {type:opType, token:opToken, index:opIndex, error} = this.tokenStream.peek(lex.TokenType.Op); + if (error) return this.getError(error); + + if (PostfixBindingPower[opToken] !== undefined) + { + // this is a post-fix operator, but it may apply to... + if (PostfixBindingPower[opToken] < minBindingPower) + { + //... some outer expression (we were too low power to bind to lhs) + break; + } + + //... or our current lhs, in which case we get to consume the token + this.tokenStream.consume(); + + if (opToken == "[") + { + /* Parsing an index on the current lhs + * We need to parse the index expression. Note we reset the binding power, + * this is because we will always end with ], so we don't need to worry about our current precedence + */ + const { index, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + // parse the closing ] + const { error } = this.tokenStream.consume(lex.TokenType.Op, "]"); + if (error) return this.getError(error); + + // our new node is the old lhs indexed with the list of argument expressions + lhs = {node:ASTNode.Index, lhs:lhs, index:index}; + } + else if (opToken == "(") + { + /* Parsing a function call + * lhs represents the function we are calling + * We need to parse a list of expression separated by , as the arguments + */ + let args = []; + while (true) + { + if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); + + // peek the next op + const {token:nextToken, type:nextType, index:nextIndex, error} = this.tokenStream.peek(); + if (error) return this.getError(error); + + if (nextType == TokenType.Op && nextToken == ")") + { + // no more arguments, we should end the call (after consuming the bracket) + this.tokenStream.consume(); + break; + } + + if (args.length > 0) + { + const { error } = this.tokenStream.consume(lex.TokenType.Op, ","); + if (error) return this.getError(error); + } + + const { arg, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + // our new node is the old lhs as a function, called with the index expression + args.push(arg); + } + + // generate the AST node + lhs = {node:ASTNode.Call, func:lhs, args:args}; + } + else + { + return this.getError({ msg: "Unexpected postfix Op '" + opToken + "'", ...this.tokenStream.stringIndexToLinePos(opIndex) }); + } + + // after parsing a postfix operator, we are expecting an operator again + continue; + } + + // if we didn't get an infix operator, we are done this expression + if (InfixBindingPower[opToken] === undefined) + { break; } - //... or our current lhs, in which case we get to consume the token - tokens.shift(); + [leftBinding, rightBinding] = InfixBindingPower[opToken]; - if (opToken == "[") - { - /* Parsing an index on the current lhs - * We need to parse the index expression. Note we reset the binding power, - * this is because we will always end with ], so we don't need to worry about our current precedence - */ - let index = parseExpressionInternal({...tokenInfo, tokens:tokens}, 0); + // if the binding power is too low, we are done parsing the current subexpression, leave + // the current operator for our caller... + if (leftBinding < minBindingPower) break; - // parse the closing ] - const {token:closeToken, type:closeType, index:closeIndex} = tokens.shiftThrow("Unclosed indexing expression, expecting ']' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines))); - if (closeType != TokenType.Op && closeToken != "]") - { - throw "Unclosed indexing expression, expecting ']' at " + JSON.stringify(stringIndexToLinePos(closeIndex, newlines)); - } + //... otherwise, this operator is for us, so consume it + this.tokenStream.consume(); - // our new node is the old lhs indexed with the list of argument expressions - lhs = {node:ASTNode.Index, lhs:lhs, index:index}; - } - else if (opToken == "(") - { - /* Parsing a function call - * lhs represents the function we are calling - * We need to parse a list of expression separated by , as the arguments - */ - let args = []; - while (true) - { - if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); + // the next expression is the right side of our current operator, but only as much as the + // right-side binding power of our operator can grab + const { rhs, error } = this.attemptSubtree(this.expressionInternal, rightBinding); + if (error) return this.getError(error); - // peek the next op - const {token:nextToken, type:nextType, index:nextIndex} = tokens[0]; - if (nextType == TokenType.Op && nextToken == ")") - { - // no more arguments, we should end the call (after consuming the bracket) - tokens.shift(); - break; - } - - if (args.length > 0) - { - // if we have already parsed args, we need to consume a comma - if (nextType != TokenType.Op || nextToken != ",") - { - throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(nextToken, newlines)); - } - tokens.shift(); - } - - // our new node is the old lhs as a function, called with the index expression - args.push(parseExpressionInternal({...tokenInfo, tokens:tokens}, 0)); - } - - // generate the AST node - lhs = {node:ASTNode.Call, func:lhs, args:args}; - } - else - { - throw "Unexpected postfix operator '" + opToken + "' at " + JSON.stringify(stringIndexToLinePos(opIndex, newlines)); - } - - // after parsing a postfix operator, we are expecting an operator again - continue; + // our new lhs becomes our applied operator, + // leaving us free to match more operators of an appropriate precedence + lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; } - // if we didn't get an infix operator, we are done this expression - if (InfixBindingPower[opToken] === undefined) - { - break; - } - - [leftBinding, rightBinding] = InfixBindingPower[opToken]; - - // if the binding power is too low, we are done parsing the current subexpression, leave - // the current operator for our caller... - if (leftBinding < minBindingPower) break; - - //... otherwise, this operator is for us, so consume it - tokens.shift(); - - // the next expression is the right side of our current operator, but only as much as the - // right-side binding power of our operator can grab - let rhs = parseExpressionInternal({...tokenInfo, tokens:tokens}, rightBinding); - - // our new lhs becomes our applied operator, - // leaving us free to match more operators of an appropriate precedence - lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; + return { ast: lhs, error: null }; } - return lhs; -} - -// helper to map over an object's keys recursively -Object.defineProperty(Object.prototype, 'mapKey', { - enumerable: false, - value: function(keyName, mapFunc) + error(msg, at) { - let ret = {}; - for (let k in this) + let loc = this.tokenStream.tokenIndexToLinePos(this.tokenStream.index); + if (at) { - if (k == keyName) + if (at.index) { - ret[k] = mapFunc(this[k]); - } - else if (typeof(this[k]) == 'object') - { - ret[k] = { ...this[k] }.mapKey(keyName, mapFunc); + loc = this.tokenStream.tokenToLinePos(at); } else { - ret[k] = this[k]; + loc = this.tokenStream.tokenIndexToLinePos(at); } } - return ret; + return { ast:null, error: { msg: msg, ...loc } }; } -}); -fs = require('fs'); -console.format = v => console.dir(v, { depth: null }); + attemptSubtree(parseFunction, ...args) + { + this.tokenStream.attemptMatch(); + const subtree = parseFunction.apply(this, [tokenStream, ...args]); -function debugExpr(expr) -{ - const tokens = lex(expr); - console.log(tokens.mapKey("type", v => TokenType.keyName(v))); + if (subtree.error) + { + this.tokenStream.failedMatch(); + } + else + { + this.tokenSTream.doneMatch(); + } - const ast = parseExpression(tokens); - console.format(ast.mapKey("node", v => ASTNode.keyName(v))); + return subtree; + } + + getError(error) + { + return { ast: null, error: error }; + } + } - -//debugExpr("test.BLAH (3+\"4x\\\"4\\\"\")"); -//debugExpr('3+-3+4*5-2[blah."\\"b 3 "]'); - -debugExpr(fs.readFileSync(process.argv[2], "utf8")); diff --git a/test.js b/test.js new file mode 100644 index 0000000..fa2123d --- /dev/null +++ b/test.js @@ -0,0 +1,15 @@ +fs = require('fs'); +parse = require('parse'); + +function debugExpr(expr) +{ + const tokens = new lex.TokenStream(expr); + console.log(tokens.getPrettyTokenList()); //mapKey("type", v => TokenType.keyName(v))); + + const { ast, error } = parse.attemptExpression(tokens); + if (error) throw error; + + console.format(parse.getPrettyAST(ast)); //ast.mapKey("node", v => ASTNode.keyName(v))); +} + +debugExpr(fs.readFileSync(process.argv[2], "utf8")); diff --git a/lex.js b/lex.js new file mode 100644 index 0000000..d76f94b --- /dev/null +++ b/lex.js @@ -0,0 +1,248 @@ +const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); + +class TokenStream +{ + constructor(input) + { + this.input = input; + this.inputLength = input.length; + + // how far in the token stream we are + this.index = 0; + + const lexed = lex(this.input); + this.tokens = lexed.tokens; + this.newlines = lexed.newlines; + + // when we attempt to match a sub-rule, save the location where we roll back to + this.savedLocations = []; + } + + eof() + { + return this.index < 0 || this.index >= this.tokens.length; + } + + peek(type = undefined, value = undefined) + { + if (this.index < 0 || this.index >= this.tokens.length) return { error: { msg: "Unexpected EOF", ...this.stringIndexToLinePos(this.inputLength - 1) } }; + + const token = this.tokens[this.index]; + + if (type) + { + if (token.type !== type) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + ", not " + TokenType.keyName(token.type) + " '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + + if (value !== undefined) + { + if (token.value !== value) + { + return { error: { msg: "Expecting " . TokenType.keyName(type) + " '" + value + "', not '" + token.value + "'", ...this.tokenToLinePos(token) } }; + } + } + } + + return token; + } + + consume(type = undefined, value = undefined) + { + const token = this.peek(type, value); + if (token.error) return error; + + this.index++; + return token; + } + + beginMatch() + { + this.savedLocations.push(this.index); + } + + doneMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected doneMatch()"); + this.savedLocations.pop(); + } + + failedMatch() + { + console.assert(this.savedLocations.length > 0, "Unexpected failedMatch()"); + this.index = this.savedLocations.pop(); + } + + stringIndexToLinePos(index) + { + index = Math.max(0, Math.min(this.inputLength - 1, index)); + + let line = this.newlines.length + 1; + let previousLinesLength = 0; + for (let i = 0; i < this.newlines.length; i++) + { + if (index <= this.newlines[i]) + { + line = i + 1; + break; + } + previousLinesLength += this.newlines[i] + 1; + } + return {line:line, pos:index - previousLinesLength + 1}; + } + + tokenIndexToLinePos(index) + { + if (index < 0 || index >= this.tokens.length) return this.stringIndexToLinePos(this.inputLength - 1); + + return this.tokenToLinePos(this.tokens[index]); + } + + tokenToLinePos(token) + { + return this.tokenIndexToLinePos(token.index); + } +} + +/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source + * + * recognized types: + * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords + * NumberLit: a literal value for the number type + * StringLit: a literal value for the string type + * Op: an operator (usually a single character of punctuation) + */ +function lex(input) +{ + let inputLength = input.length; + + let newlines = []; + + let i = 0; + while ((i = input.indexOf("\n", i)) != -1) + { + newlines.push(i); + i++; + } + + // break the input down into "parts": either words, digit strings, single character operators, or whitespace + let parts = []; + + // track where we are in the original string + let index = 0; + while (input.length > 0) + { + // match our possible sequences of more than one character (words, digits, or whitespace) + if ((match = input.match(/^\w+/)) || + (match = input.match(/^\d+/)) || + (match = input.match(/^\s+/))) + { + const matched = match[0]; + parts.push({token:matched, index:index}); + + // advance the string + input = input.substr(matched.length); + index += matched.length; + } + else + { + // everything else is a single character (punctuation) + parts.push({token:input[0], index:index}); + + input = input.substr(1); + index++; + } + } + + let tokens = []; + + for (let i = 0; i < parts.length; i++) + { + const partsLeft = parts.length - i - 1; + + let { token, index } = parts[i]; + + if (token.trim().length == 0) + { + // remove whitespace (whitespace in a string is handled in the block that handles the " token below) + continue; + } + + if (token.match(/\d+/)) + { + // try and get the decimal, if any + if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/)) + { + token += "." + parts[i + 2].token; + } + tokens.push({type:TokenType.NumberLit, token:token, index:index}); + } + else if (token == '"') + { + // we are starting a string literal + + // don't include the initial " + token = ""; + + let closed = false; + for (let j = i + 1; j < parts.length; j++) + { + // find an escape character + if (parts[j].token == '\\') + { + if (j + 1 < parts.length) + { + // the escape character is the next character (all tokens are guaranteed length > 0) + const escaped = parts[j + 1].token[0]; + + // remove the escaped character from the next token, being sure to remove the token + // if it was only that 1 character + parts[j + 1].token = parts[j + 1].token.substr(1); + if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1); + + // for now, just put the escaped character in literally, we'll deal with escape sequences later + token += escaped; + } + } + else if (parts[j].token == '"') + { + // close the string + i = j; + closed = true; + break; + } + else + { + // build up the string out of all tokens before the closing " + token += parts[j].token; + } + } + + if (!closed) + { + throw "Unclosed string"; + } + + tokens.push({type:TokenType.StringLit, token:token, index:index}); + } + else if (token.match(/\w+/)) + { + // any keyword or identifier + tokens.push({type:TokenType.Word, token:token, index:index}); + } + else + { + // anything left is an operator + tokens.push({type:TokenType.Op, token:token, index:index}); + } + } + + // keywords and identifiers are case insensitive + return { + tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t), + newlines: newlines, + }; +} + +module.exports = { TokenStream: TokenStream, TokenType: TokenType }; diff --git a/parse.js b/parse.js index c81e300..86ab105 100644 --- a/parse.js +++ b/parse.js @@ -1,145 +1,6 @@ -const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); +const lex = require('lex'); +const utils = require('utils'); -/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source - * - * recognized types: - * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords - * NumberLit: a literal value for the number type - * StringLit: a literal value for the string type - * Op: an operator (usually a single character of punctuation) - */ -function lex(input) -{ - let inputLength = input.length; - - let newlines = []; - - let i = 0; - while ((i = input.indexOf("\n", i)) != -1) - { - newlines.push(i); - i++; - } - - // break the input down into "parts": either words, digit strings, single character operators, or whitespace - let parts = []; - - // track where we are in the original string - let index = 0; - while (input.length > 0) - { - // match our possible sequences of more than one character (words, digits, or whitespace) - if ((match = input.match(/^\w+/)) || - (match = input.match(/^\d+/)) || - (match = input.match(/^\s+/))) - { - const matched = match[0]; - parts.push({token:matched, index:index}); - - // advance the string - input = input.substr(matched.length); - index += matched.length; - } - else - { - // everything else is a single character (punctuation) - parts.push({token:input[0], index:index}); - - input = input.substr(1); - index++; - } - } - - let tokens = []; - - for (let i = 0; i < parts.length; i++) - { - const partsLeft = parts.length - i - 1; - - let { token, index } = parts[i]; - - if (token.trim().length == 0) - { - // remove whitespace (whitespace in a string is handled in the block that handles the " token below) - continue; - } - - if (token.match(/\d+/)) - { - // try and get the decimal, if any - if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/)) - { - token += "." + parts[i + 2].token; - } - tokens.push({type:TokenType.NumberLit, token:token, index:index}); - } - else if (token == '"') - { - // we are starting a string literal - - // don't include the initial " - token = ""; - - let closed = false; - for (let j = i + 1; j < parts.length; j++) - { - // find an escape character - if (parts[j].token == '\\') - { - if (j + 1 < parts.length) - { - // the escape character is the next character (all tokens are guaranteed length > 0) - const escaped = parts[j + 1].token[0]; - - // remove the escaped character from the next token, being sure to remove the token - // if it was only that 1 character - parts[j + 1].token = parts[j + 1].token.substr(1); - if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1); - - // for now, just put the escaped character in literally, we'll deal with escape sequences later - token += escaped; - } - } - else if (parts[j].token == '"') - { - // close the string - i = j; - closed = true; - break; - } - else - { - // build up the string out of all tokens before the closing " - token += parts[j].token; - } - } - - if (!closed) - { - throw "Unclosed string"; - } - - tokens.push({type:TokenType.StringLit, token:token, index:index}); - } - else if (token.match(/\w+/)) - { - // any keyword or identifier - tokens.push({type:TokenType.Word, token:token, index:index}); - } - else - { - // anything left is an operator - tokens.push({type:TokenType.Op, token:token, index:index}); - } - } - - // keywords and identifiers are case insensitive - return { - tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t), - newlines: newlines, - inputLength: inputLength, - }; -} /* How powerfully infix operators bind to their left and right arguments * If a value is bound more powerfully from one side, it becomes part of the operator @@ -177,24 +38,6 @@ "(": 60, }); -// helper to grab the first item of an array, and throw a specific string if the array is empty -Object.defineProperty(Array.prototype, 'shiftThrow', { - enumerable: false, - value: function(err) { - if (this.length == 0) throw err; - return this.shift(); - } -}); - -// helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE"; -Object.defineProperty(Object.prototype, 'keyName', { - enumerable: false, - value: function(value) - { - return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE"; - } -}); - // all reserved keywords const Keywords = Object.freeze([ "if" ]); @@ -214,278 +57,262 @@ "false": {node:ASTNode.Literal, value:false}, }); -function stringIndexToLinePos(index, newlines) +class Parser { - let line = newlines.length + 1; - let previousLinesLength = 0; - for (let i = 0; i < newlines.length; i++) + constructor(tokenStream) { - if (index <= newlines[i]) - { - line = i + 1; - break; - } - previousLinesLength += newlines[i] + 1; + this.tokenStream = tokenStream; + this.tokenStream.index = 0; } - return {line:line, pos:index - previousLinesLength + 1}; -} - -/* The main entry point for parsing - * - * Takes an array of tokens from the lexer, and produces an AST of the program as raw objects - */ -function parseExpression(tokenInfo) -{ - return parseExpressionInternal(tokenInfo, 0); -} -/* Parses an expression consisting of a set of values, all separated by infix operators, - * as long none of the operators bind at least as powerfully as 'minBindingPower' - * - * This function will be called recursively to parse sub-expression, with the - * minBindingPower set to whatever the last seen infix operator was. This ensures - * that we never grab operators that can't be part of the subtree because their precedence - * should have had them happen earlier - * - * The while loop in the middle of this function is the main feature that makes this a pratt parser, - * and separates it from a standard recursive decent parser where you have to manually match on only - * certain terms to maintain precedence - * - * Produces an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications - * of operators with higher precedence) - * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information - */ -function parseExpressionInternal(tokenInfo, minBindingPower) -{ - let {tokens, newlines, inputLength} = tokenInfo; - - let eofError = "Expected more input at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); - - // grab the type of the next token, and its string representation - const {type, token, index:tokenIndex} = tokens.shiftThrow(eofError); - - // start parsing the left-hand-side of the next operator at this AST node - let lhs = null; - if (type == TokenType.Op) + expression(tokenStream) { - // the value of this part of the expression starts with an operator - // it will either be prefix operator, or a bracketed expression - if (token == "(") - { - // parse a subexpression - lhs = parseExpressionsInternal(tokens, 0); + return expressionInternal(tokenStream, 0); + } - // ensure the next token after the sub expression is a closing paren, and consume it - const {token:closeToken, type:closeType, index:closeIndex} = tokens.shiftThrow("Expecting ')', not '" + closeToken + "' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines))); - if (closeType != TokenType.Op && closeToken != ")") + /* Parses an expression consisting of a set of values, all separated by infix operators, + * as long none of the operators bind at least as powerfully as 'minBindingPower' + * + * This function will be called recursively to parse sub-expression, with the + * minBindingPower set to whatever the last seen infix operator was. This ensures + * that we never grab operators that can't be part of the subtree because their precedence + * should have had them happen earlier + * + * The while loop in the middle of this function is the main feature that makes this a pratt parser, + * and separates it from a standard recursive decent parser where you have to manually match on only + * certain terms to maintain precedence + * + * Produces an error, or an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications + * of operators with higher precedence) + * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information + * + */ + expressionInternal(minBindingPower) + { + // grab the type of the next token, and its string representation + const {type, token, index:tokenIndex, error} = this.tokenStream.consume(); + if (error) return this.getError(error); + + // start parsing the left-hand-side of the next operator at this AST node + let lhs = null; + if (type == lex.TokenType.Op) + { + // the value of this part of the expression starts with an operator + // it will either be prefix operator, or a bracketed expression + if (token == "(") { - throw "Expecting ')', not '" + closeToken + "' at " + JSON.stringify(stringIndexToLinePos(closeIndex, newlines)); + // parse a subexpression + const { expr, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + lhs = expr; + + // ensure the next token after the sub expression is a closing paren, and consume it + const { error } = tokens.consume(lex.TokenType.Op, ")"); + if (error) return this.getError(error); + } + else if ((power = PrefixBindingPower[token]) !== undefined) + { + // this is a prefix operator of the given power + // get the expression it applies to... + const { rhs, error } = this.attemptSubtree(expressionInternal, power); + if (error) return this.getError(error); + + // ... and generate an AST node for it + // this becomes the new LHS of the operator we are parsing + lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; + } + else + { + return this.getError({ msg: "Invalid Op '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); } } - else if ((power = PrefixBindingPower[token]) !== undefined) + else if (type == lex.TokenType.Word) { - // this is a prefix operator of the given power - // get the expression it applies to... - const rhs = parseExpressionInternal({...tokenInfo, tokens:tokens}, power); - // ... and generate an AST node for it - // this becomes the new LHS of the operator we are parsing - lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; - } - else - { - throw "Invalid operator '" + token + "' at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - } - else if (type == TokenType.Word) - { - if (Keywords.includes(token)) - { - // TODO: handle keywords that can go in expressions - throw "Keyword '" + token + "' not expected at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - else if (Literals[token] !== undefined) - { - // this is a hardcoded keyword literal, we already defined how those nodes look - lhs = Literals[token]; - } - else - { - // some unresolved identifier (this is the domain of the runtime/interpreter!) - lhs = {node:ASTNode.Identifier, value:token}; - } - } - else if (type == TokenType.NumberLit) - { - lhs = {node:ASTNode.Literal, value:parseFloat(token)}; - } - else if (type == TokenType.StringLit) - { - lhs = {node:ASTNode.Literal, value:token}; - } - else - { - throw "Unexpcted token " + TokenType.keyName(type) + " of '" + token + "' at " + JSON.stringify(stringIndexToLinePos(tokenIndex, newlines)); - } - - while (tokens.length > 0) - { - // start grabbing further operators and expression, that don't have higher precedence - // (higher precedence operators will be parsed as sub-expressions) - - /* Expecting an operator now - * Depending on its precedence, we may not actually get to parse it ourselves, - * so don't actually consume the token yet, just peek it (just in case) - */ - const {type:opType, token:opToken, index:opIndex} = tokens[0]; - if (opType != TokenType.Op) - { - throw "Instead of " + TokenType.keyName(opType) + " of '" + opToken + "', expected operator at " + JSON.stringify(stringIndexToLinePos(opIndex, newlines)); - } - - if (PostfixBindingPower[opToken] !== undefined) - { - // this is a post-fix operator, but it may apply to... - if (PostfixBindingPower[opToken] < minBindingPower) + if (Keywords.includes(token)) { - //... some outer expression (we were too low power to bind to lhs) + // TODO: handle keywords that can go in expressions + return this.getError({ msg: "Keyword '" + token + "' not expected", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); + } + else if (Literals[token] !== undefined) + { + // this is a hardcoded keyword literal, we already defined how those nodes look + lhs = Literals[token]; + } + else + { + // some unresolved identifier (this is the domain of the runtime/interpreter!) + lhs = {node:ASTNode.Identifier, value:token}; + } + } + else if (type == TokenType.NumberLit) + { + lhs = {node:ASTNode.Literal, value:parseFloat(token)}; + } + else if (type == TokenType.StringLit) + { + lhs = {node:ASTNode.Literal, value:token}; + } + else + { + return this.getError({ msg: "Unexpected " + lex.TokenType.keyName(type) + " of '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) }); + } + + while (!this.tokenStream.eof()) + { + // start grabbing further operators and expression, that don't have higher precedence + // (higher precedence operators will be parsed as sub-expressions) + + /* Expecting an operator now + * Depending on its precedence, we may not actually get to parse it ourselves, + * so don't actually consume the token yet, just peek it (just in case) + */ + const {type:opType, token:opToken, index:opIndex, error} = this.tokenStream.peek(lex.TokenType.Op); + if (error) return this.getError(error); + + if (PostfixBindingPower[opToken] !== undefined) + { + // this is a post-fix operator, but it may apply to... + if (PostfixBindingPower[opToken] < minBindingPower) + { + //... some outer expression (we were too low power to bind to lhs) + break; + } + + //... or our current lhs, in which case we get to consume the token + this.tokenStream.consume(); + + if (opToken == "[") + { + /* Parsing an index on the current lhs + * We need to parse the index expression. Note we reset the binding power, + * this is because we will always end with ], so we don't need to worry about our current precedence + */ + const { index, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + // parse the closing ] + const { error } = this.tokenStream.consume(lex.TokenType.Op, "]"); + if (error) return this.getError(error); + + // our new node is the old lhs indexed with the list of argument expressions + lhs = {node:ASTNode.Index, lhs:lhs, index:index}; + } + else if (opToken == "(") + { + /* Parsing a function call + * lhs represents the function we are calling + * We need to parse a list of expression separated by , as the arguments + */ + let args = []; + while (true) + { + if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); + + // peek the next op + const {token:nextToken, type:nextType, index:nextIndex, error} = this.tokenStream.peek(); + if (error) return this.getError(error); + + if (nextType == TokenType.Op && nextToken == ")") + { + // no more arguments, we should end the call (after consuming the bracket) + this.tokenStream.consume(); + break; + } + + if (args.length > 0) + { + const { error } = this.tokenStream.consume(lex.TokenType.Op, ","); + if (error) return this.getError(error); + } + + const { arg, error } = this.attemptSubtree(this.expression); + if (error) return this.getError(error); + + // our new node is the old lhs as a function, called with the index expression + args.push(arg); + } + + // generate the AST node + lhs = {node:ASTNode.Call, func:lhs, args:args}; + } + else + { + return this.getError({ msg: "Unexpected postfix Op '" + opToken + "'", ...this.tokenStream.stringIndexToLinePos(opIndex) }); + } + + // after parsing a postfix operator, we are expecting an operator again + continue; + } + + // if we didn't get an infix operator, we are done this expression + if (InfixBindingPower[opToken] === undefined) + { break; } - //... or our current lhs, in which case we get to consume the token - tokens.shift(); + [leftBinding, rightBinding] = InfixBindingPower[opToken]; - if (opToken == "[") - { - /* Parsing an index on the current lhs - * We need to parse the index expression. Note we reset the binding power, - * this is because we will always end with ], so we don't need to worry about our current precedence - */ - let index = parseExpressionInternal({...tokenInfo, tokens:tokens}, 0); + // if the binding power is too low, we are done parsing the current subexpression, leave + // the current operator for our caller... + if (leftBinding < minBindingPower) break; - // parse the closing ] - const {token:closeToken, type:closeType, index:closeIndex} = tokens.shiftThrow("Unclosed indexing expression, expecting ']' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines))); - if (closeType != TokenType.Op && closeToken != "]") - { - throw "Unclosed indexing expression, expecting ']' at " + JSON.stringify(stringIndexToLinePos(closeIndex, newlines)); - } + //... otherwise, this operator is for us, so consume it + this.tokenStream.consume(); - // our new node is the old lhs indexed with the list of argument expressions - lhs = {node:ASTNode.Index, lhs:lhs, index:index}; - } - else if (opToken == "(") - { - /* Parsing a function call - * lhs represents the function we are calling - * We need to parse a list of expression separated by , as the arguments - */ - let args = []; - while (true) - { - if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines)); + // the next expression is the right side of our current operator, but only as much as the + // right-side binding power of our operator can grab + const { rhs, error } = this.attemptSubtree(this.expressionInternal, rightBinding); + if (error) return this.getError(error); - // peek the next op - const {token:nextToken, type:nextType, index:nextIndex} = tokens[0]; - if (nextType == TokenType.Op && nextToken == ")") - { - // no more arguments, we should end the call (after consuming the bracket) - tokens.shift(); - break; - } - - if (args.length > 0) - { - // if we have already parsed args, we need to consume a comma - if (nextType != TokenType.Op || nextToken != ",") - { - throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(nextToken, newlines)); - } - tokens.shift(); - } - - // our new node is the old lhs as a function, called with the index expression - args.push(parseExpressionInternal({...tokenInfo, tokens:tokens}, 0)); - } - - // generate the AST node - lhs = {node:ASTNode.Call, func:lhs, args:args}; - } - else - { - throw "Unexpected postfix operator '" + opToken + "' at " + JSON.stringify(stringIndexToLinePos(opIndex, newlines)); - } - - // after parsing a postfix operator, we are expecting an operator again - continue; + // our new lhs becomes our applied operator, + // leaving us free to match more operators of an appropriate precedence + lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; } - // if we didn't get an infix operator, we are done this expression - if (InfixBindingPower[opToken] === undefined) - { - break; - } - - [leftBinding, rightBinding] = InfixBindingPower[opToken]; - - // if the binding power is too low, we are done parsing the current subexpression, leave - // the current operator for our caller... - if (leftBinding < minBindingPower) break; - - //... otherwise, this operator is for us, so consume it - tokens.shift(); - - // the next expression is the right side of our current operator, but only as much as the - // right-side binding power of our operator can grab - let rhs = parseExpressionInternal({...tokenInfo, tokens:tokens}, rightBinding); - - // our new lhs becomes our applied operator, - // leaving us free to match more operators of an appropriate precedence - lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; + return { ast: lhs, error: null }; } - return lhs; -} - -// helper to map over an object's keys recursively -Object.defineProperty(Object.prototype, 'mapKey', { - enumerable: false, - value: function(keyName, mapFunc) + error(msg, at) { - let ret = {}; - for (let k in this) + let loc = this.tokenStream.tokenIndexToLinePos(this.tokenStream.index); + if (at) { - if (k == keyName) + if (at.index) { - ret[k] = mapFunc(this[k]); - } - else if (typeof(this[k]) == 'object') - { - ret[k] = { ...this[k] }.mapKey(keyName, mapFunc); + loc = this.tokenStream.tokenToLinePos(at); } else { - ret[k] = this[k]; + loc = this.tokenStream.tokenIndexToLinePos(at); } } - return ret; + return { ast:null, error: { msg: msg, ...loc } }; } -}); -fs = require('fs'); -console.format = v => console.dir(v, { depth: null }); + attemptSubtree(parseFunction, ...args) + { + this.tokenStream.attemptMatch(); + const subtree = parseFunction.apply(this, [tokenStream, ...args]); -function debugExpr(expr) -{ - const tokens = lex(expr); - console.log(tokens.mapKey("type", v => TokenType.keyName(v))); + if (subtree.error) + { + this.tokenStream.failedMatch(); + } + else + { + this.tokenSTream.doneMatch(); + } - const ast = parseExpression(tokens); - console.format(ast.mapKey("node", v => ASTNode.keyName(v))); + return subtree; + } + + getError(error) + { + return { ast: null, error: error }; + } + } - -//debugExpr("test.BLAH (3+\"4x\\\"4\\\"\")"); -//debugExpr('3+-3+4*5-2[blah."\\"b 3 "]'); - -debugExpr(fs.readFileSync(process.argv[2], "utf8")); diff --git a/test.js b/test.js new file mode 100644 index 0000000..fa2123d --- /dev/null +++ b/test.js @@ -0,0 +1,15 @@ +fs = require('fs'); +parse = require('parse'); + +function debugExpr(expr) +{ + const tokens = new lex.TokenStream(expr); + console.log(tokens.getPrettyTokenList()); //mapKey("type", v => TokenType.keyName(v))); + + const { ast, error } = parse.attemptExpression(tokens); + if (error) throw error; + + console.format(parse.getPrettyAST(ast)); //ast.mapKey("node", v => ASTNode.keyName(v))); +} + +debugExpr(fs.readFileSync(process.argv[2], "utf8")); diff --git a/utils.js b/utils.js new file mode 100644 index 0000000..69e8290 --- /dev/null +++ b/utils.js @@ -0,0 +1,37 @@ +(function() { + // helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE"; + Object.defineProperty(Object.prototype, 'keyName', { + enumerable: false, + value: function(value) + { + return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE"; + } + }); + + // helper to map over an object's keys recursively + Object.defineProperty(Object.prototype, 'mapKey', { + enumerable: false, + value: function(keyName, mapFunc) + { + let ret = {}; + for (let k in this) + { + if (k == keyName) + { + ret[k] = mapFunc(this[k]); + } + else if (typeof(this[k]) == 'object') + { + ret[k] = { ...this[k] }.mapKey(keyName, mapFunc); + } + else + { + ret[k] = this[k]; + } + } + return ret; + } + }); + + console.format = v => console.dir(v, { depth: null }); +})();