const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); /* takes a string, and returns a flat array of tokens * * recognized types: * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords * NumberLit: a literal value for the number type * StringLit: a literal value for the string type * Op: an operator (usually a single character of punctuation) */ function lex(input) { // break the input down into "parts": either words, digit strings, single character operators, or whitespace let parts = []; while (input.length > 0) { // match our possible sequences of more than one character (words, digits, or whitespace) if ((match = input.match(/^\w+/)) || (match = input.match(/^\d+/)) || (match = input.match(/^\s+/))) { const matched = match[0]; parts.push(matched); input = input.substr(matched.length); } else { // everything else is a single character (punctuation) parts.push(input[0]); input = input.substr(1); } } let tokens = []; for (let i = 0; i < parts.length; i++) { const partsLeft = parts.length - i - 1; let token = parts[i]; if (token.trim().length == 0) { // remove whitespace (whitespace in a string is handled in the block that handles the " token below) continue; } if (token.match(/\d+/)) { // try and get the decimal, if any if (partsLeft > 2 && parts[i + 1] == "." && parts[i + 2].match(/\d+/)) { token += "." + parts[i + 2]; } tokens.push({type:TokenType.NumberLit, token:token}); } else if (token == '"') { // we are starting a string literal // don't include the initial " token = ""; let closed = false; for (let j = i + 1; j < parts.length; j++) { // find an escape character if (parts[j] == '\\') { if (j + 1 < parts.length) { // the escape character is the next character (all tokens are guaranteed length > 0) const escaped = parts[j + 1][0]; // remove the escaped character from the next token, being sure to remove the token // if it was only that 1 character parts[j + 1] = parts[j + 1].substr(1); if (parts[j + 1].length == 0) parts.splice(j + 1, 1); // for now, just put the escaped character in literally, we'll deal with escape sequences later token += escaped; } } else if (parts[j] == '"') { // close the string i = j; closed = true; break; } else { // build up the string out of all tokens before the closing " token += parts[j]; } } if (!closed) { throw "Unclosed string"; } tokens.push({type:TokenType.StringLit, token:token}); } else if (token.match(/\w+/)) { // any keyword or identifier tokens.push({type:TokenType.Word, token:token}); } else { // anything left is an operator tokens.push({type:TokenType.Op, token:token}); } } // keywords and identifiers are case insensitive return tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t); } /* How powerfully infix operators bind to their left and right arguments * If a value is bound more powerfully from one side, it becomes part of the operator * on that side first * * The relative binding power of operators determines their precedence, * and the relative binding power of the left and right side of a specific operator * determines its associativity: * If the *right* side binds more powerfully, the operator becomes *left* associative, * as any instance of that operator that appear first in the token stream are able * to grab their right arguments first, before further instances of the operator * (and vice versa) */ const InfixBindingPower = Object.freeze({ "+": [10, 11], "-": [10, 11], "*": [20, 21], ".": [100, 101], }); /* How powerfully a prefix operator binds the value to its right * Stacked prefix operators are unambiguous, so this value is only * useful in comparison to an infix or postfix operator, and determines * which operators this prefix will get "inside of" */ const PrefixBindingPower = Object.freeze({ "-": 40, }); /* How powerfully a prefix operator binds the value to its right * (works exactly the same as prefix binding power, but for operators on the right) */ const PostfixBindingPower = Object.freeze({ "[": 50, "(": 60, }); // helper to grab the first item of an array, and throw a specific string if the array is empty Object.defineProperty(Array.prototype, 'shiftThrow', { enumerable: false, value: function(err) { if (this.length == 0) throw err; return this.shift(); } }); // helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE"; Object.defineProperty(Object.prototype, 'keyName', { enumerable: false, value: function(value) { return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE"; } }); // all reserved keywords const Keywords = Object.freeze([ "if" ]); // the types of AST nodes const ASTNode = Object.freeze({ Literal:1, // value literal, could be string, boolean, number, etc. PrefixOp:2, // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression) InfixOp:3, // two sub-expressions with an operator between them Identifier:4, // some unresolved identifier, i.e variable_name_here Index:5, // some sub-expression being indexed by another expression, i.e. sub_expr[3] Call:6, // some sub-expression representing a function being called with a list of expressions }); // all reserved words that represent literals const Literals = Object.freeze({ "true": {node:ASTNode.Literal, value:true}, "false": {node:ASTNode.Literal, value:false}, }); /* The main entry point for parsing * * Takes an array of tokens from the lexer, and produces an AST of the program as raw objects */ function parseExpression(tokens) { return parseExpressionInternal(tokens, 0); } /* Parses an expression consisting of a set of values, all separated by infix operators, * as long none of the operators bind at least as powerfully as 'minBindingPower' * * This function will be called recursively to parse sub-expression, with the * minBindingPower set to whatever the last seen infix operator was. This ensures * that we never grab operators that can't be part of the subtree because their precedence * should have had them happen earlier * * The while loop in the middle of this function is the main feature that makes this a pratt parser, * and separates it from a standard recursive decent parser where you have to manually match on only * certain terms to maintain precedence * * Produces an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications * of operators with higher precedence) * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information */ function parseExpressionInternal(tokens, minBindingPower) { // grab the type of the next token, and its string representation const {type, token} = tokens.shiftThrow("No more input"); // start parsing the left-hand-side of the next operator at this AST node let lhs = null; if (type == TokenType.Op) { // the value of this part of the expression starts with an operator // it will either be prefix operator, or a bracketed expression if (token == "(") { // parse a subexpression lhs = parseExpressionsInternal(tokens, 0); // ensure the next token after the sub expression is a closing paren, and consume it const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input"); if (closeType != TokenType.Op && closeToken != ")") { throw "Unclosed parenthesis"; } } else if ((power = PrefixBindingPower[token]) !== undefined) { // this is a prefix operator of the given power // get the expression it applies to... const rhs = parseExpressionInternal(tokens, power); // ... and generate an AST node for it // this becomes the new LHS of the operator we are parsing lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; } else { throw "Bad operator " + token; } } else if (type == TokenType.Word) { if (Keywords.includes(token)) { // TODO: handle keywords that can go in expressions throw "Keyword " + token + " not expected here"; } else if (Literals[token] !== undefined) { // this is a hardcoded keyword literal, we already defined how those nodes look lhs = Literals[token]; } else { // some unresolved identifier (this is the domain of the runtime/interpreter!) lhs = {node:ASTNode.Identifier, value:token}; } } else if (type == TokenType.NumberLit) { lhs = {node:ASTNode.Literal, value:parseFloat(token)}; } else if (type == TokenType.StringLit) { lhs = {node:ASTNode.Literal, value:token}; } else { throw "Unexpcted token " + TokenType.keyName(type) + " of " + token; } while (tokens.length > 0) { // start grabbing further operators and expression, that don't have higher precedence // (higher precedence operators will be parsed as sub-expressions) /* Expecting an operator now * Depending on its precedence, we may not actually get to parse it ourselves, * so don't actually consume the token yet, just peek it (just in case) */ const {type:opType, token:opToken} = tokens[0]; if (opType != TokenType.Op) { throw "Operator expected here, instead of " + TokenType.keyName(opType) + " of " + opToken; } if (PostfixBindingPower[opToken] !== undefined) { // this is a post-fix operator, but it may apply to... if (PostfixBindingPower[opToken] < minBindingPower) { //... some outer expression (we were too low power to bind to lhs) break; } //... or our current lhs, in which case we get to consume the token tokens.shift(); if (opToken == "[") { /* Parsing an index on the current lhs * We need to parse the index expression. Note we reset the binding power, * this is because we will always end with ], so we don't need to worry about our current precedence */ let index = parseExpressionInternal(tokens, 0); // parse the closing ] const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input"); if (closeType != TokenType.Op && closeToken != "]") { throw "Unclosed indexing expression"; } // our new node is the old lhs indexed with the list of argument expressions lhs = {node:ASTNode.Index, lhs:lhs, index:index}; } else if (opToken == "(") { /* Parsing a function call * lhs represents the function we are calling * We need to parse a list of expression separated by , as the arguments */ let args = []; while (true) { if (tokens.length == 0) throw "No more input"; // peek the next op const {token:nextToken, type:nextType} = tokens[0]; if (nextType == TokenType.Op && nextToken == ")") { // no more arguments, we should end the call (after consuming the bracket) tokens.shift(); break; } if (args.length > 0) { // if we have already parsed args, we need to consume a comma if (nextType != TokenType.Op || nextToken != ",") { throw "Expected , or )"; } tokens.shift(); } // our new node is the old lhs as a function, called with the index expression args.push(parseExpressionInternal(tokens, 0)); } // generate the AST node lhs = {node:ASTNode.Call, func:lhs, args:args}; } else { throw "Unexpected postfix operator " + opToken; } // after parsing a postfix operator, we are expecting an operator again continue; } // if we didn't get an infix operator, we are done this expression if (InfixBindingPower[opToken] === undefined) { break; } [leftBinding, rightBinding] = InfixBindingPower[opToken]; // if the binding power is too low, we are done parsing the current subexpression, leave // the current operator for our caller... if (leftBinding < minBindingPower) break; //... otherwise, this operator is for us, so consume it tokens.shift(); // the next expression is the right side of our current operator, but only as much as the // right-side binding power of our operator can grab let rhs = parseExpressionInternal(tokens, rightBinding); // our new lhs becomes our applied operator, // leaving us free to match more operators of an appropriate precedence lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; } return lhs; } // helper to map over an object's keys recursively Object.defineProperty(Object.prototype, 'mapKey', { enumerable: false, value: function(keyName, mapFunc) { let ret = {}; for (let k in this) { if (k == keyName) { ret[k] = mapFunc(this[k]); } else if (typeof(this[k]) == 'object') { ret[k] = { ...this[k] }.mapKey(keyName, mapFunc); } else { ret[k] = this[k]; } } return ret; } }); fs = require('fs'); console.format = v => console.dir(v, { depth: null }); function debugExpr(expr) { const tokens = lex(expr); console.log(tokens.mapKey("type", v => TokenType.keyName(v))); const ast = parseExpression(tokens); console.format(ast.mapKey("node", v => ASTNode.keyName(v))); } //debugExpr("test.BLAH (3+\"4x\\\"4\\\"\")"); //debugExpr('3+-3+4*5-2[blah."\\"b 3 "]'); debugExpr(fs.readFileSync(process.argv[2], "utf8"));