diff --git a/parse.js b/parse.js index 629285e..38934aa 100644 --- a/parse.js +++ b/parse.js @@ -1,53 +1,96 @@ +const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 }); + +/* takes a string, and returns a flat array of tokens + * + * recognized types: + * Word: as a token type, reprsents any symbol/word: variable names, or actual keywords + * NumberLit: a literal value for the number type + * StringLit: a literal value for the string type + * Op: an operator (usually a single character of punctuation) + */ function lex(input) { - let tokens = []; - - const words = input.split(/(?!\w)|\b/); - console.log(words); - - const digits = /\d+/; - const keyword = /\w+/; - - for (let i = 0; i < words.length; i++) + // break the input down into "parts": either words, digit strings, single character operators, or whitespace + let parts = []; + while (input.length > 0) { - const wordsLeft = words.length - i - 1; - let word = words[i]; + // match our possible sequences of more than one character (words, digits, or whitespace) + if ((match = input.match(/^\w+/)) || + (match = input.match(/^\d+/)) || + (match = input.match(/^\s+/))) + { + const matched = match[0]; + parts.push(matched); + input = input.substr(matched.length); + } + else + { + // everything else is a single character (punctuation) + parts.push(input[0]); + input = input.substr(1); + } + } + + let tokens = []; + + for (let i = 0; i < parts.length; i++) + { + const partsLeft = parts.length - i - 1; + let token = parts[i]; - if (word.match(digits)) + if (token.trim().length == 0) { - if (wordsLeft > 2 && words[i + 1] == "." && words[i + 2].match(digits)) + // remove whitespace (whitespace in a string is handled in the block that handles the " token below) + continue; + } + + if (token.match(/\d+/)) + { + // try and get the decimal, if any + if (partsLeft > 2 && parts[i + 1] == "." && parts[i + 2].match(/\d+/)) { - word += "." + words[i + 2]; + token += "." + parts[i + 2]; } - tokens.push({type:'number', value:word}); + tokens.push({type:TokenType.NumberLit, token:token}); } - else if (word == '"') + else if (token == '"') { - word = ""; + // we are starting a string literal + + // don't include the initial " + token = ""; let closed = false; - for (let j = i + 1; j < words.length; j++) + for (let j = i + 1; j < parts.length; j++) { - if (words[j] == '\\') + // find an escape character + if (parts[j] == '\\') { - if (j + 1 < words.length) + if (j + 1 < parts.length) { - const escaped = words[j + 1][0]; - words[j + 1] = words[j + 1].substr(1); - if (words[j + 1].length == 0) words.splice(j + 1, 1); + // the escape character is the next character (all tokens are guaranteed length > 0) + const escaped = parts[j + 1][0]; - word += escaped; + // remove the escaped character from the next token, being sure to remove the token + // if it was only that 1 character + parts[j + 1] = parts[j + 1].substr(1); + if (parts[j + 1].length == 0) parts.splice(j + 1, 1); + + // for now, just put the escaped character in literally, we'll deal with escape sequences later + token += escaped; } } - else if (words[j] == '"') + else if (parts[j] == '"') { + // close the string i = j; closed = true; break; } else { - word += words[j]; + // build up the string out of all tokens before the closing " + token += parts[j]; } } @@ -56,44 +99,61 @@ throw "Unclosed string"; } - tokens.push({type:'string', value:word}); + tokens.push({type:TokenType.StringLit, token:token}); } - else if (word.match(keyword)) + else if (token.match(/\w+/)) { - tokens.push({type:'keyword', value:word}); + // any keyword or identifier + tokens.push({type:TokenType.Word, token:token}); } else { - tokens.push({type:'op', value:word}); + // anything left is an operator + tokens.push({type:TokenType.Op, token:token}); } } - return tokens; + // keywords and identifiers are case insensitive + return tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t); } -function expect(tokens, type) -{ - if (tokens.length > 0 && tokens[0].type == type) - { - return tokens.shift().value; - } - return null; -} - -const infixBindingPower = { +/* How powerfully infix operators bind to their left and right arguments + * If a value is bound more powerfully from one side, it becomes part of the operator + * on that side first + * + * The relative binding power of operators determines their precedence, + * and the relative binding power of the left and right side of a specific operator + * determines its associativity: + * If the *right* side binds more powerfully, the operator becomes *left* associative, + * as any instance of that operator that appear first in the token stream are able + * to grab their right arguments first, before further instances of the operator + * (and vice versa) + */ +const InfixBindingPower = Object.freeze({ "+": [10, 11], "-": [10, 11], "*": [20, 21], ".": [100, 101], -}; -const prefixBindingPower = { +}); + +/* How powerfully a prefix operator binds the value to its right + * Stacked prefix operators are unambiguous, so this value is only + * useful in comparison to an infix or postfix operator, and determines + * which operators this prefix will get "inside of" + */ +const PrefixBindingPower = Object.freeze({ "-": 40, -}; -const postfixBindingPower = { +}); + +/* How powerfully a prefix operator binds the value to its right + * (works exactly the same as prefix binding power, but for operators on the right) + */ +const PostfixBindingPower = Object.freeze({ "[": 50, "(": 60, -}; +}); +// helper to grab the first item of an array, and throw a specific string if the array is empty Object.defineProperty(Array.prototype, 'shiftThrow', { enumerable: false, value: function(err) { @@ -102,134 +162,286 @@ } }); -const keywords = [ "if" ]; - -const literals = { - "true": {node:"literal", value:true}, - "false": {node:"literal", value:false}, -}; - -function parseExpression(tokens, minBindingPower) -{ - const {value, type} = tokens.shiftThrow("No more input"); - - let lhs = null; - if (type == "op") +// helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE"; +Object.defineProperty(Object.prototype, 'keyName', { + enumerable: false, + value: function(value) { - if (value == "(") + return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE"; + } +}); + +// all reserved keywords +const Keywords = Object.freeze([ "if" ]); + +// the types of AST nodes +const ASTNode = Object.freeze({ + Literal:1, // value literal, could be string, boolean, number, etc. + PrefixOp:2, // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression) + InfixOp:3, // two sub-expressions with an operator between them + Identifier:4, // some unresolved identifier, i.e variable_name_here + Index:5, // some sub-expression being indexed by another expression, i.e. sub_expr[3] + Call:6, // some sub-expression representing a function being called with a list of expressions +}); + +// all reserved words that represent literals +const Literals = Object.freeze({ + "true": {node:ASTNode.Literal, value:true}, + "false": {node:ASTNode.Literal, value:false}, +}); + +/* The main entry point for parsing + * + * Takes an array of tokens from the lexer, and produces an AST of the program as raw objects + */ +function parseExpression(tokens) +{ + return parseExpressionInternal(tokens, 0); +} + + +/* Parses an expression consisting of a set of values, all separated by infix operators, + * as long none of the operators bind at least as powerfully as 'minBindingPower' + * + * This function will be called recursively to parse sub-expression, with the + * minBindingPower set to whatever the last seen infix operator was. This ensures + * that we never grab operators that can't be part of the subtree because their precedence + * should have had them happen earlier + * + * The while loop in the middle of this function is the main feature that makes this a pratt parser, + * and separates it from a standard recursive decent parser where you have to manually match on only + * certain terms to maintain precedence + * + * Produces an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications + * of operators with higher precedence) + * The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information + */ +function parseExpressionInternal(tokens, minBindingPower) +{ + // grab the type of the next token, and its string representation + const {type, token} = tokens.shiftThrow("No more input"); + + // start parsing the left-hand-side of the next operator at this AST node + let lhs = null; + if (type == TokenType.Op) + { + // the value of this part of the expression starts with an operator + // it will either be prefix operator, or a bracketed expression + if (token == "(") { - lhs = parseExpressions(tokens, 0); - const {value:closeValue, type:closeType} = tokens.shiftThrow("No more input"); - if (closeType != "op" && closeValue != ")") + // parse a subexpression + lhs = parseExpressionsInternal(tokens, 0); + + // ensure the next token after the sub expression is a closing paren, and consume it + const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input"); + if (closeType != TokenType.Op && closeToken != ")") { throw "Unclosed parenthesis"; } } - else if ((power = prefixBindingPower[value]) !== undefined) + else if ((power = PrefixBindingPower[token]) !== undefined) { - const rhs = parseExpression(tokens, power); - lhs = {node:"prefix_op", operand:rhs, op:value}; + // this is a prefix operator of the given power + // get the expression it applies to... + const rhs = parseExpressionInternal(tokens, power); + // ... and generate an AST node for it + // this becomes the new LHS of the operator we are parsing + lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token}; } else { - throw "Bad operator"; + throw "Bad operator " + token; } } - else if (type == "keyword") + else if (type == TokenType.Word) { - if (keywords.indexOf(value) != -1) + if (Keywords.includes(token)) { - throw "Keyword " + value + " not expected here"; + // TODO: handle keywords that can go in expressions + throw "Keyword " + token + " not expected here"; } - else if (literals[value] !== undefined) + else if (Literals[token] !== undefined) { - lhs = literals[value]; + // this is a hardcoded keyword literal, we already defined how those nodes look + lhs = Literals[token]; } else { - lhs = {type:"identifier", value:value}; + // some unresolved identifier (this is the domain of the runtime/interpreter!) + lhs = {node:ASTNode.Identifier, value:token}; } } - else if (type == "number") + else if (type == TokenType.NumberLit) { - lhs = {node:"literal", value:parseFloat(value)}; + lhs = {node:ASTNode.Literal, value:parseFloat(token)}; } - else if (type == "string") + else if (type == TokenType.StringLit) { - lhs = {node:"literal", value:value}; + lhs = {node:ASTNode.Literal, value:token}; } else { - throw "Unexpcted token " + type + " of " + value; + throw "Unexpcted token " + TokenType.keyName(type) + " of " + token; } while (tokens.length > 0) { - const {type:opType, value:opValue} = tokens[0]; - if (opType != "op") + // start grabbing further operators and expression, that don't have higher precedence + // (higher precedence operators will be parsed as sub-expressions) + + /* Expecting an operator now + * Depending on its precedence, we may not actually get to parse it ourselves, + * so don't actually consume the token yet, just peek it (just in case) + */ + const {type:opType, token:opToken} = tokens[0]; + if (opType != TokenType.Op) { - throw "Operator expected here"; + throw "Operator expected here, instead of " + TokenType.keyName(opType) + " of " + opToken; } - if (postfixBindingPower[opValue] !== undefined) + if (PostfixBindingPower[opToken] !== undefined) { - if (postfixBindingPower[opValue] < minBindingPower) + // this is a post-fix operator, but it may apply to... + if (PostfixBindingPower[opToken] < minBindingPower) { + //... some outer expression (we were too low power to bind to lhs) break; } + //... or our current lhs, in which case we get to consume the token tokens.shift(); - if (opValue == "[") + if (opToken == "[") { - let index = parseExpression(tokens, 0); - const {value:closeValue, type:closeType} = tokens.shiftThrow("No more input"); - if (closeType != "op" && closeValue != "]") + /* Parsing an index on the current lhs + * We need to parse the index expression. Note we reset the binding power, + * this is because we will always end with ], so we don't need to worry about our current precedence + */ + let index = parseExpressionInternal(tokens, 0); + + // parse the closing ] + const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input"); + if (closeType != TokenType.Op && closeToken != "]") { - throw "Unclosed indexing"; + throw "Unclosed indexing expression"; } - lhs = {node:"index", value:lhs, index:index}; + + // our new node is the old lhs indexed with the list of argument expressions + lhs = {node:ASTNode.Index, lhs:lhs, index:index}; } - if (opValue == "(") + else if (opToken == "(") { + /* Parsing a function call + * lhs represents the function we are calling + * We need to parse a list of expression separated by , as the arguments + */ + let args = []; while (true) { + if (tokens.length == 0) throw "No more input"; + + // peek the next op + const {token:nextToken, type:nextType} = tokens[0]; + if (nextType == TokenType.Op && nextToken == ")") + { + // no more arguments, we should end the call (after consuming the bracket) + tokens.shift(); + break; + } + + if (args.length > 0) + { + // if we have already parsed args, we need to consume a comma + if (nextType != TokenType.Op || nextToken != ",") + { + throw "Expected , or )"; + } + tokens.shift(); + } + + // our new node is the old lhs as a function, called with the index expression + args.push(parseExpressionInternal(tokens, 0)); } - let args = parseExpression(tokens, 0); - const {value:closeValue, type:closeType} = tokens.shiftThrow("No more input"); - if (closeType != "op" && closeValue != ")") - { - throw "Unclosed function call"; - } - lhs = {node:"call", func:lhs, args:args}; + + // generate the AST node + lhs = {node:ASTNode.Call, func:lhs, args:args}; } else { - throw "Unexpected postfix operator " + opValue; + throw "Unexpected postfix operator " + opToken; } + // after parsing a postfix operator, we are expecting an operator again continue; } - if (infixBindingPower[opValue] === undefined) + // if we didn't get an infix operator, we are done this expression + if (InfixBindingPower[opToken] === undefined) { break; } - [leftBinding, rightBinding] = infixBindingPower[opValue]; + [leftBinding, rightBinding] = InfixBindingPower[opToken]; + // if the binding power is too low, we are done parsing the current subexpression, leave + // the current operator for our caller... if (leftBinding < minBindingPower) break; + //... otherwise, this operator is for us, so consume it tokens.shift(); - let rhs = parseExpression(tokens, rightBinding); + // the next expression is the right side of our current operator, but only as much as the + // right-side binding power of our operator can grab + let rhs = parseExpressionInternal(tokens, rightBinding); - lhs = {type:"op", left:lhs, right:rhs, op:opValue}; + // our new lhs becomes our applied operator, + // leaving us free to match more operators of an appropriate precedence + lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken}; } return lhs; } +// helper to map over an object's keys recursively +Object.defineProperty(Object.prototype, 'mapKey', { + enumerable: false, + value: function(keyName, mapFunc) + { + let ret = {}; + for (let k in this) + { + if (k == keyName) + { + ret[k] = mapFunc(this[k]); + } + else if (typeof(this[k]) == 'object') + { + ret[k] = { ...this[k] }.mapKey(keyName, mapFunc); + } + else + { + ret[k] = this[k]; + } + } + return ret; + } +}); -const tokens = lex("test.blah(3+\"4x\\\"4\\\"\")"); -console.log(parseExpression(tokens, 0)); +fs = require('fs'); + +console.format = v => console.dir(v, { depth: null }); + +function debugExpr(expr) +{ + const tokens = lex(expr); + console.log(tokens.mapKey("type", v => TokenType.keyName(v))); + + const ast = parseExpression(tokens); + console.format(ast.mapKey("node", v => ASTNode.keyName(v))); +} + + +//debugExpr("test.BLAH (3+\"4x\\\"4\\\"\")"); +//debugExpr('3+-3+4*5-2[blah."\\"b 3 "]'); + +debugExpr(fs.readFileSync(process.argv[2], "utf8"));