const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 });
/* takes a string, and returns a flat array of tokens
*
* recognized types:
* Word: as a token type, reprsents any symbol/word: variable names, or actual keywords
* NumberLit: a literal value for the number type
* StringLit: a literal value for the string type
* Op: an operator (usually a single character of punctuation)
*/
function lex(input)
{
// break the input down into "parts": either words, digit strings, single character operators, or whitespace
let parts = [];
while (input.length > 0)
{
// match our possible sequences of more than one character (words, digits, or whitespace)
if ((match = input.match(/^\w+/)) ||
(match = input.match(/^\d+/)) ||
(match = input.match(/^\s+/)))
{
const matched = match[0];
parts.push(matched);
input = input.substr(matched.length);
}
else
{
// everything else is a single character (punctuation)
parts.push(input[0]);
input = input.substr(1);
}
}
let tokens = [];
for (let i = 0; i < parts.length; i++)
{
const partsLeft = parts.length - i - 1;
let token = parts[i];
if (token.trim().length == 0)
{
// remove whitespace (whitespace in a string is handled in the block that handles the " token below)
continue;
}
if (token.match(/\d+/))
{
// try and get the decimal, if any
if (partsLeft > 2 && parts[i + 1] == "." && parts[i + 2].match(/\d+/))
{
token += "." + parts[i + 2];
}
tokens.push({type:TokenType.NumberLit, token:token});
}
else if (token == '"')
{
// we are starting a string literal
// don't include the initial "
token = "";
let closed = false;
for (let j = i + 1; j < parts.length; j++)
{
// find an escape character
if (parts[j] == '\\')
{
if (j + 1 < parts.length)
{
// the escape character is the next character (all tokens are guaranteed length > 0)
const escaped = parts[j + 1][0];
// remove the escaped character from the next token, being sure to remove the token
// if it was only that 1 character
parts[j + 1] = parts[j + 1].substr(1);
if (parts[j + 1].length == 0) parts.splice(j + 1, 1);
// for now, just put the escaped character in literally, we'll deal with escape sequences later
token += escaped;
}
}
else if (parts[j] == '"')
{
// close the string
i = j;
closed = true;
break;
}
else
{
// build up the string out of all tokens before the closing "
token += parts[j];
}
}
if (!closed)
{
throw "Unclosed string";
}
tokens.push({type:TokenType.StringLit, token:token});
}
else if (token.match(/\w+/))
{
// any keyword or identifier
tokens.push({type:TokenType.Word, token:token});
}
else
{
// anything left is an operator
tokens.push({type:TokenType.Op, token:token});
}
}
// keywords and identifiers are case insensitive
return tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t);
}
/* How powerfully infix operators bind to their left and right arguments
* If a value is bound more powerfully from one side, it becomes part of the operator
* on that side first
*
* The relative binding power of operators determines their precedence,
* and the relative binding power of the left and right side of a specific operator
* determines its associativity:
* If the *right* side binds more powerfully, the operator becomes *left* associative,
* as any instance of that operator that appear first in the token stream are able
* to grab their right arguments first, before further instances of the operator
* (and vice versa)
*/
const InfixBindingPower = Object.freeze({
"+": [10, 11],
"-": [10, 11],
"*": [20, 21],
".": [100, 101],
});
/* How powerfully a prefix operator binds the value to its right
* Stacked prefix operators are unambiguous, so this value is only
* useful in comparison to an infix or postfix operator, and determines
* which operators this prefix will get "inside of"
*/
const PrefixBindingPower = Object.freeze({
"-": 40,
});
/* How powerfully a prefix operator binds the value to its right
* (works exactly the same as prefix binding power, but for operators on the right)
*/
const PostfixBindingPower = Object.freeze({
"[": 50,
"(": 60,
});
// helper to grab the first item of an array, and throw a specific string if the array is empty
Object.defineProperty(Array.prototype, 'shiftThrow', {
enumerable: false,
value: function(err) {
if (this.length == 0) throw err;
return this.shift();
}
});
// helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE";
Object.defineProperty(Object.prototype, 'keyName', {
enumerable: false,
value: function(value)
{
return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE";
}
});
// all reserved keywords
const Keywords = Object.freeze([ "if" ]);
// the types of AST nodes
const ASTNode = Object.freeze({
Literal:1, // value literal, could be string, boolean, number, etc.
PrefixOp:2, // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression)
InfixOp:3, // two sub-expressions with an operator between them
Identifier:4, // some unresolved identifier, i.e variable_name_here
Index:5, // some sub-expression being indexed by another expression, i.e. sub_expr[3]
Call:6, // some sub-expression representing a function being called with a list of expressions
});
// all reserved words that represent literals
const Literals = Object.freeze({
"true": {node:ASTNode.Literal, value:true},
"false": {node:ASTNode.Literal, value:false},
});
/* The main entry point for parsing
*
* Takes an array of tokens from the lexer, and produces an AST of the program as raw objects
*/
function parseExpression(tokens)
{
return parseExpressionInternal(tokens, 0);
}
/* Parses an expression consisting of a set of values, all separated by infix operators,
* as long none of the operators bind at least as powerfully as 'minBindingPower'
*
* This function will be called recursively to parse sub-expression, with the
* minBindingPower set to whatever the last seen infix operator was. This ensures
* that we never grab operators that can't be part of the subtree because their precedence
* should have had them happen earlier
*
* The while loop in the middle of this function is the main feature that makes this a pratt parser,
* and separates it from a standard recursive decent parser where you have to manually match on only
* certain terms to maintain precedence
*
* Produces an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications
* of operators with higher precedence)
* The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information
*/
function parseExpressionInternal(tokens, minBindingPower)
{
// grab the type of the next token, and its string representation
const {type, token} = tokens.shiftThrow("No more input");
// start parsing the left-hand-side of the next operator at this AST node
let lhs = null;
if (type == TokenType.Op)
{
// the value of this part of the expression starts with an operator
// it will either be prefix operator, or a bracketed expression
if (token == "(")
{
// parse a subexpression
lhs = parseExpressionsInternal(tokens, 0);
// ensure the next token after the sub expression is a closing paren, and consume it
const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input");
if (closeType != TokenType.Op && closeToken != ")")
{
throw "Unclosed parenthesis";
}
}
else if ((power = PrefixBindingPower[token]) !== undefined)
{
// this is a prefix operator of the given power
// get the expression it applies to...
const rhs = parseExpressionInternal(tokens, power);
// ... and generate an AST node for it
// this becomes the new LHS of the operator we are parsing
lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token};
}
else
{
throw "Bad operator " + token;
}
}
else if (type == TokenType.Word)
{
if (Keywords.includes(token))
{
// TODO: handle keywords that can go in expressions
throw "Keyword " + token + " not expected here";
}
else if (Literals[token] !== undefined)
{
// this is a hardcoded keyword literal, we already defined how those nodes look
lhs = Literals[token];
}
else
{
// some unresolved identifier (this is the domain of the runtime/interpreter!)
lhs = {node:ASTNode.Identifier, value:token};
}
}
else if (type == TokenType.NumberLit)
{
lhs = {node:ASTNode.Literal, value:parseFloat(token)};
}
else if (type == TokenType.StringLit)
{
lhs = {node:ASTNode.Literal, value:token};
}
else
{
throw "Unexpcted token " + TokenType.keyName(type) + " of " + token;
}
while (tokens.length > 0)
{
// start grabbing further operators and expression, that don't have higher precedence
// (higher precedence operators will be parsed as sub-expressions)
/* Expecting an operator now
* Depending on its precedence, we may not actually get to parse it ourselves,
* so don't actually consume the token yet, just peek it (just in case)
*/
const {type:opType, token:opToken} = tokens[0];
if (opType != TokenType.Op)
{
throw "Operator expected here, instead of " + TokenType.keyName(opType) + " of " + opToken;
}
if (PostfixBindingPower[opToken] !== undefined)
{
// this is a post-fix operator, but it may apply to...
if (PostfixBindingPower[opToken] < minBindingPower)
{
//... some outer expression (we were too low power to bind to lhs)
break;
}
//... or our current lhs, in which case we get to consume the token
tokens.shift();
if (opToken == "[")
{
/* Parsing an index on the current lhs
* We need to parse the index expression. Note we reset the binding power,
* this is because we will always end with ], so we don't need to worry about our current precedence
*/
let index = parseExpressionInternal(tokens, 0);
// parse the closing ]
const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input");
if (closeType != TokenType.Op && closeToken != "]")
{
throw "Unclosed indexing expression";
}
// our new node is the old lhs indexed with the list of argument expressions
lhs = {node:ASTNode.Index, lhs:lhs, index:index};
}
else if (opToken == "(")
{
/* Parsing a function call
* lhs represents the function we are calling
* We need to parse a list of expression separated by , as the arguments
*/
let args = [];
while (true)
{
if (tokens.length == 0) throw "No more input";
// peek the next op
const {token:nextToken, type:nextType} = tokens[0];
if (nextType == TokenType.Op && nextToken == ")")
{
// no more arguments, we should end the call (after consuming the bracket)
tokens.shift();
break;
}
if (args.length > 0)
{
// if we have already parsed args, we need to consume a comma
if (nextType != TokenType.Op || nextToken != ",")
{
throw "Expected , or )";
}
tokens.shift();
}
// our new node is the old lhs as a function, called with the index expression
args.push(parseExpressionInternal(tokens, 0));
}
// generate the AST node
lhs = {node:ASTNode.Call, func:lhs, args:args};
}
else
{
throw "Unexpected postfix operator " + opToken;
}
// after parsing a postfix operator, we are expecting an operator again
continue;
}
// if we didn't get an infix operator, we are done this expression
if (InfixBindingPower[opToken] === undefined)
{
break;
}
[leftBinding, rightBinding] = InfixBindingPower[opToken];
// if the binding power is too low, we are done parsing the current subexpression, leave
// the current operator for our caller...
if (leftBinding < minBindingPower) break;
//... otherwise, this operator is for us, so consume it
tokens.shift();
// the next expression is the right side of our current operator, but only as much as the
// right-side binding power of our operator can grab
let rhs = parseExpressionInternal(tokens, rightBinding);
// our new lhs becomes our applied operator,
// leaving us free to match more operators of an appropriate precedence
lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken};
}
return lhs;
}
// helper to map over an object's keys recursively
Object.defineProperty(Object.prototype, 'mapKey', {
enumerable: false,
value: function(keyName, mapFunc)
{
let ret = {};
for (let k in this)
{
if (k == keyName)
{
ret[k] = mapFunc(this[k]);
}
else if (typeof(this[k]) == 'object')
{
ret[k] = { ...this[k] }.mapKey(keyName, mapFunc);
}
else
{
ret[k] = this[k];
}
}
return ret;
}
});
fs = require('fs');
console.format = v => console.dir(v, { depth: null });
function debugExpr(expr)
{
const tokens = lex(expr);
console.log(tokens.mapKey("type", v => TokenType.keyName(v)));
const ast = parseExpression(tokens);
console.format(ast.mapKey("node", v => ASTNode.keyName(v)));
}
//debugExpr("test.BLAH (3+\"4x\\\"4\\\"\")");
//debugExpr('3+-3+4*5-2[blah."\\"b 3 "]');
debugExpr(fs.readFileSync(process.argv[2], "utf8"));