const lex = require('lex');
const utils = require('utils');
/* How powerfully infix operators bind to their left and right arguments
* If a value is bound more powerfully from one side, it becomes part of the operator
* on that side first
*
* The relative binding power of operators determines their precedence,
* and the relative binding power of the left and right side of a specific operator
* determines its associativity:
* If the *right* side binds more powerfully, the operator becomes *left* associative,
* as any instance of that operator that appear first in the token stream are able
* to grab their right arguments first, before further instances of the operator
* (and vice versa)
*/
const InfixBindingPower = Object.freeze({
"+": [10, 11],
"-": [10, 11],
"*": [20, 21],
".": [100, 101],
});
/* How powerfully a prefix operator binds the value to its right
* Stacked prefix operators are unambiguous, so this value is only
* useful in comparison to an infix or postfix operator, and determines
* which operators this prefix will get "inside of"
*/
const PrefixBindingPower = Object.freeze({
"-": 40,
});
/* How powerfully a prefix operator binds the value to its right
* (works exactly the same as prefix binding power, but for operators on the right)
*/
const PostfixBindingPower = Object.freeze({
"[": 50,
"(": 60,
});
// all reserved keywords
const Keywords = Object.freeze([ "if" ]);
// the types of AST nodes
const ASTNode = Object.freeze({
Literal:1, // value literal, could be string, boolean, number, etc.
PrefixOp:2, // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression)
InfixOp:3, // two sub-expressions with an operator between them
Identifier:4, // some unresolved identifier, i.e variable_name_here
Index:5, // some sub-expression being indexed by another expression, i.e. sub_expr[3]
Call:6, // some sub-expression representing a function being called with a list of expressions
});
// all reserved words that represent literals
const Literals = Object.freeze({
"true": {node:ASTNode.Literal, value:true},
"false": {node:ASTNode.Literal, value:false},
});
class Parser
{
constructor(tokenStream)
{
this.tokenStream = tokenStream;
this.tokenStream.index = 0;
}
expression(tokenStream)
{
return expressionInternal(tokenStream, 0);
}
/* Parses an expression consisting of a set of values, all separated by infix operators,
* as long none of the operators bind at least as powerfully as 'minBindingPower'
*
* This function will be called recursively to parse sub-expression, with the
* minBindingPower set to whatever the last seen infix operator was. This ensures
* that we never grab operators that can't be part of the subtree because their precedence
* should have had them happen earlier
*
* The while loop in the middle of this function is the main feature that makes this a pratt parser,
* and separates it from a standard recursive decent parser where you have to manually match on only
* certain terms to maintain precedence
*
* Produces an error, or an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications
* of operators with higher precedence)
* The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information
*
*/
expressionInternal(minBindingPower)
{
// grab the type of the next token, and its string representation
const {type, token, index:tokenIndex, error} = this.tokenStream.consume();
if (error) return this.getError(error);
// start parsing the left-hand-side of the next operator at this AST node
let lhs = null;
if (type == lex.TokenType.Op)
{
// the value of this part of the expression starts with an operator
// it will either be prefix operator, or a bracketed expression
if (token == "(")
{
// parse a subexpression
const { expr, error } = this.attemptSubtree(this.expression);
if (error) return this.getError(error);
lhs = expr;
// ensure the next token after the sub expression is a closing paren, and consume it
const { error } = tokens.consume(lex.TokenType.Op, ")");
if (error) return this.getError(error);
}
else if ((power = PrefixBindingPower[token]) !== undefined)
{
// this is a prefix operator of the given power
// get the expression it applies to...
const { rhs, error } = this.attemptSubtree(expressionInternal, power);
if (error) return this.getError(error);
// ... and generate an AST node for it
// this becomes the new LHS of the operator we are parsing
lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token};
}
else
{
return this.getError({ msg: "Invalid Op '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) });
}
}
else if (type == lex.TokenType.Word)
{
if (Keywords.includes(token))
{
// TODO: handle keywords that can go in expressions
return this.getError({ msg: "Keyword '" + token + "' not expected", ...this.tokenStream.stringIndexToLinePos(tokenIndex) });
}
else if (Literals[token] !== undefined)
{
// this is a hardcoded keyword literal, we already defined how those nodes look
lhs = Literals[token];
}
else
{
// some unresolved identifier (this is the domain of the runtime/interpreter!)
lhs = {node:ASTNode.Identifier, value:token};
}
}
else if (type == TokenType.NumberLit)
{
lhs = {node:ASTNode.Literal, value:parseFloat(token)};
}
else if (type == TokenType.StringLit)
{
lhs = {node:ASTNode.Literal, value:token};
}
else
{
return this.getError({ msg: "Unexpected " + lex.TokenType.keyName(type) + " of '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) });
}
while (!this.tokenStream.eof())
{
// start grabbing further operators and expression, that don't have higher precedence
// (higher precedence operators will be parsed as sub-expressions)
/* Expecting an operator now
* Depending on its precedence, we may not actually get to parse it ourselves,
* so don't actually consume the token yet, just peek it (just in case)
*/
const {type:opType, token:opToken, index:opIndex, error} = this.tokenStream.peek(lex.TokenType.Op);
if (error) return this.getError(error);
if (PostfixBindingPower[opToken] !== undefined)
{
// this is a post-fix operator, but it may apply to...
if (PostfixBindingPower[opToken] < minBindingPower)
{
//... some outer expression (we were too low power to bind to lhs)
break;
}
//... or our current lhs, in which case we get to consume the token
this.tokenStream.consume();
if (opToken == "[")
{
/* Parsing an index on the current lhs
* We need to parse the index expression. Note we reset the binding power,
* this is because we will always end with ], so we don't need to worry about our current precedence
*/
const { index, error } = this.attemptSubtree(this.expression);
if (error) return this.getError(error);
// parse the closing ]
const { error } = this.tokenStream.consume(lex.TokenType.Op, "]");
if (error) return this.getError(error);
// our new node is the old lhs indexed with the list of argument expressions
lhs = {node:ASTNode.Index, lhs:lhs, index:index};
}
else if (opToken == "(")
{
/* Parsing a function call
* lhs represents the function we are calling
* We need to parse a list of expression separated by , as the arguments
*/
let args = [];
while (true)
{
if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines));
// peek the next op
const {token:nextToken, type:nextType, index:nextIndex, error} = this.tokenStream.peek();
if (error) return this.getError(error);
if (nextType == TokenType.Op && nextToken == ")")
{
// no more arguments, we should end the call (after consuming the bracket)
this.tokenStream.consume();
break;
}
if (args.length > 0)
{
const { error } = this.tokenStream.consume(lex.TokenType.Op, ",");
if (error) return this.getError(error);
}
const { arg, error } = this.attemptSubtree(this.expression);
if (error) return this.getError(error);
// our new node is the old lhs as a function, called with the index expression
args.push(arg);
}
// generate the AST node
lhs = {node:ASTNode.Call, func:lhs, args:args};
}
else
{
return this.getError({ msg: "Unexpected postfix Op '" + opToken + "'", ...this.tokenStream.stringIndexToLinePos(opIndex) });
}
// after parsing a postfix operator, we are expecting an operator again
continue;
}
// if we didn't get an infix operator, we are done this expression
if (InfixBindingPower[opToken] === undefined)
{
break;
}
[leftBinding, rightBinding] = InfixBindingPower[opToken];
// if the binding power is too low, we are done parsing the current subexpression, leave
// the current operator for our caller...
if (leftBinding < minBindingPower) break;
//... otherwise, this operator is for us, so consume it
this.tokenStream.consume();
// the next expression is the right side of our current operator, but only as much as the
// right-side binding power of our operator can grab
const { rhs, error } = this.attemptSubtree(this.expressionInternal, rightBinding);
if (error) return this.getError(error);
// our new lhs becomes our applied operator,
// leaving us free to match more operators of an appropriate precedence
lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken};
}
return { ast: lhs, error: null };
}
error(msg, at)
{
let loc = this.tokenStream.tokenIndexToLinePos(this.tokenStream.index);
if (at)
{
if (at.index)
{
loc = this.tokenStream.tokenToLinePos(at);
}
else
{
loc = this.tokenStream.tokenIndexToLinePos(at);
}
}
return { ast:null, error: { msg: msg, ...loc } };
}
attemptSubtree(parseFunction, ...args)
{
this.tokenStream.attemptMatch();
const subtree = parseFunction.apply(this, [tokenStream, ...args]);
if (subtree.error)
{
this.tokenStream.failedMatch();
}
else
{
this.tokenSTream.doneMatch();
}
return subtree;
}
getError(error)
{
return { ast: null, error: error };
}
}