parsetest/parse.js at master

Fork: 0
mark / parsetest
Find file
Newer
Older
parsetest / parse.js
Mark on 24 Sep 2020 27 KB Wait and while statements
Raw Blame History
const lex = require('./lex.js');
const utils = require('./utils.js');


/* How powerfully infix operators bind to their left and right arguments
 * If a value is bound more powerfully from one side, it becomes part of the operator
 *  on that side first
 *
 * The relative binding power of operators determines their precedence,
 *  and the relative binding power of the left and right side of a specific operator
 *  determines its associativity:
 *      If the *right* side binds more powerfully, the operator becomes *left* associative,
 *      as any instance of that operator that appear first in the token stream are able
 *      to grab their right arguments first, before further instances of the operator
 *      (and vice versa)
 */
const InfixBindingPower = Object.freeze({
    "+": [10, 11],
    "-": [10, 11],
    "*": [20, 21],
    "/": [20, 21],
    ".": [100, 101],
    "?": [6, 5],
    "=": [2, 1],
    "..": [4, 3],
});

/* How powerfully a prefix operator binds the value to its right
 * Stacked prefix operators are unambiguous, so this value is only
 *  useful in comparison to an infix or postfix operator, and determines
 *  which operators this prefix will get "inside of"
 */
const PrefixBindingPower = Object.freeze({
    "-": 40,
    "+": 40,
});

/* How powerfully a prefix operator binds the value to its right
 *  (works exactly the same as prefix binding power, but for operators on the right)
 */
const PostfixBindingPower = Object.freeze({
    "[": 50,
    "(": 60,
});

let InfixOpTrie = {};
let PrefixOpTrie = {};
let PostfixOpTrie = {};
/* turns a flat list of variable character length operators
 * into a trie (tree of operators, where each node
 * is indexed by the next character in the string)
 */
function buildOpTrie(allOps, opTrie)
{
    for (let op in allOps)
    {
        let currentOpTrie = opTrie;
        for (let i = 0; i < op.length; i++)
        {
            if (currentOpTrie[op[i]] === undefined)
            {
                currentOpTrie[op[i]] = { more: {} };
            }
            if (i == op.length - 1)
            {
                currentOpTrie[op[i]].op = allOps[op];
            }
            currentOpTrie = currentOpTrie[op[i]].more;
        }
    }
}

// all reserved keywords
const Keywords = Object.freeze([ "if", "else", "on", "wait", "for", "map", "in", "while", ]);

// the types of AST nodes
const ASTNode = Object.freeze({
    Literal:1,          // value literal, could be string, boolean, number, etc.
    PrefixOp:2,         // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression)
    InfixOp:3,          // two sub-expressions with an operator between them
    Identifier:4,       // some unresolved identifier, i.e variable_name_here
    Index:5,            // some sub-expression being indexed by another expression, i.e. sub_expr[3]
    Call:6,             // some sub-expression representing a function being called with a list of expressions
    Ternary:7,          // a ternary expression, with a condition expr, true expr, and false expr
    ExprStatement:8,    // a full statement that is just an expression
    Statements:9,       // a continuous text block of statements
    Block:10,           // a set of statements in a { } block
    If:11,              // if statement with condition, trueCase, and optional falseCase
    For:12,             // a for statement with loop variable, range, and loop body
    Map:13,             // like a for statement, but is actually an expression where you can 'return' in the body and generate an array
    On:14,              // handle a named event on a given target with given params, returning given values
    While:15,           // while loop with condition
    Wait:16,            // statement that pauses exeuction until the async call is done
});

const StatementsWithSemicolon = Object.freeze([
    ASTNode.ExprStatement, ASTNode.Wait,
]);

// all reserved words that represent literals
const Literals = Object.freeze({
    "true": {node:ASTNode.Literal, value:true},
    "false": {node:ASTNode.Literal, value:false},
});

class Parser
{
    constructor(tokenStream, mustMatchBeforeConsiderExpression = 4)
    {
        this.tokenStream = tokenStream;
        this.tokenStream.index = 0;
        this.mustMatchBeforeConsiderExpression = mustMatchBeforeConsiderExpression;
    }

    block()
    {
        let { error:openError } = this.tokenStream.consume(lex.TokenType.Op, "{");
        if (openError) return this.getError(openError);

        let { ast:statements, error, numMatched } = this.withBacktrack(this.statements);
        if (error) return this.getError(error);

        let { error:closeError } = this.tokenStream.consume(lex.TokenType.Op, "}");
        if (closeError) return this.getError(closeError);

        return { ast: { node: ASTNode.Block, statements: statements.statements } };
    }

    statements()
    {
        let statements = [];

        let error, mostErrorMatched = 0;
        while (!error)
        {
            if (!this.tokenStream.peek(lex.TokenType.Op, "}").error || this.tokenStream.eof())
            {
                break;
            }

            let stmt, errorMatched;

            ({ ast:stmt, error, numMatched:errorMatched } = this.withBacktrack(this.statement));

            if (error)
            {
                if (error.expectingMatch) return { error: error };
                break;
            }
            else
            {
                statements.push(stmt);
            }
        }

        if (error) return this.getError({ msg: "Expecting statement", ...this.tokenStream.currentLinePos() });

        return { ast: { node: ASTNode.Statements, statements: statements } };
    }

    statement()
    {
        let { ast:stmt, error } = this.withBacktrack(this.statementInternal);
        if (error)
        {
            return this.getError(error);
        }
        else if (StatementsWithSemicolon.includes(stmt.node))
        {
            if (error = this.tokenStream.consume(lex.TokenType.Op, ';').error)
            {
                return this.getError(error, true);
            }
        }

        return { ast:stmt };
    }

    exprInBrackets(expectingMatch = true)
    {
        let error;

        error = this.tokenStream.consume(lex.TokenType.Op, "(").error;
        if (error) return this.getError(error, expectingMatch);

        let expr;
        ({ ast:expr, error } = this.withBacktrack(this.expression));
        if (error) return this.getError(error, expectingMatch);

        error = this.tokenStream.consume(lex.TokenType.Op, ")").error;
        if (error) return this.getError(error, expectingMatch);

        return { ast:expr };
    }

    whileStmt()
    {
        let error;

        error = this.tokenStream.consume(lex.TokenType.Word, "while").error;
        if (error) return this.getError(error);

        let condition;
        ({ ast:condition, error } = this.exprInBrackets());
        if (error) return this.getError(error, true);

        let loop;
        ({ ast:loop, error } = this.withBacktrack(this.block));
        if (error) return this.getError(error, true);

        return { ast: { node: ASTNode.While, condition: condition, loop: loop } };
    }

    whileStmt()
    {
        let error;

        error = this.tokenStream.consume(lex.TokenType.Word, "wait").error;
        if (error) return this.getError(error);

        let expr;
        ({ ast:expr, error } = this.expression());
        if (error) return this.getError(error, true);

        return { ast: { node: ASTNode.Wait, expr: expr } };
    }

    onStmt()
    {
        let error;

        error = this.tokenStream.consume(lex.TokenType.Word, "on").error;
        if (error) return this.getError(error);

        let target;
        ({ ast:target, error } = this.expression());
        if (error) return this.getError(error, true);

        let eventName;
        ({ token:eventName, error } = this.tokenStream.consume(lex.TokenType.Word));
        if (error) return this.getError(error, true);

        let eventParams;
        ({ args:eventParams, error } = this.funcParams(true, true));
        if (error)
        {
            eventParams = [];
            if (error.expectingMatch)
            {
                return this.getError(error);
            }
        }

        let eventValues;
        ({ args:eventValues, error } = this.funcParams(false, false));
        if (error)
        {
            eventValues = [];
            if (error.expectingMatch)
            {
                return this.getError(error);
            }
        }

        let body;
        ({ ast:body, error } = this.withBacktrack(this.block));
        if (error) return this.getError(error, true);

        return { ast: { node: ASTNode.On, target: target, eventName: eventName, eventParams: eventParams, eventValues: eventValues, body: body } };
    }

    forStmt()
    {
        let error = this.tokenStream.consume(lex.TokenType.Word, "for").error;
        if (error) return this.getError(error);

        let { ast:forBody, error:bodyError } = this.forBody();
        if (bodyError) return this.getError(bodyError, true);

        return { ast: { node: ASTNode.For, variable: forBody.variable, range: forBody.range, loop: forBody.loop } };
    }

    forBody()
    {
        let error;

        error = this.tokenStream.consume(lex.TokenType.Op, "(").error;
        if (error) return this.getError(error, true);

        let variable;
        ({ token:variable, error } = this.tokenStream.consume(lex.TokenType.Word));
        if (error) return this.getError(error, true);

        error = this.tokenStream.consume(lex.TokenType.Word, "in").error;
        if (error) return this.getError(error, true);

        let range;
        ({ ast:range, error } = this.withBacktrack(this.expression));
        if (error) return this.getError(error, true);

        error = this.tokenStream.consume(lex.TokenType.Op, ")").error;
        if (error) return this.getError(error, true);

        let loop;
        ({ ast:loop, error } = this.withBacktrack(this.block));
        if (error) return this.getError(error, true);

        return { ast: { variable: variable, range: range, loop: loop } };
    }

    ifStmt()
    {
        let error;

        error = this.tokenStream.consume(lex.TokenType.Word, "if").error;
        if (error) return this.getError(error);

        let condition;
        ({ ast:condition, error } = this.exprInBrackets());
        if (error) return this.getError(error, true);

        let trueCase;
        ({ ast:trueCase, error } = this.withBacktrack(this.block));
        if (error) return this.getError(error, true);

        let ret = { ast: { node: ASTNode.If, condition: condition, trueCase: trueCase } };

        if (!this.tokenStream.consume(lex.TokenType.Word, "else").error)
        {
            let falseCase;
            ({ ast:falseCase, error } = this.withBacktrack(this.block));
            if (error) return this.getError(error, true);

            ret.ast.falseCase = falseCase;
        }

        return ret;
    }

    statementInternal()
    {
        return this.choice(
            [
                { parse: this.expression, handle: ast => ({ node: ASTNode.ExprStatement, expr: ast }) },
                { parse: this.ifStmt, handle: ast => ast },
                { parse: this.forStmt, handle: ast => ast },
                { parse: this.onStmt, handle: ast => ast },
                { parse: this.whileStmt, handle: ast => ast },
                { parse: this.waitStmt, handle: ast => ast },
            ],
            "Expecting statement");
        let error, bestError, numMatched, bestNumMatched = 0;
    }

    funcParams(consumeOpeningBracket, hasClosingBracket)
    {
        let error;
        if (consumeOpeningBracket)
        {
            if (error = this.tokenStream.consume(lex.TokenType.Op, "(").error)
            {
                if (error) return this.getError(error);
            }
        }

        /* Parsing a function call
         * lhs represents the function we are calling
         * We need to parse a list of expression separated by , as the arguments
         */
        let args = [];
        while (true)
        {
            // peek the next op
            let { token:nextToken, type:nextType, pos:nextPos, error } = this.tokenStream.peek();
            if (error) return this.getError(error, true);

            if (hasClosingBracket && nextType == lex.TokenType.Op && nextToken == ")")
            {
                // no more arguments, we should end the call (after consuming the bracket)
                this.tokenStream.consume();
                break;
            }

            if (args.length > 0)
            {
                if (error = this.tokenStream.consume(lex.TokenType.Op, ",").error)
                {
                    if (!hasClosingBracket) break;
                    return this.getError(error, true);
                }
            }

            let arg;
            // avoid re-declaring error... annoying
            ({ ast:arg, error } = this.withBacktrack(this.expression));
            if (error) return this.getError(error);

            // our new node is the old lhs as a function, called with the index expression
            args.push(arg);
        }

        return { args: args };
    }

    expression()
    {
        let result = this.withBacktrack(this.expressionInternal, 0);
        if (result.error) return this.getError(result.error, result.error.expectingMatch || result.numMatched > this.mustMatchBeforeConsiderExpression);
        return result;
    }

    /* Parses an expression consisting of a set of values, all separated by infix operators,
     *  as long none of the operators bind at least as powerfully as 'minBindingPower'
     *
     * This function will be called recursively to parse sub-expression, with the
     *  minBindingPower set to whatever the last seen infix operator was. This ensures
     *  that we never grab operators that can't be part of the subtree because their precedence
     *  should have had them happen earlier
     *
     * The while loop in the middle of this function is the main feature that makes this a pratt parser,
     *  and separates it from a standard recursive decent parser where you have to manually match on only 
     *  certain terms to maintain precedence
     *
     * Produces an error, or an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications
     *  of operators with higher precedence)
     *  The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information
     * 
     */
    expressionInternal(minBindingPower)
    {
        // grab the type of the next token, and its string representation
        let { type, token, pos, error } = this.tokenStream.consume();
        if (error) return this.getError(error);

        // start parsing the left-hand-side of the next operator at this AST node
        let lhs = null;
        if (type == lex.TokenType.Op)
        {
            // the binding power of the op we will read
            let power;

            // the value of this part of the expression starts with an operator
            // it will either be prefix operator, or a bracketed expression
            if (token == "(")
            {
                // parse a subexpression
                let { ast:expr, error } = this.withBacktrack(this.expression);
                if (error) return this.getError(error);

                lhs = expr;

                // ensure the next token after the sub expression is a closing paren, and consume it
                if (error = this.tokenStream.consume(lex.TokenType.Op, ")").error)
                {
                    return this.getError(error);
                }
            }
            else
            {
                const largestOp = this.matchLargestOp(PrefixOpTrie, token, true);
                if (largestOp)
                {
                    token = largestOp.op;
                    let power = largestOp.value;

                    // this is a prefix operator of the given power
                    // get the expression it applies to...
                    let { ast:rhs, error } = this.withBacktrack(this.expressionInternal, power);
                    if (error) return this.getError(error);

                    // ... and generate an AST node for it
                    // this becomes the new LHS of the operator we are parsing
                    lhs = { node:ASTNode.PrefixOp, rhs:rhs, op:token };
                }
                else
                {
                    return this.getError({ msg: "Invalid Op '" + token + "'", ...this.tokenStream.stringPosToLinePos(pos) });
                }
            }
        }
        else if (type == lex.TokenType.Word)
        {
            if (token == "map")
            {
                let { ast:mapBody, error } = this.forBody();
                if (error) return this.getError(error, true);

                lhs = { node:ASTNode.Map, variable: mapBody.variable, range: mapBody.range, loop: mapBody.loop };
            }
            else if (Keywords.includes(token))
            {
                // TODO: handle keywords that can go in expressions
                return this.getError({ msg: "Keyword '" + token + "' not expected", ...this.tokenStream.stringPosToLinePos(pos) });
            }
            else if (Literals[token] !== undefined)
            {
                // this is a hardcoded keyword literal, we already defined how those nodes look
                lhs = Literals[token];
            }
            else
            {
                // some unresolved identifier (this is the domain of the runtime/interpreter!)
                lhs = { node:ASTNode.Identifier, value:token };
            }
        }
        else if (type == lex.TokenType.NumberLit)
        {
            lhs = {node:ASTNode.Literal, value:parseFloat(token)};
        }
        else if (type == lex.TokenType.StringLit)
        {
            lhs = {node:ASTNode.Literal, value:token};
        }
        else
        {
            return this.getError({ msg: "Unexpected " + lex.TokenType.keyName(type) + " of '" + token + "'", ...this.tokenStream.stringPosToLinePos(pos) });
        }

        while (!this.tokenStream.eof())
        {
            // start grabbing further operators and expression, that don't have higher precedence
            // (higher precedence operators will be parsed as sub-expressions)
            
            /* Expecting an operator now
             * Depending on its precedence, we may not actually get to parse it ourselves,
             *  so don't actually consume the token yet, just peek it (just in case)
             */
            let { type:opType, token:opToken, pos:opPos, error } = this.tokenStream.peek(lex.TokenType.Op);
            if (error) return { ast: lhs };

            let largestOp = this.matchLargestOp(PostfixOpTrie, opToken);
            if (largestOp)
            {
                opToken = largestOp.op;
                let power = largestOp.value

                // this is a post-fix operator, but it may apply to...
                if (power < minBindingPower)
                {
                    //... some outer expression (we were too low power to bind to lhs)
                    break;
                }

                //... or our current lhs, in which case we consume the bracket
                this.tokenStream.consumeCount(opToken.length);

                if (opToken == "[")
                {
                    /* Parsing an index on the current lhs
                     * We need to parse the index expression. Note we reset the binding power,
                     *  this is because we will always end with ], so we don't need to worry about our current precedence
                     */
                    let { ast:index, error } = this.withBacktrack(this.expression);
                    if (error) return this.getError(error);

                    // parse the closing ]
                    if (error = this.tokenStream.consume(lex.TokenType.Op, "]").error)
                    {
                        return this.getError(error);
                    }

                    // our new node is the old lhs indexed with the list of argument expressions
                    lhs = { node:ASTNode.Index, lhs:lhs, index:index };
                }
                else if (opToken == "(")
                {
                    // funcParams consumes the first ( token for us
                    // false = don't consume opening bracket (we already did)
                    // true = need closing bracket
                    let { args, error } = this.funcParams(false, true);
                    if (error) return this.getError(error, true);

                    // generate the AST node
                    lhs = {node:ASTNode.Call, func:lhs, args:args};
                }
                else
                {
                    return this.getError({ msg: "Unexpected postfix Op '" + opToken + "'", ...this.tokenStream.stringPosToLinePos(opPos) });
                }

                // after parsing a postfix operator, we are expecting an operator again
                continue;
            }

            largestOp = this.matchLargestOp(InfixOpTrie, opToken, false);
            // if we didn't get an infix operator, we are done this expression
            if (!largestOp)
            {
                break;
            }

            opToken = largestOp.op;
            let [ leftBindingPower, rightBindingPower ] = largestOp.value;

            // if the binding power is too low, we are done parsing the current subexpression, leave
            // the current operator for our caller...
            if (leftBindingPower < minBindingPower) break;

            //... otherwise, this operator is for us, so consume it
            this.tokenStream.consumeCount(opToken.length);

            if (opToken == "?")
            {
                /* Parsing a ternary expression
                 * We need to parse the 'true' condition, then a ':', then the 'false' condition
                 * No expression has  ':' naturally in it (except for us manually parsing it here)
                 *  so this expression will automatically end at the next ':', and we don't need
                 *  to worry about precedence
                 */
                let { ast:trueExpr, error } = this.withBacktrack(this.expression);
                if (error) return this.getError(error);

                // parse the ':'
                if (error = this.tokenStream.consume(lex.TokenType.Op, ":").error)
                {
                    return this.getError(error);
                }

                {
                    let { ast:falseExpr, error } = this.withBacktrack(this.expression);
                    if (error) return this.getError(error);

                    // our new node is the old lhs indexed with the list of argument expressions
                    lhs = { node:ASTNode.Ternary, condition:lhs, trueCase:trueExpr, falseCase:falseExpr };
                }
            }
            else
            {
                // the next expression is the right side of our current operator, but only as much as the
                // right-side binding power of our operator can grab
                let { ast:rhs, error } = this.withBacktrack(this.expressionInternal, rightBindingPower);
                if (error) return this.getError(error);

                // our new lhs becomes our applied operator,
                // leaving us free to match more operators of an appropriate precedence
                lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken};
            }
        }

        return { ast: lhs };
    }

    matchLargestOp(opTrie, op, opAlreadyConsumed)
    {
        const ret = this.matchLargestOpInternal(opTrie, op);
        if (ret && ret.op.length > 1 && opAlreadyConsumed)
        {
            this.tokenStream.consumeCount(ret.op.length - 1);
        }
        return ret;
    }

    matchLargestOpInternal(opTrie, op)
    {
        const lastOp = op[op.length - 1];
        if (opTrie[lastOp] !== undefined)
        {
            let bestOp = null, bestOpLength = 0;
            for (let nextOp in opTrie[lastOp].more)
            {
                if (!this.tokenStream.peekAt(op.length, lex.TokenType.Op, nextOp).error)
                {
                    let betterOp = this.matchLargestOpInternal(opTrie[op].more, op + nextOp);
                    if (betterOp && betterOp.op.length > bestOpLength)
                    {
                        bestOp = betterOp;
                        bestOpLength = bestOp.op.length;
                    }
                }
            }
            if (bestOp)
            {
                return bestOp;
            }
            if (opTrie[lastOp].op)
            {
                return { op: op, value: opTrie[lastOp].op };
            }
        }
        return null;
    }

    /* attempt to match the input strema using this.parseFunction,
     * but if it fails, rollback the input stream to where it is now
     *
     * the return value is either the matched ast or an error, decorated with
     * the most number of tokens we ever managed to match before erroring
     */
    withBacktrack(parseFunction, ...args)
    {
        this.tokenStream.startMatch();
        const subtree = parseFunction.apply(this, args);

        let numMatched;
        if (subtree.error)
        {
            numMatched = this.tokenStream.failMatch();
        }
        else
        {
            numMatched = this.tokenStream.finishMatch();
        }

        return { numMatched: numMatched, ...subtree};
    }

    getError(error, expectingMatch = undefined)
    {
        if (expectingMatch === undefined)
        {
            if (error.expectingMatch !== undefined)
            {
                expectingMatch = error.expectingMatch;
            }
            else
            {
                expectingMatch = false;
            }
        }
        return { error: { ...error, expectingMatch: expectingMatch } };
    }

    choice(choices, genericError)
    {
        for (let choice of choices)
        {
            if (choice.parse && choice.handle)
            {
                let { ast, error, numMatched } = this.withBacktrack(choice.parse);
                if (!error) return { ast: choice.handle(ast) };
                if (error.expectingMatch) return this.getError(error);
            }
        }
        
        return this.getError({ msg: genericError, ...this.tokenStream.currentLinePos() });
    }
}

function getPrettyAST(ast)
{
    return ast.mapKey("node", v => ASTNode.keyName(v));
}

(function() {
    buildOpTrie(InfixBindingPower, InfixOpTrie);
    buildOpTrie(PrefixBindingPower, PrefixOpTrie);
    buildOpTrie(PostfixBindingPower, PostfixOpTrie);
})();

module.exports = { Parser: Parser, getPrettyAST: getPrettyAST };