parsetest/parse.js at 8dc02ffabf39e2ef9fbe4395ae5d8a605e679db5

Fork: 0
mark / parsetest
Find file
Newer
Older
parsetest / parse.js
Mark on 24 Sep 2020 17 KB Blocks of statements
Raw Blame History
const lex = require('./lex.js');
const utils = require('./utils.js');


/* How powerfully infix operators bind to their left and right arguments
 * If a value is bound more powerfully from one side, it becomes part of the operator
 *  on that side first
 *
 * The relative binding power of operators determines their precedence,
 *  and the relative binding power of the left and right side of a specific operator
 *  determines its associativity:
 *      If the *right* side binds more powerfully, the operator becomes *left* associative,
 *      as any instance of that operator that appear first in the token stream are able
 *      to grab their right arguments first, before further instances of the operator
 *      (and vice versa)
 */
const InfixBindingPower = Object.freeze({
    "+": [10, 11],
    "-": [10, 11],
    "*": [20, 21],
    "/": [20, 21],
    ".": [100, 101],
    "?": [6, 5],
    "=": [2, 1],
});

/* How powerfully a prefix operator binds the value to its right
 * Stacked prefix operators are unambiguous, so this value is only
 *  useful in comparison to an infix or postfix operator, and determines
 *  which operators this prefix will get "inside of"
 */
const PrefixBindingPower = Object.freeze({
    "-": 40,
    "+": 40,
});

/* How powerfully a prefix operator binds the value to its right
 *  (works exactly the same as prefix binding power, but for operators on the right)
 */
const PostfixBindingPower = Object.freeze({
    "[": 50,
    "(": 60,
});

// all reserved keywords
const Keywords = Object.freeze([ "if" ]);

// the types of AST nodes
const ASTNode = Object.freeze({
    Literal:1,          // value literal, could be string, boolean, number, etc.
    PrefixOp:2,         // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression)
    InfixOp:3,          // two sub-expressions with an operator between them
    Identifier:4,       // some unresolved identifier, i.e variable_name_here
    Index:5,            // some sub-expression being indexed by another expression, i.e. sub_expr[3]
    Call:6,             // some sub-expression representing a function being called with a list of expressions
    Ternary:7,          // a ternary expression, with a condition expr, true expr, and false expr
    ExprStatement:8,    // a full statement that is just an expression
    Statements:9,       // a continuous text block of statements
    Block:10,           // a set of statements in a { } block
});

// all reserved words that represent literals
const Literals = Object.freeze({
    "true": {node:ASTNode.Literal, value:true},
    "false": {node:ASTNode.Literal, value:false},
});

class Parser
{
    constructor(tokenStream, mustConsumeBeforeUsingError = 4)
    {
        this.tokenStream = tokenStream;
        this.tokenStream.index = 0;
        this.mustConsumeBeforeUsingError = mustConsumeBeforeUsingError;
    }

    block()
    {
        let { error:openError } = this.tokenStream.consume(lex.TokenType.Op, "{");
        if (openError) return this.getError(openError);

        let { ast:statements, error, numMatched } = this.attemptSubtree(this.statements);
        if (error && numMatched > this.mustConsumeBeforeUsingError) return this.getError(this.error);

        let { error:closeError } = this.tokenStream.consume(lex.TokenType.Op, "}");
        if (closeError) return this.getError(closeError);

        return { ast: { node: ASTNode.Block, statements: statements }, error: null };
    }

    statements()
    {
        let statements = [];

        let error, mostErrorMatched = 0;
        while (!error)
        {
            let stmt, errorMatched;

            ({ ast:stmt, error, numMatched:errorMatched } = this.attemptSubtree(this.statement));

            if (error)
            {
                if (errorMatched > mostErrorMatched) mostErrorMatched = errorMatched;
            }
            else
            {
                statements.push(stmt);
            }
        }

        if (mostErrorMatched > this.mustConsumeBeforeUsingError) return { ast: null, error: error };

        return { ast: { node: ASTNode.Statements, statements: statements }, error: null };
    }

    statement()
    {
        let { ast:stmt, error } = this.attemptSubtree(this.statementInternal);
        if (error)
        {
            return this.getError(error);
        }
        else
        {
            if (error = this.tokenStream.consume(lex.TokenType.Op, ';').error)
            {
                return this.getError(error);
            }
        }

        return { ast:stmt, error: null };
    }

    statementInternal()
    {
        return this.choice(
            [
                { parse: this.expression, handle: ast => ({ node: ASTNode.ExprStatement, expr: ast }) }
            ],
            "Expecting expression");
        let error, bestError, numMatched, bestNumMatched = 0;
    }

    expression()
    {
        return this.expressionInternal(this.tokenStream, 0);
    }

    /* Parses an expression consisting of a set of values, all separated by infix operators,
     *  as long none of the operators bind at least as powerfully as 'minBindingPower'
     *
     * This function will be called recursively to parse sub-expression, with the
     *  minBindingPower set to whatever the last seen infix operator was. This ensures
     *  that we never grab operators that can't be part of the subtree because their precedence
     *  should have had them happen earlier
     *
     * The while loop in the middle of this function is the main feature that makes this a pratt parser,
     *  and separates it from a standard recursive decent parser where you have to manually match on only 
     *  certain terms to maintain precedence
     *
     * Produces an error, or an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications
     *  of operators with higher precedence)
     *  The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information
     * 
     */
    expressionInternal(minBindingPower)
    {
        // grab the type of the next token, and its string representation
        let { type, token, pos, error } = this.tokenStream.consume();
        if (error) return this.getError(error);

        // start parsing the left-hand-side of the next operator at this AST node
        let lhs = null;
        if (type == lex.TokenType.Op)
        {
            // the binding power of the op we will read
            let power;

            // the value of this part of the expression starts with an operator
            // it will either be prefix operator, or a bracketed expression
            if (token == "(")
            {
                // parse a subexpression
                let { ast:expr, error } = this.attemptSubtree(this.expression);
                if (error) return this.getError(error);

                lhs = expr;

                // ensure the next token after the sub expression is a closing paren, and consume it
                if (error = this.tokenStream.consume(lex.TokenType.Op, ")").error)
                {
                    return this.getError(error);
                }
            }
            else if ((power = PrefixBindingPower[token]) !== undefined)
            {
                // this is a prefix operator of the given power
                // get the expression it applies to...
                let { ast:rhs, error } = this.attemptSubtree(this.expressionInternal, power);
                if (error) return this.getError(error);

                // ... and generate an AST node for it
                // this becomes the new LHS of the operator we are parsing
                lhs = { node:ASTNode.PrefixOp, rhs:rhs, op:token };
            }
            else
            {
                return this.getError({ msg: "Invalid Op '" + token + "'", ...this.tokenStream.stringPosToLinePos(pos) });
            }
        }
        else if (type == lex.TokenType.Word)
        {
            if (Keywords.includes(token))
            {
                // TODO: handle keywords that can go in expressions
                return this.getError({ msg: "Keyword '" + token + "' not expected", ...this.tokenStream.stringPosToLinePos(pos) });
            }
            else if (Literals[token] !== undefined)
            {
                // this is a hardcoded keyword literal, we already defined how those nodes look
                lhs = Literals[token];
            }
            else
            {
                // some unresolved identifier (this is the domain of the runtime/interpreter!)
                lhs = { node:ASTNode.Identifier, value:token };
            }
        }
        else if (type == lex.TokenType.NumberLit)
        {
            lhs = {node:ASTNode.Literal, value:parseFloat(token)};
        }
        else if (type == lex.TokenType.StringLit)
        {
            lhs = {node:ASTNode.Literal, value:token};
        }
        else
        {
            return this.getError({ msg: "Unexpected " + lex.TokenType.keyName(type) + " of '" + token + "'", ...this.tokenStream.stringPosToLinePos(pos) });
        }

        while (!this.tokenStream.eof())
        {
            // start grabbing further operators and expression, that don't have higher precedence
            // (higher precedence operators will be parsed as sub-expressions)
            
            /* Expecting an operator now
             * Depending on its precedence, we may not actually get to parse it ourselves,
             *  so don't actually consume the token yet, just peek it (just in case)
             */
            let { type:opType, token:opToken, pos:opPos, error } = this.tokenStream.peek(lex.TokenType.Op);
            if (error) return this.getError(error);

            if (PostfixBindingPower[opToken] !== undefined)
            {
                // this is a post-fix operator, but it may apply to...
                if (PostfixBindingPower[opToken] < minBindingPower)
                {
                    //... some outer expression (we were too low power to bind to lhs)
                    break;
                }

                //... or our current lhs, in which case we get to consume the token
                this.tokenStream.consume();

                if (opToken == "[")
                {
                    /* Parsing an index on the current lhs
                     * We need to parse the index expression. Note we reset the binding power,
                     *  this is because we will always end with ], so we don't need to worry about our current precedence
                     */
                    let { ast:index, error } = this.attemptSubtree(this.expression);
                    if (error) return this.getError(error);

                    // parse the closing ]
                    if (error = this.tokenStream.consume(lex.TokenType.Op, "]").error)
                    {
                        return this.getError(error);
                    }

                    // our new node is the old lhs indexed with the list of argument expressions
                    lhs = { node:ASTNode.Index, lhs:lhs, index:index };
                }
                else if (opToken == "(")
                {
                    /* Parsing a function call
                     * lhs represents the function we are calling
                     * We need to parse a list of expression separated by , as the arguments
                     */
                    let args = [];
                    while (true)
                    {
                        // peek the next op
                        let { token:nextToken, type:nextType, pos:nextPos, error } = this.tokenStream.peek();
                        if (error) return this.getError(error);

                        if (nextType == lex.TokenType.Op && nextToken == ")")
                        {
                            // no more arguments, we should end the call (after consuming the bracket)
                            this.tokenStream.consume();
                            break;
                        }

                        if (args.length > 0)
                        {
                            if (error = this.tokenStream.consume(lex.TokenType.Op, ",").error)
                            {
                                return this.getError(error);
                            }
                        }

                        {
                            // avoid re-declaring error... annoying
                            let { ast:arg, error } = this.attemptSubtree(this.expression);
                            if (error) return this.getError(error);

                            // our new node is the old lhs as a function, called with the index expression
                            args.push(arg);
                        }
                    }

                    // generate the AST node
                    lhs = {node:ASTNode.Call, func:lhs, args:args};
                }
                else
                {
                    return this.getError({ msg: "Unexpected postfix Op '" + opToken + "'", ...this.tokenStream.stringPosToLinePos(opPos) });
                }

                // after parsing a postfix operator, we are expecting an operator again
                continue;
            }

            // if we didn't get an infix operator, we are done this expression
            if (InfixBindingPower[opToken] === undefined)
            {
                break;
            }

            let [ leftBindingPower, rightBindingPower ] = InfixBindingPower[opToken];

            // if the binding power is too low, we are done parsing the current subexpression, leave
            // the current operator for our caller...
            if (leftBindingPower < minBindingPower) break;

            //... otherwise, this operator is for us, so consume it
            this.tokenStream.consume();

            if (opToken == "?")
            {
                /* Parsing a ternary expression
                 * We need to parse the 'true' condition, then a ':', then the 'false' condition
                 * No expression has  ':' naturally in it (except for us manually parsing it here)
                 *  so this expression will automatically end at the next ':', and we don't need
                 *  to worry about precedence
                 */
                let { ast:trueExpr, error } = this.attemptSubtree(this.expression);
                if (error) return this.getError(error);

                // parse the ':'
                if (error = this.tokenStream.consume(lex.TokenType.Op, ":").error)
                {
                    return this.getError(error);
                }

                {
                    let { ast:falseExpr, error } = this.attemptSubtree(this.expression);
                    if (error) return this.getError(error);

                    // our new node is the old lhs indexed with the list of argument expressions
                    lhs = { node:ASTNode.Ternary, condition:lhs, trueCase:trueExpr, falseCase:falseExpr };
                }
            }
            else
            {
                // the next expression is the right side of our current operator, but only as much as the
                // right-side binding power of our operator can grab
                let { ast:rhs, error } = this.attemptSubtree(this.expressionInternal, rightBindingPower);
                if (error) return this.getError(error);

                // our new lhs becomes our applied operator,
                // leaving us free to match more operators of an appropriate precedence
                lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken};
            }
        }

        return { ast: lhs, error: null };
    }

    attemptSubtree(parseFunction, ...args)
    {
        this.tokenStream.startMatch();
        const subtree = parseFunction.apply(this, args);

        let numMatched;
        if (subtree.error)
        {
            numMatched = this.tokenStream.failMatch();
        }
        else
        {
            numMatched = this.tokenStream.finishMatch();
        }

        return { numMatched: numMatched, ...subtree};
    }

    getError(error)
    {
        return { ast: null, error: error };
    }

    choice(choices, genericError)
    {
        let bestError, bestErrorNumMatched = 0;

        for (let choice of choices)
        {
            if (choice.parse && choice.handle)
            {
                let { ast, error, numMatched } = this.attemptSubtree(choice.parse);
                if (!error) return { ast: choice.handle(ast), error: null };
                if (numMatched > bestErrorNumMatched)
                {
                    bestError = error;
                    bestErrorNumMatched = numMatched;
                }
            }
        }
        
        if (bestError && bestErrorNumMatched > this.mustConsumeBeforeUsingError)
        {
            return { ast: null, error: bestError };
        }
        return { ast: null, error: { msg: genericError, ...this.tokenStream.currentLinePos() } };
    }
}

function getPrettyAST(ast)
{
    return ast.mapKey("node", v => ASTNode.keyName(v));
}

module.exports = { Parser: Parser, getPrettyAST: getPrettyAST };