Newer
Older
parsetest / parse.js
@Mark Mark on 23 Sep 2020 12 KB Make it professional
const lex = require('lex');
const utils = require('utils');


/* How powerfully infix operators bind to their left and right arguments
 * If a value is bound more powerfully from one side, it becomes part of the operator
 *  on that side first
 *
 * The relative binding power of operators determines their precedence,
 *  and the relative binding power of the left and right side of a specific operator
 *  determines its associativity:
 *      If the *right* side binds more powerfully, the operator becomes *left* associative,
 *      as any instance of that operator that appear first in the token stream are able
 *      to grab their right arguments first, before further instances of the operator
 *      (and vice versa)
 */
const InfixBindingPower = Object.freeze({
    "+": [10, 11],
    "-": [10, 11],
    "*": [20, 21],
    ".": [100, 101],
});

/* How powerfully a prefix operator binds the value to its right
 * Stacked prefix operators are unambiguous, so this value is only
 *  useful in comparison to an infix or postfix operator, and determines
 *  which operators this prefix will get "inside of"
 */
const PrefixBindingPower = Object.freeze({
    "-": 40,
});

/* How powerfully a prefix operator binds the value to its right
 *  (works exactly the same as prefix binding power, but for operators on the right)
 */
const PostfixBindingPower = Object.freeze({
    "[": 50,
    "(": 60,
});

// all reserved keywords
const Keywords = Object.freeze([ "if" ]);

// the types of AST nodes
const ASTNode = Object.freeze({
    Literal:1,          // value literal, could be string, boolean, number, etc.
    PrefixOp:2,         // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression)
    InfixOp:3,          // two sub-expressions with an operator between them
    Identifier:4,       // some unresolved identifier, i.e variable_name_here
    Index:5,            // some sub-expression being indexed by another expression, i.e. sub_expr[3]
    Call:6,             // some sub-expression representing a function being called with a list of expressions
});

// all reserved words that represent literals
const Literals = Object.freeze({
    "true": {node:ASTNode.Literal, value:true},
    "false": {node:ASTNode.Literal, value:false},
});

class Parser
{
    constructor(tokenStream)
    {
        this.tokenStream = tokenStream;
        this.tokenStream.index = 0;
    }


    expression(tokenStream)
    {
        return expressionInternal(tokenStream, 0);
    }

    /* Parses an expression consisting of a set of values, all separated by infix operators,
     *  as long none of the operators bind at least as powerfully as 'minBindingPower'
     *
     * This function will be called recursively to parse sub-expression, with the
     *  minBindingPower set to whatever the last seen infix operator was. This ensures
     *  that we never grab operators that can't be part of the subtree because their precedence
     *  should have had them happen earlier
     *
     * The while loop in the middle of this function is the main feature that makes this a pratt parser,
     *  and separates it from a standard recursive decent parser where you have to manually match on only 
     *  certain terms to maintain precedence
     *
     * Produces an error, or an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications
     *  of operators with higher precedence)
     *  The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information
     * 
     */
    expressionInternal(minBindingPower)
    {
        // grab the type of the next token, and its string representation
        const {type, token, index:tokenIndex, error} = this.tokenStream.consume();
        if (error) return this.getError(error);

        // start parsing the left-hand-side of the next operator at this AST node
        let lhs = null;
        if (type == lex.TokenType.Op)
        {
            // the value of this part of the expression starts with an operator
            // it will either be prefix operator, or a bracketed expression
            if (token == "(")
            {
                // parse a subexpression
                const { expr, error } = this.attemptSubtree(this.expression);
                if (error) return this.getError(error);

                lhs = expr;

                // ensure the next token after the sub expression is a closing paren, and consume it
                const { error } = tokens.consume(lex.TokenType.Op, ")");
                if (error) return this.getError(error);
            }
            else if ((power = PrefixBindingPower[token]) !== undefined)
            {
                // this is a prefix operator of the given power
                // get the expression it applies to...
                const { rhs, error } = this.attemptSubtree(expressionInternal, power);
                if (error) return this.getError(error);

                // ... and generate an AST node for it
                // this becomes the new LHS of the operator we are parsing
                lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token};
            }
            else
            {
                return this.getError({ msg: "Invalid Op '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) });
            }
        }
        else if (type == lex.TokenType.Word)
        {
            if (Keywords.includes(token))
            {
                // TODO: handle keywords that can go in expressions
                return this.getError({ msg: "Keyword '" + token + "' not expected", ...this.tokenStream.stringIndexToLinePos(tokenIndex) });
            }
            else if (Literals[token] !== undefined)
            {
                // this is a hardcoded keyword literal, we already defined how those nodes look
                lhs = Literals[token];
            }
            else
            {
                // some unresolved identifier (this is the domain of the runtime/interpreter!)
                lhs = {node:ASTNode.Identifier, value:token};
            }
        }
        else if (type == TokenType.NumberLit)
        {
            lhs = {node:ASTNode.Literal, value:parseFloat(token)};
        }
        else if (type == TokenType.StringLit)
        {
            lhs = {node:ASTNode.Literal, value:token};
        }
        else
        {
            return this.getError({ msg: "Unexpected " + lex.TokenType.keyName(type) + " of '" + token + "'", ...this.tokenStream.stringIndexToLinePos(tokenIndex) });
        }

        while (!this.tokenStream.eof())
        {
            // start grabbing further operators and expression, that don't have higher precedence
            // (higher precedence operators will be parsed as sub-expressions)
            
            /* Expecting an operator now
             * Depending on its precedence, we may not actually get to parse it ourselves,
             *  so don't actually consume the token yet, just peek it (just in case)
             */
            const {type:opType, token:opToken, index:opIndex, error} = this.tokenStream.peek(lex.TokenType.Op);
            if (error) return this.getError(error);

            if (PostfixBindingPower[opToken] !== undefined)
            {
                // this is a post-fix operator, but it may apply to...
                if (PostfixBindingPower[opToken] < minBindingPower)
                {
                    //... some outer expression (we were too low power to bind to lhs)
                    break;
                }

                //... or our current lhs, in which case we get to consume the token
                this.tokenStream.consume();

                if (opToken == "[")
                {
                    /* Parsing an index on the current lhs
                     * We need to parse the index expression. Note we reset the binding power,
                     *  this is because we will always end with ], so we don't need to worry about our current precedence
                     */
                    const { index, error } = this.attemptSubtree(this.expression);
                    if (error) return this.getError(error);

                    // parse the closing ]
                    const { error  } = this.tokenStream.consume(lex.TokenType.Op, "]");
                    if (error) return this.getError(error);

                    // our new node is the old lhs indexed with the list of argument expressions
                    lhs = {node:ASTNode.Index, lhs:lhs, index:index};
                }
                else if (opToken == "(")
                {
                    /* Parsing a function call
                     * lhs represents the function we are calling
                     * We need to parse a list of expression separated by , as the arguments
                     */
                    let args = [];
                    while (true)
                    {
                        if (tokens.length == 0) throw "Unclosed argument list, expected ',' or ')' at " + JSON.stringify(stringIndexToLinePos(inputLength - 1, newlines));

                        // peek the next op
                        const {token:nextToken, type:nextType, index:nextIndex, error} = this.tokenStream.peek();
                        if (error) return this.getError(error);

                        if (nextType == TokenType.Op && nextToken == ")")
                        {
                            // no more arguments, we should end the call (after consuming the bracket)
                            this.tokenStream.consume();
                            break;
                        }

                        if (args.length > 0)
                        {
                            const { error } = this.tokenStream.consume(lex.TokenType.Op, ",");
                            if (error) return this.getError(error);
                        }

                        const { arg, error } = this.attemptSubtree(this.expression);
                        if (error) return this.getError(error);

                        // our new node is the old lhs as a function, called with the index expression
                        args.push(arg);
                    }

                    // generate the AST node
                    lhs = {node:ASTNode.Call, func:lhs, args:args};
                }
                else
                {
                    return this.getError({ msg: "Unexpected postfix Op '" + opToken + "'", ...this.tokenStream.stringIndexToLinePos(opIndex) });
                }

                // after parsing a postfix operator, we are expecting an operator again
                continue;
            }

            // if we didn't get an infix operator, we are done this expression
            if (InfixBindingPower[opToken] === undefined)
            {
                break;
            }

            [leftBinding, rightBinding] = InfixBindingPower[opToken];

            // if the binding power is too low, we are done parsing the current subexpression, leave
            // the current operator for our caller...
            if (leftBinding < minBindingPower) break;

            //... otherwise, this operator is for us, so consume it
            this.tokenStream.consume();

            // the next expression is the right side of our current operator, but only as much as the
            // right-side binding power of our operator can grab
            const { rhs, error } = this.attemptSubtree(this.expressionInternal, rightBinding);
            if (error) return this.getError(error);

            // our new lhs becomes our applied operator,
            // leaving us free to match more operators of an appropriate precedence
            lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken};
        }

        return { ast: lhs, error: null };
    }

    error(msg, at)
    {
        let loc = this.tokenStream.tokenIndexToLinePos(this.tokenStream.index);
        if (at)
        {
            if (at.index)
            {
                loc = this.tokenStream.tokenToLinePos(at);
            }
            else
            {
                loc = this.tokenStream.tokenIndexToLinePos(at);
            }
        }
        return { ast:null, error: { msg: msg, ...loc } };
    }


    attemptSubtree(parseFunction, ...args)
    {
        this.tokenStream.attemptMatch();
        const subtree = parseFunction.apply(this, [tokenStream, ...args]);

        if (subtree.error)
        {
            this.tokenStream.failedMatch();
        }
        else
        {
            this.tokenSTream.doneMatch();
        }

        return subtree;
    }

    getError(error)
    {
        return { ast: null, error: error };
    }

}