parsetest/parse.js at 06faae9d22d29c6d6bc311ff75b4b53934423ac8

Fork: 0
mark / parsetest
Find file
Newer
Older
parsetest / parse.js
Mark on 22 Sep 2020 16 KB Parser pretty much complete, and pretty
Raw Blame History
const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 });

/* takes a string, and returns a flat array of tokens
 *
 * recognized types:
 *  Word: as a token type, reprsents any symbol/word: variable names, or actual keywords
 *  NumberLit: a literal value for the number type
 *  StringLit: a literal value for the string type
 *  Op: an operator (usually a single character of punctuation)
 */
function lex(input)
{
    // break the input down into "parts": either words, digit strings, single character operators, or whitespace
    let parts = [];
    while (input.length > 0)
    {
        // match our possible sequences of more than one character (words, digits, or whitespace)
        if ((match = input.match(/^\w+/)) ||
            (match = input.match(/^\d+/)) ||
            (match = input.match(/^\s+/)))
        {
            const matched = match[0];
            parts.push(matched);
            input = input.substr(matched.length);
        }
        else
        {
            // everything else is a single character (punctuation)
            parts.push(input[0]);
            input = input.substr(1);
        }
    }

    let tokens = [];

    for (let i = 0; i < parts.length; i++)
    {
        const partsLeft = parts.length - i - 1;
        let token = parts[i];
        
        if (token.trim().length == 0)
        {
            // remove whitespace (whitespace in a string is handled in the block that handles the " token below)
            continue;
        }

        if (token.match(/\d+/))
        {
            // try and get the decimal, if any
            if (partsLeft > 2 && parts[i + 1] == "." && parts[i + 2].match(/\d+/))
            {
                token += "." + parts[i + 2];
            }
            tokens.push({type:TokenType.NumberLit, token:token});
        }       
        else if (token == '"')
        {
            // we are starting a string literal
            
            // don't include the initial "
            token = "";

            let closed = false;
            for (let j = i + 1; j < parts.length; j++)
            {
                // find an escape character
                if (parts[j] == '\\')
                {
                    if (j + 1 < parts.length)
                    {
                        // the escape character is the next character (all tokens are guaranteed length > 0)
                        const escaped = parts[j + 1][0];

                        // remove the escaped character from the next token, being sure to remove the token
                        // if it was only that 1 character
                        parts[j + 1] = parts[j + 1].substr(1);
                        if (parts[j + 1].length == 0) parts.splice(j + 1, 1);

                        // for now, just put the escaped character in literally, we'll deal with escape sequences later
                        token += escaped;
                    }
                }
                else if (parts[j] == '"')
                {
                    // close the string
                    i = j;
                    closed = true;
                    break;
                }
                else
                {
                    // build up the string out of all tokens before the closing "
                    token += parts[j];
                }
            }
            
            if (!closed)
            {
                throw "Unclosed string";
            }
            
            tokens.push({type:TokenType.StringLit, token:token});
        }
        else if (token.match(/\w+/))
        {
            // any keyword or identifier
            tokens.push({type:TokenType.Word, token:token});
        }
        else
        {
            // anything left is an operator
            tokens.push({type:TokenType.Op, token:token});
        }
    }
    
    // keywords and identifiers are case insensitive
    return tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t);
}

/* How powerfully infix operators bind to their left and right arguments
 * If a value is bound more powerfully from one side, it becomes part of the operator
 *  on that side first
 *
 * The relative binding power of operators determines their precedence,
 *  and the relative binding power of the left and right side of a specific operator
 *  determines its associativity:
 *      If the *right* side binds more powerfully, the operator becomes *left* associative,
 *      as any instance of that operator that appear first in the token stream are able
 *      to grab their right arguments first, before further instances of the operator
 *      (and vice versa)
 */
const InfixBindingPower = Object.freeze({
    "+": [10, 11],
    "-": [10, 11],
    "*": [20, 21],
    ".": [100, 101],
});

/* How powerfully a prefix operator binds the value to its right
 * Stacked prefix operators are unambiguous, so this value is only
 *  useful in comparison to an infix or postfix operator, and determines
 *  which operators this prefix will get "inside of"
 */
const PrefixBindingPower = Object.freeze({
    "-": 40,
});

/* How powerfully a prefix operator binds the value to its right
 *  (works exactly the same as prefix binding power, but for operators on the right)
 */
const PostfixBindingPower = Object.freeze({
    "[": 50,
    "(": 60,
});

// helper to grab the first item of an array, and throw a specific string if the array is empty
Object.defineProperty(Array.prototype, 'shiftThrow', {
    enumerable: false,
    value: function(err) { 
        if (this.length == 0) throw err;
        return this.shift();
    }
});

// helper to get the name of an enum from one of its valuesObj[k] == value) || "INVALID_ENUM_VALUE";
Object.defineProperty(Object.prototype, 'keyName', {
    enumerable: false,
    value: function(value)
    {
        return Object.keys(this).find(k => this[k] == value) || "INVALID_ENUM_VALUE";
    }
});

// all reserved keywords
const Keywords = Object.freeze([ "if" ]);

// the types of AST nodes
const ASTNode = Object.freeze({
    Literal:1,          // value literal, could be string, boolean, number, etc.
    PrefixOp:2,         // prefix operator applied to sub-expression, i.e. -3 (3 is the sub-expression)
    InfixOp:3,          // two sub-expressions with an operator between them
    Identifier:4,       // some unresolved identifier, i.e variable_name_here
    Index:5,            // some sub-expression being indexed by another expression, i.e. sub_expr[3]
    Call:6,             // some sub-expression representing a function being called with a list of expressions
});

// all reserved words that represent literals
const Literals = Object.freeze({
    "true": {node:ASTNode.Literal, value:true},
    "false": {node:ASTNode.Literal, value:false},
});

/* The main entry point for parsing
 *
 * Takes an array of tokens from the lexer, and produces an AST of the program as raw objects
 */
function parseExpression(tokens)
{
    return parseExpressionInternal(tokens, 0);
}


/* Parses an expression consisting of a set of values, all separated by infix operators,
 *  as long none of the operators bind at least as powerfully as 'minBindingPower'
 *
 * This function will be called recursively to parse sub-expression, with the
 *  minBindingPower set to whatever the last seen infix operator was. This ensures
 *  that we never grab operators that can't be part of the subtree because their precedence
 *  should have had them happen earlier
 *
 * The while loop in the middle of this function is the main feature that makes this a pratt parser,
 *  and separates it from a standard recursive decent parser where you have to manually match on only 
 *  certain terms to maintain precedence
 *
 * Produces an AST with operator precedence guaranteed to be respected (sub-terms of operators are applications
 *  of operators with higher precedence)
 *  The nodes of the AST are marked with their type, and references to the subnodes, plus any relevent information
 */
function parseExpressionInternal(tokens, minBindingPower)
{
    // grab the type of the next token, and its string representation
    const {type, token} = tokens.shiftThrow("No more input");

    // start parsing the left-hand-side of the next operator at this AST node
    let lhs = null;
    if (type == TokenType.Op)
    {
        // the value of this part of the expression starts with an operator
        // it will either be prefix operator, or a bracketed expression
        if (token == "(")
        {
            // parse a subexpression
            lhs = parseExpressionsInternal(tokens, 0);

            // ensure the next token after the sub expression is a closing paren, and consume it
            const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input");
            if (closeType != TokenType.Op && closeToken != ")")
            {
                throw "Unclosed parenthesis";
            }
        }
        else if ((power = PrefixBindingPower[token]) !== undefined)
        {
            // this is a prefix operator of the given power
            // get the expression it applies to...
            const rhs = parseExpressionInternal(tokens, power);
            // ... and generate an AST node for it
            // this becomes the new LHS of the operator we are parsing
            lhs = {node:ASTNode.PrefixOp, rhs:rhs, op:token};
        }
        else
        {
            throw "Bad operator " + token;
        }
    }
    else if (type == TokenType.Word)
    {
        if (Keywords.includes(token))
        {
            // TODO: handle keywords that can go in expressions
            throw "Keyword " + token + " not expected here";
        }
        else if (Literals[token] !== undefined)
        {
            // this is a hardcoded keyword literal, we already defined how those nodes look
            lhs = Literals[token];
        }
        else
        {
            // some unresolved identifier (this is the domain of the runtime/interpreter!)
            lhs = {node:ASTNode.Identifier, value:token};
        }
    }
    else if (type == TokenType.NumberLit)
    {
        lhs = {node:ASTNode.Literal, value:parseFloat(token)};
    }
    else if (type == TokenType.StringLit)
    {
        lhs = {node:ASTNode.Literal, value:token};
    }
    else
    {
        throw "Unexpcted token " + TokenType.keyName(type) + " of " + token;
    }

    while (tokens.length > 0)
    {
        // start grabbing further operators and expression, that don't have higher precedence
        // (higher precedence operators will be parsed as sub-expressions)
        
        /* Expecting an operator now
         * Depending on its precedence, we may not actually get to parse it ourselves,
         *  so don't actually consume the token yet, just peek it (just in case)
         */
        const {type:opType, token:opToken} = tokens[0];
        if (opType != TokenType.Op)
        {
            throw "Operator expected here, instead of " + TokenType.keyName(opType) + " of " + opToken;
        }

        if (PostfixBindingPower[opToken] !== undefined)
        {
            // this is a post-fix operator, but it may apply to...
            if (PostfixBindingPower[opToken] < minBindingPower)
            {
                //... some outer expression (we were too low power to bind to lhs)
                break;
            }

            //... or our current lhs, in which case we get to consume the token
            tokens.shift();

            if (opToken == "[")
            {
                /* Parsing an index on the current lhs
                 * We need to parse the index expression. Note we reset the binding power,
                 *  this is because we will always end with ], so we don't need to worry about our current precedence
                 */
                let index = parseExpressionInternal(tokens, 0);

                // parse the closing ]
                const {token:closeToken, type:closeType} = tokens.shiftThrow("No more input");
                if (closeType != TokenType.Op && closeToken != "]")
                {
                    throw "Unclosed indexing expression";
                }

                // our new node is the old lhs indexed with the list of argument expressions
                lhs = {node:ASTNode.Index, lhs:lhs, index:index};
            }
            else if (opToken == "(")
            {
                /* Parsing a function call
                 * lhs represents the function we are calling
                 * We need to parse a list of expression separated by , as the arguments
                 */
                let args = [];
                while (true)
                {
                    if (tokens.length == 0) throw "No more input";

                    // peek the next op
                    const {token:nextToken, type:nextType} = tokens[0];
                    if (nextType == TokenType.Op && nextToken == ")")
                    {
                        // no more arguments, we should end the call (after consuming the bracket)
                        tokens.shift();
                        break;
                    }

                    if (args.length > 0)
                    {
                        // if we have already parsed args, we need to consume a comma
                        if (nextType != TokenType.Op || nextToken != ",")
                        {
                            throw "Expected , or )";
                        }
                        tokens.shift();
                    }

                    // our new node is the old lhs as a function, called with the index expression
                    args.push(parseExpressionInternal(tokens, 0));
                }

                // generate the AST node
                lhs = {node:ASTNode.Call, func:lhs, args:args};
            }
            else
            {
                throw "Unexpected postfix operator " + opToken;
            }

            // after parsing a postfix operator, we are expecting an operator again
            continue;
        }

        // if we didn't get an infix operator, we are done this expression
        if (InfixBindingPower[opToken] === undefined)
        {
            break;
        }

        [leftBinding, rightBinding] = InfixBindingPower[opToken];

        // if the binding power is too low, we are done parsing the current subexpression, leave
        // the current operator for our caller...
        if (leftBinding < minBindingPower) break;

        //... otherwise, this operator is for us, so consume it
        tokens.shift();

        // the next expression is the right side of our current operator, but only as much as the
        // right-side binding power of our operator can grab
        let rhs = parseExpressionInternal(tokens, rightBinding);

        // our new lhs becomes our applied operator,
        // leaving us free to match more operators of an appropriate precedence
        lhs = {node:ASTNode.InfixOp, lhs:lhs, rhs:rhs, op:opToken};
    }

    return lhs;
}

// helper to map over an object's keys recursively
Object.defineProperty(Object.prototype, 'mapKey', {
    enumerable: false,
    value: function(keyName, mapFunc)
    {
        let ret = {};
        for (let k in this)
        {
            if (k == keyName)
            {
                ret[k] = mapFunc(this[k]);
            }
            else if (typeof(this[k]) == 'object')
            {
                ret[k] = { ...this[k] }.mapKey(keyName, mapFunc);
            }
            else
            {
                ret[k] = this[k];
            }
        }
        return ret;
    }
});

fs = require('fs');

console.format = v => console.dir(v, { depth: null });

function debugExpr(expr)
{
    const tokens = lex(expr);
    console.log(tokens.mapKey("type", v => TokenType.keyName(v)));

    const ast = parseExpression(tokens);
    console.format(ast.mapKey("node", v => ASTNode.keyName(v)));
}


//debugExpr("test.BLAH  (3+\"4x\\\"4\\\"\")");
//debugExpr('3+-3+4*5-2[blah."\\"b  3 "]');

debugExpr(fs.readFileSync(process.argv[2], "utf8"));