parsetest/lex.js at master

Fork: 0
mark / parsetest
Find file
Newer
Older
parsetest / lex.js
Mark on 24 Sep 2020 9 KB Fix some bugs with parsing larger tokens, and add a for and map statement, and better error printing
Raw Blame History
require('./utils.js');

const TokenType = Object.freeze({ Word:1, NumberLit:2, StringLit:3, Op:4 });

class TokenStream
{
    constructor(input)
    {
        this.input = input;
        this.inputLength = input.length;

        // how far in the token stream we are
        this.index = 0;

        const lexed = lex(this.input);
        this.tokens = lexed.tokens;
        this.newlines = lexed.newlines;

        // when we attempt to match a sub-rule, save the location where we roll back to,
        // as well as the max number of tokens we ever matched
        this.savedLocations = [];
    }

    getPrettyTokenList()
    {
        return this.tokens.mapKey("type", v => TokenType.keyName(v));
    }

    eof()
    {
        return this.index < 0 || this.index >= this.tokens.length;
    }

    peek(type = undefined, tokenValue = undefined)
    {
        return this.peekAt(0, type, tokenValue);
    }

    peekAt(forwardAmount, type = undefined, tokenValue = undefined)
    {
        if (this.index + forwardAmount < 0 || this.index + forwardAmount >= this.tokens.length)
        {
            let msg = "Unexpected EOF";
            if (type !== undefined)
            {
                msg += ", expected " + TokenType.keyName(type);
                if (tokenValue !== undefined)
                {
                    msg += " of '" + tokenValue + "'";
                }
            }
            return { error: { msg: msg, ...this.stringPosToLinePos(this.inputLength - 1) } };
        }

        const token = this.tokens[this.index + forwardAmount];

        if (type)
        {
            if (token.type !== type)
            {
                if (tokenValue !== undefined)
                {
                    return { error: { msg: "Expecting " + TokenType.keyName(type) + " '" + tokenValue + "', not " + TokenType.keyName(token.type) + " '" + token.token + "'", ...this.tokenToLinePos(token) } };
                }
                return { error: { msg: "Expecting " + TokenType.keyName(type) + ", not " + TokenType.keyName(token.type) + " '" + token.token + "'", ...this.tokenToLinePos(token) } };
            }

            if (tokenValue !== undefined)
            {
                if (token.token !== tokenValue)
                {
                    return { error: { msg: "Expecting " + TokenType.keyName(type) + " '" + tokenValue + "', not '" + token.token + "'", ...this.tokenToLinePos(token) } };
                }
            }
        }

        return token;
    }

    consume(type = undefined, tokenValue = undefined)
    {
        const token = this.peek(type, tokenValue);
        if (token.error) return token;

        for (let saved of this.savedLocations)
        {
            saved.matched++;
        }

        this.index++;
        return token;
    }

    consumeCount(amount)
    {
        let { error } = this.peekAt(amount - 1);
        if (error) return { error: error };

        for (let saved of this.savedLocations)
        {
            saved.matched += amount;
        }

        this.index += amount;

        return { error: null };
    }

    startMatch()
    {
        this.savedLocations.push({ index: this.index, matched: 0 });
    }

    // returns the number of tokens in the match
    finishMatch()
    {
        console.assert(this.savedLocations.length > 0, "Unexpected finishMatch()");
        return this.savedLocations.pop().matched;
    }

    // returns the number of tokens we matched before we failed
    failMatch()
    {
        console.assert(this.savedLocations.length > 0, "Unexpected failMatch()");

        const saved = this.savedLocations.pop();
        this.index = saved.index;

        return saved.matched;
    }

    stringPosToLinePos(pos)
    {
        pos = Math.max(0, Math.min(this.inputLength - 1, pos));

        let line = this.newlines.length + 1;
        let previousLinesLength = 0;
        for (let i = 0; i < this.newlines.length; i++)
        {
            if (pos <= this.newlines[i])
            {
                line = i + 1;
                break;
            }
            previousLinesLength = this.newlines[i] + 1;
        }
        return {inputPos:pos, line:line, pos:pos - previousLinesLength + 1};
    }

    tokenIndexToLinePos(index)
    {
        if (index < 0 || index >= this.tokens.length) return this.stringPosToLinePos(this.inputLength - 1);

        return this.tokenToLinePos(this.tokens[index]);
    }

    tokenToLinePos(token)
    {
        return this.stringPosToLinePos(token.pos);
    }

    currentLinePos()
    {
        return this.tokenIndexToLinePos(this.index);
    }
}

/* takes a string, and returns a flat array of tokens, and a list of the locations of newlines in the source
 *
 * recognized types:
 *  Word: as a token type, reprsents any symbol/word: variable names, or actual keywords
 *  NumberLit: a literal value for the number type
 *  StringLit: a literal value for the string type
 *  Op: an operator (usually a single character of punctuation)
 */
function lex(input)
{
    let inputLength = input.length;

    let newlines = [];

    let i = 0;
    while ((i = input.indexOf("\n", i)) != -1)
    {
        newlines.push(i);
        i++;
    }

    // break the input down into "parts": either words, digit strings, single character operators, or whitespace
    let parts = [];

    // track where we are in the original string
    let pos = 0;
    while (input.length > 0)
    {
        // match our possible sequences of more than one character (words, digits, or whitespace)
        if ((match = input.match(/^\w+/)) ||
            (match = input.match(/^\d+/)) ||
            (match = input.match(/^\s+/)))
        {
            const matched = match[0];
            parts.push({token:matched, pos:pos});

            // advance the string
            input = input.substr(matched.length);
            pos += matched.length;
        }
        else
        {
            // everything else is a single character (punctuation)
            parts.push({token:input[0], pos:pos});
            
            input = input.substr(1);
            pos++;
        }
    }

    let tokens = [];

    for (let i = 0; i < parts.length; i++)
    {
        const partsLeft = parts.length - i - 1;

        let { token, pos } = parts[i];
        
        if (token.trim().length == 0)
        {
            // remove whitespace (whitespace in a string is handled in the block that handles the " token below)
            continue;
        }

        if (token.match(/\d+/))
        {
            // try and get the decimal, if any
            if (partsLeft > 2 && parts[i + 1].token == "." && parts[i + 2].token.match(/\d+/))
            {
                token += "." + parts[i + 2].token;
            }
            tokens.push({type:TokenType.NumberLit, token:token, pos:pos});
        }       
        else if (token == '"')
        {
            // we are starting a string literal
            
            // don't include the initial "
            token = "";

            let closed = false;
            for (let j = i + 1; j < parts.length; j++)
            {
                // find an escape character
                if (parts[j].token == '\\')
                {
                    if (j + 1 < parts.length)
                    {
                        // the escape character is the next character (all tokens are guaranteed length > 0)
                        const escaped = parts[j + 1].token[0];

                        // remove the escaped character from the next token, being sure to remove the token
                        // if it was only that 1 character
                        parts[j + 1].token = parts[j + 1].token.substr(1);
                        if (parts[j + 1].token.length == 0) parts.splice(j + 1, 1);

                        // for now, just put the escaped character in literally, we'll deal with escape sequences later
                        token += escaped;
                    }
                }
                else if (parts[j].token == '"')
                {
                    // close the string
                    i = j;
                    closed = true;
                    break;
                }
                else
                {
                    // build up the string out of all tokens before the closing "
                    token += parts[j].token;
                }
            }
            
            if (!closed)
            {
                throw "Unclosed string";
            }
            
            tokens.push({type:TokenType.StringLit, token:token, pos:pos});
        }
        else if (token.match(/\w+/))
        {
            // any keyword or identifier
            tokens.push({type:TokenType.Word, token:token, pos:pos});
        }
        else
        {
            // anything left is an operator
            tokens.push({type:TokenType.Op, token:token, pos:pos});
        }
    }
    
    // keywords and identifiers are case insensitive
    return {
        tokens: tokens.map(t => t.type == TokenType.Word ? { ...t, token:t.token.trim().toLowerCase() } : t),
        newlines: newlines,
    };
}

module.exports = { TokenStream: TokenStream, TokenType: TokenType };