Fork me on GitHub

core/lexer.js

            
            /*global require:false*/
/*global define:false*/

/**
 * @module lexer
 * @private
 */
/**
 * ## Reference
 * TeX source code:
 * {@link  http://tug.org/texlive/devsrc/Build/source/texk/web2c/tex.web|Tex.web}
 * 
 * For a list of standard TeX macros, see:
 * {@link ftp://tug.ctan.org/pub/tex-archive/systems/knuth/dist/lib/plain.tex|plain.tex}
*/


define(['mathlive/core/grapheme-splitter'], function(GraphemeSplitter) {


/**
 * 
 * A token can be of type:
 *  - `literal`: the value is the character this token represents. This can be 
 * a combination of Unicode codepoints, for example for emojis.
 *  - `^` and `_`: superscript and subscript commands.
 *  - command: a command such as \sin
 *  - `{` and `}`: begin and end group (use for arguments of commands and for grouping)
 *  - `#`: parameter
 *
 *  - `esc`: start of a special command. Followed by commandliteral tokens.
 *  - `backslash`: start of a special command. Followed by commandliteral tokens.
 *  - `commandliteral`: a-zA-Z for special commands (esc sequence, etc...)
 *  - `placeholder`: a placeholder value meant to be replaced by some actual value
 *  - `space`: one or more space characters (including tab, etc...)
 * 
 *  See: [TeX:289](http://tug.org/texlive/devsrc/Build/source/texk/web2c/tex.web)
 * @property {string} value
 * @property {string} type
 * @class Token
 * @global
 * @private
 */
function Token(type, value) {
    this.type = type;
    this.value = value;
    console.assert(!(type === 'literal' && value === '}'));
}

/**
 * @param {string} s 
 * @class Lexer
 * @global
 * @private
 */
function Lexer(s) {
    this.s = GraphemeSplitter.splitGraphemes(s);
    this.pos = 0;
}

/**
 * @return {boolean} True if we reached the end of the stream
 * @method Lexer#end
 * @private
 */
Lexer.prototype.end = function() {
    return this.pos >= this.s.length;
}

/** 
 * Return the next char and advance 
 * @return {string}
 * @method Lexer#get
 */
Lexer.prototype.get = function() {
    return this.pos < this.s.length ? this.s[this.pos++] : null;
}

/**
 * Return the next char, but do not advance
 * @return {string}
 * @method Lexer#peek
 * @private
 */
Lexer.prototype.peek = function() {
    return this.pos < this.s.length ? this.s[this.pos] : null;
}

/**
 * Return the next substring matching regEx and advance.
 * @param {RegEx} regEx
 * @return {?string}
 * @method Lexer#scan
 * @private
 */
Lexer.prototype.scan = function(regEx) {
    let result;
    // this.s can either be a string, if it's made up only of ASCII chars
    // or an array of graphemes, if it's more complicated.
    if (typeof this.s === 'string') {
        result = regEx.exec(this.s.slice(this.pos)); 
    } else {
        result = regEx.exec(this.s.slice(this.pos).join('')); 
    }
    if (result) {
        this.pos += result[0].length;
        return result[0];
    }
    return null;
}


/**
 * Return true if next char is white space. Does not advance.
 * Note that browsers are inconsistent in their definitions of the 
 * `\s` metacharacter, so use an explicit string match instead.
 * 
 * - Chrome:      `[ \t\n\v\f\r\u00A0]`
 * - Firefox:     `[ \t\n\v\f\r\u00A0\u2028\u2029]`
 * - IE:          `[ \t\n\v\f\r]`
 * 
 * See [Stackoverflow](http://stackoverflow.com/questions/6073637/)
 * @method Lexer#isWhiteSpace
 * @private
 */
Lexer.prototype.isWhiteSpace = function() {
    return ' \f\n\r\t\v\u00A0\u2028\u2029'.indexOf(this.s[this.pos]) !== -1;
    /*
        - \t \u0009: tab (CHARACTER TABULATION)
        - \n \u000A: linefeed (LINE FEED)
        - \v \u000B: vertical tab (LINE TABULATION)
        - \f \u000C: form feed (FORM FEED)
        - \r \u000D: carriage return
        - \u00A0: NON-BREAKING SPACE
        - \u2028: LINE SEPARATOR
        - \u2029: PARAGRAPH SEPARATOR

        Could be considered:
        - \u1680 OGHAM SPACE MARK
        - \u2000-\u200a spacing
        - \u202f NARROW NO-BREAK SPACE
        - \u205F MEDIDUM MATHEMATICAL SPACE
        - \u3000 IDEOGRAPHIC SPACE
        - \uFEFF ZERO WITH NON-BREAKING SPACE
    */
}


/***
 * Advance until non-white-space char.
 * Returns number of chars skipped.
 * @method Lexer#skipWhiteSpace
 * @private
 */
Lexer.prototype.skipWhiteSpace = function() {
    const savedPos = this.pos;
    while (!this.end() && this.isWhiteSpace()) {
        this.get();
    }
    return this.pos - savedPos;
}



/**
 * Return a single token, or null, created from the lexer.
 * 
 * @returns {Token}
 * @method Lexer#makeToken
 * @private
 */
Lexer.prototype.makeToken = function() {
    // If we've reached the end, exit
    if (this.end()) return null;

    // Skip white space
    if (this.skipWhiteSpace() > 0) return new Token('space');

    let result = null;

    // Is it a command?
    if (this.peek() === '\\') {
        this.get();  // Skip the initial \
        if (!this.end()) {
            // A command is either a string of letters and asterisks...
            let command = this.scan(/^[a-zA-Z*]+/);
            if (!command) {
                // ... or a single non-letter character
                command = this.get();
            }
            // There are a few special commands that are handled here...
            if (command === 'bgroup') {
                // Begin group, synonym for opening brace
                result = new Token('{');
            } else if (command === 'egroup') {
                // End group, synonym for closing brace
                result = new Token('}');
            } else {
                result = new Token('command', command);
            }
        }

    // Is it a group start/end?
    } else if (this.peek() === '{' || this.peek() === '}') {
        result = new Token(this.get());

    } else if (this.peek() === '#') {
        // This could be either a param token, or a literal # (used for 
        // colorspecs, for example). A param token is a '#' followed by
        // - a digit 0-9 followed by a non-alpha, non-digit 
        // - or '?'. 
        // Otherwise, it's a literal '#'.
        this.get();
        if (!this.end()) {
            let isParam = false;
            let next = this.peek();
            if (/[0-9?]/.test(next)) {
                // Could be a param
                isParam = true;
                // Need to look ahead to the following char
                if (this.pos + 1 < this.s.length) {
                    const after = this.s[this.pos + 1];
                    isParam = /[^0-9A-Za-z]/.test(after);
                }
            }
            if (isParam) {
                result = new Token('#');
                next = this.get();
                if (next >= '0' && next <= '9') {
                    result.value = parseInt(next);
                } else {
                    result.value = '?';
                }
            } else {
                result = new Token('literal', '#');
            }
        }


        // result = new Token(this.get());
        // if (!this.end()) {
        //     const next = this.get();
        //     if (next >= '0' && next <= '9') {
        //         result.value = parseInt(next);
        //     } else {
        //         result.value = next;
        //     }
        // }

    } else if (this.peek() === '^') {
        result = new Token(this.get());

    } else if (this.peek() === '_') {
        result = new Token(this.get());

    } else if (this.peek() === '~') {
        // Spacing
        this.get();
        result = new Token('command', 'space');

    // Is it ESCAPE
    } else if (this.peek() === '\u001b') {   // ESCAPE character
        result = new Token('esc', this.get());

    // Is it a mode switch? 
    } else if (this.peek() === '$') {
        this.get();
        if (this.peek() === '$') {
            // $$ 
            this.get();
            result = new Token('$$');
        } else {
            // $
            result = new Token('$');
        }
    } else {
        result = new Token('literal', this.get());
    }

    return result;
}


/**
 * Create Tokens from a stream of LaTeX
 * 
 * @param {string} s - A string o LaTeX. It can include comments (with the `%`
 * marker) and multiple lines.
 * @return {Token[]}
 * @memberof module:lexer
 * @private
 */
function tokenize(s) {
    const result = [];
    const lines = s.toString().split(/\r?\n/);
    let stream = '';
    for (const line of lines) {
        // Remove everything after a % (comment marker)
        // (but \% should be preserved...)
        // @todo there's probably a better way of doing this using s.split(regex)
        let previousChar = '';
        for (let i = 0; i < line.length; i++) {
            const c = line.charAt(i);
            if (c === '%' && previousChar !== '\\') {
                break;
            }
            stream += c;
            previousChar = c;
        }
    }

    const lex = new Lexer(stream);
    while (!lex.end()) {
        const token = lex.makeToken();
        if (token) result.push(token);
    }

    return result;
}





return {
    tokenize
}




})