Compiling Regular Expressions

2023.2.13: geez this took months to write. Not really though; lots of stuff happened in the last couple of months.

Next step: compiling EBNFs into recursive descent parsers.

Basically the idea is that you can compile a regular expression to a VM program. Multiple regular expressions can be combined with | and thus is also covered.

I did this because I wanted to make my own lex/yacc... I can't fucking believe that I did a basic version (regexp expressed in JSON & compile to DFA and then JavaScript) and completly forgot about it.

The code is bad though so I'm gonna rewrite the thing anyway.

The link here explained it a lot better than mine...

The regular expression language & the VM

The basic regular expression is:

Regexp ::= EMPTY
         | CHARACTER
         | Regexp "*"           (Kleene star)
         | Regexp Regexp        (concatenation)
         | Regexp "|" Regexp    (union)

The expression A? can be replaced with A|\empty, A+ can be replaced with AA*, A{n,} can be replaced with "A repeated n times and ends with A*", A{,n} can be replaced with "A? repeated n times", A{n,m} can be replaced with "A repeated n times then A? repeated m-n times".

The VM has 4 instructions:

CHAR c: Matches the character c from the input. If match success, go to the next instruction; or else, the whole match of this thread (explained later) fail.
MATCH: Stop this thread because this signifies a match is found.
JMP x: Jump to another instruction. (I'm gonna make this jump an indirect jump for reasons.)
SPLIT x, y: Split the current thread and continues at both x and y.

And the regular expression language described above can be compiled like this:

EMPTY           (nothing)

CHARACTER       CHAR c (where c is the character)

e*              _1: SPLIT _2, _3
                _2: (code for e)
                _3:

e_1 e_2         (code for e_1)
                (code for e_2)
                
e_1 | e_2           SPLIT _1, _2
                _1: (code for e_1)
                    JMP _3
                _2: (code for e_2)
                _3: 

e?              _1: SPLIT _2, _3
                _2: (code for e)
                    JMP _1
                _3: 

e+              _1: (code for e)
                    SPLIT _1, _2
                _2:

The end of the VM program will have a MATCH instruction (of course).

So a simple interpreter for the VM would be:

enum VMInstrType { Char = 1, Match, Jump, Split, }
type VMInstr =
    { type: VMInstrType.Char, ch: string }
    | { type: VMInstrType.Match }
    | { type: VMInstrType.Jump, offset: number }
    | { type: VMInstrType.Split, t1: number, t2: number };
type VMThread = {
    pc: number,
    strp: number,
};
function runVm(prog: VMInstr[], str: string) {
    let threadpool: VMThread[] = [{pc:0, strp: 0}];
    function newThread(pc: number, strp: number) { threadpool.push({pc, strp}); }
    function pushThread(x: VMThread) { threadpool.push(x); }
    function takeThread() { return threadpool.shift(); }
    let fail: boolean = true;
    vmloop: while (threadpool.length > 0) {
        let th = takeThread()!;
        if (th.pc >= prog.length) {
            // NOTE: normally this should be an error because the program
            // must end with a MATCH thus during the execution it's either
            // (1) MATCH got executed or (2) CHAR executed and failed.
            break vmloop;
        }
        let i = prog[th.pc];
        switch (i.type) {
            case VMInstrType.Char: {
                if (str[th.strp] === i.ch) {
                    th.strp++;
                    th.pc++;
                    pushThread(th);
                }
                break;
            }
            case VMInstrType.Match: {
                fail = false;
                break vmloop;
            }
            case VMInstrType.Jump: {
                th.pc += i.offset;
                pushThread(th);
                break;
            }
            case VMInstrType.Split: {
                let oldpc = th.pc;
                th.pc += i.t1;
                pushThread(th);
                newThread(oldpc+i.t2, th.strp);
                break;
            }
        }
    }
    return !fail;
}

This works:

// a+b+a*
let program = [
    {type:VMInstrType.Char, ch:'a'},
    {type:VMInstrType.Split, t1:-1, t2:1},
    {type:VMInstrType.Char, ch:'b'},
    {type:VMInstrType.Split, t1:-1, t2:1},
    {type:VMInstrType.Split, t1:1, t2:2},
    {type:VMInstrType.Char, ch:'a'},
    {type:VMInstrType.Match}
];
console.log(runVm(program, 'aaabbbaaaaa'));
console.log(runVm(program, 'aaabbb'));
console.log(runVm(program, 'aaa'));
console.log(runVm(program, 'ccc'));
// Output:
// true
// true
// false
// false

We can have a compiler right now:

enum MyRegexpType {
    Empty = 1,
    Character,
    Star,
    Plus,
    Optional,
    Union,
    Concat,
}
type MyRegexp =
    { type: MyRegexpType.Empty }
    | { type: MyRegexpType.Character, ch: string }
    | { type: MyRegexpType.Star, e: MyRegexp }
    | { type: MyRegexpType.Plus, e: MyRegexp }
    | { type: MyRegexpType.Optional, e: MyRegexp }
    | { type: MyRegexpType.Union, e: MyRegexp[] }
    | { type: MyRegexpType.Concat, e: MyRegexp[] };

function _compileRegexp(x: MyRegexp): VMInstr[] {
    let res: VMInstr[] = [];
    switch (x.type) {
        case MyRegexpType.Empty: {
            break;
        }
        case MyRegexpType.Character: {
            res.push({type: VMInstrType.Char, ch: x.ch});
            break;
        }
        case MyRegexpType.Star: {
            let _e = _compileRegexp(x.e);
            res.push({type: VMInstrType.Split, t1:1, t2:_e.length+1});
            res = res.concat(_e);
            break;
        }
        case MyRegexpType.Plus: {
            let _e = _compileRegexp(x.e);
            res = res.concat(_e);
            res.push({type: VMInstrType.Split, t1:-_e.length, t2:1});
            break;
        }
        case MyRegexpType.Optional: {
            let _e = _compileRegexp(x.e);
            res.push({type: VMInstrType.Split, t1:1, t2:1+_e.length});
            res = res.concat(_e);
            res.push({type: VMInstrType.Jump, offset:-1-_e.length});
            break;
        }
        case MyRegexpType.Union: {
            if (x.e.length <= 0) { break; }
            // NOTE: we treat a union length of 1 as the regexp itself;
            // the regexp (|E) is expressed with Union(Empty,E).
            if (x.e.length <= 1) {
                res = res.concat(_compileRegexp(x.e[0]));
                break;
            }
            // this part is kinda hard to explain. first consider the case (e0|e1).
            // the output would be:
            //     SPLIT +1, +1+len(e0)
            //     (e0)
            //     JMP +1+len(e1)
            //     (e1)
            // now consider the case (e0|e1|e2):
            //     SPLIT +1, +1+len(e0)
            //     (e0)
            //     JMP +1+A
            //         SPLIT +1, +1+len(e1)
            //         (e1)
            //         JMP +1+B
            //         (e2)
            // B is obviously len(e2); but A should be 1+len(e1)+1+len(e2). one can
            // similarly consider the case (e0|e1|e2|e3), in which the JMP instruc-
            // -tions would be:
            //     JMP +1+1+len(e1)+1+1+len(e2)+1+len(e3)
            //     JMP +1+1+len(e2)+1+len(e3)
            //     JMP +1+len(e3)
            // so for a union with n subexpr, the offset for the ith (starting from 0)
            // JMP would be:
            //     +1 + (n-i-2)*2 + Sum{j=i+1 -> n-1, len(e_j)}
            // e.g. the JMP offsets of a union with 3 subexpr would be:
            //     +1 + 2 + len(e1)+len(e2)
            //     +1 + len(e2)
            // the JMP offsets of a union with 4 subexpr would be:
            //     +1 + 4 + len(e1)+len(e2)+len(e3)
            //     +1 + 2 + len(e2)+len(e3)
            //     +1 + len(e3)
            // the offset for the ith (starting from 0) SPLIT is obviously:
            //     +1, +2+len(e_i)
            // so we need a rolling sum of x.e.
            let rollingSumBase = 0;
            let rollingSum: number[] = [];
            let len: number[] = [];
            let buf: VMInstr[][] = [];
            for (let i = 0; i < x.e.length; i++) {
                let r = _compileRegexp(x.e[i]);
                buf.push(r);
                len[i] = r.length;
                rollingSum[i] = rollingSumBase + r.length;
                rollingSumBase = rollingSum[i];
            }
            for (let i = 0; i < x.e.length-1; i++) {
                res.push({type: VMInstrType.Split, t1:1, t2:2+len[i]});
                res = res.concat(buf[i]);
                res.push({
                    type: VMInstrType.Jump,
                    offset: 1+(x.e.length-i-2)*2+rollingSum[rollingSum.length-1]-rollingSum[i]
                });
            }
            // now we add the last part.
            res = res.concat(buf[x.e.length-1]);
            break;
        }
        case MyRegexpType.Concat: {
            x.e.forEach((v) => {
                let subprogram = _compileRegexp(v);
                res = res.concat(subprogram);
            });
            break;
        }

    }
    return res;
}
function compileRegexp(x: MyRegexp): VMInstr[] {
    return _compileRegexp(x).concat([{type: VMInstrType.Match}]);
}

which works as expected:

let z1 = compileRegexp({type: MyRegexpType.Character, ch: 'a'});
let z2 = compileRegexp({type: MyRegexpType.Union, e:[
    {type: MyRegexpType.Character, ch: 'a'},
    {type: MyRegexpType.Character, ch: 'b'},
]});
let z3 = compileRegexp({type: MyRegexpType.Union, e:[
    {type: MyRegexpType.Character, ch: 'a'},
    {type: MyRegexpType.Character, ch: 'b'},
    {type: MyRegexpType.Character, ch: 'c'},
]});
console.log(z1);
console.log(z2);
console.log(z3);
console.log('z1 a', runVm(z1, 'a'));
console.log('z2 a', runVm(z2, 'a'));
console.log('z3 a', runVm(z3, 'a'));
console.log('z1 b', runVm(z1, 'b'));
console.log('z2 b', runVm(z2, 'b'));
console.log('z3 b', runVm(z3, 'b'));
console.log('z1 c', runVm(z1, 'c'));
console.log('z2 c', runVm(z2, 'c'));
console.log('z3 c', runVm(z3, 'c'));

// [ { type: 1, ch: 'a' }, { type: 2 } ]
// [
//   { type: 4, t1: 1, t2: 3 },
//   { type: 1, ch: 'a' },
//   { type: 3, offset: 2 },
//   { type: 1, ch: 'b' },
//   { type: 2 }
// ]
// [
//   { type: 4, t1: 1, t2: 3 },
//   { type: 1, ch: 'a' },
//   { type: 3, offset: 5 },
//   { type: 4, t1: 1, t2: 3 },
//   { type: 1, ch: 'b' },
//   { type: 3, offset: 2 },
//   { type: 1, ch: 'c' },
//   { type: 2 }
// ]
// z1 a true
// z2 a true
// z3 a true
// z1 b false
// z2 b true
// z3 b true
// z1 c false
// z2 c false
// z3 c true

"Lock-step" VM

The VM listed on the original article is not efficient because itself being depth-first search will check some parts of the input string multiple times before eventually succeeding or failing (Consider matching the regexp (abc1|abc2|abc3|abc4) with abc4; the abc part will be checked multiple times before reaching the succeeding branch; of course the actuall execution order of branches can be changed but it's not a good idea to expect you can rely on this level of details.). The one I listed above is actually breadth-first with JavaScript standard library & engine GC behind my back; but I do intent to rewrite in C so this is worth checking out.

The basic observation for lock-step VM is that unions (in regexp) spawn threads and there's no "going back" on input inside single threads, i.e. input is strictly consumed by each thread in a single-pass left to right manner. "Lock-step" means every thread proceeds on the same position (of the input). The thread pool size won't need to be bigger than the length of the compiled regexp because if two threads are at the same PC they're definitely going down the same route (there's no extra state like in "conventional" VM, it's a DFA) which is effectively equivalent to one thread.

function runVm(prog: VMInstr[], str: string) {
    let threadpool: (VMThread|undefined)[] = new Array(prog.length);
    function pushThread(pc: number) { threadpool[pc] = {pc: pc}; }
    function takeThread(pc: number) {
        let th = threadpool[pc];
        threadpool[pc] = undefined;
        return th;
    }

    pushThread(0);
    for (let i = 0; i < str.length; i++) {
        for (let j = 0; j < threadpool.length; j++){
            let thread = takeThread(j);
            if (thread) {
                let pc = thread.pc;
                let instr = prog[pc];
                switch (instr.type) {
                    case VMInstrType.Char: {
                        if (instr.ch != str[i]) { break; }
                        pushThread(pc+1);
                        break;
                    }
                    case VMInstrType.Match: {
                        return true;
                    }
                    case VMInstrType.Jump: {
                        pushThread(pc+instr.offset);
                        break;
                    }
                    case VMInstrType.Split: {
                        pushThread(pc+instr.t1);
                        pushThread(pc+instr.t2);
                        break;
                    }
                }
            }
        }
    }
    return false;
}

Greedy and non-greedy matches

The original article says:

Having at most n threads on a list also bounds the amount of time spent processing each character. Assuming an efficient O(1) implementation of addthread (NOTE: pushThread in the code above), the worst case time for processing a single character is only O(n), so the time for the entire string is O(n m). This is far better than the essentially unbounded time requires by backtracking. (It also eliminates the infinite loops mentioned above.)

The code above has an O(1) pushThread but if we were to add non-greedy match there's a problem. See how non-greedy matches compile in that article comparing with greedy match:

e*              _1: SPLIT _2, _3
                _2: (code for e)
                _3:
                
e?              _1: SPLIT _2, _3
                _2: (code for e)
                    JMP _1
                _3: 

e+              _1: (code for e)
                    SPLIT _1, _2
                _2:

e*?             _1: SPLIT _3, _2
                _2: (code for e)
                _3:
                
e??             _1: SPLIT _3, _2
                _2: (code for e)
                    JMP _1
                _3: 

e+?             _1: (code for e)
                    SPLIT _2, _1
                _2:

The two offsets of the SPLIT instruction is reversed; this means the code I've listed above cannot properly support this because it iterates threadpool solely by PC. To keep addThread at O(1) we need to rewrite this:

function runVm(prog: VMInstr[], str: string) {
    let threadpool: (VMThread)[] = [];
    let threadpc: boolean[] = new Array(prog.length);
    function pushThread(pc: number) {
        if (!threadpc[pc]) {
            threadpc[pc] = true;
            threadpool.push({pc: pc});
        }
    }
    function takeThread() {
        let th = threadpool.pop();
        if (th) {
            threadpc[th.pc] = false;
        }
        return th;
    }

    pushThread(0);
    for (let i = 0; i < str.length; i++) {
        while (threadpool.length > 0) {
            let thread = takeThread();
            if (thread) {
                let pc = thread.pc;
                let instr = prog[pc];
                switch (instr.type) {
                    case VMInstrType.Char: {
                        if (instr.ch != str[i]) { break; }
                        pushThread(pc+1);
                        break;
                    }
                    case VMInstrType.Match: {
                        return true;
                    }
                    case VMInstrType.Jump: {
                        pushThread(pc+instr.offset);
                        break;
                    }
                    case VMInstrType.Split: {
                        pushThread(pc+instr.t1);
                        pushThread(pc+instr.t2);
                        break;
                    }
                }
            }
        }
    }
    return false;
}

We can update the compiler now:

// type MyRegexp
    // ...
    // NOTE: greedy by default. only when greedy is explicitly false the match is non-greedy.
    | { type: MyRegexpType.Star, e: MyRegexp, greedy?: boolean }
    | { type: MyRegexpType.Plus, e: MyRegexp, greedy?: boolean }
    | { type: MyRegexpType.Optional, e: MyRegexp, greedy?: boolean }
    // ...

// function _compileRegexp
    // ...
        case MyRegexpType.Star: {
            let _e = _compileRegexp(x.e);
            let t1 = x.greedy === false? _e.length+1 : 1;
            let t2 = x.greedy === false? 1 : _e.length+1;
            res.push({type: VMInstrType.Split, t1, t2});
            res = res.concat(_e);
            break;
        }
        case MyRegexpType.Plus: {
            let _e = _compileRegexp(x.e);
            let t1 = x.greedy === false? 1 : -_e.length+1;
            let t2 = x.greedy === false? _e.length+1 : 1;
            res = res.concat(_e);
            res.push({type: VMInstrType.Split, t1, t2});
            break;
        }
        case MyRegexpType.Optional: {
            let _e = _compileRegexp(x.e);
            let t1 = x.greedy === false? _e.length+1 : 1;
            let t2 = x.greedy === false? 1 : _e.length+1;
            res.push({type: VMInstrType.Split, t1, t2});
            res = res.concat(_e);
            res.push({type: VMInstrType.Jump, offset:-1-_e.length});
            break;
        }
    // ...

Technically making a lex replacement doesn't need to support submatch tracking so I'm gonna stop here. I might revisit this when I need to make my own regex engine...