The 2006 edition of the ICFP programming contest, one of the most enjoyable to date, introduced the Universal Machine used by a fictional society to program around 200 BC. Many people have tried their had at implementing the UM in a variety of languages. This table lists several C, C++ and Haskell implementations.

Even though it is clearly hard to beat C speed-wise here, high-level, functional programming languages like Haskell or OCaml can come quite close in spite of the very low-level nature of the problem. I'm getting 75% of C's performance (1m21s vs. 1m for the "SANDmark" benchmark on a 3GHz AMD64) in OCaml with straightforward code that performs array bound checking (i.e., unlike the other implementations, malicious machine images cannot take over the process). This makes it faster than the best performing C++ implementation on this table, and comparable to other C implementations; obviously, this says more about the C++ implementations than about the language itself, but it shows that they're all in the same league. (BTW, Haskell has improved a lot since GHC 6.5: the "ugly, fast" Fast.hs is only 2.25 times slower than edwardk.c with 6.8.2, and a bit worse with 6.10.3: 2m18s = 2.3X; um6.hs, less harmful to the eye, is 5.5X slower than edwardk.c with 6.10.3, though --- virtually the same as GHC 6.5. I had to change a few lines of code as some APIs have changed since.)

Here's the OCaml code:

let empty = [||]
let regs = Array.make 8 0
let platters = ref (Array.make 16 empty)
let free_platters = Intstack.make 1 15

(* we deliberately use Array.get to check bounds; use unsafe_ for speed *)
let get a n : int = Array.get a n
let get_platter n = Array.get !platters (n)

let load_prog_platter a = !platters.(0) <- a; a

let free_platter n = !platters.(n) <- empty; Intstack.push free_platters n

let rec make_platter size =
  try
    let id = Intstack.pop free_platters in
      !platters.(id) <- Array.make size 0;
      id
  with Intstack.Underflow ->
    let len = Array.length !platters in
    let platters' = Array.make (len lsl 1) empty in
      Array.blit !platters 0 platters' 0 len;
      platters := platters';
      Intstack.push_interval free_platters len len;
      make_platter size

let op ins = ins lsr 28
let rA ins = (ins lsr 6) land 7
let rB ins = (ins lsr 3) land 7
let rC ins = ins land 7

DEFINE RegA = regs.(rA ins)
DEFINE RegB = regs.(rB ins)
DEFINE RegC = regs.(rC ins)
DEFINE Next = exec prog (pc + 1)

let rec exec prog pc =
  let ins = get prog pc in
    match op ins with
        0 -> if RegC <> 0 then RegA <- RegB; Next
      | 1 -> RegA <- get (get_platter RegB) RegC; Next
      | 2 -> Array.set (get_platter RegA) RegB RegC; Next
      | 3 -> RegA <- (RegB + RegC) land 0xFFFFFFFF; Next
      | 4 -> RegA <- (RegB * RegC) land 0xFFFFFFFF; Next
      | 5 -> RegA <- (RegB land 0xFFFFFFFF) / (RegC land 0xFFFFFFFF); Next
      | 6 -> RegA <- lnot (RegB land RegC); Next
      | 7 -> print_endline "\nHALT"; exit 0
      | 8 -> RegB <- make_platter RegC; Next
      | 9 -> free_platter RegC; Next
      | 10 -> Printf.printf "%c%!" (Char.chr RegC); Next
      | 11 -> RegC <- Char.code (input_char stdin); Next
      | 12 -> if RegB = 0 then exec prog RegC
              else exec (load_prog_platter (Array.copy (get_platter RegB))) RegC
      | 13 -> regs.((ins lsr 25) land 7) <- (ins land 0x01ffffff); Next
      | _ -> failwith "Invalid instruction"

let read_i32 io =
  try
    let b3 = IO.read_byte io in
    let b2 = IO.read_byte io in
    let b1 = IO.read_byte io in
    let b0 = IO.read_byte io in
      Some (b0 + (b1 lsl 8) + (b2 lsl 16) + (b3 lsl 24))
  with IO.No_more_input -> None

let rec read_and_exec io prog = match read_i32 io with
    Some ins -> DynArray.add prog ins; read_and_exec io prog
  | None -> exec (load_prog_platter (DynArray.to_array prog)) 0

let () = read_and_exec (IO.input_channel stdin) (DynArray.make 128)

(Update: changed regs.(rX ins) <- ... to RegX <- ... in the above code; thanks to psykotic for noticing it.)

For convenience, I use camlp4's macro system and extlib, so this should be compiled with

ocamlfind ocamlopt -o um -nodynlink -inline 100 -pp "camlp4orf -unsafe" intstack.cmx um.ml -linkpkg

Unlike most UMs around, the above one is meant to run on a 64-bit system, so it has to assign identifiers to the allocated arrays ("platters"), as pointers don't fit in UM's 32-bit words. OTOH, this means I get to use OCaml's native 63-bit arithmetic instead of Int32's (which could be a bit faster but wouldn't be as convenient, as I would have to use pa_do's delimited overloading).

Achieving 75% of C's speed here is satisfying because the task leaves few to no opportunities to gain an advantage over C: it is so low-level that you cannot benefit from better algorithms or data structures, efficient code generation (complex pattern matching) or inter-module inlining. In fact, the only thing my OCaml program should be better at here is allocation, as it takes only 5 instructions to allocate small arrays, compared to several hundred (or possibly thousands) in C's malloc. Both the C version and the OCaml one admit further optimizations: in the latter, some Obj.magic tricks would allow to decrease the GC load, and the former would benefit from GCC's extensions (computed goto).

Finally, this is the trivial integer stack used by the above code to manage platter identifiers:

exception Underflow

type t = { mutable data : int array; mutable cnt : int }

let make start len = { data = Array.init len ((+) start); cnt = len }

let pop t =
  if t.cnt >= 1 then begin
    let n = t.data.(t.cnt - 1) in
      t.cnt <- t.cnt - 1;
      n
  end else raise Underflow

let resize t =
  let len = Array.length t.data in
  let data' = Array.make (len * 2) 0 in
    Array.blit t.data 0 data' 0 len;
    t.data <- data'

let push t n =
  if t.cnt + 1 > Array.length t.data then resize t;
  t.data.(t.cnt) <- n;
  t.cnt <- t.cnt + 1

let push_interval t start len =
  if t.cnt + len > Array.length t.data then resize t;
  for i = 0 to len - 1 do
    t.data.(t.cnt + i) <- start + i;
  done;
  t.cnt <- t.cnt + len

Comments

  1. I guess I'm a bit of an OCaml newbie, but what does "DEFINE" below mean?:

    DEFINE RegA = regs.(rA ins)

    I haven't seen that in OCaml... but again, I'm a bit of a newbie.

    Phrank, 22 July 2009 at 00:59#
  2. As the author of edwardk.c, I'd like to say that I wouldn't be so quick to claim that you "cannot benefit from better algorithms or data structures".

    You can blow the doors off of my straw man example just by jitting.

    The machine is actually fairly well behaved and doesn't resort to self-modifying code within an array, so I was able to get much better results by just calling out to GNU Lightning and could have probably done a lot better still with a better jitting strategy.

    Edward Kmett, 22 July 2009 at 05:28#
  3. Phrank: it's a directive defined by the Camlp4MacroParser camlp4 syntax extension; this is why you need the -pp "camlp4orf -unsafe" option to compile. It is documented here.

    Edward: I was referring to changes like using persistent structures instead of arrays for the program representation (so as to make the JMP instruction faster when loading a new platter, at the cost of making set/get slower --- of course, there's no way this could perform better), not removing the interpretative overhead by JITting, which is a sort of meta-optimization, if you see what I mean. Compiling with the OCaml LLVM bindings would be an interesting exercise, and the result would certainly trounce all the non-JITted VMs.

    mfp, 22 July 2009 at 08:07#