Building a softcore

in ocaml for prototyping

Chapter 9: Building a WebAssembly Softcore for FPGA

Now we turn from software to hardware. Building a WebAssembly processor in hardware might seem like an unusual choice, but it offers several compelling advantages:

  1. Native WebAssembly execution without translation overhead

  2. Predictable performance with cycle-accurate execution

  3. Enhanced security through hardware isolation

  4. Custom instruction extensions for domain-specific workloads

We'll design our softcore using HardCaml, a hardware description language embedded in OCaml that provides excellent abstraction capabilities.

Design Overview

Our WebAssembly softcore will implement a stack-based processor with the following components:

  1. Instruction Fetch Unit - Fetches instructions from memory

  2. Decode Unit - Decodes WebAssembly instructions

  3. Operand Stack - Hardware implementation of the WebAssembly stack

  4. Execution Units - ALU, memory interface, control flow

  5. Local Storage - Fast storage for local variables

  6. Memory Interface - Linear memory access

  7. Call Stack - Hardware call stack for function calls

Let's start with the basic types and interfaces:

(* wasm_types.ml *)
open Base
open Hardcaml

module Value_type = struct
  type t =
    | I32
    | I64
    | F32
    | F64
  [@@deriving sexp_of]
  
  let to_bits = function
    | I32 -> Bits.of_int ~width:2 0
    | I64 -> Bits.of_int ~width:2 1
    | F32 -> Bits.of_int ~width:2 2
    | F64 -> Bits.of_int ~width:2 3
end

module Wasm_value = struct
  (* We'll represent all values as 64-bit for simplicity *)
  type t = {
    data : Signal.t; (* 64 bits *)
    typ : Signal.t;  (* 2 bits for value type *)
    valid : Signal.t; (* 1 bit *)
  }

  let create ~data ~typ ~valid = { data; typ; valid }
  
  let width = 64 + 2 + 1 (* data + type + valid *)
  
  let pack t = 
    Signal.concat_msb [ t.valid; t.typ; t.data ]
    
  let unpack s =
    let data = Signal.select s ~high:63 ~low:0 in
    let typ = Signal.select s ~high:65 ~low:64 in
    let valid = Signal.bit s 66 in
    { data; typ; valid }
end

(* Opcode definitions *)
module Opcode = struct
  type t =
    | Unreachable
    | Nop
    | Block
    | Loop
    | If
    | Else
    | End
    | Br
    | Br_if
    | Return
    | Call
    | Drop
    | Select
    | Local_get
    | Local_set
    | Local_tee
    | I32_load
    | I32_store
    | I32_const
    | I32_add
    | I32_sub
    | I32_mul
    | I32_div_s
    | I32_eq
    | I32_ne
    | I32_lt_s
    (* ... more opcodes ... *)
  [@@deriving sexp_of, enumerate]

  let to_bits opcode =
    (* Map opcodes to their WebAssembly binary values *)
    let code = match opcode with
      | Unreachable -> 0x00
      | Nop -> 0x01
      | Block -> 0x02
      | Loop -> 0x03
      | If -> 0x04
      | Else -> 0x05
      | End -> 0x0B
      | Br -> 0x0C
      | Br_if -> 0x0D
      | Return -> 0x0F
      | Call -> 0x10
      | Drop -> 0x1A
      | Select -> 0x1B
      | Local_get -> 0x20
      | Local_set -> 0x21
      | Local_tee -> 0x22
      | I32_load -> 0x28
      | I32_store -> 0x36
      | I32_const -> 0x41
      | I32_add -> 0x6A
      | I32_sub -> 0x6B
      | I32_mul -> 0x6C
      | I32_div_s -> 0x6D
      | I32_eq -> 0x46
      | I32_ne -> 0x47
      | I32_lt_s -> 0x48
    in
    Bits.of_int ~width:8 code
end

The Stack Machine Core

The heart of our processor is the stack machine. WebAssembly's stack-based execution maps naturally to hardware:

(* stack_machine.ml *)
open Base
open Hardcaml
open Signal.Std
open Wasm_types

module Stack_config = struct
  let depth = 256  (* Stack depth *)
  let width = Wasm_value.width
end

(* Hardware stack implementation *)
module Stack = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      push : 'a;
      pop : 'a;
      data_in : 'a [@bits Stack_config.width];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      data_out : 'a [@bits Stack_config.width];
      empty : 'a;
      full : 'a;
      depth : 'a [@bits 8]; (* Current stack depth *)
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Stack pointer *)
    let sp_next = Variable.wire ~default:(zero 8) in
    let sp = reg spec sp_next.value in
    
    (* Stack memory - using Block RAM *)
    let stack_ram = Ram.create
      ~collision_mode:Read_before_write
      ~size:Stack_config.depth
      ~write_ports:[|
        { write_clock = i.clock;
          write_address = sp;
          write_enable = i.push;
          write_data = i.data_in }
      |]
      ~read_ports:[|
        { read_clock = i.clock;
          read_address = mux2 i.pop (sp -:. 1) sp }
      |]
    in
    
    let data_out = stack_ram.(0) in
    
    (* Stack pointer logic *)
    let sp_inc = sp +:. 1 in
    let sp_dec = sp -:. 1 in
    let sp_next_val = 
      mux2 i.push sp_inc @@
      mux2 i.pop sp_dec sp in
    Variable.assign sp_next sp_next_val;
    
    (* Status flags *)
    let empty = sp ==:. 0 in
    let full = sp ==:. (Stack_config.depth - 1) in
    
    { O.data_out; empty; full; depth = sp }
end

(* Main processor core *)
module Processor = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      enable : 'a;
      
      (* Instruction memory interface *)
      inst_data : 'a [@bits 32];
      inst_ready : 'a;
      
      (* Data memory interface *)
      mem_read_data : 'a [@bits 32];
      mem_ready : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      (* Instruction fetch *)
      inst_addr : 'a [@bits 32];
      inst_req : 'a;
      
      (* Data memory *)
      mem_addr : 'a [@bits 32];
      mem_write_data : 'a [@bits 32];
      mem_write_enable : 'a;
      mem_read_enable : 'a;
      
      (* Status *)
      halted : 'a;
      error : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Program counter *)
    let pc_next = Variable.wire ~default:(zero 32) in
    let pc = reg spec pc_next.value in
    
    (* Current instruction *)
    let current_inst = reg spec ~enable:i.inst_ready i.inst_data in
    let opcode = select current_inst ~high:7 ~low:0 in
    
    (* Operand stack *)
    let stack_push = Variable.wire ~default:gnd in
    let stack_pop = Variable.wire ~default:gnd in
    let stack_data_in = Variable.wire ~default:(zero Stack_config.width) in
    
    let stack = Stack.create scope {
      Stack.I.clock = i.clock;
      clear = i.clear;
      push = stack_push.value;
      pop = stack_pop.value;
      data_in = stack_data_in.value;
    } in
    
    (* Instruction decode *)
    let decode_result = Instruction_decode.create scope {
      Instruction_decode.I.opcode;
      instruction = current_inst;
    } in
    
    (* Execution units *)
    let alu_result = Alu.create scope {
      Alu.I.operation = decode_result.alu_op;
      operand_a = Wasm_value.unpack stack.data_out;
      operand_b = Wasm_value.unpack stack.data_out; (* This needs proper dual-port stack *)
      enable = decode_result.alu_enable;
    } in
    
    (* State machine for instruction execution *)
    let module State = struct
      type t = 
        | Fetch
        | Decode  
        | Execute
        | Memory_wait
        | Stack_op
      [@@deriving sexp_of, enumerate, compare]
    end in
    
    let state = State_machine.create (module State) spec ~enable:i.enable in
    
    (* Control logic *)
    let fetch_next_inst = Variable.wire ~default:gnd in
    let execute_alu = Variable.wire ~default:gnd in
    let memory_op = Variable.wire ~default:gnd in
    
    always [
      switch (state.current) [
        State.Fetch, [
          when_ i.inst_ready [
            state.set_next State.Decode;
            fetch_next_inst <-- vdd;
          ];
        ];
        
        State.Decode, [
          state.set_next State.Execute;
        ];
        
        State.Execute, [
          switch opcode [
            (* Arithmetic operations *)
            Bits.of_int ~width:8 0x6A, [ (* i32.add *)
              execute_alu <-- vdd;
              stack_pop <-- vdd;
              stack_push <-- vdd;
              stack_data_in <-- Wasm_value.pack alu_result.result;
              state.set_next State.Fetch;
            ];
            
            (* Constants *)
            Bits.of_int ~width:8 0x41, [ (* i32.const *)
              let const_val = select current_inst ~high:31 ~low:8 in
              let wasm_val = Wasm_value.create 
                ~data:(uresize const_val 64) 
                ~typ:(Value_type.to_bits I32)
                ~valid:vdd in
              stack_push <-- vdd;
              stack_data_in <-- Wasm_value.pack wasm_val;
              state.set_next State.Fetch;
            ];
            
            (* Memory operations *)
            Bits.of_int ~width:8 0x28, [ (* i32.load *)
              memory_op <-- vdd;
              state.set_next State.Memory_wait;
            ];
            
            (* Control flow *)
            Bits.of_int ~width:8 0x0F, [ (* return *)
              (* Return logic - simplified *)
              state.set_next State.Fetch;
            ];
            
            (* Default case *)
            (default, [
              (* Unknown instruction - error *)
            ]);
          ];
        ];
        
        State.Memory_wait, [
          when_ i.mem_ready [
            stack_push <-- vdd;
            let mem_val = Wasm_value.create
              ~data:(uresize i.mem_read_data 64)
              ~typ:(Value_type.to_bits I32)
              ~valid:vdd in
            stack_data_in <-- Wasm_value.pack mem_val;
            state.set_next State.Fetch;
          ];
        ];
      ];
    ];
    
    (* Update PC *)
    Variable.assign pc_next (mux2 fetch_next_inst (pc +:. 4) pc);
    
    (* Memory interface *)
    let stack_top = Wasm_value.unpack stack.data_out in
    let mem_addr = resize_unsigned stack_top.data 32 in
    let mem_write_data = resize_unsigned stack_top.data 32 in
    
    { O.
      inst_addr = pc;
      inst_req = state.current ==: State.Fetch;
      mem_addr;
      mem_write_data;
      mem_write_enable = memory_op.value &: decode_result.is_store;
      mem_read_enable = memory_op.value &: decode_result.is_load;
      halted = gnd; (* TODO: Implement halt logic *)
      error = gnd;  (* TODO: Implement error detection *)
    }
end

Instruction Decode Unit

The instruction decoder translates WebAssembly opcodes into control signals:

(* instruction_decode.ml *)
module Instruction_decode = struct
  module I = struct
    type 'a t = {
      opcode : 'a [@bits 8];
      instruction : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      (* ALU control *)
      alu_op : 'a [@bits 4];
      alu_enable : 'a;
      
      (* Memory control *)
      is_load : 'a;
      is_store : 'a;
      mem_size : 'a [@bits 2]; (* 0=byte, 1=half, 2=word *)
      
      (* Stack control *)
      stack_push : 'a;
      stack_pop : 'a;
      pop_count : 'a [@bits 3]; (* How many values to pop *)
      push_count : 'a [@bits 3]; (* How many values to push *)
      
      (* Control flow *)
      is_branch : 'a;
      is_call : 'a;
      is_return : 'a;
      
      (* Local variable access *)
      is_local_get : 'a;
      is_local_set : 'a;
      local_index : 'a [@bits 16];
      
      (* Immediate values *)
      has_immediate : 'a;
      immediate : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    
    (* Decode table - this could be implemented as a ROM *)
    let decode_table opcode =
      let zero_output = {
        O.alu_op = zero 4;
        alu_enable = gnd;
        is_load = gnd;
        is_store = gnd;
        mem_size = zero 2;
        stack_push = gnd;
        stack_pop = gnd;
        pop_count = zero 3;
        push_count = zero 3;
        is_branch = gnd;
        is_call = gnd;
        is_return = gnd;
        is_local_get = gnd;
        is_local_set = gnd;
        local_index = zero 16;
        has_immediate = gnd;
        immediate = zero 32;
      } in
      
      mux (uresize opcode 8) [
        (* 0x00: unreachable *)
        { zero_output with (* trap *) };
        
        (* 0x01: nop *)
        zero_output;
        
        (* ... skip to interesting opcodes ... *)
        
        (* 0x20: local.get *)
        { zero_output with 
          is_local_get = vdd;
          stack_push = vdd;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
          has_immediate = vdd;
          immediate = select i.instruction ~high:31 ~low:8; (* local index *)
        };
        
        (* 0x21: local.set *)
        { zero_output with
          is_local_set = vdd;
          stack_pop = vdd;
          pop_count = Bits.of_int ~width:3 1 |> of_bit_string;
          has_immediate = vdd;
          immediate = select i.instruction ~high:31 ~low:8;
        };
        
        (* 0x28: i32.load *)
        { zero_output with
          is_load = vdd;
          mem_size = Bits.of_int ~width:2 2 |> of_bit_string; (* word *)
          stack_pop = vdd;
          stack_push = vdd;
          pop_count = Bits.of_int ~width:3 1 |> of_bit_string;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
        };
        
        (* 0x36: i32.store *)
        { zero_output with
          is_store = vdd;
          mem_size = Bits.of_int ~width:2 2 |> of_bit_string;
          stack_pop = vdd;
          pop_count = Bits.of_int ~width:3 2 |> of_bit_string; (* address + value *)
        };
        
        (* 0x41: i32.const *)
        { zero_output with
          stack_push = vdd;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
          has_immediate = vdd;
          immediate = select i.instruction ~high:31 ~low:8; (* constant value *)
        };
        
        (* 0x6A: i32.add *)
        { zero_output with
          alu_op = Bits.of_int ~width:4 0 |> of_bit_string; (* ADD *)
          alu_enable = vdd;
          stack_pop = vdd;
          stack_push = vdd;
          pop_count = Bits.of_int ~width:3 2 |> of_bit_string;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
        };
        
        (* 0x6B: i32.sub *)
        { zero_output with
          alu_op = Bits.of_int ~width:4 1 |> of_bit_string; (* SUB *)
          alu_enable = vdd;
          stack_pop = vdd;
          stack_push = vdd;
          pop_count = Bits.of_int ~width:3 2 |> of_bit_string;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
        };
      ] ~default:zero_output
    in
    
    decode_table i.opcode
end

ALU Implementation

The arithmetic logic unit handles all computational operations:

(* alu.ml *)
module Alu = struct
  module I = struct
    type 'a t = {
      operation : 'a [@bits 4];
      operand_a : Wasm_value.t;
      operand_b : Wasm_value.t;
      enable : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      result : Wasm_value.t;
      overflow : 'a;
      divide_by_zero : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    
    (* ALU operations *)
    let module Op = struct
      let add = of_bit_string (Bits.of_int ~width:4 0)
      let sub = of_bit_string (Bits.of_int ~width:4 1)

      let mul = of_bit_string (Bits.of_int ~width:4 2)
      let div_s = of_bit_string (Bits.of_int ~width:4 3)
      let div_u = of_bit_string (Bits.of_int ~width:4 4)
      let rem_s = of_bit_string (Bits.of_int ~width:4 5)
      let rem_u = of_bit_string (Bits.of_int ~width:4 6)
      let and_ = of_bit_string (Bits.of_int ~width:4 7)
      let or_ = of_bit_string (Bits.of_int ~width:4 8)
      let xor = of_bit_string (Bits.of_int ~width:4 9)
      let shl = of_bit_string (Bits.of_int ~width:4 10)
      let shr_s = of_bit_string (Bits.of_int ~width:4 11)
      let shr_u = of_bit_string (Bits.of_int ~width:4 12)
      let eq = of_bit_string (Bits.of_int ~width:4 13)
      let ne = of_bit_string (Bits.of_int ~width:4 14)
      let lt_s = of_bit_string (Bits.of_int ~width:4 15)
    end in
    
    (* Extract operands based on type *)
    let a_i32 = select i.operand_a.data ~high:31 ~low:0 in
    let b_i32 = select i.operand_b.data ~high:31 ~low:0 in
    let a_i64 = i.operand_a.data in
    let b_i64 = i.operand_b.data in
    
    (* Arithmetic operations *)
    let add_result_i32 = a_i32 +: b_i32 in
    let sub_result_i32 = a_i32 -: b_i32 in
    let mul_result_i32 = a_i32 *: b_i32 in
    
    (* Division requires special handling for divide-by-zero *)
    let div_by_zero_i32 = b_i32 ==:. 0 in
    let div_s_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 /+ b_i32) in
    let div_u_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 /: b_i32) in
    
    (* Remainder operations *)
    let rem_s_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 %+ b_i32) in
    let rem_u_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 %: b_i32) in
    
    (* Bitwise operations *)
    let and_result_i32 = a_i32 &: b_i32 in
    let or_result_i32 = a_i32 |: b_i32 in
    let xor_result_i32 = a_i32 ^: b_i32 in
    
    (* Shift operations (b is shift amount, only lower 5 bits used for i32) *)
    let shift_amount = select b_i32 ~high:4 ~low:0 in
    let shl_result_i32 = a_i32 <<: (uresize shift_amount 6) in
    let shr_s_result_i32 = a_i32 >>+ (uresize shift_amount 6) in
    let shr_u_result_i32 = a_i32 >>: (uresize shift_amount 6) in
    
    (* Comparison operations (return 1 for true, 0 for false) *)
    let eq_result_i32 = mux2 (a_i32 ==: b_i32) (one 32) (zero 32) in
    let ne_result_i32 = mux2 (a_i32 <>: b_i32) (one 32) (zero 32) in
    let lt_s_result_i32 = mux2 (a_i32 <+ b_i32) (one 32) (zero 32) in
    
    (* Similar operations for i64 *)
    let add_result_i64 = a_i64 +: b_i64 in
    let sub_result_i64 = a_i64 -: b_i64 in
    (* ... more i64 operations ... *)
    
    (* Result selection based on operation and operand types *)
    let result_data = 
      let i32_result = mux (uresize i.operation 4) [
        add_result_i32;      (* ADD *)
        sub_result_i32;      (* SUB *)
        mul_result_i32;      (* MUL *)
        div_s_result_i32;    (* DIV_S *)
        div_u_result_i32;    (* DIV_U *)
        rem_s_result_i32;    (* REM_S *)
        rem_u_result_i32;    (* REM_U *)
        and_result_i32;      (* AND *)
        or_result_i32;       (* OR *)
        xor_result_i32;      (* XOR *)
        shl_result_i32;      (* SHL *)
        shr_s_result_i32;    (* SHR_S *)
        shr_u_result_i32;    (* SHR_U *)
        eq_result_i32;       (* EQ *)
        ne_result_i32;       (* NE *)
        lt_s_result_i32;     (* LT_S *)
      ] in
      
      (* For now, assume all operations are on i32 *)
      (* In a full implementation, we'd switch on operand type *)
      uresize i32_result 64
    in
    
    let result_type = i.operand_a.typ in (* Result type same as operands for most ops *)
    let result_valid = i.enable &: i.operand_a.valid &: i.operand_b.valid in
    
    let result = Wasm_value.create 
      ~data:result_data 
      ~typ:result_type 
      ~valid:result_valid in
    
    (* Overflow detection *)
    let overflow = 
      (* Simple overflow detection for addition/subtraction *)
      let add_overflow = 
        let sign_a = msb a_i32 in
        let sign_b = msb b_i32 in
        let sign_result = msb add_result_i32 in
        (sign_a &: sign_b &: ~:sign_result) |: (~:sign_a &: ~:sign_b &: sign_result)
      in
      
      let sub_overflow =
        let sign_a = msb a_i32 in
        let sign_b = msb b_i32 in
        let sign_result = msb sub_result_i32 in
        (sign_a &: ~:sign_b &: ~:sign_result) |: (~:sign_a &: sign_b &: sign_result)
      in
      
      mux2 (i.operation ==: Op.add) add_overflow @@
      mux2 (i.operation ==: Op.sub) sub_overflow gnd
    in
    
    { O.result; overflow; divide_by_zero = div_by_zero_i32 }
end

Local Variable Storage

WebAssembly functions have local variables that need fast access. We'll implement this with a dedicated memory:

(* locals.ml *)
module Local_storage = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      
      (* Read port *)
      read_addr : 'a [@bits 16];
      read_enable : 'a;
      
      (* Write port *)
      write_addr : 'a [@bits 16];
      write_data : 'a [@bits Wasm_value.width];
      write_enable : 'a;
      
      (* Function call setup *)
      func_entry : 'a;
      param_count : 'a [@bits 8];
      local_count : 'a [@bits 8];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      read_data : 'a [@bits Wasm_value.width];
      read_valid : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Local variable memory - dual port RAM *)
    let local_ram = Ram.create
      ~collision_mode:Read_before_write
      ~size:65536 (* 64K locals max *)
      ~write_ports:[|
        { write_clock = i.clock;
          write_address = i.write_addr;
          write_enable = i.write_enable;
          write_data = i.write_data }
      |]
      ~read_ports:[|
        { read_clock = i.clock;
          read_address = i.read_addr }
      |]
    in
    
    let read_data = local_ram.(0) in
    let read_valid = reg spec i.read_enable in
    
    (* TODO: Add function frame management for locals *)
    
    { O.read_data; read_valid }
end

Memory Subsystem

The linear memory interface handles load and store operations:

(* memory_interface.ml *)
module Memory_interface = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      
      (* Processor interface *)
      addr : 'a [@bits 32];
      write_data : 'a [@bits 32];
      read_enable : 'a;
      write_enable : 'a;
      size : 'a [@bits 2]; (* 0=byte, 1=half, 2=word, 3=double *)
      
      (* External memory interface *)
      ext_read_data : 'a [@bits 32];
      ext_ready : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      read_data : 'a [@bits 64];
      ready : 'a;
      
      (* External memory interface *)
      ext_addr : 'a [@bits 32];
      ext_write_data : 'a [@bits 32];
      ext_read_enable : 'a;
      ext_write_enable : 'a;
      ext_byte_enable : 'a [@bits 4];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Address alignment and bounds checking *)
    let aligned_addr = i.addr &: (~~:. 0x3) in (* Word align *)
    let addr_valid = 
      (* Check for valid alignment based on access size *)
      mux i.size [
        vdd;                           (* byte - no alignment needed *)
        ~:(bit i.addr 0);             (* half - must be 2-byte aligned *)
        ~:(i.addr &: (of_bit_string (Bits.of_int ~width:32 0x3))) |> reduce ~f:(|:); (* word - 4-byte aligned *)
        ~:(i.addr &: (of_bit_string (Bits.of_int ~width:32 0x7))) |> reduce ~f:(|:); (* double - 8-byte aligned *)
      ]
    in
    
    (* Byte enable generation *)
    let byte_enable = 
      mux i.size [
        of_bit_string (Bits.of_int ~width:4 0x1); (* byte *)
        of_bit_string (Bits.of_int ~width:4 0x3); (* half *)
        of_bit_string (Bits.of_int ~width:4 0xF); (* word *)
        of_bit_string (Bits.of_int ~width:4 0xF); (* double - handled as two word accesses *)
      ]
    in
    
    (* State machine for memory operations *)
    let module Mem_state = struct
      type t = Idle | Read | Write | Wait [@@deriving sexp_of, enumerate]
    end in
    
    let mem_state = State_machine.create (module Mem_state) spec in
    
    (* Memory operation logic *)
    let start_read = i.read_enable &: addr_valid in
    let start_write = i.write_enable &: addr_valid in
    
    always [
      switch (mem_state.current) [
        Mem_state.Idle, [
          when_ start_read [ mem_state.set_next Mem_state.Read ];
          when_ start_write [ mem_state.set_next Mem_state.Write ];
        ];
        
        Mem_state.Read, [
          when_ i.ext_ready [ mem_state.set_next Mem_state.Idle ];
        ];
        
        Mem_state.Write, [
          when_ i.ext_ready [ mem_state.set_next Mem_state.Idle ];
        ];
      ];
    ];
    
    (* Output logic *)
    let ext_read_enable = mem_state.current ==: Mem_state.Read in
    let ext_write_enable = mem_state.current ==: Mem_state.Write in
    
    let read_data = 
      (* Sign/zero extend based on access size *)
      mux i.size [
        sresize i.ext_read_data 64;  (* byte - sign extend *)
        sresize i.ext_read_data 64;  (* half - sign extend *)
        uresize i.ext_read_data 64;  (* word - zero extend *)
        uresize i.ext_read_data 64;  (* double - TODO: implement proper 64-bit reads *)
      ]
    in
    
    { O.
      read_data;
      ready = mem_state.current ==: Mem_state.Idle;
      ext_addr = aligned_addr;
      ext_write_data = i.write_data;
      ext_read_enable;
      ext_write_enable;
      ext_byte_enable = byte_enable;
    }
end

Control Flow and Function Calls

WebAssembly's structured control flow requires careful hardware implementation:

(* control_flow.ml *)
module Control_flow = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      
      (* Branch control *)
      branch_enable : 'a;
      branch_target : 'a [@bits 32];
      branch_condition : 'a;
      is_conditional : 'a;
      
      (* Call/return *)
      call_enable : 'a;
      call_target : 'a [@bits 32];
      return_enable : 'a;
      
      (* Block structure *)
      block_enter : 'a;
      block_exit : 'a;
      loop_enter : 'a;
      
      (* Current PC *)
      current_pc : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      next_pc : 'a [@bits 32];
      pc_update : 'a;
      
      (* Stack management *)
      call_stack_push : 'a;
      call_stack_pop : 'a;
      call_stack_data : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Call stack for return addresses *)
    let call_stack_sp_next = Variable.wire ~default:(zero 8) in
    let call_stack_sp = reg spec call_stack_sp_next.value in
    
    let call_stack = Ram.create
      ~collision_mode:Read_before_write
      ~size:256 (* 256 call levels *)
      ~write_ports:[|
        { write_clock = i.clock;
          write_address = call_stack_sp;
          write_enable = i.call_enable;
          write_data = i.current_pc +:. 4 } (* Return address *)
      |]
      ~read_ports:[|
        { read_clock = i.clock;
          read_address = call_stack_sp -:. 1 }
      |]
    in
    
    let return_addr = call_stack.(0) in
    
    (* Control flow decision *)
    let next_pc = 
      (* Priority: call > return > branch > sequential *)
      mux2 i.call_enable i.call_target @@
      mux2 i.return_enable return_addr @@
      mux2 (i.branch_enable &: (i.branch_condition |: ~:i.is_conditional)) i.branch_target @@
      (i.current_pc +:. 4) (* Sequential *)
    in
    
    let pc_update = 
      i.call_enable |: i.return_enable |: 
      (i.branch_enable &: (i.branch_condition |: ~:i.is_conditional))
    in
    
    (* Update call stack pointer *)
    let sp_next = 
      mux2 i.call_enable (call_stack_sp +:. 1) @@
      mux2 i.return_enable (call_stack_sp -:. 1) @@
      call_stack_sp
    in
    Variable.assign call_stack_sp_next sp_next;
    
    { O.
      next_pc;
      pc_update;
      call_stack_push = i.call_enable;
      call_stack_pop = i.return_enable;
      call_stack_data = i.current_pc +:. 4;
    }
end

Top-Level Integration

Finally, let's integrate everything into a complete WebAssembly processor:

(* wasm_processor.ml *)
module Wasm_processor = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      enable : 'a;
      
      (* Instruction memory *)
      inst_data : 'a [@bits 32];
      inst_ready : 'a;
      
      (* Data memory *)
      mem_read_data : 'a [@bits 32];
      mem_ready : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      (* Instruction fetch *)
      inst_addr : 'a [@bits 32];
      inst_req : 'a;
      
      (* Data memory *)
      mem_addr : 'a [@bits 32];
      mem_write_data : 'a [@bits 32];
      mem_write_enable : 'a;
      mem_read_enable : 'a;
      mem_byte_enable : 'a [@bits 4];
      
      (* Status *)
      halted : 'a;
      error : 'a;
      
      (* Debug *)
      pc : 'a [@bits 32];
      stack_depth : 'a [@bits 8];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Create all the submodules *)
    let processor = Processor.create scope {
      Processor.I.
      clock = i.clock;
      clear = i.clear;
      enable = i.enable;
      inst_data = i.inst_data;
      inst_ready = i.inst_ready;
      mem_read_data = i.mem_read_data;
      mem_ready = i.mem_ready;
    } in
    
    { O.
      inst_addr = processor.inst_addr;
      inst_req = processor.inst_req;
      mem_addr = processor.mem_addr;
      mem_write_data = processor.mem_write_data;
      mem_write_enable = processor.mem_write_enable;
      mem_read_enable = processor.mem_read_enable;
      mem_byte_enable = ones 4; (* TODO: Connect proper byte enables *)
      halted = processor.halted;
      error = processor.error;
      pc = processor.inst_addr;
      stack_depth = zero 8; (* TODO: Connect actual stack depth *)
    }
end

Synthesis and Testing

To synthesize this design for the Arty A7 FPGA, you would:

  1. Generate Verilog: Use HardCaml's Verilog backend to produce synthesizable RTL

  2. Create constraints: Define pin assignments and timing constraints for the Arty A7

  3. Add memory controllers: Interface with the DDR3 RAM and other peripherals

  4. Build bootloader: Create firmware to load WebAssembly modules

  5. Implement debugging: Add UART interface for debugging and module loading

Here's a basic testbench structure:

(* test_wasm_processor.ml *)
open Base
open Hardcaml
open Hardcaml_waveterm

let create_test_program () =
  (* Simple WebAssembly program: adds two constants *)
  [|
    0x41000005l; (* i32.const 5 *)
    0x41000003l; (* i32.const 3 *)
    0x6A00000Bl; (* i32.add + end *)
  |]

let test_processor () =
  let module Sim = Cyclesim.Make(Wasm_processor) in
  
  let test_program = create_test_program () in
  let program_size = Array.length test_program in
  
  let sim = Sim.create ~config:Cyclesim.Config.trace_all 
    (Wasm_processor.create (Scope.create ~auto_label_hierarchical_ports:true ())) in
    
  let inputs = Cyclesim.inputs sim in
  let outputs = Cyclesim.outputs sim in
  
  (* Initialize *)
  inputs.clear := Bits.vdd;
  inputs.enable := Bits.gnd;
  Cyclesim.cycle sim;
  
  inputs.clear := Bits.gnd;
  inputs.enable := Bits.vdd;
  
  (* Simulate instruction fetch and execution *)
  for cycle = 0 to 100 do
    (* Provide instruction data when requested *)
    if Bits.to_bool !(outputs.inst_req) then (
      let pc = Bits.to_int !(outputs.inst_addr) / 4 in
      if pc < program_size then
        inputs.inst_data := Bits.of_int ~width:32 (Int32.to_int_exn test_program.(pc))
      else
        inputs.inst_data := Bits.zero 32;
      inputs.inst_ready := Bits.vdd;
    ) else (
      inputs.inst_ready := Bits.gnd;
    );
    
    (* Handle memory requests *)
    inputs.mem_ready := !(outputs.mem_read_enable) |: !(outputs.mem_write_enable);
    inputs.mem_read_data := Bits.zero 32;
    
    Cyclesim.cycle sim;
    
    (* Print status *)
    Printf.printf "Cycle %d: PC=%s Stack_depth=%s\n"
      cycle
      (Bits.to_string !(outputs.pc))
      (Bits.to_string !(outputs.stack_depth));
  done;
  
  (* Generate waveforms *)
  let waves = Hardcaml_waveterm.Waveform.create sim in
  Hardcaml_waveterm.Waveform.print ~display_width:120 ~display_height:30 waves

This WebAssembly softcore provides several advantages:

  1. Deterministic Performance: Every instruction has a known execution time

  2. Native WebAssembly: No translation overhead

  3. Security: Hardware-enforced sandboxing

  4. Customization: Can add domain-specific instructions

  5. Energy Efficiency: Optimized for the specific workload

The design can be extended with:

  • Floating-point units for F32/F64 operations

  • Multi-threading support

  • Custom instruction extensions

  • Hardware garbage collection support

  • Advanced branch prediction

Last updated