# Building a softcore

### Chapter 9: Building a WebAssembly Softcore for FPGA

Now we turn from software to hardware. Building a WebAssembly processor in hardware might seem like an unusual choice, but it offers several compelling advantages:

1. **Native WebAssembly execution** without translation overhead
2. **Predictable performance** with cycle-accurate execution
3. **Enhanced security** through hardware isolation
4. **Custom instruction extensions** for domain-specific workloads

We'll design our softcore using HardCaml, a hardware description language embedded in OCaml that provides excellent abstraction capabilities.

#### Design Overview

Our WebAssembly softcore will implement a stack-based processor with the following components:

1. **Instruction Fetch Unit** - Fetches instructions from memory
2. **Decode Unit** - Decodes WebAssembly instructions
3. **Operand Stack** - Hardware implementation of the WebAssembly stack
4. **Execution Units** - ALU, memory interface, control flow
5. **Local Storage** - Fast storage for local variables
6. **Memory Interface** - Linear memory access
7. **Call Stack** - Hardware call stack for function calls

Let's start with the basic types and interfaces:

```ocaml
(* wasm_types.ml *)
open Base
open Hardcaml

module Value_type = struct
  type t =
    | I32
    | I64
    | F32
    | F64
  [@@deriving sexp_of]
  
  let to_bits = function
    | I32 -> Bits.of_int ~width:2 0
    | I64 -> Bits.of_int ~width:2 1
    | F32 -> Bits.of_int ~width:2 2
    | F64 -> Bits.of_int ~width:2 3
end

module Wasm_value = struct
  (* We'll represent all values as 64-bit for simplicity *)
  type t = {
    data : Signal.t; (* 64 bits *)
    typ : Signal.t;  (* 2 bits for value type *)
    valid : Signal.t; (* 1 bit *)
  }

  let create ~data ~typ ~valid = { data; typ; valid }
  
  let width = 64 + 2 + 1 (* data + type + valid *)
  
  let pack t = 
    Signal.concat_msb [ t.valid; t.typ; t.data ]
    
  let unpack s =
    let data = Signal.select s ~high:63 ~low:0 in
    let typ = Signal.select s ~high:65 ~low:64 in
    let valid = Signal.bit s 66 in
    { data; typ; valid }
end

(* Opcode definitions *)
module Opcode = struct
  type t =
    | Unreachable
    | Nop
    | Block
    | Loop
    | If
    | Else
    | End
    | Br
    | Br_if
    | Return
    | Call
    | Drop
    | Select
    | Local_get
    | Local_set
    | Local_tee
    | I32_load
    | I32_store
    | I32_const
    | I32_add
    | I32_sub
    | I32_mul
    | I32_div_s
    | I32_eq
    | I32_ne
    | I32_lt_s
    (* ... more opcodes ... *)
  [@@deriving sexp_of, enumerate]

  let to_bits opcode =
    (* Map opcodes to their WebAssembly binary values *)
    let code = match opcode with
      | Unreachable -> 0x00
      | Nop -> 0x01
      | Block -> 0x02
      | Loop -> 0x03
      | If -> 0x04
      | Else -> 0x05
      | End -> 0x0B
      | Br -> 0x0C
      | Br_if -> 0x0D
      | Return -> 0x0F
      | Call -> 0x10
      | Drop -> 0x1A
      | Select -> 0x1B
      | Local_get -> 0x20
      | Local_set -> 0x21
      | Local_tee -> 0x22
      | I32_load -> 0x28
      | I32_store -> 0x36
      | I32_const -> 0x41
      | I32_add -> 0x6A
      | I32_sub -> 0x6B
      | I32_mul -> 0x6C
      | I32_div_s -> 0x6D
      | I32_eq -> 0x46
      | I32_ne -> 0x47
      | I32_lt_s -> 0x48
    in
    Bits.of_int ~width:8 code
end
```

#### The Stack Machine Core

The heart of our processor is the stack machine. WebAssembly's stack-based execution maps naturally to hardware:

```ocaml
(* stack_machine.ml *)
open Base
open Hardcaml
open Signal.Std
open Wasm_types

module Stack_config = struct
  let depth = 256  (* Stack depth *)
  let width = Wasm_value.width
end

(* Hardware stack implementation *)
module Stack = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      push : 'a;
      pop : 'a;
      data_in : 'a [@bits Stack_config.width];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      data_out : 'a [@bits Stack_config.width];
      empty : 'a;
      full : 'a;
      depth : 'a [@bits 8]; (* Current stack depth *)
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Stack pointer *)
    let sp_next = Variable.wire ~default:(zero 8) in
    let sp = reg spec sp_next.value in
    
    (* Stack memory - using Block RAM *)
    let stack_ram = Ram.create
      ~collision_mode:Read_before_write
      ~size:Stack_config.depth
      ~write_ports:[|
        { write_clock = i.clock;
          write_address = sp;
          write_enable = i.push;
          write_data = i.data_in }
      |]
      ~read_ports:[|
        { read_clock = i.clock;
          read_address = mux2 i.pop (sp -:. 1) sp }
      |]
    in
    
    let data_out = stack_ram.(0) in
    
    (* Stack pointer logic *)
    let sp_inc = sp +:. 1 in
    let sp_dec = sp -:. 1 in
    let sp_next_val = 
      mux2 i.push sp_inc @@
      mux2 i.pop sp_dec sp in
    Variable.assign sp_next sp_next_val;
    
    (* Status flags *)
    let empty = sp ==:. 0 in
    let full = sp ==:. (Stack_config.depth - 1) in
    
    { O.data_out; empty; full; depth = sp }
end

(* Main processor core *)
module Processor = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      enable : 'a;
      
      (* Instruction memory interface *)
      inst_data : 'a [@bits 32];
      inst_ready : 'a;
      
      (* Data memory interface *)
      mem_read_data : 'a [@bits 32];
      mem_ready : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      (* Instruction fetch *)
      inst_addr : 'a [@bits 32];
      inst_req : 'a;
      
      (* Data memory *)
      mem_addr : 'a [@bits 32];
      mem_write_data : 'a [@bits 32];
      mem_write_enable : 'a;
      mem_read_enable : 'a;
      
      (* Status *)
      halted : 'a;
      error : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Program counter *)
    let pc_next = Variable.wire ~default:(zero 32) in
    let pc = reg spec pc_next.value in
    
    (* Current instruction *)
    let current_inst = reg spec ~enable:i.inst_ready i.inst_data in
    let opcode = select current_inst ~high:7 ~low:0 in
    
    (* Operand stack *)
    let stack_push = Variable.wire ~default:gnd in
    let stack_pop = Variable.wire ~default:gnd in
    let stack_data_in = Variable.wire ~default:(zero Stack_config.width) in
    
    let stack = Stack.create scope {
      Stack.I.clock = i.clock;
      clear = i.clear;
      push = stack_push.value;
      pop = stack_pop.value;
      data_in = stack_data_in.value;
    } in
    
    (* Instruction decode *)
    let decode_result = Instruction_decode.create scope {
      Instruction_decode.I.opcode;
      instruction = current_inst;
    } in
    
    (* Execution units *)
    let alu_result = Alu.create scope {
      Alu.I.operation = decode_result.alu_op;
      operand_a = Wasm_value.unpack stack.data_out;
      operand_b = Wasm_value.unpack stack.data_out; (* This needs proper dual-port stack *)
      enable = decode_result.alu_enable;
    } in
    
    (* State machine for instruction execution *)
    let module State = struct
      type t = 
        | Fetch
        | Decode  
        | Execute
        | Memory_wait
        | Stack_op
      [@@deriving sexp_of, enumerate, compare]
    end in
    
    let state = State_machine.create (module State) spec ~enable:i.enable in
    
    (* Control logic *)
    let fetch_next_inst = Variable.wire ~default:gnd in
    let execute_alu = Variable.wire ~default:gnd in
    let memory_op = Variable.wire ~default:gnd in
    
    always [
      switch (state.current) [
        State.Fetch, [
          when_ i.inst_ready [
            state.set_next State.Decode;
            fetch_next_inst <-- vdd;
          ];
        ];
        
        State.Decode, [
          state.set_next State.Execute;
        ];
        
        State.Execute, [
          switch opcode [
            (* Arithmetic operations *)
            Bits.of_int ~width:8 0x6A, [ (* i32.add *)
              execute_alu <-- vdd;
              stack_pop <-- vdd;
              stack_push <-- vdd;
              stack_data_in <-- Wasm_value.pack alu_result.result;
              state.set_next State.Fetch;
            ];
            
            (* Constants *)
            Bits.of_int ~width:8 0x41, [ (* i32.const *)
              let const_val = select current_inst ~high:31 ~low:8 in
              let wasm_val = Wasm_value.create 
                ~data:(uresize const_val 64) 
                ~typ:(Value_type.to_bits I32)
                ~valid:vdd in
              stack_push <-- vdd;
              stack_data_in <-- Wasm_value.pack wasm_val;
              state.set_next State.Fetch;
            ];
            
            (* Memory operations *)
            Bits.of_int ~width:8 0x28, [ (* i32.load *)
              memory_op <-- vdd;
              state.set_next State.Memory_wait;
            ];
            
            (* Control flow *)
            Bits.of_int ~width:8 0x0F, [ (* return *)
              (* Return logic - simplified *)
              state.set_next State.Fetch;
            ];
            
            (* Default case *)
            (default, [
              (* Unknown instruction - error *)
            ]);
          ];
        ];
        
        State.Memory_wait, [
          when_ i.mem_ready [
            stack_push <-- vdd;
            let mem_val = Wasm_value.create
              ~data:(uresize i.mem_read_data 64)
              ~typ:(Value_type.to_bits I32)
              ~valid:vdd in
            stack_data_in <-- Wasm_value.pack mem_val;
            state.set_next State.Fetch;
          ];
        ];
      ];
    ];
    
    (* Update PC *)
    Variable.assign pc_next (mux2 fetch_next_inst (pc +:. 4) pc);
    
    (* Memory interface *)
    let stack_top = Wasm_value.unpack stack.data_out in
    let mem_addr = resize_unsigned stack_top.data 32 in
    let mem_write_data = resize_unsigned stack_top.data 32 in
    
    { O.
      inst_addr = pc;
      inst_req = state.current ==: State.Fetch;
      mem_addr;
      mem_write_data;
      mem_write_enable = memory_op.value &: decode_result.is_store;
      mem_read_enable = memory_op.value &: decode_result.is_load;
      halted = gnd; (* TODO: Implement halt logic *)
      error = gnd;  (* TODO: Implement error detection *)
    }
end
```

#### Instruction Decode Unit

The instruction decoder translates WebAssembly opcodes into control signals:

```ocaml
(* instruction_decode.ml *)
module Instruction_decode = struct
  module I = struct
    type 'a t = {
      opcode : 'a [@bits 8];
      instruction : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      (* ALU control *)
      alu_op : 'a [@bits 4];
      alu_enable : 'a;
      
      (* Memory control *)
      is_load : 'a;
      is_store : 'a;
      mem_size : 'a [@bits 2]; (* 0=byte, 1=half, 2=word *)
      
      (* Stack control *)
      stack_push : 'a;
      stack_pop : 'a;
      pop_count : 'a [@bits 3]; (* How many values to pop *)
      push_count : 'a [@bits 3]; (* How many values to push *)
      
      (* Control flow *)
      is_branch : 'a;
      is_call : 'a;
      is_return : 'a;
      
      (* Local variable access *)
      is_local_get : 'a;
      is_local_set : 'a;
      local_index : 'a [@bits 16];
      
      (* Immediate values *)
      has_immediate : 'a;
      immediate : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    
    (* Decode table - this could be implemented as a ROM *)
    let decode_table opcode =
      let zero_output = {
        O.alu_op = zero 4;
        alu_enable = gnd;
        is_load = gnd;
        is_store = gnd;
        mem_size = zero 2;
        stack_push = gnd;
        stack_pop = gnd;
        pop_count = zero 3;
        push_count = zero 3;
        is_branch = gnd;
        is_call = gnd;
        is_return = gnd;
        is_local_get = gnd;
        is_local_set = gnd;
        local_index = zero 16;
        has_immediate = gnd;
        immediate = zero 32;
      } in
      
      mux (uresize opcode 8) [
        (* 0x00: unreachable *)
        { zero_output with (* trap *) };
        
        (* 0x01: nop *)
        zero_output;
        
        (* ... skip to interesting opcodes ... *)
        
        (* 0x20: local.get *)
        { zero_output with 
          is_local_get = vdd;
          stack_push = vdd;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
          has_immediate = vdd;
          immediate = select i.instruction ~high:31 ~low:8; (* local index *)
        };
        
        (* 0x21: local.set *)
        { zero_output with
          is_local_set = vdd;
          stack_pop = vdd;
          pop_count = Bits.of_int ~width:3 1 |> of_bit_string;
          has_immediate = vdd;
          immediate = select i.instruction ~high:31 ~low:8;
        };
        
        (* 0x28: i32.load *)
        { zero_output with
          is_load = vdd;
          mem_size = Bits.of_int ~width:2 2 |> of_bit_string; (* word *)
          stack_pop = vdd;
          stack_push = vdd;
          pop_count = Bits.of_int ~width:3 1 |> of_bit_string;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
        };
        
        (* 0x36: i32.store *)
        { zero_output with
          is_store = vdd;
          mem_size = Bits.of_int ~width:2 2 |> of_bit_string;
          stack_pop = vdd;
          pop_count = Bits.of_int ~width:3 2 |> of_bit_string; (* address + value *)
        };
        
        (* 0x41: i32.const *)
        { zero_output with
          stack_push = vdd;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
          has_immediate = vdd;
          immediate = select i.instruction ~high:31 ~low:8; (* constant value *)
        };
        
        (* 0x6A: i32.add *)
        { zero_output with
          alu_op = Bits.of_int ~width:4 0 |> of_bit_string; (* ADD *)
          alu_enable = vdd;
          stack_pop = vdd;
          stack_push = vdd;
          pop_count = Bits.of_int ~width:3 2 |> of_bit_string;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
        };
        
        (* 0x6B: i32.sub *)
        { zero_output with
          alu_op = Bits.of_int ~width:4 1 |> of_bit_string; (* SUB *)
          alu_enable = vdd;
          stack_pop = vdd;
          stack_push = vdd;
          pop_count = Bits.of_int ~width:3 2 |> of_bit_string;
          push_count = Bits.of_int ~width:3 1 |> of_bit_string;
        };
      ] ~default:zero_output
    in
    
    decode_table i.opcode
end
```

#### ALU Implementation

The arithmetic logic unit handles all computational operations:

```ocaml
(* alu.ml *)
module Alu = struct
  module I = struct
    type 'a t = {
      operation : 'a [@bits 4];
      operand_a : Wasm_value.t;
      operand_b : Wasm_value.t;
      enable : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      result : Wasm_value.t;
      overflow : 'a;
      divide_by_zero : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    
    (* ALU operations *)
    let module Op = struct
      let add = of_bit_string (Bits.of_int ~width:4 0)
      let sub = of_bit_string (Bits.of_int ~width:4 1)
```

```ocaml
      let mul = of_bit_string (Bits.of_int ~width:4 2)
      let div_s = of_bit_string (Bits.of_int ~width:4 3)
      let div_u = of_bit_string (Bits.of_int ~width:4 4)
      let rem_s = of_bit_string (Bits.of_int ~width:4 5)
      let rem_u = of_bit_string (Bits.of_int ~width:4 6)
      let and_ = of_bit_string (Bits.of_int ~width:4 7)
      let or_ = of_bit_string (Bits.of_int ~width:4 8)
      let xor = of_bit_string (Bits.of_int ~width:4 9)
      let shl = of_bit_string (Bits.of_int ~width:4 10)
      let shr_s = of_bit_string (Bits.of_int ~width:4 11)
      let shr_u = of_bit_string (Bits.of_int ~width:4 12)
      let eq = of_bit_string (Bits.of_int ~width:4 13)
      let ne = of_bit_string (Bits.of_int ~width:4 14)
      let lt_s = of_bit_string (Bits.of_int ~width:4 15)
    end in
    
    (* Extract operands based on type *)
    let a_i32 = select i.operand_a.data ~high:31 ~low:0 in
    let b_i32 = select i.operand_b.data ~high:31 ~low:0 in
    let a_i64 = i.operand_a.data in
    let b_i64 = i.operand_b.data in
    
    (* Arithmetic operations *)
    let add_result_i32 = a_i32 +: b_i32 in
    let sub_result_i32 = a_i32 -: b_i32 in
    let mul_result_i32 = a_i32 *: b_i32 in
    
    (* Division requires special handling for divide-by-zero *)
    let div_by_zero_i32 = b_i32 ==:. 0 in
    let div_s_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 /+ b_i32) in
    let div_u_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 /: b_i32) in
    
    (* Remainder operations *)
    let rem_s_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 %+ b_i32) in
    let rem_u_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 %: b_i32) in
    
    (* Bitwise operations *)
    let and_result_i32 = a_i32 &: b_i32 in
    let or_result_i32 = a_i32 |: b_i32 in
    let xor_result_i32 = a_i32 ^: b_i32 in
    
    (* Shift operations (b is shift amount, only lower 5 bits used for i32) *)
    let shift_amount = select b_i32 ~high:4 ~low:0 in
    let shl_result_i32 = a_i32 <<: (uresize shift_amount 6) in
    let shr_s_result_i32 = a_i32 >>+ (uresize shift_amount 6) in
    let shr_u_result_i32 = a_i32 >>: (uresize shift_amount 6) in
    
    (* Comparison operations (return 1 for true, 0 for false) *)
    let eq_result_i32 = mux2 (a_i32 ==: b_i32) (one 32) (zero 32) in
    let ne_result_i32 = mux2 (a_i32 <>: b_i32) (one 32) (zero 32) in
    let lt_s_result_i32 = mux2 (a_i32 <+ b_i32) (one 32) (zero 32) in
    
    (* Similar operations for i64 *)
    let add_result_i64 = a_i64 +: b_i64 in
    let sub_result_i64 = a_i64 -: b_i64 in
    (* ... more i64 operations ... *)
    
    (* Result selection based on operation and operand types *)
    let result_data = 
      let i32_result = mux (uresize i.operation 4) [
        add_result_i32;      (* ADD *)
        sub_result_i32;      (* SUB *)
        mul_result_i32;      (* MUL *)
        div_s_result_i32;    (* DIV_S *)
        div_u_result_i32;    (* DIV_U *)
        rem_s_result_i32;    (* REM_S *)
        rem_u_result_i32;    (* REM_U *)
        and_result_i32;      (* AND *)
        or_result_i32;       (* OR *)
        xor_result_i32;      (* XOR *)
        shl_result_i32;      (* SHL *)
        shr_s_result_i32;    (* SHR_S *)
        shr_u_result_i32;    (* SHR_U *)
        eq_result_i32;       (* EQ *)
        ne_result_i32;       (* NE *)
        lt_s_result_i32;     (* LT_S *)
      ] in
      
      (* For now, assume all operations are on i32 *)
      (* In a full implementation, we'd switch on operand type *)
      uresize i32_result 64
    in
    
    let result_type = i.operand_a.typ in (* Result type same as operands for most ops *)
    let result_valid = i.enable &: i.operand_a.valid &: i.operand_b.valid in
    
    let result = Wasm_value.create 
      ~data:result_data 
      ~typ:result_type 
      ~valid:result_valid in
    
    (* Overflow detection *)
    let overflow = 
      (* Simple overflow detection for addition/subtraction *)
      let add_overflow = 
        let sign_a = msb a_i32 in
        let sign_b = msb b_i32 in
        let sign_result = msb add_result_i32 in
        (sign_a &: sign_b &: ~:sign_result) |: (~:sign_a &: ~:sign_b &: sign_result)
      in
      
      let sub_overflow =
        let sign_a = msb a_i32 in
        let sign_b = msb b_i32 in
        let sign_result = msb sub_result_i32 in
        (sign_a &: ~:sign_b &: ~:sign_result) |: (~:sign_a &: sign_b &: sign_result)
      in
      
      mux2 (i.operation ==: Op.add) add_overflow @@
      mux2 (i.operation ==: Op.sub) sub_overflow gnd
    in
    
    { O.result; overflow; divide_by_zero = div_by_zero_i32 }
end
```

#### Local Variable Storage

WebAssembly functions have local variables that need fast access. We'll implement this with a dedicated memory:

```ocaml
(* locals.ml *)
module Local_storage = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      
      (* Read port *)
      read_addr : 'a [@bits 16];
      read_enable : 'a;
      
      (* Write port *)
      write_addr : 'a [@bits 16];
      write_data : 'a [@bits Wasm_value.width];
      write_enable : 'a;
      
      (* Function call setup *)
      func_entry : 'a;
      param_count : 'a [@bits 8];
      local_count : 'a [@bits 8];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      read_data : 'a [@bits Wasm_value.width];
      read_valid : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Local variable memory - dual port RAM *)
    let local_ram = Ram.create
      ~collision_mode:Read_before_write
      ~size:65536 (* 64K locals max *)
      ~write_ports:[|
        { write_clock = i.clock;
          write_address = i.write_addr;
          write_enable = i.write_enable;
          write_data = i.write_data }
      |]
      ~read_ports:[|
        { read_clock = i.clock;
          read_address = i.read_addr }
      |]
    in
    
    let read_data = local_ram.(0) in
    let read_valid = reg spec i.read_enable in
    
    (* TODO: Add function frame management for locals *)
    
    { O.read_data; read_valid }
end
```

#### Memory Subsystem

The linear memory interface handles load and store operations:

```ocaml
(* memory_interface.ml *)
module Memory_interface = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      
      (* Processor interface *)
      addr : 'a [@bits 32];
      write_data : 'a [@bits 32];
      read_enable : 'a;
      write_enable : 'a;
      size : 'a [@bits 2]; (* 0=byte, 1=half, 2=word, 3=double *)
      
      (* External memory interface *)
      ext_read_data : 'a [@bits 32];
      ext_ready : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      read_data : 'a [@bits 64];
      ready : 'a;
      
      (* External memory interface *)
      ext_addr : 'a [@bits 32];
      ext_write_data : 'a [@bits 32];
      ext_read_enable : 'a;
      ext_write_enable : 'a;
      ext_byte_enable : 'a [@bits 4];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Address alignment and bounds checking *)
    let aligned_addr = i.addr &: (~~:. 0x3) in (* Word align *)
    let addr_valid = 
      (* Check for valid alignment based on access size *)
      mux i.size [
        vdd;                           (* byte - no alignment needed *)
        ~:(bit i.addr 0);             (* half - must be 2-byte aligned *)
        ~:(i.addr &: (of_bit_string (Bits.of_int ~width:32 0x3))) |> reduce ~f:(|:); (* word - 4-byte aligned *)
        ~:(i.addr &: (of_bit_string (Bits.of_int ~width:32 0x7))) |> reduce ~f:(|:); (* double - 8-byte aligned *)
      ]
    in
    
    (* Byte enable generation *)
    let byte_enable = 
      mux i.size [
        of_bit_string (Bits.of_int ~width:4 0x1); (* byte *)
        of_bit_string (Bits.of_int ~width:4 0x3); (* half *)
        of_bit_string (Bits.of_int ~width:4 0xF); (* word *)
        of_bit_string (Bits.of_int ~width:4 0xF); (* double - handled as two word accesses *)
      ]
    in
    
    (* State machine for memory operations *)
    let module Mem_state = struct
      type t = Idle | Read | Write | Wait [@@deriving sexp_of, enumerate]
    end in
    
    let mem_state = State_machine.create (module Mem_state) spec in
    
    (* Memory operation logic *)
    let start_read = i.read_enable &: addr_valid in
    let start_write = i.write_enable &: addr_valid in
    
    always [
      switch (mem_state.current) [
        Mem_state.Idle, [
          when_ start_read [ mem_state.set_next Mem_state.Read ];
          when_ start_write [ mem_state.set_next Mem_state.Write ];
        ];
        
        Mem_state.Read, [
          when_ i.ext_ready [ mem_state.set_next Mem_state.Idle ];
        ];
        
        Mem_state.Write, [
          when_ i.ext_ready [ mem_state.set_next Mem_state.Idle ];
        ];
      ];
    ];
    
    (* Output logic *)
    let ext_read_enable = mem_state.current ==: Mem_state.Read in
    let ext_write_enable = mem_state.current ==: Mem_state.Write in
    
    let read_data = 
      (* Sign/zero extend based on access size *)
      mux i.size [
        sresize i.ext_read_data 64;  (* byte - sign extend *)
        sresize i.ext_read_data 64;  (* half - sign extend *)
        uresize i.ext_read_data 64;  (* word - zero extend *)
        uresize i.ext_read_data 64;  (* double - TODO: implement proper 64-bit reads *)
      ]
    in
    
    { O.
      read_data;
      ready = mem_state.current ==: Mem_state.Idle;
      ext_addr = aligned_addr;
      ext_write_data = i.write_data;
      ext_read_enable;
      ext_write_enable;
      ext_byte_enable = byte_enable;
    }
end
```

#### Control Flow and Function Calls

WebAssembly's structured control flow requires careful hardware implementation:

```ocaml
(* control_flow.ml *)
module Control_flow = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      
      (* Branch control *)
      branch_enable : 'a;
      branch_target : 'a [@bits 32];
      branch_condition : 'a;
      is_conditional : 'a;
      
      (* Call/return *)
      call_enable : 'a;
      call_target : 'a [@bits 32];
      return_enable : 'a;
      
      (* Block structure *)
      block_enter : 'a;
      block_exit : 'a;
      loop_enter : 'a;
      
      (* Current PC *)
      current_pc : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      next_pc : 'a [@bits 32];
      pc_update : 'a;
      
      (* Stack management *)
      call_stack_push : 'a;
      call_stack_pop : 'a;
      call_stack_data : 'a [@bits 32];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Call stack for return addresses *)
    let call_stack_sp_next = Variable.wire ~default:(zero 8) in
    let call_stack_sp = reg spec call_stack_sp_next.value in
    
    let call_stack = Ram.create
      ~collision_mode:Read_before_write
      ~size:256 (* 256 call levels *)
      ~write_ports:[|
        { write_clock = i.clock;
          write_address = call_stack_sp;
          write_enable = i.call_enable;
          write_data = i.current_pc +:. 4 } (* Return address *)
      |]
      ~read_ports:[|
        { read_clock = i.clock;
          read_address = call_stack_sp -:. 1 }
      |]
    in
    
    let return_addr = call_stack.(0) in
    
    (* Control flow decision *)
    let next_pc = 
      (* Priority: call > return > branch > sequential *)
      mux2 i.call_enable i.call_target @@
      mux2 i.return_enable return_addr @@
      mux2 (i.branch_enable &: (i.branch_condition |: ~:i.is_conditional)) i.branch_target @@
      (i.current_pc +:. 4) (* Sequential *)
    in
    
    let pc_update = 
      i.call_enable |: i.return_enable |: 
      (i.branch_enable &: (i.branch_condition |: ~:i.is_conditional))
    in
    
    (* Update call stack pointer *)
    let sp_next = 
      mux2 i.call_enable (call_stack_sp +:. 1) @@
      mux2 i.return_enable (call_stack_sp -:. 1) @@
      call_stack_sp
    in
    Variable.assign call_stack_sp_next sp_next;
    
    { O.
      next_pc;
      pc_update;
      call_stack_push = i.call_enable;
      call_stack_pop = i.return_enable;
      call_stack_data = i.current_pc +:. 4;
    }
end
```

#### Top-Level Integration

Finally, let's integrate everything into a complete WebAssembly processor:

```ocaml
(* wasm_processor.ml *)
module Wasm_processor = struct
  module I = struct
    type 'a t = {
      clock : 'a;
      clear : 'a;
      enable : 'a;
      
      (* Instruction memory *)
      inst_data : 'a [@bits 32];
      inst_ready : 'a;
      
      (* Data memory *)
      mem_read_data : 'a [@bits 32];
      mem_ready : 'a;
    } [@@deriving sexp_of, hardcaml]
  end

  module O = struct
    type 'a t = {
      (* Instruction fetch *)
      inst_addr : 'a [@bits 32];
      inst_req : 'a;
      
      (* Data memory *)
      mem_addr : 'a [@bits 32];
      mem_write_data : 'a [@bits 32];
      mem_write_enable : 'a;
      mem_read_enable : 'a;
      mem_byte_enable : 'a [@bits 4];
      
      (* Status *)
      halted : 'a;
      error : 'a;
      
      (* Debug *)
      pc : 'a [@bits 32];
      stack_depth : 'a [@bits 8];
    } [@@deriving sexp_of, hardcaml]
  end

  let create scope (i : _ I.t) =
    let open Signal in
    let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
    
    (* Create all the submodules *)
    let processor = Processor.create scope {
      Processor.I.
      clock = i.clock;
      clear = i.clear;
      enable = i.enable;
      inst_data = i.inst_data;
      inst_ready = i.inst_ready;
      mem_read_data = i.mem_read_data;
      mem_ready = i.mem_ready;
    } in
    
    { O.
      inst_addr = processor.inst_addr;
      inst_req = processor.inst_req;
      mem_addr = processor.mem_addr;
      mem_write_data = processor.mem_write_data;
      mem_write_enable = processor.mem_write_enable;
      mem_read_enable = processor.mem_read_enable;
      mem_byte_enable = ones 4; (* TODO: Connect proper byte enables *)
      halted = processor.halted;
      error = processor.error;
      pc = processor.inst_addr;
      stack_depth = zero 8; (* TODO: Connect actual stack depth *)
    }
end
```

#### Synthesis and Testing

To synthesize this design for the Arty A7 FPGA, you would:

1. **Generate Verilog**: Use HardCaml's Verilog backend to produce synthesizable RTL
2. **Create constraints**: Define pin assignments and timing constraints for the Arty A7
3. **Add memory controllers**: Interface with the DDR3 RAM and other peripherals
4. **Build bootloader**: Create firmware to load WebAssembly modules
5. **Implement debugging**: Add UART interface for debugging and module loading

Here's a basic testbench structure:

```ocaml
(* test_wasm_processor.ml *)
open Base
open Hardcaml
open Hardcaml_waveterm

let create_test_program () =
  (* Simple WebAssembly program: adds two constants *)
  [|
    0x41000005l; (* i32.const 5 *)
    0x41000003l; (* i32.const 3 *)
    0x6A00000Bl; (* i32.add + end *)
  |]

let test_processor () =
  let module Sim = Cyclesim.Make(Wasm_processor) in
  
  let test_program = create_test_program () in
  let program_size = Array.length test_program in
  
  let sim = Sim.create ~config:Cyclesim.Config.trace_all 
    (Wasm_processor.create (Scope.create ~auto_label_hierarchical_ports:true ())) in
    
  let inputs = Cyclesim.inputs sim in
  let outputs = Cyclesim.outputs sim in
  
  (* Initialize *)
  inputs.clear := Bits.vdd;
  inputs.enable := Bits.gnd;
  Cyclesim.cycle sim;
  
  inputs.clear := Bits.gnd;
  inputs.enable := Bits.vdd;
  
  (* Simulate instruction fetch and execution *)
  for cycle = 0 to 100 do
    (* Provide instruction data when requested *)
    if Bits.to_bool !(outputs.inst_req) then (
      let pc = Bits.to_int !(outputs.inst_addr) / 4 in
      if pc < program_size then
        inputs.inst_data := Bits.of_int ~width:32 (Int32.to_int_exn test_program.(pc))
      else
        inputs.inst_data := Bits.zero 32;
      inputs.inst_ready := Bits.vdd;
    ) else (
      inputs.inst_ready := Bits.gnd;
    );
    
    (* Handle memory requests *)
    inputs.mem_ready := !(outputs.mem_read_enable) |: !(outputs.mem_write_enable);
    inputs.mem_read_data := Bits.zero 32;
    
    Cyclesim.cycle sim;
    
    (* Print status *)
    Printf.printf "Cycle %d: PC=%s Stack_depth=%s\n"
      cycle
      (Bits.to_string !(outputs.pc))
      (Bits.to_string !(outputs.stack_depth));
  done;
  
  (* Generate waveforms *)
  let waves = Hardcaml_waveterm.Waveform.create sim in
  Hardcaml_waveterm.Waveform.print ~display_width:120 ~display_height:30 waves
```

This WebAssembly softcore provides several advantages:

1. **Deterministic Performance**: Every instruction has a known execution time
2. **Native WebAssembly**: No translation overhead
3. **Security**: Hardware-enforced sandboxing
4. **Customization**: Can add domain-specific instructions
5. **Energy Efficiency**: Optimized for the specific workload

The design can be extended with:

* Floating-point units for F32/F64 operations
* Multi-threading support
* Custom instruction extensions
* Hardware garbage collection support
* Advanced branch prediction


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://ocamlstreet.gitbook.io/hardcaml-wiki/wasm/building-a-softcore.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
