Building a softcore
in ocaml for prototyping
Chapter 9: Building a WebAssembly Softcore for FPGA
Now we turn from software to hardware. Building a WebAssembly processor in hardware might seem like an unusual choice, but it offers several compelling advantages:
Native WebAssembly execution without translation overhead
Predictable performance with cycle-accurate execution
Enhanced security through hardware isolation
Custom instruction extensions for domain-specific workloads
We'll design our softcore using HardCaml, a hardware description language embedded in OCaml that provides excellent abstraction capabilities.
Design Overview
Our WebAssembly softcore will implement a stack-based processor with the following components:
Instruction Fetch Unit - Fetches instructions from memory
Decode Unit - Decodes WebAssembly instructions
Operand Stack - Hardware implementation of the WebAssembly stack
Execution Units - ALU, memory interface, control flow
Local Storage - Fast storage for local variables
Memory Interface - Linear memory access
Call Stack - Hardware call stack for function calls
Let's start with the basic types and interfaces:
(* wasm_types.ml *)
open Base
open Hardcaml
module Value_type = struct
type t =
| I32
| I64
| F32
| F64
[@@deriving sexp_of]
let to_bits = function
| I32 -> Bits.of_int ~width:2 0
| I64 -> Bits.of_int ~width:2 1
| F32 -> Bits.of_int ~width:2 2
| F64 -> Bits.of_int ~width:2 3
end
module Wasm_value = struct
(* We'll represent all values as 64-bit for simplicity *)
type t = {
data : Signal.t; (* 64 bits *)
typ : Signal.t; (* 2 bits for value type *)
valid : Signal.t; (* 1 bit *)
}
let create ~data ~typ ~valid = { data; typ; valid }
let width = 64 + 2 + 1 (* data + type + valid *)
let pack t =
Signal.concat_msb [ t.valid; t.typ; t.data ]
let unpack s =
let data = Signal.select s ~high:63 ~low:0 in
let typ = Signal.select s ~high:65 ~low:64 in
let valid = Signal.bit s 66 in
{ data; typ; valid }
end
(* Opcode definitions *)
module Opcode = struct
type t =
| Unreachable
| Nop
| Block
| Loop
| If
| Else
| End
| Br
| Br_if
| Return
| Call
| Drop
| Select
| Local_get
| Local_set
| Local_tee
| I32_load
| I32_store
| I32_const
| I32_add
| I32_sub
| I32_mul
| I32_div_s
| I32_eq
| I32_ne
| I32_lt_s
(* ... more opcodes ... *)
[@@deriving sexp_of, enumerate]
let to_bits opcode =
(* Map opcodes to their WebAssembly binary values *)
let code = match opcode with
| Unreachable -> 0x00
| Nop -> 0x01
| Block -> 0x02
| Loop -> 0x03
| If -> 0x04
| Else -> 0x05
| End -> 0x0B
| Br -> 0x0C
| Br_if -> 0x0D
| Return -> 0x0F
| Call -> 0x10
| Drop -> 0x1A
| Select -> 0x1B
| Local_get -> 0x20
| Local_set -> 0x21
| Local_tee -> 0x22
| I32_load -> 0x28
| I32_store -> 0x36
| I32_const -> 0x41
| I32_add -> 0x6A
| I32_sub -> 0x6B
| I32_mul -> 0x6C
| I32_div_s -> 0x6D
| I32_eq -> 0x46
| I32_ne -> 0x47
| I32_lt_s -> 0x48
in
Bits.of_int ~width:8 code
end
The Stack Machine Core
The heart of our processor is the stack machine. WebAssembly's stack-based execution maps naturally to hardware:
(* stack_machine.ml *)
open Base
open Hardcaml
open Signal.Std
open Wasm_types
module Stack_config = struct
let depth = 256 (* Stack depth *)
let width = Wasm_value.width
end
(* Hardware stack implementation *)
module Stack = struct
module I = struct
type 'a t = {
clock : 'a;
clear : 'a;
push : 'a;
pop : 'a;
data_in : 'a [@bits Stack_config.width];
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
data_out : 'a [@bits Stack_config.width];
empty : 'a;
full : 'a;
depth : 'a [@bits 8]; (* Current stack depth *)
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
(* Stack pointer *)
let sp_next = Variable.wire ~default:(zero 8) in
let sp = reg spec sp_next.value in
(* Stack memory - using Block RAM *)
let stack_ram = Ram.create
~collision_mode:Read_before_write
~size:Stack_config.depth
~write_ports:[|
{ write_clock = i.clock;
write_address = sp;
write_enable = i.push;
write_data = i.data_in }
|]
~read_ports:[|
{ read_clock = i.clock;
read_address = mux2 i.pop (sp -:. 1) sp }
|]
in
let data_out = stack_ram.(0) in
(* Stack pointer logic *)
let sp_inc = sp +:. 1 in
let sp_dec = sp -:. 1 in
let sp_next_val =
mux2 i.push sp_inc @@
mux2 i.pop sp_dec sp in
Variable.assign sp_next sp_next_val;
(* Status flags *)
let empty = sp ==:. 0 in
let full = sp ==:. (Stack_config.depth - 1) in
{ O.data_out; empty; full; depth = sp }
end
(* Main processor core *)
module Processor = struct
module I = struct
type 'a t = {
clock : 'a;
clear : 'a;
enable : 'a;
(* Instruction memory interface *)
inst_data : 'a [@bits 32];
inst_ready : 'a;
(* Data memory interface *)
mem_read_data : 'a [@bits 32];
mem_ready : 'a;
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
(* Instruction fetch *)
inst_addr : 'a [@bits 32];
inst_req : 'a;
(* Data memory *)
mem_addr : 'a [@bits 32];
mem_write_data : 'a [@bits 32];
mem_write_enable : 'a;
mem_read_enable : 'a;
(* Status *)
halted : 'a;
error : 'a;
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
(* Program counter *)
let pc_next = Variable.wire ~default:(zero 32) in
let pc = reg spec pc_next.value in
(* Current instruction *)
let current_inst = reg spec ~enable:i.inst_ready i.inst_data in
let opcode = select current_inst ~high:7 ~low:0 in
(* Operand stack *)
let stack_push = Variable.wire ~default:gnd in
let stack_pop = Variable.wire ~default:gnd in
let stack_data_in = Variable.wire ~default:(zero Stack_config.width) in
let stack = Stack.create scope {
Stack.I.clock = i.clock;
clear = i.clear;
push = stack_push.value;
pop = stack_pop.value;
data_in = stack_data_in.value;
} in
(* Instruction decode *)
let decode_result = Instruction_decode.create scope {
Instruction_decode.I.opcode;
instruction = current_inst;
} in
(* Execution units *)
let alu_result = Alu.create scope {
Alu.I.operation = decode_result.alu_op;
operand_a = Wasm_value.unpack stack.data_out;
operand_b = Wasm_value.unpack stack.data_out; (* This needs proper dual-port stack *)
enable = decode_result.alu_enable;
} in
(* State machine for instruction execution *)
let module State = struct
type t =
| Fetch
| Decode
| Execute
| Memory_wait
| Stack_op
[@@deriving sexp_of, enumerate, compare]
end in
let state = State_machine.create (module State) spec ~enable:i.enable in
(* Control logic *)
let fetch_next_inst = Variable.wire ~default:gnd in
let execute_alu = Variable.wire ~default:gnd in
let memory_op = Variable.wire ~default:gnd in
always [
switch (state.current) [
State.Fetch, [
when_ i.inst_ready [
state.set_next State.Decode;
fetch_next_inst <-- vdd;
];
];
State.Decode, [
state.set_next State.Execute;
];
State.Execute, [
switch opcode [
(* Arithmetic operations *)
Bits.of_int ~width:8 0x6A, [ (* i32.add *)
execute_alu <-- vdd;
stack_pop <-- vdd;
stack_push <-- vdd;
stack_data_in <-- Wasm_value.pack alu_result.result;
state.set_next State.Fetch;
];
(* Constants *)
Bits.of_int ~width:8 0x41, [ (* i32.const *)
let const_val = select current_inst ~high:31 ~low:8 in
let wasm_val = Wasm_value.create
~data:(uresize const_val 64)
~typ:(Value_type.to_bits I32)
~valid:vdd in
stack_push <-- vdd;
stack_data_in <-- Wasm_value.pack wasm_val;
state.set_next State.Fetch;
];
(* Memory operations *)
Bits.of_int ~width:8 0x28, [ (* i32.load *)
memory_op <-- vdd;
state.set_next State.Memory_wait;
];
(* Control flow *)
Bits.of_int ~width:8 0x0F, [ (* return *)
(* Return logic - simplified *)
state.set_next State.Fetch;
];
(* Default case *)
(default, [
(* Unknown instruction - error *)
]);
];
];
State.Memory_wait, [
when_ i.mem_ready [
stack_push <-- vdd;
let mem_val = Wasm_value.create
~data:(uresize i.mem_read_data 64)
~typ:(Value_type.to_bits I32)
~valid:vdd in
stack_data_in <-- Wasm_value.pack mem_val;
state.set_next State.Fetch;
];
];
];
];
(* Update PC *)
Variable.assign pc_next (mux2 fetch_next_inst (pc +:. 4) pc);
(* Memory interface *)
let stack_top = Wasm_value.unpack stack.data_out in
let mem_addr = resize_unsigned stack_top.data 32 in
let mem_write_data = resize_unsigned stack_top.data 32 in
{ O.
inst_addr = pc;
inst_req = state.current ==: State.Fetch;
mem_addr;
mem_write_data;
mem_write_enable = memory_op.value &: decode_result.is_store;
mem_read_enable = memory_op.value &: decode_result.is_load;
halted = gnd; (* TODO: Implement halt logic *)
error = gnd; (* TODO: Implement error detection *)
}
end
Instruction Decode Unit
The instruction decoder translates WebAssembly opcodes into control signals:
(* instruction_decode.ml *)
module Instruction_decode = struct
module I = struct
type 'a t = {
opcode : 'a [@bits 8];
instruction : 'a [@bits 32];
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
(* ALU control *)
alu_op : 'a [@bits 4];
alu_enable : 'a;
(* Memory control *)
is_load : 'a;
is_store : 'a;
mem_size : 'a [@bits 2]; (* 0=byte, 1=half, 2=word *)
(* Stack control *)
stack_push : 'a;
stack_pop : 'a;
pop_count : 'a [@bits 3]; (* How many values to pop *)
push_count : 'a [@bits 3]; (* How many values to push *)
(* Control flow *)
is_branch : 'a;
is_call : 'a;
is_return : 'a;
(* Local variable access *)
is_local_get : 'a;
is_local_set : 'a;
local_index : 'a [@bits 16];
(* Immediate values *)
has_immediate : 'a;
immediate : 'a [@bits 32];
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
(* Decode table - this could be implemented as a ROM *)
let decode_table opcode =
let zero_output = {
O.alu_op = zero 4;
alu_enable = gnd;
is_load = gnd;
is_store = gnd;
mem_size = zero 2;
stack_push = gnd;
stack_pop = gnd;
pop_count = zero 3;
push_count = zero 3;
is_branch = gnd;
is_call = gnd;
is_return = gnd;
is_local_get = gnd;
is_local_set = gnd;
local_index = zero 16;
has_immediate = gnd;
immediate = zero 32;
} in
mux (uresize opcode 8) [
(* 0x00: unreachable *)
{ zero_output with (* trap *) };
(* 0x01: nop *)
zero_output;
(* ... skip to interesting opcodes ... *)
(* 0x20: local.get *)
{ zero_output with
is_local_get = vdd;
stack_push = vdd;
push_count = Bits.of_int ~width:3 1 |> of_bit_string;
has_immediate = vdd;
immediate = select i.instruction ~high:31 ~low:8; (* local index *)
};
(* 0x21: local.set *)
{ zero_output with
is_local_set = vdd;
stack_pop = vdd;
pop_count = Bits.of_int ~width:3 1 |> of_bit_string;
has_immediate = vdd;
immediate = select i.instruction ~high:31 ~low:8;
};
(* 0x28: i32.load *)
{ zero_output with
is_load = vdd;
mem_size = Bits.of_int ~width:2 2 |> of_bit_string; (* word *)
stack_pop = vdd;
stack_push = vdd;
pop_count = Bits.of_int ~width:3 1 |> of_bit_string;
push_count = Bits.of_int ~width:3 1 |> of_bit_string;
};
(* 0x36: i32.store *)
{ zero_output with
is_store = vdd;
mem_size = Bits.of_int ~width:2 2 |> of_bit_string;
stack_pop = vdd;
pop_count = Bits.of_int ~width:3 2 |> of_bit_string; (* address + value *)
};
(* 0x41: i32.const *)
{ zero_output with
stack_push = vdd;
push_count = Bits.of_int ~width:3 1 |> of_bit_string;
has_immediate = vdd;
immediate = select i.instruction ~high:31 ~low:8; (* constant value *)
};
(* 0x6A: i32.add *)
{ zero_output with
alu_op = Bits.of_int ~width:4 0 |> of_bit_string; (* ADD *)
alu_enable = vdd;
stack_pop = vdd;
stack_push = vdd;
pop_count = Bits.of_int ~width:3 2 |> of_bit_string;
push_count = Bits.of_int ~width:3 1 |> of_bit_string;
};
(* 0x6B: i32.sub *)
{ zero_output with
alu_op = Bits.of_int ~width:4 1 |> of_bit_string; (* SUB *)
alu_enable = vdd;
stack_pop = vdd;
stack_push = vdd;
pop_count = Bits.of_int ~width:3 2 |> of_bit_string;
push_count = Bits.of_int ~width:3 1 |> of_bit_string;
};
] ~default:zero_output
in
decode_table i.opcode
end
ALU Implementation
The arithmetic logic unit handles all computational operations:
(* alu.ml *)
module Alu = struct
module I = struct
type 'a t = {
operation : 'a [@bits 4];
operand_a : Wasm_value.t;
operand_b : Wasm_value.t;
enable : 'a;
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
result : Wasm_value.t;
overflow : 'a;
divide_by_zero : 'a;
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
(* ALU operations *)
let module Op = struct
let add = of_bit_string (Bits.of_int ~width:4 0)
let sub = of_bit_string (Bits.of_int ~width:4 1)
let mul = of_bit_string (Bits.of_int ~width:4 2)
let div_s = of_bit_string (Bits.of_int ~width:4 3)
let div_u = of_bit_string (Bits.of_int ~width:4 4)
let rem_s = of_bit_string (Bits.of_int ~width:4 5)
let rem_u = of_bit_string (Bits.of_int ~width:4 6)
let and_ = of_bit_string (Bits.of_int ~width:4 7)
let or_ = of_bit_string (Bits.of_int ~width:4 8)
let xor = of_bit_string (Bits.of_int ~width:4 9)
let shl = of_bit_string (Bits.of_int ~width:4 10)
let shr_s = of_bit_string (Bits.of_int ~width:4 11)
let shr_u = of_bit_string (Bits.of_int ~width:4 12)
let eq = of_bit_string (Bits.of_int ~width:4 13)
let ne = of_bit_string (Bits.of_int ~width:4 14)
let lt_s = of_bit_string (Bits.of_int ~width:4 15)
end in
(* Extract operands based on type *)
let a_i32 = select i.operand_a.data ~high:31 ~low:0 in
let b_i32 = select i.operand_b.data ~high:31 ~low:0 in
let a_i64 = i.operand_a.data in
let b_i64 = i.operand_b.data in
(* Arithmetic operations *)
let add_result_i32 = a_i32 +: b_i32 in
let sub_result_i32 = a_i32 -: b_i32 in
let mul_result_i32 = a_i32 *: b_i32 in
(* Division requires special handling for divide-by-zero *)
let div_by_zero_i32 = b_i32 ==:. 0 in
let div_s_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 /+ b_i32) in
let div_u_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 /: b_i32) in
(* Remainder operations *)
let rem_s_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 %+ b_i32) in
let rem_u_result_i32 = mux2 div_by_zero_i32 (zero 32) (a_i32 %: b_i32) in
(* Bitwise operations *)
let and_result_i32 = a_i32 &: b_i32 in
let or_result_i32 = a_i32 |: b_i32 in
let xor_result_i32 = a_i32 ^: b_i32 in
(* Shift operations (b is shift amount, only lower 5 bits used for i32) *)
let shift_amount = select b_i32 ~high:4 ~low:0 in
let shl_result_i32 = a_i32 <<: (uresize shift_amount 6) in
let shr_s_result_i32 = a_i32 >>+ (uresize shift_amount 6) in
let shr_u_result_i32 = a_i32 >>: (uresize shift_amount 6) in
(* Comparison operations (return 1 for true, 0 for false) *)
let eq_result_i32 = mux2 (a_i32 ==: b_i32) (one 32) (zero 32) in
let ne_result_i32 = mux2 (a_i32 <>: b_i32) (one 32) (zero 32) in
let lt_s_result_i32 = mux2 (a_i32 <+ b_i32) (one 32) (zero 32) in
(* Similar operations for i64 *)
let add_result_i64 = a_i64 +: b_i64 in
let sub_result_i64 = a_i64 -: b_i64 in
(* ... more i64 operations ... *)
(* Result selection based on operation and operand types *)
let result_data =
let i32_result = mux (uresize i.operation 4) [
add_result_i32; (* ADD *)
sub_result_i32; (* SUB *)
mul_result_i32; (* MUL *)
div_s_result_i32; (* DIV_S *)
div_u_result_i32; (* DIV_U *)
rem_s_result_i32; (* REM_S *)
rem_u_result_i32; (* REM_U *)
and_result_i32; (* AND *)
or_result_i32; (* OR *)
xor_result_i32; (* XOR *)
shl_result_i32; (* SHL *)
shr_s_result_i32; (* SHR_S *)
shr_u_result_i32; (* SHR_U *)
eq_result_i32; (* EQ *)
ne_result_i32; (* NE *)
lt_s_result_i32; (* LT_S *)
] in
(* For now, assume all operations are on i32 *)
(* In a full implementation, we'd switch on operand type *)
uresize i32_result 64
in
let result_type = i.operand_a.typ in (* Result type same as operands for most ops *)
let result_valid = i.enable &: i.operand_a.valid &: i.operand_b.valid in
let result = Wasm_value.create
~data:result_data
~typ:result_type
~valid:result_valid in
(* Overflow detection *)
let overflow =
(* Simple overflow detection for addition/subtraction *)
let add_overflow =
let sign_a = msb a_i32 in
let sign_b = msb b_i32 in
let sign_result = msb add_result_i32 in
(sign_a &: sign_b &: ~:sign_result) |: (~:sign_a &: ~:sign_b &: sign_result)
in
let sub_overflow =
let sign_a = msb a_i32 in
let sign_b = msb b_i32 in
let sign_result = msb sub_result_i32 in
(sign_a &: ~:sign_b &: ~:sign_result) |: (~:sign_a &: sign_b &: sign_result)
in
mux2 (i.operation ==: Op.add) add_overflow @@
mux2 (i.operation ==: Op.sub) sub_overflow gnd
in
{ O.result; overflow; divide_by_zero = div_by_zero_i32 }
end
Local Variable Storage
WebAssembly functions have local variables that need fast access. We'll implement this with a dedicated memory:
(* locals.ml *)
module Local_storage = struct
module I = struct
type 'a t = {
clock : 'a;
clear : 'a;
(* Read port *)
read_addr : 'a [@bits 16];
read_enable : 'a;
(* Write port *)
write_addr : 'a [@bits 16];
write_data : 'a [@bits Wasm_value.width];
write_enable : 'a;
(* Function call setup *)
func_entry : 'a;
param_count : 'a [@bits 8];
local_count : 'a [@bits 8];
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
read_data : 'a [@bits Wasm_value.width];
read_valid : 'a;
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
(* Local variable memory - dual port RAM *)
let local_ram = Ram.create
~collision_mode:Read_before_write
~size:65536 (* 64K locals max *)
~write_ports:[|
{ write_clock = i.clock;
write_address = i.write_addr;
write_enable = i.write_enable;
write_data = i.write_data }
|]
~read_ports:[|
{ read_clock = i.clock;
read_address = i.read_addr }
|]
in
let read_data = local_ram.(0) in
let read_valid = reg spec i.read_enable in
(* TODO: Add function frame management for locals *)
{ O.read_data; read_valid }
end
Memory Subsystem
The linear memory interface handles load and store operations:
(* memory_interface.ml *)
module Memory_interface = struct
module I = struct
type 'a t = {
clock : 'a;
clear : 'a;
(* Processor interface *)
addr : 'a [@bits 32];
write_data : 'a [@bits 32];
read_enable : 'a;
write_enable : 'a;
size : 'a [@bits 2]; (* 0=byte, 1=half, 2=word, 3=double *)
(* External memory interface *)
ext_read_data : 'a [@bits 32];
ext_ready : 'a;
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
read_data : 'a [@bits 64];
ready : 'a;
(* External memory interface *)
ext_addr : 'a [@bits 32];
ext_write_data : 'a [@bits 32];
ext_read_enable : 'a;
ext_write_enable : 'a;
ext_byte_enable : 'a [@bits 4];
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
(* Address alignment and bounds checking *)
let aligned_addr = i.addr &: (~~:. 0x3) in (* Word align *)
let addr_valid =
(* Check for valid alignment based on access size *)
mux i.size [
vdd; (* byte - no alignment needed *)
~:(bit i.addr 0); (* half - must be 2-byte aligned *)
~:(i.addr &: (of_bit_string (Bits.of_int ~width:32 0x3))) |> reduce ~f:(|:); (* word - 4-byte aligned *)
~:(i.addr &: (of_bit_string (Bits.of_int ~width:32 0x7))) |> reduce ~f:(|:); (* double - 8-byte aligned *)
]
in
(* Byte enable generation *)
let byte_enable =
mux i.size [
of_bit_string (Bits.of_int ~width:4 0x1); (* byte *)
of_bit_string (Bits.of_int ~width:4 0x3); (* half *)
of_bit_string (Bits.of_int ~width:4 0xF); (* word *)
of_bit_string (Bits.of_int ~width:4 0xF); (* double - handled as two word accesses *)
]
in
(* State machine for memory operations *)
let module Mem_state = struct
type t = Idle | Read | Write | Wait [@@deriving sexp_of, enumerate]
end in
let mem_state = State_machine.create (module Mem_state) spec in
(* Memory operation logic *)
let start_read = i.read_enable &: addr_valid in
let start_write = i.write_enable &: addr_valid in
always [
switch (mem_state.current) [
Mem_state.Idle, [
when_ start_read [ mem_state.set_next Mem_state.Read ];
when_ start_write [ mem_state.set_next Mem_state.Write ];
];
Mem_state.Read, [
when_ i.ext_ready [ mem_state.set_next Mem_state.Idle ];
];
Mem_state.Write, [
when_ i.ext_ready [ mem_state.set_next Mem_state.Idle ];
];
];
];
(* Output logic *)
let ext_read_enable = mem_state.current ==: Mem_state.Read in
let ext_write_enable = mem_state.current ==: Mem_state.Write in
let read_data =
(* Sign/zero extend based on access size *)
mux i.size [
sresize i.ext_read_data 64; (* byte - sign extend *)
sresize i.ext_read_data 64; (* half - sign extend *)
uresize i.ext_read_data 64; (* word - zero extend *)
uresize i.ext_read_data 64; (* double - TODO: implement proper 64-bit reads *)
]
in
{ O.
read_data;
ready = mem_state.current ==: Mem_state.Idle;
ext_addr = aligned_addr;
ext_write_data = i.write_data;
ext_read_enable;
ext_write_enable;
ext_byte_enable = byte_enable;
}
end
Control Flow and Function Calls
WebAssembly's structured control flow requires careful hardware implementation:
(* control_flow.ml *)
module Control_flow = struct
module I = struct
type 'a t = {
clock : 'a;
clear : 'a;
(* Branch control *)
branch_enable : 'a;
branch_target : 'a [@bits 32];
branch_condition : 'a;
is_conditional : 'a;
(* Call/return *)
call_enable : 'a;
call_target : 'a [@bits 32];
return_enable : 'a;
(* Block structure *)
block_enter : 'a;
block_exit : 'a;
loop_enter : 'a;
(* Current PC *)
current_pc : 'a [@bits 32];
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
next_pc : 'a [@bits 32];
pc_update : 'a;
(* Stack management *)
call_stack_push : 'a;
call_stack_pop : 'a;
call_stack_data : 'a [@bits 32];
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
(* Call stack for return addresses *)
let call_stack_sp_next = Variable.wire ~default:(zero 8) in
let call_stack_sp = reg spec call_stack_sp_next.value in
let call_stack = Ram.create
~collision_mode:Read_before_write
~size:256 (* 256 call levels *)
~write_ports:[|
{ write_clock = i.clock;
write_address = call_stack_sp;
write_enable = i.call_enable;
write_data = i.current_pc +:. 4 } (* Return address *)
|]
~read_ports:[|
{ read_clock = i.clock;
read_address = call_stack_sp -:. 1 }
|]
in
let return_addr = call_stack.(0) in
(* Control flow decision *)
let next_pc =
(* Priority: call > return > branch > sequential *)
mux2 i.call_enable i.call_target @@
mux2 i.return_enable return_addr @@
mux2 (i.branch_enable &: (i.branch_condition |: ~:i.is_conditional)) i.branch_target @@
(i.current_pc +:. 4) (* Sequential *)
in
let pc_update =
i.call_enable |: i.return_enable |:
(i.branch_enable &: (i.branch_condition |: ~:i.is_conditional))
in
(* Update call stack pointer *)
let sp_next =
mux2 i.call_enable (call_stack_sp +:. 1) @@
mux2 i.return_enable (call_stack_sp -:. 1) @@
call_stack_sp
in
Variable.assign call_stack_sp_next sp_next;
{ O.
next_pc;
pc_update;
call_stack_push = i.call_enable;
call_stack_pop = i.return_enable;
call_stack_data = i.current_pc +:. 4;
}
end
Top-Level Integration
Finally, let's integrate everything into a complete WebAssembly processor:
(* wasm_processor.ml *)
module Wasm_processor = struct
module I = struct
type 'a t = {
clock : 'a;
clear : 'a;
enable : 'a;
(* Instruction memory *)
inst_data : 'a [@bits 32];
inst_ready : 'a;
(* Data memory *)
mem_read_data : 'a [@bits 32];
mem_ready : 'a;
} [@@deriving sexp_of, hardcaml]
end
module O = struct
type 'a t = {
(* Instruction fetch *)
inst_addr : 'a [@bits 32];
inst_req : 'a;
(* Data memory *)
mem_addr : 'a [@bits 32];
mem_write_data : 'a [@bits 32];
mem_write_enable : 'a;
mem_read_enable : 'a;
mem_byte_enable : 'a [@bits 4];
(* Status *)
halted : 'a;
error : 'a;
(* Debug *)
pc : 'a [@bits 32];
stack_depth : 'a [@bits 8];
} [@@deriving sexp_of, hardcaml]
end
let create scope (i : _ I.t) =
let open Signal in
let spec = Reg_spec.create ~clock:i.clock ~clear:i.clear () in
(* Create all the submodules *)
let processor = Processor.create scope {
Processor.I.
clock = i.clock;
clear = i.clear;
enable = i.enable;
inst_data = i.inst_data;
inst_ready = i.inst_ready;
mem_read_data = i.mem_read_data;
mem_ready = i.mem_ready;
} in
{ O.
inst_addr = processor.inst_addr;
inst_req = processor.inst_req;
mem_addr = processor.mem_addr;
mem_write_data = processor.mem_write_data;
mem_write_enable = processor.mem_write_enable;
mem_read_enable = processor.mem_read_enable;
mem_byte_enable = ones 4; (* TODO: Connect proper byte enables *)
halted = processor.halted;
error = processor.error;
pc = processor.inst_addr;
stack_depth = zero 8; (* TODO: Connect actual stack depth *)
}
end
Synthesis and Testing
To synthesize this design for the Arty A7 FPGA, you would:
Generate Verilog: Use HardCaml's Verilog backend to produce synthesizable RTL
Create constraints: Define pin assignments and timing constraints for the Arty A7
Add memory controllers: Interface with the DDR3 RAM and other peripherals
Build bootloader: Create firmware to load WebAssembly modules
Implement debugging: Add UART interface for debugging and module loading
Here's a basic testbench structure:
(* test_wasm_processor.ml *)
open Base
open Hardcaml
open Hardcaml_waveterm
let create_test_program () =
(* Simple WebAssembly program: adds two constants *)
[|
0x41000005l; (* i32.const 5 *)
0x41000003l; (* i32.const 3 *)
0x6A00000Bl; (* i32.add + end *)
|]
let test_processor () =
let module Sim = Cyclesim.Make(Wasm_processor) in
let test_program = create_test_program () in
let program_size = Array.length test_program in
let sim = Sim.create ~config:Cyclesim.Config.trace_all
(Wasm_processor.create (Scope.create ~auto_label_hierarchical_ports:true ())) in
let inputs = Cyclesim.inputs sim in
let outputs = Cyclesim.outputs sim in
(* Initialize *)
inputs.clear := Bits.vdd;
inputs.enable := Bits.gnd;
Cyclesim.cycle sim;
inputs.clear := Bits.gnd;
inputs.enable := Bits.vdd;
(* Simulate instruction fetch and execution *)
for cycle = 0 to 100 do
(* Provide instruction data when requested *)
if Bits.to_bool !(outputs.inst_req) then (
let pc = Bits.to_int !(outputs.inst_addr) / 4 in
if pc < program_size then
inputs.inst_data := Bits.of_int ~width:32 (Int32.to_int_exn test_program.(pc))
else
inputs.inst_data := Bits.zero 32;
inputs.inst_ready := Bits.vdd;
) else (
inputs.inst_ready := Bits.gnd;
);
(* Handle memory requests *)
inputs.mem_ready := !(outputs.mem_read_enable) |: !(outputs.mem_write_enable);
inputs.mem_read_data := Bits.zero 32;
Cyclesim.cycle sim;
(* Print status *)
Printf.printf "Cycle %d: PC=%s Stack_depth=%s\n"
cycle
(Bits.to_string !(outputs.pc))
(Bits.to_string !(outputs.stack_depth));
done;
(* Generate waveforms *)
let waves = Hardcaml_waveterm.Waveform.create sim in
Hardcaml_waveterm.Waveform.print ~display_width:120 ~display_height:30 waves
This WebAssembly softcore provides several advantages:
Deterministic Performance: Every instruction has a known execution time
Native WebAssembly: No translation overhead
Security: Hardware-enforced sandboxing
Customization: Can add domain-specific instructions
Energy Efficiency: Optimized for the specific workload
The design can be extended with:
Floating-point units for F32/F64 operations
Multi-threading support
Custom instruction extensions
Hardware garbage collection support
Advanced branch prediction
Last updated