diff --git a/extra/remu/src/thread.rs b/extra/remu/src/thread.rs index 1278351165..5f493a1bef 100644 --- a/extra/remu/src/thread.rs +++ b/extra/remu/src/thread.rs @@ -1127,13 +1127,14 @@ impl<'a> Thread<'a> { self.vec_reg.write64(vdst, ret) } } - 306 | 309 | 313 | 596 | 584 | 585 | 588 => { + 306 | 309 | 310 | 313 | 596 | 584 | 585 | 588 => { let (s0, s1, s2) = (self.val(src.0), self.val(src.1), self.val(src.2)); let s0 = f16::from_bits(s0).negate(0, neg).absolute(0, abs); let s1 = f16::from_bits(s1).negate(1, neg).absolute(1, abs); let s2 = f16::from_bits(s2).negate(1, neg).absolute(1, abs); let ret = match op { 309 => s0 * s1, + 310 => f16::mul_add(s0, s1, f16::from_bits(self.vec_reg[vdst] as u16)), 306 => s0 + s1, 584 => f16::mul_add(s0, s1, s2), 585 => f16::min(f16::min(s0, s1), s2), diff --git a/extra/remu/test/hwtest.py b/extra/remu/test/hwtest.py index 88fc6b41dd..52c019787a 100644 --- a/extra/remu/test/hwtest.py +++ b/extra/remu/test/hwtest.py @@ -1,6 +1,6 @@ import numpy as np import unittest -import subprocess +import subprocess, struct from typing import cast from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice from tinygrad import Tensor, dtypes, Device @@ -83,7 +83,7 @@ amdhsa.version: + "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}" return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret)) -def get_output(s:str, n_threads:int): +def get_output(s:str, n_threads:int=1): assert n_threads <= 32 code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s, "s_waitcnt 0", @@ -94,6 +94,8 @@ def get_output(s:str, n_threads:int): prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True) return test.numpy() +def f16_to_bits(x:float) -> int: return struct.unpack('