Files
tinygrad/test/amd/hw/test_vop2.py
George Hotz 359b1582d6 amd: EMU DPP support (#15719)
* EMU DPP support from GPT 5.4

* cleanups

* simple

* nope

* fix
2026-04-14 14:58:41 +08:00

995 lines
37 KiB
Python

"""Tests for VOP2 instructions - two operand vector operations.
Includes: v_add_f32, v_mul_f32, v_and_b32, v_or_b32, v_xor_b32,
v_lshrrev_b32, v_lshlrev_b32, v_fmac_f32, v_fmaak_f32, v_fmamk_f32,
v_add_nc_u32, v_cndmask_b32, v_add_f16, v_mul_f16
"""
import unittest
from test.amd.hw.helpers import *
class TestBasicArithmetic(unittest.TestCase):
"""Tests for basic arithmetic VOP2 instructions."""
def test_v_add_f32(self):
"""V_ADD_F32 adds two floats."""
instructions = [
v_mov_b32_e32(v[0], 1.0),
v_mov_b32_e32(v[1], 2.0),
v_add_f32_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 3.0, places=5)
def test_v_mul_f32(self):
"""V_MUL_F32 multiplies two floats."""
instructions = [
v_mov_b32_e32(v[0], 2.0),
v_mov_b32_e32(v[1], 4.0),
v_mul_f32_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 8.0, places=5)
def test_v_add_f32_dpp_row_shl(self):
"""V_ADD_F32 DPP row_shl swizzles src0 before the add."""
instructions = [
v_cvt_f32_u32_e32(v[0], v[255]),
v_add_f32_e32(v[1], DPP, v[0], vsrc0=v[0], dpp=0x101, row_mask=0xf, bank_mask=0xf, bc=1),
]
st = run_program(instructions, n_lanes=16)
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 1.0, places=5)
self.assertAlmostEqual(i2f(st.vgpr[1][1]), 3.0, places=5)
self.assertAlmostEqual(i2f(st.vgpr[14][1]), 29.0, places=5)
def test_v_fmac_f32(self):
"""V_FMAC_F32: d = d + a*b using inline constants."""
instructions = [
v_mov_b32_e32(v[0], 2.0),
v_mov_b32_e32(v[1], 4.0),
v_mov_b32_e32(v[2], 1.0),
v_fmac_f32_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
def test_v_fmaak_f32(self):
"""V_FMAAK_F32: d = a * b + K using inline constants."""
instructions = [
v_mov_b32_e32(v[0], 2.0),
v_mov_b32_e32(v[1], 4.0),
v_fmaak_f32_e32(v[2], v[0], v[1], literal=0x3f800000),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
def test_v_fmamk_f32_basic(self):
"""V_FMAMK_F32: d = a * K + b."""
instructions = [
v_mov_b32_e32(v[0], 2.0),
v_mov_b32_e32(v[1], 1.0),
v_fmamk_f32_e32(v[2], v[0], v[1], literal=0x40800000),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 9.0, places=5)
def test_v_fmamk_f32_small_constant(self):
"""V_FMAMK_F32 with small constant."""
instructions = [
v_mov_b32_e32(v[0], 4.0),
v_mov_b32_e32(v[1], 1.0),
v_fmamk_f32_e32(v[2], v[0], v[1], literal=f2i(0.5)),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][2]), 3.0, places=5)
class TestBitManipulation(unittest.TestCase):
"""Tests for bit manipulation VOP2 instructions."""
def test_v_and_b32(self):
"""V_AND_B32 bitwise and."""
instructions = [
s_mov_b32(s[0], 0xff),
s_mov_b32(s[1], 0x0f),
v_mov_b32_e32(v[0], s[0]),
v_and_b32_e32(v[1], s[1], v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0x0f)
def test_v_and_b32_quadrant(self):
"""V_AND_B32 for quadrant extraction (n & 3)."""
instructions = [
s_mov_b32(s[0], 15915),
v_mov_b32_e32(v[0], s[0]),
v_and_b32_e32(v[1], 3, v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 15915 & 3)
def test_v_lshrrev_b32(self):
"""V_LSHRREV_B32 logical shift right."""
instructions = [
s_mov_b32(s[0], 0xff00),
v_mov_b32_e32(v[0], s[0]),
v_lshrrev_b32_e32(v[1], 8, v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0xff)
def test_v_lshlrev_b32(self):
"""V_LSHLREV_B32 logical shift left."""
instructions = [
s_mov_b32(s[0], 0xff),
v_mov_b32_e32(v[0], s[0]),
v_lshlrev_b32_e32(v[1], 8, v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0xff00)
def test_v_xor_b32(self):
"""V_XOR_B32 bitwise xor (used in sin for sign)."""
instructions = [
s_mov_b32(s[0], 0x80000000),
s_mov_b32(s[1], f2i(1.0)),
v_mov_b32_e32(v[0], s[1]),
v_xor_b32_e32(v[1], s[0], v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][1]), -1.0, places=5)
def test_v_xor_b32_sign_flip(self):
"""V_XOR_B32 for sign flip pattern."""
instructions = [
s_mov_b32(s[0], 0x80000000),
v_mov_b32_e32(v[0], -2.0),
v_xor_b32_e32(v[1], s[0], v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertAlmostEqual(i2f(st.vgpr[0][1]), 2.0, places=5)
class TestSpecialValues(unittest.TestCase):
"""Tests for special float values - inf, nan, zero handling."""
def test_v_mul_f32_zero_times_inf(self):
"""V_MUL_F32: 0 * inf = NaN."""
import math
instructions = [
v_mov_b32_e32(v[0], 0),
s_mov_b32(s[0], 0x7f800000),
v_mov_b32_e32(v[1], s[0]),
v_mul_f32_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
def test_v_add_f32_inf_minus_inf(self):
"""V_ADD_F32: inf + (-inf) = NaN."""
import math
instructions = [
s_mov_b32(s[0], 0x7f800000),
s_mov_b32(s[1], 0xff800000),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_add_f32_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
self.assertTrue(math.isnan(i2f(st.vgpr[0][2])))
class TestF16Ops(unittest.TestCase):
"""Tests for 16-bit VOP2 operations."""
def test_v_add_f16_basic(self):
"""V_ADD_F16 adds two f16 values."""
instructions = [
s_mov_b32(s[0], 0x3c00), # f16 1.0
s_mov_b32(s[1], 0x4000), # f16 2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_add_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x4200, f"Expected 0x4200 (f16 3.0), got 0x{result:04x}")
def test_v_add_f16_negative(self):
"""V_ADD_F16 with negative values."""
instructions = [
s_mov_b32(s[0], 0x3c00), # f16 1.0
s_mov_b32(s[1], 0xc000), # f16 -2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_add_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0xbc00, f"Expected 0xbc00 (f16 -1.0), got 0x{result:04x}")
def test_v_mul_f16_basic(self):
"""V_MUL_F16 multiplies two f16 values."""
instructions = [
s_mov_b32(s[0], 0x4000), # f16 2.0
s_mov_b32(s[1], 0x4200), # f16 3.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mul_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x4600, f"Expected 0x4600 (f16 6.0), got 0x{result:04x}")
def test_v_mul_f16_by_zero(self):
"""V_MUL_F16 by zero."""
instructions = [
s_mov_b32(s[0], 0x4000), # f16 2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], 0),
v_mul_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x0000, f"Expected 0x0000 (f16 0.0), got 0x{result:04x}")
def test_v_fmac_f16_basic(self):
"""V_FMAC_F16: d = d + a*b."""
instructions = [
s_mov_b32(s[0], 0x4000), # f16 2.0
s_mov_b32(s[1], 0x4200), # f16 3.0
s_mov_b32(s[2], 0x3c00), # f16 1.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
v_fmac_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
# 2.0 * 3.0 + 1.0 = 7.0, f16 7.0 = 0x4700
self.assertEqual(result, 0x4700, f"Expected 0x4700 (f16 7.0), got 0x{result:04x}")
def test_v_max_f16_basic(self):
"""V_MAX_F16 returns the maximum of two f16 values."""
instructions = [
s_mov_b32(s[0], 0x3c00), # f16 1.0
s_mov_b32(s[1], 0x4000), # f16 2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_max_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x4000, f"Expected 0x4000 (f16 2.0), got 0x{result:04x}")
def test_v_min_f16_basic(self):
"""V_MIN_F16 returns the minimum of two f16 values."""
instructions = [
s_mov_b32(s[0], 0x3c00), # f16 1.0
s_mov_b32(s[1], 0x4000), # f16 2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_min_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x3c00, f"Expected 0x3c00 (f16 1.0), got 0x{result:04x}")
def test_v_fmaak_f16_basic(self):
"""V_FMAAK_F16: d = a * b + K."""
instructions = [
s_mov_b32(s[0], 0x4000), # f16 2.0
s_mov_b32(s[1], 0x4200), # f16 3.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_fmaak_f16_e32(v[2], v[0], v[1], literal=0x3c00), # + f16 1.0
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
# 2.0 * 3.0 + 1.0 = 7.0, f16 7.0 = 0x4700
self.assertEqual(result, 0x4700, f"Expected 0x4700 (f16 7.0), got 0x{result:04x}")
class TestHiHalfOps(unittest.TestCase):
"""Tests for VOP2 16-bit operations with hi-half operands."""
def test_v_add_f16_src0_hi_fold(self):
"""V_ADD_F16 with src0 hi-half fold (same register, different halves)."""
instructions = [
s_mov_b32(s[0], 0x40003c00), # lo=f16(1.0), hi=f16(2.0)
v_mov_b32_e32(v[0], s[0]),
VOP3(VOP3Op.V_ADD_F16, vdst=v[1], src0=v[0], src1=v[0], opsel=0b0001),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
def test_v_add_f16_src0_hi_different_reg(self):
"""V_ADD_F16 with src0 hi-half from different register."""
instructions = [
s_mov_b32(s[0], 0x40000000), # hi=f16(2.0), lo=0
s_mov_b32(s[1], 0x00003c00), # hi=0, lo=f16(1.0)
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
VOP3(VOP3Op.V_ADD_F16, vdst=v[2], src0=v[0], src1=v[1], opsel=0b0001),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
def test_v_mul_f16_src0_hi(self):
"""V_MUL_F16 with src0 from high half."""
instructions = [
s_mov_b32(s[0], 0x40000000), # hi=f16(2.0), lo=0
s_mov_b32(s[1], 0x00004200), # hi=0, lo=f16(3.0)
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
VOP3(VOP3Op.V_MUL_F16, vdst=v[2], src0=v[0], src1=v[1], opsel=0b0001),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
def test_v_mul_f16_hi_half(self):
"""V_MUL_F16 reading from high half."""
instructions = [
s_mov_b32(s[0], 0x40003c00), # lo=1.0, hi=2.0
v_mov_b32_e32(v[0], s[0]),
VOP3(VOP3Op.V_MUL_F16, vdst=v[1], src0=v[0], src1=v[0], opsel=0b0011),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
self.assertEqual(result, 0x4400, f"Expected f16(4.0)=0x4400, got 0x{result:04x}")
def test_v_fma_f16_hi_dest(self):
"""V_FMA_F16 writing to high half with opsel.
Uses V_FMA_F16 (not V_FMAC_F16) because it has explicit src2 operand
which makes opsel handling clearer.
"""
instructions = [
s_mov_b32(s[0], 0x3c000000), # hi=f16(1.0), lo=0
s_mov_b32(s[1], 0x4000), # f16(2.0) in lo
s_mov_b32(s[2], 0x4200), # f16(3.0) in lo
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
# V_FMA_F16: dst = src0 * src1 + src2
# opsel=0b1100: bit2=src2 hi, bit3=dst hi
# So: v[0].hi = v[1].lo * v[2].lo + v[0].hi = 2.0 * 3.0 + 1.0 = 7.0
VOP3(VOP3Op.V_FMA_F16, vdst=v[0], src0=v[1], src1=v[2], src2=v[0], opsel=0b1100),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][0] >> 16) & 0xffff
# 2.0 * 3.0 + 1.0 = 7.0, f16 7.0 = 0x4700
self.assertEqual(hi, 0x4700, f"Expected f16(7.0)=0x4700 in hi, got 0x{hi:04x}")
def test_v_add_f16_multilane(self):
"""V_ADD_F16 with multiple lanes."""
instructions = [
s_mov_b32(s[0], 0x3c00), # f16 1.0
s_mov_b32(s[1], 0x4000), # f16 2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_add_f16_e32(v[2], v[0], v[1]),
]
st = run_program(instructions, n_lanes=4)
for lane in range(4):
result = st.vgpr[lane][2] & 0xffff
self.assertEqual(result, 0x4200, f"Lane {lane}: expected 0x4200, got 0x{result:04x}")
class TestVop2F16HiHalf(unittest.TestCase):
"""Regression tests for VOP2 f16 hi-half operand handling.
These test the bugs where:
1. VOP2 vsrc1 >= 384 (v[128]+) wasn't extracting hi 16 bits
2. VOP2 vdst >= 384 (v[128]+) wasn't preserving lo 16 bits
"""
def test_v_add_f16_e32_vsrc1_hi_half(self):
"""V_ADD_F16_E32 with vsrc1 from hi-half (v[128]+).
When vsrc1 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
of v[vsrc1-128]. The emulator must extract bits [31:16] from the actual VGPR.
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v_add_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0])
# In VOP2 encoding, vsrc1=384 means v[128], which maps to v[0].hi
# v[1] = v[0].lo + v[0].hi = 1.0 + 2.0 = 3.0
VOP2(VOP2Op.V_ADD_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
# 1.0 + 2.0 = 3.0, f16 3.0 = 0x4200
self.assertEqual(result, 0x4200, f"Expected f16(3.0)=0x4200, got 0x{result:04x}")
def test_v_mul_f16_e32_vsrc1_hi_half(self):
"""V_MUL_F16_E32 with vsrc1 from hi-half.
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4200_4000: hi=f16(3.0), lo=f16(2.0)
s_mov_b32(s[0], 0x42004000),
v_mov_b32_e32(v[0], s[0]),
# v_mul_f16_e32 v[1], v[0], v[128] (vsrc1=v[128] reads hi of v[0])
# v[1] = v[0].lo * v[0].hi = 2.0 * 3.0 = 6.0
VOP2(VOP2Op.V_MUL_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
def test_v_add_f16_e32_vdst_hi_half(self):
"""V_ADD_F16_E32 writing to hi-half destination (v[128]+).
When vdst >= 384 (representing v[128]+), the hardware writes to bits [31:16]
of v[vdst-128] while preserving bits [15:0]. The emulator must merge the result.
Regression test for: VOP2 f16 vdst hi-half write bug.
"""
instructions = [
# v[0] = 0x0000_BEEF: lo has marker value
s_mov_b32(s[0], 0x0000BEEF),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(1.0), v[2] = f16(2.0)
s_mov_b32(s[1], 0x3c00),
s_mov_b32(s[2], 0x4000),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
# v_add_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0])
# v[0].hi = 1.0 + 2.0 = 3.0, v[0].lo preserved = 0xBEEF
VOP2(VOP2Op.V_ADD_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][0] >> 16) & 0xffff
lo = st.vgpr[0][0] & 0xffff
# hi = 3.0 = 0x4200, lo preserved = 0xBEEF
self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
self.assertEqual(lo, 0xBEEF, f"Expected lo preserved=0xBEEF, got 0x{lo:04x}")
def test_v_mul_f16_e32_vdst_hi_half(self):
"""V_MUL_F16_E32 writing to hi-half destination.
Regression test for: VOP2 f16 vdst hi-half write bug.
"""
instructions = [
# v[0] = 0x0000_DEAD: lo has marker value
s_mov_b32(s[0], 0x0000DEAD),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(2.0), v[2] = f16(4.0)
s_mov_b32(s[1], 0x4000),
s_mov_b32(s[2], 0x4400),
v_mov_b32_e32(v[1], s[1]),
v_mov_b32_e32(v[2], s[2]),
# v_mul_f16_e32 v[128], v[1], v[2] (vdst=v[128] writes hi of v[0])
# v[0].hi = 2.0 * 4.0 = 8.0, v[0].lo preserved = 0xDEAD
VOP2(VOP2Op.V_MUL_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][0] >> 16) & 0xffff
lo = st.vgpr[0][0] & 0xffff
# hi = 8.0 = 0x4800, lo preserved = 0xDEAD
self.assertEqual(hi, 0x4800, f"Expected hi=f16(8.0)=0x4800, got 0x{hi:04x}")
self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
def test_v_add_f16_e32_both_hi_half(self):
"""V_ADD_F16_E32 with both vsrc1 and vdst as hi-half (different underlying regs).
Tests the combination of both fixes: reading vsrc1 from hi-half AND
writing result to hi-half destination, using different underlying VGPRs.
Regression test for: VOP2 f16 hi-half bugs (combined).
"""
instructions = [
# v[0] = 0x4000_xxxx: hi=f16(2.0) for vsrc1
s_mov_b32(s[0], 0x40000000),
v_mov_b32_e32(v[0], s[0]),
# v[1] = 0x0000_3c00: lo=f16(1.0) for src0
s_mov_b32(s[1], 0x00003c00),
v_mov_b32_e32(v[1], s[1]),
# v[2] = 0x0000_CAFE: lo=marker for vdst preservation
s_mov_b32(s[2], 0x0000CAFE),
v_mov_b32_e32(v[2], s[2]),
# v_add_f16_e32 v[130], v[1], v[128]
# src0 = v[1].lo = 1.0
# vsrc1 = v[128] reads v[0].hi = 2.0
# result = 1.0 + 2.0 = 3.0
# vdst = v[130] writes to v[2].hi, preserving v[2].lo
VOP2(VOP2Op.V_ADD_F16, vdst=v[130], src0=v[1], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][2] >> 16) & 0xffff
lo = st.vgpr[0][2] & 0xffff
# hi = 3.0 = 0x4200, lo preserved = 0xCAFE
self.assertEqual(hi, 0x4200, f"Expected hi=f16(3.0)=0x4200, got 0x{hi:04x}")
self.assertEqual(lo, 0xCAFE, f"Expected lo preserved=0xCAFE, got 0x{lo:04x}")
def test_v_fmac_f16_e32_vsrc1_hi_half(self):
"""V_FMAC_F16_E32 with vsrc1 from hi-half.
V_FMAC_F16: vdst = vdst + src0 * vsrc1
Regression test for: VOP2 f16 vsrc1 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(3.0) = 0x4200
s_mov_b32(s[1], 0x4200),
v_mov_b32_e32(v[1], s[1]),
# v_fmac_f16_e32 v[1], v[0], v[128]
# vdst = v[1] = 3.0 + v[0].lo * v[0].hi = 3.0 + 1.0 * 2.0 = 5.0
VOP2(VOP2Op.V_FMAC_F16, vdst=v[1], src0=v[0], vsrc1=v[128]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xffff
# 3.0 + 1.0 * 2.0 = 5.0, f16 5.0 = 0x4500
self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
def test_v_fmac_f16_e32_vdst_hi_half(self):
"""V_FMAC_F16_E32 writing to hi-half destination.
V_FMAC_F16: vdst.h = vdst.h + src0 * vsrc1
When vdst is v[128]+, the accumulator D0 must also read from the hi-half.
This tests the bug where D0 was read from lo-half instead of hi-half.
Regression test for: VOP2 FMAC hi-half D0 accumulator read bug.
"""
instructions = [
# v[0] = 0x3800_DEAD: hi=f16(0.5), lo=marker (0xDEAD)
s_mov_b32(s[0], 0x3800DEAD),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(2.0) = 0x4000
s_mov_b32(s[1], 0x4000),
v_mov_b32_e32(v[1], s[1]),
# v[2] = f16(3.0) = 0x4200
s_mov_b32(s[2], 0x4200),
v_mov_b32_e32(v[2], s[2]),
# v_fmac_f16_e32 v[128], v[1], v[2]
# vdst = v[128] means v[0].hi
# D0 = v[0].hi = 0.5
# result = D0 + src0 * vsrc1 = 0.5 + 2.0 * 3.0 = 6.5
# v[0].hi = 6.5, v[0].lo preserved = 0xDEAD
VOP2(VOP2Op.V_FMAC_F16, vdst=v[128], src0=v[1], vsrc1=v[2]),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][0] >> 16) & 0xffff
lo = st.vgpr[0][0] & 0xffff
# hi = 6.5 = 0x4680, lo preserved = 0xDEAD
self.assertEqual(hi, 0x4680, f"Expected hi=f16(6.5)=0x4680, got 0x{hi:04x}")
self.assertEqual(lo, 0xDEAD, f"Expected lo preserved=0xDEAD, got 0x{lo:04x}")
def test_v_mul_f16_e32_src0_hi_half(self):
"""V_MUL_F16_E32 with src0 from hi-half (src0 >= v[128]).
When src0 >= 384 (representing v[128]+), the hardware reads from the hi 16 bits
of v[src0-128]. The emulator must extract bits [31:16] from the actual VGPR.
Regression test for: VOP2 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(3.0) = 0x4200
s_mov_b32(s[1], 0x4200),
v_mov_b32_e32(v[1], s[1]),
# v_mul_f16_e32 v[2], v[128], v[1]
# src0 = v[128] reads from v[0].hi = 2.0
# result = 2.0 * 3.0 = 6.0
VOP2(VOP2Op.V_MUL_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
def test_v_add_f16_e32_src0_hi_half(self):
"""V_ADD_F16_E32 with src0 from hi-half (src0 >= v[128]).
Regression test for: VOP2 f16 src0 hi-half extraction bug.
"""
instructions = [
# v[0] = 0x4000_3c00: hi=f16(2.0), lo=f16(1.0)
s_mov_b32(s[0], 0x40003c00),
v_mov_b32_e32(v[0], s[0]),
# v[1] = f16(5.0) = 0x4500
s_mov_b32(s[1], 0x4500),
v_mov_b32_e32(v[1], s[1]),
# v_add_f16_e32 v[2], v[128], v[1]
# src0 = v[128] reads from v[0].hi = 2.0
# result = 2.0 + 5.0 = 7.0
VOP2(VOP2Op.V_ADD_F16, vdst=v[2], src0=v[128], vsrc1=v[1]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
# 2.0 + 5.0 = 7.0, f16 7.0 = 0x4700
self.assertEqual(result, 0x4700, f"Expected f16(7.0)=0x4700, got 0x{result:04x}")
class TestF16InlineConstants(unittest.TestCase):
"""Regression tests for VOP2 F16 inline float constants.
For 16-bit VOP2 operations (v_add_f16, v_mul_f16, etc.), inline float constants
like 1.0, 2.0 must use F16 encoding (0x3c00, 0x4000) not F32 encoding (0x3f800000).
The emulator's rsrc() function needs bits=16 to select F16_INLINE constants.
Regression test for: VOP2 16-bit inline constant using F32 instead of F16.
"""
def test_v_add_f16_inline_constant_1_0(self):
"""V_ADD_F16_E32 with inline constant 1.0 should use F16 encoding."""
instructions = [
s_mov_b32(s[0], 0x3c00), # f16 1.0
v_mov_b32_e32(v[0], s[0]),
# v_add_f16_e32 v[1], 1.0, v[0] -- 1.0 must be F16 0x3c00, not F32 0x3f800000
v_add_f16_e32(v[1], 1.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xFFFF
# 1.0 + 1.0 = 2.0, f16 2.0 = 0x4000
self.assertEqual(result, 0x4000, f"Expected f16(2.0)=0x4000, got 0x{result:04x}")
def test_v_add_f16_inline_constant_2_0(self):
"""V_ADD_F16_E32 with inline constant 2.0."""
instructions = [
s_mov_b32(s[0], 0x4200), # f16 3.0
v_mov_b32_e32(v[0], s[0]),
v_add_f16_e32(v[1], 2.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xFFFF
# 2.0 + 3.0 = 5.0, f16 5.0 = 0x4500
self.assertEqual(result, 0x4500, f"Expected f16(5.0)=0x4500, got 0x{result:04x}")
def test_v_mul_f16_inline_constant(self):
"""V_MUL_F16_E32 with inline constant 2.0."""
instructions = [
s_mov_b32(s[0], 0x4200), # f16 3.0
v_mov_b32_e32(v[0], s[0]),
v_mul_f16_e32(v[1], 2.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][1] & 0xFFFF
# 2.0 * 3.0 = 6.0, f16 6.0 = 0x4600
self.assertEqual(result, 0x4600, f"Expected f16(6.0)=0x4600, got 0x{result:04x}")
class TestCndmask(unittest.TestCase):
"""Tests for V_CNDMASK_B32 and V_CNDMASK_B16."""
def test_v_cndmask_b16_select_src0(self):
"""V_CNDMASK_B16 selects src0 when VCC bit is 0."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0
s_mov_b32(s[0], 0x3c00), # f16 1.0
s_mov_b32(s[1], 0x4000), # f16 2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_cndmask_b16(v[2], v[0], v[1], VCC),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x3c00, f"Expected src0=0x3c00, got 0x{result:04x}")
def test_v_cndmask_b16_select_src1(self):
"""V_CNDMASK_B16 selects src1 when VCC bit is 1."""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1
s_mov_b32(s[0], 0x3c00), # f16 1.0
s_mov_b32(s[1], 0x4000), # f16 2.0
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_cndmask_b16(v[2], v[0], v[1], VCC),
]
st = run_program(instructions, n_lanes=1)
result = st.vgpr[0][2] & 0xffff
self.assertEqual(result, 0x4000, f"Expected src1=0x4000, got 0x{result:04x}")
def test_v_cndmask_b16_write_hi(self):
"""V_CNDMASK_B16 can write to high 16 bits with opsel."""
instructions = [
s_mov_b32(s[0], 0x3c003800), # src0: hi=1.0, lo=0.5
v_mov_b32_e32(v[0], s[0]),
s_mov_b32(s[1], 0x4000c000), # src1: hi=2.0, lo=-2.0
v_mov_b32_e32(v[1], s[1]),
s_mov_b32(s[2], 0xDEAD0000), # v2 initial: hi=0xDEAD, lo=0
v_mov_b32_e32(v[2], s[2]),
s_mov_b32(VCC_LO, 0), # vcc = 0, select src0
# opsel=0b1011: bit0=src0 hi, bit1=src1 hi, bit3=dst hi
VOP3(VOP3Op.V_CNDMASK_B16, vdst=v[2], src0=v[0], src1=v[1], src2=SrcEnum.VCC_LO, opsel=0b1011),
]
st = run_program(instructions, n_lanes=1)
hi = (st.vgpr[0][2] >> 16) & 0xffff
lo = st.vgpr[0][2] & 0xffff
# vcc=0 selects src0.h = 1.0 = 0x3c00, writes to hi
self.assertEqual(hi, 0x3c00, f"Expected hi=0x3c00 (1.0), got 0x{hi:04x}")
self.assertEqual(lo, 0x0000, f"Expected lo preserved as 0, got 0x{lo:04x}")
class TestSpecialFloatValues(unittest.TestCase):
"""Tests for special float value handling in VOP2 instructions."""
def test_neg_zero_add(self):
"""-0.0 + 0.0 = +0.0 (IEEE 754)."""
neg_zero = 0x80000000
instructions = [
s_mov_b32(s[0], neg_zero),
v_mov_b32_e32(v[0], s[0]),
v_add_f32_e32(v[1], 0.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0x00000000, "Should be +0.0")
def test_neg_zero_mul(self):
"""-0.0 * -1.0 = +0.0."""
neg_zero = 0x80000000
instructions = [
s_mov_b32(s[0], neg_zero),
v_mov_b32_e32(v[0], s[0]),
v_mul_f32_e32(v[1], -1.0, v[0]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0x00000000, "Should be +0.0")
def test_inf_minus_inf(self):
"""+inf - inf = NaN."""
import math
pos_inf = 0x7f800000
neg_inf = 0xff800000
instructions = [
s_mov_b32(s[0], pos_inf),
s_mov_b32(s[1], neg_inf),
v_mov_b32_e32(v[0], s[0]),
v_mov_b32_e32(v[1], s[1]),
v_sub_f32_e32(v[2], v[0], v[1]), # inf - (-inf) = inf
v_add_f32_e32(v[3], v[0], v[1]), # inf + (-inf) = NaN
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], pos_inf, "inf - (-inf) = inf")
self.assertTrue(math.isnan(i2f(st.vgpr[0][3])), "inf + (-inf) = NaN")
def test_denormal_f32_mul_ftz(self):
"""Denormal * normal - RDNA3 flushes denormals to zero (FTZ mode)."""
smallest_denorm = 0x00000001 # Smallest positive denormal
instructions = [
s_mov_b32(s[0], smallest_denorm),
v_mov_b32_e32(v[0], s[0]),
v_mul_f32_e32(v[1], 2.0, v[0]), # Denormal input gets flushed to 0
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][1], 0x00000000)
class TestCarryOps(unittest.TestCase):
"""Tests for VOP2 carry instructions (v_add_co_ci_u32, v_sub_co_ci_u32, v_subrev_co_ci_u32)."""
def test_v_subrev_co_ci_u32_no_borrow(self):
"""V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=0."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 5)
self.assertEqual(st.vcc, 0) # No borrow out
def test_v_subrev_co_ci_u32_with_borrow(self):
"""V_SUBREV_CO_CI_U32: D0 = S1 - S0 - VCC_IN, when VCC_IN=1."""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1 (borrow in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 1 = 4
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 4)
self.assertEqual(st.vcc, 0) # No borrow out
def test_v_subrev_co_ci_u32_generates_borrow(self):
"""V_SUBREV_CO_CI_U32: generates borrow when S0 + VCC_IN > S1."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0
v_mov_b32_e32(v[0], 10), # S0 = 10
v_mov_b32_e32(v[1], 5), # S1 = 5
v_subrev_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 - 10 - 0 = -5 (underflow)
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0xFFFFFFFB) # -5 as unsigned
self.assertEqual(st.vcc, 1) # Borrow out
def test_v_add_co_ci_u32_no_carry(self):
"""V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=0."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0 (no carry in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 0 = 15
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 15)
self.assertEqual(st.vcc, 0) # No carry out
def test_v_add_co_ci_u32_with_carry(self):
"""V_ADD_CO_CI_U32: D0 = S0 + S1 + VCC_IN, when VCC_IN=1."""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in)
v_mov_b32_e32(v[0], 5), # S0 = 5
v_mov_b32_e32(v[1], 10), # S1 = 10
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 5 + 10 + 1 = 16
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 16)
self.assertEqual(st.vcc, 0) # No carry out
def test_v_add_co_ci_u32_generates_carry(self):
"""V_ADD_CO_CI_U32: generates carry when overflow occurs."""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in)
s_mov_b32(s[0], 0xFFFFFFFF), # max u32
v_mov_b32_e32(v[0], s[0]), # S0 = 0xFFFFFFFF
v_mov_b32_e32(v[1], 0), # S1 = 0
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 0xFFFFFFFF + 0 + 1 = 0 (overflow)
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 0) # Overflowed to 0
self.assertEqual(st.vcc, 1) # Carry out
def test_v_add_co_ci_u32_clears_carry(self):
"""V_ADD_CO_CI_U32: VCC must be updated even when no carry is generated.
This tests the case where VCC=1 going in (carry-in consumed) but the addition
does not overflow, so VCC must be cleared to 0.
Regression test for: VCC not being written by v_add_co_ci_u32_e32.
"""
instructions = [
s_mov_b32(VCC_LO, 1), # VCC = 1 (carry in)
v_mov_b32_e32(v[0], 1), # S0 = 1
v_mov_b32_e32(v[1], 1), # S1 = 1
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 1 + 1 + 1 = 3 (no overflow)
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 3) # 1 + 1 + 1 = 3
self.assertEqual(st.vcc, 0) # No carry out - VCC must be cleared
def test_v_add_co_ci_u32_multilane_clears_vcc(self):
"""V_ADD_CO_CI_U32 with multiple lanes: VCC bits must be updated per-lane.
When VCC has multiple bits set (one per active lane), and the addition doesn't
overflow for any lane, all VCC bits must be cleared.
Regression test for: VCC not being written by v_add_co_ci_u32_e32 in multi-lane case.
"""
instructions = [
s_mov_b32(VCC_LO, 0b11), # VCC = 0b11 (lanes 0,1 have carry-in)
v_mov_b32_e32(v[0], 1), # S0 = 1 for all lanes
v_mov_b32_e32(v[1], 1), # S1 = 1 for all lanes
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 1 + 1 + 1 = 3 (no overflow)
]
st = run_program(instructions, n_lanes=2)
self.assertEqual(st.vgpr[0][2], 3) # lane 0: 1 + 1 + 1 = 3
self.assertEqual(st.vgpr[1][2], 3) # lane 1: 1 + 1 + 1 = 3
self.assertEqual(st.vcc, 0) # No carry out for any lane - all VCC bits must be cleared
def test_v_add_co_ci_u32_preserves_inactive_vcc_bits(self):
"""V_ADD_CO_CI_U32: VCC carry-out overwrites entire VCC register.
VOP2 carry instructions write ALL VCC bits based on carry-out, clearing
bits for lanes that don't overflow regardless of EXEC mask.
Note: This differs from VOPC which only writes active lane bits.
"""
instructions = [
s_mov_b32(VCC_LO, 0x00010000), # VCC bit 16 set
v_mov_b32_e32(v[0], 1), # S0 = 1
v_mov_b32_e32(v[1], 1), # S1 = 1
v_add_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 1 + 1 + 0 = 2 (no carry)
]
st = run_program(instructions, n_lanes=4)
self.assertEqual(st.vgpr[0][2], 2) # lane 0: 1 + 1 + 0 = 2
# VCC should be completely cleared (all lanes have no carry-out)
self.assertEqual(st.vcc, 0)
def test_v_add_co_ci_u32_all_lanes_same_result(self):
"""V_ADD_CO_CI_U32: all active lanes should produce the same result.
When the same constant inputs are used across all lanes, each lane should
compute the same result and write to its own VGPR slot.
Regression test for: VGPR writes not happening for all lanes.
"""
instructions = [
s_mov_b32(VCC_LO, 0), # No carry-in
v_mov_b32_e32(v[0], 3), # inline constant 3
v_mov_b32_e32(v[1], 5), # value 5
v_add_co_ci_u32_e32(v[1], 3, v[1]), # v[1] = 3 + v[1] + 0 = 3 + 5 = 8
]
st = run_program(instructions, n_lanes=4)
# All 4 lanes should have v[1] = 8
for lane in range(4):
self.assertEqual(st.vgpr[lane][1], 8, f"lane {lane} should have v[1]=8")
def test_v_sub_co_ci_u32_no_borrow(self):
"""V_SUB_CO_CI_U32: D0 = S0 - S1 - VCC_IN, when VCC_IN=0."""
instructions = [
s_mov_b32(VCC_LO, 0), # VCC = 0 (no borrow in)
v_mov_b32_e32(v[0], 10), # S0 = 10
v_mov_b32_e32(v[1], 5), # S1 = 5
v_sub_co_ci_u32_e32(v[2], v[0], v[1]), # D0 = 10 - 5 - 0 = 5
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][2], 5)
self.assertEqual(st.vcc, 0) # No borrow out
def test_v_sub_co_ci_u32_vop3sd_separate_carry_regs(self):
"""VOP3SD V_SUB_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
This tests the VOP3SD encoding where src2 specifies the carry-in register
independently from sdst (carry-out). The bug was reading carry-in from sdst
instead of src2.
Computation: D0 = S0 - S1 - carry_in = 0 - 0 - 1 = -1 = 0xFFFFFFFF
"""
instructions = [
s_mov_b32(s[6], 1), # carry-in = 1 (in s[6])
s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10])
# VOP3SD: v_sub_co_ci_u32(vdst, sdst, src0, src1, src2)
# src2 is carry-in (s[6]=1), sdst is carry-out (s[10])
v_sub_co_ci_u32(v[0], s[10], 0, 0, s[6]), # D0 = 0 - 0 - 1 = -1
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0xFFFFFFFF) # -1 as unsigned
self.assertEqual(st.sgpr[10], 1) # Borrow out to s[10]
def test_v_add_co_ci_u32_vop3sd_separate_carry_regs(self):
"""VOP3SD V_ADD_CO_CI_U32: carry-in from src2, carry-out to sdst (separate registers).
This tests the VOP3SD encoding where src2 specifies the carry-in register
independently from sdst (carry-out).
Computation: D0 = S0 + S1 + carry_in = 5 + 10 + 1 = 16
"""
instructions = [
s_mov_b32(s[6], 1), # carry-in = 1 (in s[6])
s_mov_b32(s[10], 0), # carry-out dest = 0 initially (in s[10])
# VOP3SD: v_add_co_ci_u32(vdst, sdst, src0, src1, src2)
v_add_co_ci_u32(v[0], s[10], 5, 10, s[6]), # D0 = 5 + 10 + 1 = 16
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 16)
self.assertEqual(st.sgpr[10], 0) # No carry out
def test_v_add_co_ci_u32_vop3sd_null_sdst(self):
"""VOP3SD V_ADD_CO_CI_U32 with sdst=NULL: carry output is discarded.
When sdst=NULL (register 124), the carry-out should NOT be written anywhere.
We verify this by checking that VCC (which we set to a sentinel value) is unchanged.
"""
instructions = [
s_mov_b32(VCC_LO, 0xDEADBEEF), # Sentinel value in VCC
s_mov_b32(s[6], 0), # carry-in = 0
# VOP3SD with NULL sdst: carry-out should be discarded
# Uses 0xFFFFFFFF + 1 + 0 = 0 with carry-out=1, but carry should not be written
v_add_co_ci_u32(v[0], NULL, 0xFFFFFFFF, 1, s[6]),
]
st = run_program(instructions, n_lanes=1)
self.assertEqual(st.vgpr[0][0], 0) # 0xFFFFFFFF + 1 + 0 = 0 (overflow)
self.assertEqual(st.vcc, 0xDEADBEEF) # VCC unchanged - carry was discarded
if __name__ == '__main__':
unittest.main()