diff --git a/tinygrad/renderer/isa/__init__.py b/tinygrad/renderer/isa/__init__.py index 7a74d7eefd..cd8b402a2d 100644 --- a/tinygrad/renderer/isa/__init__.py +++ b/tinygrad/renderer/isa/__init__.py @@ -45,7 +45,6 @@ class ISARenderer(Renderer): post_regalloc_matcher: PatternMatcher def is_two_address(self, x:UOp) -> bool: return False - def should_rematerialize(self, x:UOp) -> bool: return False def copy(self, x:UOp, reg:Register) -> UOp: raise NotImplementedError("arch specific") def spill(self, disp:UOp, x:UOp) -> UOp: raise NotImplementedError("arch specific") def fill(self, disp:UOp, x:UOp, reg:Register) -> UOp: raise NotImplementedError("arch specific") diff --git a/tinygrad/renderer/isa/x86.py b/tinygrad/renderer/isa/x86.py index 639a713bd7..647f703cfd 100644 --- a/tinygrad/renderer/isa/x86.py +++ b/tinygrad/renderer/isa/x86.py @@ -232,7 +232,7 @@ reg_strs = {"rax": {4:"eax", 2:"ax", 1:"al"}, "rcx": {4:"ecx", 2:"cx", 1:"cl"}, # if the load is used multiple times we don't fold def is_foldable_load(ctx:IselContext, x:UOp, s:UOp) -> bool: return s.op is Ops.LOAD and len(ctx.uses[s]) == x.src.count(s) == 1 def base(x:UOp, i:int) -> UOp: return s.src[0] if (s:=x.src[i]).op is Ops.GEP else s -def lane(x:UOp, i:int) -> int: return x.src[i].arg[0] if x.src[i].op is Ops.GEP else 0 +def lane(x:UOp, i:int) -> int: return s.arg[0] if (s:=x.src[i]).op is Ops.GEP else 0 def to_int(dt:DType): return {dtypes.float16: dtypes.int16, dtypes.float32: dtypes.int32, dtypes.float64: dtypes.int64}[dt] def def_reg(dt:DType, reg:Register|None=None) -> UOp: return UOp(Ops.INS, arg=X86Ops.DEFINE_REG, dtype=dt, tag=None if reg is None else (reg,)) def imm(dt:DType, v:int) -> UOp: return UOp(Ops.INS, arg=X86Ops.IMM, dtype=dt, tag=truncate[dt](v)) @@ -357,10 +357,9 @@ dt_128bit = tuple(dt.vec(l) for dt in dts for l in [16,8,4,2,1] if l*dt.itemsize isel_matcher = PatternMatcher([ # **** Op -> Op **** - # TODO: this breaks stuff # float gep(0) is a noop as it just moves the 0th element from one xmm register to another - # this is done here to not interfere with shuffles / gep store fusion - #(UPat(dtype=dtypes.floats).gep(0, name="x"), lambda x: x.replace(op=Ops.NOOP, arg=None)), + # this is done here to not interfere with shuffles + (UPat(dtype=dtypes.floats).gep(0, name="x"), lambda x: x.replace(op=Ops.NOOP, arg=None)), # range is lowered to acc, cmp, jmp after regalloc (UPat(Ops.RANGE, src=(UPat.cvar("c"),), allow_any_len=True, name="x"), lambda c,x: x.replace(src=(imm(c.dtype, c.arg),) + x.src[1:])), (UPat(Ops.RANGE, name="x"), lambda ctx,x: x.replace(tag=(ctx.vreg(WGPR),)) if not isinstance(x.tag, tuple) else None), @@ -543,7 +542,7 @@ isel_matcher = PatternMatcher([ (UPat(Ops.INDEX, name="x"), lambda x: x.ins(X86Ops.LEA, src=fold_address(x))), # TODO: fuse stores, very few cases -- store cmp becomes setcc, store gep int becomes vpextr, store bitcast to int becomes vmovd/q # copy, load, store - # NOTE: copy here violates the spec, it only happens in register allocation when a reg to reg move needs to be inserted + # NOTE: copy here violates the spec, it only happens post register allocation when a reg to reg move needs to be inserted (UPat(Ops.COPY, dt_128bit, name="x"), lambda x: x.ins(X86Ops.VMOVUPS)), (UPat(Ops.COPY, dt_64bit, name="x"), lambda x: x.ins(X86Ops.VMOVSD)), (UPat(Ops.COPY, dt_32bit+dt_16bit, name="x"), lambda x: x.ins(X86Ops.VMOVSS)),