Child pages
  • AMD GPU

Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Migrated to Confluence 5.3

...

  • program counter (shared with threads within a SIMD)
  • loop state: constant, index
  • stack: loop nesting, predicates
  • general purpose registers
    • thread private
    • clause temporary
    • shared: SIMD global
  • constant registers
  • special registers: previous scalar, previous vector
  • predicate state

Example

Code Block
borderStylesolid
titleVecAdd.cl
borderStylesolid
/*
 *  Sums two vectors
 */

__kernel void 
vectorAddition(__global float * output,
	       __global float * input0,
	       __global float * input1,
	       const    uint    width)
{
    /* get the block id */
    int bx = get_group_id(0);
    /* get the local id within the block */
    int tx = get_local_id(0); 
    /* who am I? */
    int idx = bx * get_local_size(0) + tx;
    /* write the sum to the output */
    if (idx >= 0 && idx < width)
	output[idx] = input0[idx] + input1[idx];
}
Code Block
borderStylesolid
titlevecAdd.isa
borderStylesolid
ShaderType = 3
TargetChip = m
;SC Dep components
NumClauseTemps = 4

; --------  Disassembly --------------------
00 ALU_PUSH_BEFORE: ADDR(32) CNT(28) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 
      0  x: AND_INT     T1.x,  R0.x,  (0x0000007F, 1.779649050e-43f).x      
         w: LSHR        T0.w,  R0.x,  (0x00000007, 9.809089250e-45f).y      
         t: RCP_UINT    T0.z,  KC0[2].x      
      1  t: MULLO_UINT  T0.y,  KC0[2].x,  PS0      
      2  w: SUB_INT     ____,  0.0f,  PS1      
         t: MULHI_UINT  T0.x,  KC0[2].x,  T0.z      
      3  z: CNDE_INT    R123.z,  PS2,  PV2.w,  T0.y      
      4  t: MULHI_UINT  ____,  PV3.z,  T0.z      
      5  x: SUB_INT     ____,  T0.z,  PS4      
         w: ADD_INT     ____,  T0.z,  PS4      
      6  z: CNDE_INT    R123.z,  T0.x,  PV5.w,  PV5.x      
      7  t: MULHI_UINT  ____,  PV6.z,  T0.w      
      8  t: MULLO_UINT  ____,  PS7,  KC0[2].x      
      9  z: SETGE_UINT  T0.z,  T0.w,  PS8      
         w: SUB_INT     T0.w,  T0.w,  PS8      
     10  x: SETGE_UINT  ____,  PV9.w,  KC0[2].x      
         y: SUB_INT     T0.y,  PV9.w,  KC0[2].x      
     11  w: AND_INT     ____,  T0.z,  PV10.x      
     12  z: CNDE_INT    T1.z,  PV11.w,  T0.w,  T0.y      
     13  y: ADD_INT     ____,  KC0[2].x,  PV12.z      
     14  x: CNDE_INT    R123.x,  T0.z,  PV13.y,  T1.z      
     15  w: CNDE_INT    R123.w,  KC0[2].x,  -1,  PV14.x      
     16  z: ADD_INT     ____,  PV15.w,  KC0[7].x      
     17  t: MULLO_INT   ____,  KC0[1].x,  PV16.z      
     18  x: ADD_INT     R1.x,  T1.x,  PS17      
     19  x: SETGE_UINT  R0.x,  PV18.x,  KC1[3].x      
     20  x: PREDE_INT   ____,  R0.x,  0.0f      UPDATE_EXEC_MASK UPDATE_PRED 
01 JUMP  POP_CNT(1) ADDR(9) 
02 ALU_PUSH_BEFORE: ADDR(60) CNT(2) 
     21  x: SETGT_INT   R0.x,  R1.x,  -1      
     22  x: PREDNE_INT  ____,  R0.x,  0.0f      UPDATE_EXEC_MASK UPDATE_PRED 
03 JUMP  POP_CNT(2) ADDR(9) 
04 ALU: ADDR(62) CNT(8) KCACHE0(CB1:0-15) 
     23  y: LSHL        T0.y,  R1.x,  (0x00000002, 2.802596929e-45f).x      
     24  z: ADD_INT     ____,  KC0[1].x,  PV23.y      
         w: ADD_INT     ____,  KC0[2].x,  PV23.y      
     25  x: LSHR        R0.x,  PV24.w,  (0x00000002, 2.802596929e-45f).x      
         y: ADD_INT     R0.y,  KC0[0].x,  T0.y      
         t: LSHR        R1.x,  PV24.z,  (0x00000002, 2.802596929e-45f).x      
05 TEX: ADDR(80) CNT(2) 
     26  RD_SCATTER R0.x___, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 
     27  RD_SCATTER R1.x___, DWORD_PTR[0+R1.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 
06 ALU: ADDR(70) CNT(3) 
     28  x: ADD         R0.x,  R0.x,  R1.x      
         t: LSHR        R1.x,  R0.y,  (0x00000002, 2.802596929e-45f).x      
07 MEM_EXPORT_WRITE_IND: DWORD_PTR[0+R1.x].x___, R0, ELEM_SIZE(3)  VPM 
08 POP (2) ADDR(9) 
09 NOP NO_BARRIER 
END_OF_PROGRAM

; ----------------- CS Data ------------------------
; Input Semantic Mappings
;    No input mappings

GprPoolSize = 0
CodeLen                 = 672;Bytes
PGM_END_CF              = 0; words(64 bit)
PGM_END_ALU             = 0; words(64 bit)
PGM_END_FETCH           = 0; words(64 bit)
MaxScratchRegsNeeded    = 0
;AluPacking              = 0.0
;AluClauses              = 0
;PowerThrottleRate       = 0.0
; texResourceUsage[0]     = 0x00000000
; texResourceUsage[1]     = 0x00000000
; texResourceUsage[2]     = 0x00000000
; texResourceUsage[3]     = 0x00000000
; fetch4ResourceUsage[0]  = 0x00000000
; fetch4ResourceUsage[1]  = 0x00000000
; fetch4ResourceUsage[2]  = 0x00000000
; fetch4ResourceUsage[3]  = 0x00000000
; texSamplerUsage         = 0x00000000
; constBufUsage           = 0x00000000
ResourcesAffectAlphaOutput[0]  = 0x00000000
ResourcesAffectAlphaOutput[1]  = 0x00000000
ResourcesAffectAlphaOutput[2]  = 0x00000000
ResourcesAffectAlphaOutput[3]  = 0x00000000

;SQ_PGM_RESOURCES        = 0x30000102
SQ_PGM_RESOURCES:NUM_GPRS     = 2
SQ_PGM_RESOURCES:STACK_SIZE           = 1
SQ_PGM_RESOURCES:FETCH_CACHE_LINES    = 0
SQ_PGM_RESOURCES:PRIME_CACHE_ENABLE   = 1
CsSetupMode = Fast
NumThreadPerGroup = 128
NumWavefrontPerSIMD = 4
IsMaxNumWavePerSIMD = No
; SetBufferForNumGroup = false

...