2_nchw_to_nhwc_param_0+168]; mov.u32 %r8, %ctaid.x; div.u32 %r9, %r8, %r7; mul.lo.s32 %r10, %r9, %r7; sub.s32 %r11, %r8, %r10; shl.b32 %r2, %r11, 4; shl.b32 %r3, %r9, 4; mov.u32 %r12, %tid.y; shl.b32 %r4, %r12, 1; add.s32 %r13, %r2, %r4; mov.u32 %r14, %tid.x; shl.b32 %r5, %r14, 1; add.s32 %r15, %r3, %r5; cvt.u64.u32 %rd2, %r13; ld.param.u64 %rd7, [ConvertTensor_2x2_nchw_to_nhwc_param_0+88]; setp.le.u64 %p1, %rd7, %rd2; cvt.u64.u32 %rd3, %r15; ld.param.u64 %rd8, [ConvertTensor_2x2_nchw_to_nhwc_param_0+160]; setp.le.u64 %p2, %rd8, %rd3; mov.u16 %rs21, 0; or.pred %p3, %p1, %p2; mov.u16 %rs22, %rs21; mov.u16 %rs23, %rs21; mov.u16 %rs24, %rs21; @%p3 bra $L__BB0_2; cvt.u32.u64 %r16, %rd2; ld.param.u64 %rd9, [%rd1+8]; cvta.to.global.u64 %rd10, %rd9; ld.param.u64 %rd11, [%rd1+48]; cvt.u64.u32 %rd12, %r1; mul.lo.s64 %rd13, %rd11, %rd12; ld.param.u64 %rd14, [%rd1+56]; mul.lo.s64 %rd15, %rd14, %rd2; add.s64 %rd16, %rd13, %rd3; add.s64 %rd17, %rd16, %rd15; shl.b64 %rd18, %rd17, 1; and.b64 %rd19, %rd18, -4; add.s64 %rd20, %rd10, %rd19; ld.global.v2.u16 {%rs23, %rs24}, [%rd20]; add.s32 %r17, %r16, 1; cvt.u64.u32 %rd21, %r17; mul.lo.s64 %rd22, %rd14, %rd21; add.s64 %rd23, %rd16, %rd22; shl.b64 %rd24, %rd23, 1; and.b64 %rd25, %rd24, -4; add.s64 %rd26, %rd10, %rd25; ld.global.v2.u16 {%rs21, %rs22}, [%rd26]; $L__BB0_2: shl.b32 %r18, %r4, 5; mov.u32 %r19, _ZZ30ConvertTensor_2x2_nchw_to_nhwcE6values; add.s32 %r20, %r19, %r18; shl.b32 %r21, %r5, 1; add.s32 %r22, %r20, %r21; st.shared.u16 [%r22], %rs23; st.shared.u16 [%r22+32], %rs21; st.shared.u16 [%r22+2], %rs24; st.shared.u16 [%r22+34], %rs22; bar.sync 0; shl.b32 %r23, %r5, 5; add.s32 %r24, %r19, %r23; shl.b32 %r25, %r4, 1; add.s32 %r6, %r24, %r25; add.s32 %r26, %r2, %r5; cvt.u64.u32 %rd4, %r26; ld.param.u64 %rd27, [%rd1+88]; setp.le.u64 %p4, %rd27, %rd4; add.s32 %r27, %r3, %r4; cvt.u64.u32 %rd5, %r27; ld.param.u64 %rd28, [%rd1+160]; setp.le.u64 %p5, %rd28, %rd5; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_4; cvt.u32.u64 %r28, %rd5; ld.param.u64 %rd29, [%rd1]; ld.param.u64 %rd30, [%rd1+112]; cvt.u64.u32 %rd31, %r1; mul.lo.s64 %rd32, %rd30, %rd31; ld.param.u64 %rd33, [%rd1+136]; mul.lo.s64 %rd34, %rd33, %rd5; add.s64 %rd35, %rd32, %rd4; add.s64 %rd36, %rd35, %rd34; shl.b64 %rd37, %rd36, 1; and.b64 %rd38, %rd37, -4; cvta.to.global.u64 %rd39, %rd29; add.s64 %rd40, %rd39, %rd38; ld.shared.u16 %rs17, [%r6+32]; ld.shared.u16 %rs18, [%r6]; ld.shared.u16 %rs19, [%r6+34]; ld.shared.u16 %rs20, [%r6+2]; st.global.v2.u16 [%rd40], {%rs18, %rs17}; add.s32 %r29, %r28, 1; cvt.u64.u32 %rd41, %r29; mul.lo.s64 %rd42, %rd33, %rd41; add.s64 %rd43, %rd35, %rd42; shl.b64 %rd44, %rd43, 1; and.b64 %rd45, %rd44, -4; add.s64 %rd46, %rd39, %rd45; st.global.v2.u16 [%rd46], {%rs20, %rs19}; $L__BB0_4: ret; }