pthwise_conv_fp16_nhwc_c8_kernel_param_0[152] ) { .reg .pred %p<63>; .reg .b16 %rs<76>; .reg .f32 %f<402>; .reg .b32 %r<197>; .reg .b64 %rd<49>; mov.b64 %rd8, depthwise_conv_fp16_nhwc_c8_kernel_param_0; mov.u64 %rd1, %rd8; mov.u32 %r71, %ntid.x; mov.u32 %r72, %ctaid.x; mov.u32 %r73, %tid.x; mad.lo.s32 %r74, %r72, %r71, %r73; shl.b32 %r1, %r74, 3; mov.u32 %r75, %ntid.y; mov.u32 %r76, %ctaid.y; mov.u32 %r77, %tid.y; mad.lo.s32 %r2, %r76, %r75, %r77; mov.u32 %r78, %ntid.z; mov.u32 %r79, %ctaid.z; mov.u32 %r80, %tid.z; mad.lo.s32 %r3, %r79, %r78, %r80; ld.param.v2.u32 {%r81, %r82}, [depthwise_conv_fp16_nhwc_c8_kernel_param_0+40]; setp.ge.s32 %p2, %r2, %r82; setp.ge.s32 %p3, %r3, %r81; or.pred %p4, %p2, %p3; ld.param.u32 %r4, [depthwise_conv_fp16_nhwc_c8_kernel_param_0+36]; setp.ge.s32 %p5, %r1, %r4; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_47; ld.param.u32 %r5, [%rd1+32]; setp.lt.s32 %p7, %r5, 1; @%p7 bra $L__BB0_47; ld.param.v4.u8 {%rs3, %rs4, %rs5, %rs6}, [%rd1+144]; mov.u32 %r190, 0; ld.param.v2.u32 {%r86, %r87}, [%rd1+56]; ld.param.u64 %rd2, [%rd1+24]; setp.ne.s16 %p8, %rs5, 0; setp.eq.s16 %p9, %rs4, 0; or.pred %p1, %p8, %p9; ld.param.u32 %r88, [%rd1+132]; add.s32 %r8, %r88, %r3; ld.param.u32 %r89, [%rd1+120]; add.s32 %r9, %r89, %r2; ld.param.v2.u32 {%r90, %r91}, [%rd1+104]; mul.lo.s32 %r92, %r91, %r2; sub.s32 %r12, %r92, %r89; ld.param.u32 %r93, [%rd1+104]; mul.lo.s32 %r94, %r93, %r3; sub.s32 %r13, %r94, %r88; ld.param.u8 %rs2, [%rd1+145]; ld.param.u32 %r14, [%rd1+56]; ld.param.v2.u32 {%r95, %r96}, [%rd1+112]; ld.param.u32 %r17, [%rd1+52]; ld.param.v2.u32 {%r97, %r98}, [%rd1+48]; ld.param.u64 %rd9, [%rd1+8]; cvta.to.global.u64 %rd3, %rd9; ld.param.u64 %rd10, [%rd1+16]; cvta.to.global.u64 %rd4, %rd10; ld.param.u32 %r20, [%rd1+80]; ld.param.v2.u32 {%r99, %r100}, [%rd1+88]; ld.param.v2.u32 {%r101, %r102}, [%rd1+96]; ld.param.u32 %r24, [%rd1+60]; ld.param.v2.f32 {%f154, %f155}, [%rd1+136]; cvta.to.global.u64 %rd11, %rd2; ld.param.u32 %r25, [%rd1+112]; ld.param.u32 %r26, [%rd1+48]; ld.param.u64 %rd12, [%rd1]; cvta.to.global.u64 %rd5, %rd12; ld.param.u32 %r27, [%rd1+64]; ld.param.v2.u32 {%r104, %r105}, [%rd1+72]; mul.lo.s32 %r28, %r104, %r3; mul.lo.s32 %r29, %r105, %r2; mul.lo.s32 %r30, %r101, %r1; mul.wide.s32 %rd13, %r1, 2; add.s64 %rd6, %rd11, %rd13; mul.wide.s32 %rd7, %r101, 2; $L__BB0_3: setp.eq.s16 %p10, %rs3, 0; mov.f32 %f380, 0f00000000; mov.f32 %f381, 0f00000000; mov.f32 %f382, 0f00000000; mov.f32 %f383, 0f00000000; mov.f32 %f384, 0f00000000; mov.f32 %f385, 0f00000000; mov.f32 %f386, 0f00000000; mov.f32 %f387, 0f00000000; @%p10 bra $L__BB0_23; @%p1 bra $L__BB0_13; bra.uni $L__BB0_5; $L__BB0_13: setp.lt.s32 %p20, %r86, 1; @%p20 bra $L__BB0_31; mov.u32 %r193, 0; mov.f32 %f380, 0f00000000; mov.f32 %f381, %f380; mov.f32 %f382, %f380; mov.f32 %f383, %f380; mov.f32 %f384, %f380; mov.f32 %f385, %f380; mov.f32 %f386, %f380; mov.f32 %f387, %f380; $L__BB0_15: setp.lt.s32 %p21, %r87, 1; @%p21 bra $L__BB0_22; setp.eq.s16 %p22, %rs2, 0; not.b32 %r124, %r193; add.s32 %r125, %r86, %r124; selp.b32 %r126, %r125, %r193, %p22; mul.lo.s32 %r127, %r126, %r25; sub.s32 %r53, %r8, %r127; rem.s32 %r54, %r53, %r90; mov.u32 %r194, 0; $L__BB0_17: setp.eq.s16 %p61, %rs2, 0; not.b32 %r128, %r194; add.s32 %r129, %r87, %r128; selp.b32 %r130, %r129, %r194, %p61; mul.lo.s32 %r131, %r130, %r96; sub.s32 %r56, %r9, %r131; setp.ne.s32 %p24, %r54, 0; @%p24 bra $L__BB0_21; div.s32 %r57, %r53, %r90; rem.s32 %r132, %r56, %r91; setp.ne.s32 %p25, %r132, 0; @%p25 bra $L__BB0_21; div.s32 %r58, %r56, %r91; setp.le.s32 %p26, %r98, %r58; or.b32 %r133, %r58, %r57; setp.lt.s32 %p27, %r133, 0; or.pred %p28, %p27, %p26; setp.le.s32 %p29, %r97, %r57; or.pred %p30, %p28, %p29; @%p30 bra $L__BB0_21; mul.lo.s32 %r186, %r102, %r193; mad.lo.s32 %r185, %r20, %r190, %r1; mad.lo.s32 %r134, %r99, %r57, %r185; mad.lo.s32 %r135, %r100, %r58, %r134; mul.wide.s32 %rd25, %r135, 2; add.s64 %rd26, %rd3, %rd25; ld.global.v4.u32 {%r136, %r137, %r138, %r139}, [%rd26]; add.s32 %r144, %r30, %r194; add.s32 %r145, %r144, %r186; mul.wide.s32 %rd27, %r145, 2; add.s64 %rd28, %rd4, %rd27; ld.global.u16 %rs26, [%rd28]; // begin inline asm { cvt.f32.f16 %f204, %rs26;} // end inline asm mov.b32 {%rs27, %rs29}, %r136; // begin inline asm { cvt.f32.f16 %f205, %rs27;} // end inline asm fma.rn.ftz.f32 %f387, %f204, %f205, %f387; add.s64 %rd29, %rd28, %rd7; ld.global.u16 %rs28, [%rd29]; // begin inline asm { cvt.f32.f16 %f206, %rs28;} // end inline asm // begin inline asm { cvt.f32.f16 %f207, %rs29;} // end inline asm fma.rn.ftz.f32 %f386, %f206, %f207, %f386; add.s64 %rd30, %rd29, %rd7; ld.global.u16 %rs30, [%rd30]; // begin inline asm { cvt.f32.f16 %f208, %rs30;} // end inline asm mov.b32 {%rs31, %rs33}, %r137; // begin inline asm { cvt.f32.f16 %f209, %rs31;} // end inline asm fma.rn.ftz.f32 %f385, %f208, %f209, %f385; add.s64 %rd31, %rd30, %rd7; ld.global.u16 %rs32, [%rd31]; // begin inline asm { cvt.f32.f16 %f210, %rs32;} // end inline asm // begin inline asm { cvt.f32.f16 %f211, %rs33;} // end inline asm fma.rn.ftz.f32 %f384, %f210, %f211, %f384; add.s64 %rd32, %rd31, %rd7; ld.global.u16 %rs34, [%rd32]; // begin inline asm { cvt.f32.f16 %f212, %rs34;} // end inline asm mov.b32 {%rs35, %rs37}, %r138; // begin inline asm { cvt.f32.f16 %f213, %rs35;} // end inline asm fma.rn.ftz.f32 %f383, %f212, %f213, %f383; add.s64 %rd33, %rd32, %rd7; ld.global.u16 %rs36, [%rd33]; // begin inline asm { cvt.f32.f16 %f214, %rs36;} // end inline asm // begin inline asm { cvt.f32.f16 %f215, %rs37;} // end inline asm fma.rn.ftz.f32 %f382, %f214, %f215, %f382; add.s64 %rd34, %rd33, %rd7; ld.global.u16 %rs38, [%rd34]; // begin inline asm { cvt.f32.f16 %f216, %rs38;} // end inline asm mov.b32 {%rs39, %rs41}, %r139; // begin inline asm { cvt.f32.f16 %f217, %rs39;} // end inline asm fma.rn.ftz.f32 %f381, %f216, %f217, %f381; add.s64 %rd35, %rd34, %rd7; ld.global.u16 %rs40, [%rd35]; // begin inline asm { cvt.f32.f16 %f218, %rs40;} // end inline asm // begin inline asm { cvt.f32.f16 %f219, %rs41;} // end inline asm fma.rn.ftz.f32 %f380, %f218, %f219, %f380; $L__BB0_21: add.s32 %r194, %r194, 1; setp.lt.s32 %p31, %r194, %r87; @%p31 bra $L__BB0_17; $L__BB0_22: add.s32 %r193, %r193, 1; setp.lt.s32 %p32, %r193, %r86; @%p32 bra $L__BB0_15; bra.uni $L__BB0_31; $L__BB0_23: setp.lt.s32 %p33, %r86, 1; @%p33 bra $L__BB0_31; mov.u32 %r195, 0; mov.f32 %f380, 0f00000000; mov.f32 %f381, %f380; mov.f32 %f382, %f380; mov.f32 %f383, %f380; mov.f32 %f384, %f380; mov.f32 %f385, %f380; mov.f32 %f386, %f380; mov.f32 %f387, %f380; $L__BB0_25: setp.lt.s32 %p34, %r87, 1; @%p34 bra $L__BB0_30; mad.lo.s32 %r187, %r20, %r190, %r1; setp.eq.s16 %p35, %rs2, 0; not.b32 %r148, %r195; add.s32 %r149, %r14, %r148; mul.lo.s32 %r150, %r149, %r95; mul.lo.s32 %r151, %r95, %r195; selp.b32 %r152, %r150, %r151, %p35; add.s32 %r64, %r152, %r13; mov.u32 %r196, 0; $L__BB0_27: setp.eq.s16 %p62, %rs2, 0; not.b32 %r153, %r196; add.s32 %r154, %r87, %r153; selp.b32 %r155, %r154, %r196, %p62; mad.lo.s32 %r67, %r96, %r155, %r12; setp.le.s32 %p37, %r17, %r67; or.b32 %r156, %r67, %r64; setp.lt.s32 %p38, %r156, 0; or.pred %p39, %p37, %p38; setp.le.s32 %p40, %r97, %r64; or.pred %p41, %p40, %p39; @%p41 bra $L__BB0_29; mul.lo.s32 %r189, %r102, %r195; mad.lo.s32 %r188, %r99, %r64, %r187; mad.lo.s32 %r157, %r100, %r67, %r188; mul.wide.s32 %rd36, %r157, 2; add.s64 %rd37, %rd3, %rd36; ld.global.v4.u32 {%r158, %r159, %r160, %r161}, [%rd37]; add.s32 %r166, %r30, %r196; add.s32 %r167, %r166, %r189; mul.wide.s32 %rd38, %r167, 2; add.s64 %rd39, %rd4, %rd38; ld.global.u16 %rs44, [%rd39]; // begin inline asm { cvt.f32.f16 %f236, %rs44;} // end inline asm mov.b32 {%rs45, %rs47}, %r158; // begin inline asm { cvt.f32.f16 %f237, %rs45;} // end inline asm fma.rn.ftz.f32 %f387, %f236, %f237, %f387; add.s64 %rd40, %rd39, %rd7; ld.global.u16 %rs46, [%rd40]; // begin inline asm { cvt.f32.f16 %f238, %rs46;} // end inline asm // begin inline asm { cvt.f32.f16 %f239, %rs47;} // end inline asm fma.rn.ftz.f32 %f386, %f238, %f239, %f386; add.s64 %rd41, %rd40, %rd7; ld.global.u16 %rs48, [%rd41]; // begin inline asm { cvt.f32.f16 %f240, %rs48;} // end inline asm mov.b32 {%rs49, %rs51}, %r159; // begin inline asm { cvt.f32.f16 %f241, %rs49;} // end inline asm fma.rn.ftz.f32 %f385, %f240, %f241, %f385; add.s64 %rd42, %rd41, %rd7; ld.global.u16 %rs50, [%rd42]; // begin inline asm { cvt.f32.f16 %f242, %rs50;} // end inline asm // begin inline asm { cvt.f32.f16 %f243, %rs51;} // end inline asm fma.rn.ftz.f32 %f384, %f242, %f243, %f384; add.s64 %rd43, %rd42, %rd7; ld.global.u16 %rs52, [%rd43]; // begin inline asm { cvt.f32.f16 %f244, %rs52;} // end inline asm mov.b32 {%rs53, %rs55}, %r160; // begin inline asm { cvt.f32.f16 %f245, %rs53;} // end inline asm fma.rn.ftz.f32 %f383, %f244, %f245, %f383; add.s64 %rd44, %rd43, %rd7; ld.global.u16 %rs54, [%rd44]; // begin inline asm { cvt.f32.f16 %f246, %rs54;} // end inline asm // begin inline asm { cvt.f32.f16 %f247, %rs55;} // end inline asm fma.rn.ftz.f32 %f382, %f246, %f247, %f382; add.s64 %rd45, %rd44, %rd7; ld.global.u16 %rs56, [%rd45]; // begin inline asm { cvt.f32.f16 %f248, %rs56;} // end inline asm mov.b32 {%rs57, %rs59}, %r161; // begin inline asm { cvt.f32.f16 %f249, %rs57;} // end inline asm fma.rn.ftz.f32 %f381, %f248, %f249, %f381; add.s64 %rd46, %rd45, %rd7; ld.global.u16 %rs58, [%rd46]; // begin inline asm { cvt.f32.f16 %f250, %rs58;} // end inline asm // begin inline asm { cvt.f32.f16 %f251, %rs59;} // end inline asm fma.rn.ftz.f32 %f380, %f250, %f251, %f380; $L__BB0_29: add.s32 %r196, %r196, 1; setp.lt.s32 %p42, %r196, %r24; @%p42 bra $L__BB0_27; $L__BB0_30: add.s32 %r195, %r195, 1; setp.lt.s32 %p43, %r195, %r86; @%p43 bra $L__BB0_25; bra.uni $L__BB0_31; $L__BB0_5: rem.s32 %r39, %r9, %r91; rem.s32 %r191, %r8, %r90; setp.ge.s32 %p11, %r191, %r86; @%p11 bra $L__BB0_31; mov.f32 %f380, 0f00000000; mov.f32 %f381, %f380; mov.f32 %f382, %f380; mov.f32 %f383, %f380; mov.f32 %f384, %f380; mov.f32 %f385, %f380; mov.f32 %f386, %f380; mov.f32 %f387, %f380; $L__BB0_7: setp.ge.s32 %p12, %r39, %r24; @%p12 bra $L__BB0_12; mad.lo.s32 %r182, %r20, %r190, %r1; sub.s32 %r108, %r8, %r191; div.s32 %r43, %r108, %r90; mov.u32 %r192, %r39; $L__BB0_9: sub.s32 %r109, %r9, %r192; div.s32 %r47, %r109, %r91; setp.le.s32 %p13, %r98, %r47; or.b32 %r110, %r47, %r43; setp.lt.s32 %p14, %r110, 0; or.pred %p15, %p14, %p13; setp.le.s32 %p16, %r26, %r43; or.pred %p17, %p15, %p16; @%p17 bra $L__BB0_11; mul.lo.s32 %r184, %r102, %r191; mad.lo.s32 %r183, %r99, %r43, %r182; mad.lo.s32 %r111, %r100, %r47, %r183; mul.wide.s32 %rd14, %r111, 2; add.s64 %rd15, %rd3, %rd14; ld.global.v4.u32 {%r112, %r113, %r114, %r115}, [%rd15]; add.s32 %r120, %r30, %r192; add.s32 %r121, %r120, %r184; mul.wide.s32 %rd16, %r121, 2; add.s64 %rd17, %rd4, %rd16; ld.global.u16 %rs8, [%rd17]; // begin inline asm { cvt.f32.f16 %f172, %rs8;} // end inline asm mov.b32 {%rs9, %rs11}, %r112; // begin inline asm { cvt.f32.f16 %f173, %rs9;} // end inline asm fma.rn.ftz.f32 %f387, %f172, %f173, %f387; add.s64 %rd18, %rd17, %rd7; ld.global.u16 %rs10, [%rd18]; // begin inline asm { cvt.f32.f16 %f174, %rs10;} // end inline asm // begin inline asm { cvt.f32.f16 %f175, %rs11;} // end inline asm fma.rn.ftz.f32 %f386, %f174, %f175, %f386; add.s64 %rd19, %rd18, %rd7; ld.global.u16 %rs12, [%rd19]; // begin inline asm { cvt.f32.f16 %f176, %rs12;} // end inline asm mov.b32 {%rs13, %rs15}, %r113; // begin inline asm { cvt.f32.f16 %f177, %rs13;} // end inline asm fma.rn.ftz.f32 %f385, %f176, %f177, %f385; add.s64 %rd20, %rd19, %rd7; ld.global.u16 %rs14, [%rd20]; // begin inline asm { cvt.f32.f16 %f178, %rs14;} // end inline asm // begin inline asm { cvt.f32.f16 %f179, %rs15;} // end inline asm fma.rn.ftz.f32 %f384, %f178, %f179, %f384; add.s64 %rd21, %rd20, %rd7; ld.global.u16 %rs16, [%rd21]; // begin inline asm { cvt.f32.f16 %f180, %rs16;} // end inline asm mov.b32 {%rs17, %rs19}, %r114; // begin inline asm { cvt.f32.f16 %f181, %rs17;} // end inline asm fma.rn.ftz.f32 %f383, %f180, %f181, %f383; add.s64 %rd22, %rd21, %rd7; ld.global.u16 %rs18, [%rd22]; // begin inline asm { cvt.f32.f16 %f182, %rs18;} // end inline asm // begin inline asm { cvt.f32.f16 %f183, %rs19;} // end inline asm fma.rn.ftz.f32 %f382, %f182, %f183, %f382; add.s64 %rd23, %rd22, %rd7; ld.global.u16 %rs20, [%rd23]; // begin inline asm { cvt.f32.f16 %f184, %rs20;} // end inline asm mov.b32 {%rs21, %rs23}, %r115; // begin inline asm { cvt.f32.f16 %f185, %rs21;} // end inline asm fma.rn.ftz.f32 %f381, %f184, %f185, %f381; add.s64 %rd24, %rd23, %rd7; ld.global.u16 %rs22, [%rd24]; // begin inline asm { cvt.f32.f16 %f186, %rs22;} // end inline asm // begin inline asm { cvt.f32.f16 %f187, %rs23;} // end inline asm fma.rn.ftz.f32 %f380, %f186, %f187, %f380; $L__BB0_11: add.s32 %r192, %r192, %r91; setp.lt.s32 %p18, %r192, %r87; @%p18 bra $L__BB0_9; $L__BB0_12: add.s32 %r191, %r191, %r90; setp.lt.s32 %p19, %r191, %r86; @%p19 bra $L__BB0_7; $L__BB0_31: setp.eq.s64 %p44, %rd2, 0; @%p44 bra $L__BB0_46; add.s32 %r175, %r1, 1; ld.global.u16 %rs60, [%rd6]; // begin inline asm { cvt.f32.f16 %f252, %rs60;} // end inline asm setp.ge.s32 %p45, %r175, %r4; @%p45 bra $L__BB0_34; ld.global.u16 %rs61, [%rd6+2]; // begin inline asm { cvt.f32.f16 %f253, %rs61;} // end inline asm add.ftz.f32 %f386, %f253, %f386; $L__BB0_34: add.s32 %r176, %r1, 2; setp.ge.s32 %p46, %r176, %r4; @%p46 bra $L__BB0_36; ld.global.u16 %rs62, [%rd6+4]; // begin inline asm { cvt.f32.f16 %f254, %rs62;} // end inline asm add.ftz.f32 %f385, %f254, %f385; $L__BB0_36: add.s32 %r177, %r1, 3; setp.ge.s32 %p47, %r177, %r4; @%p47 bra $L__BB0_38; ld.global.u16 %rs63, [%rd6+6]; // begin inline asm { cvt.f32.f16 %f255, %rs63;} // end inline asm add.ftz.f32 %f384, %f255, %f384; $L__BB0_38: add.s32 %r178, %r1, 4; setp.ge.s32 %p48, %r178, %r4; @%p48 bra $L__BB0_40; ld.global.u16 %rs64, [%rd6+8]; // begin inline asm { cvt.f32.f16 %f256, %rs64;} // end inline asm add.ftz.f32 %f383, %f256, %f383; $L__BB0_40: add.s32 %r179, %r1, 5; setp.ge.s32 %p49, %r179, %r4; @%p49 bra $L__BB0_42; ld.global.u16 %rs65, [%rd6+10]; // begin inline asm { cvt.f32.f16 %f257, %rs65;} // end inline asm add.ftz.f32 %f382, %f257, %f382; $L__BB0_42: add.s32 %r180, %r1, 6; setp.ge.s32 %p50, %r180, %r4; @%p50 bra $L__BB0_44; ld.global.u16 %rs66, [%rd6+12]; // begin inline asm { cvt.f32.f16 %f258, %rs66;} // end inline asm add.ftz.f32 %f381, %f258, %f381; $L__BB0_44: add.s32 %r181, %r1, 7; add.ftz.f32 %f387, %f252, %f387; setp.ge.s32 %p51, %r181, %r4; @%p51 bra $L__BB0_46; ld.global.u16 %rs67, [%rd6+14]; // begin inline asm { cvt.f32.f16 %f259, %rs67;} // end inline asm add.ftz.f32 %f380, %f259, %f380; $L__BB0_46: mul.ftz.f32 %f268, %f155, %f387; setp.gtu.ftz.f32 %p52, %f387, %f154; selp.f32 %f260, %f387, %f268, %p52; mul.ftz.f32 %f269, %f155, %f386; setp.gtu.ftz.f32 %p53, %f386, %f154; selp.f32 %f261, %f386, %f269, %p53; mul.ftz.f32 %f270, %f155, %f385; setp.gtu.ftz.f32 %p54, %f385, %f154; selp.f32 %f262, %f385, %f270, %p54; mul.ftz.f32 %f271, %f155, %f384; setp.gtu.ftz.f32 %p55, %f384, %f154; selp.f32 %f263, %f384, %f271, %p55; mul.ftz.f32 %f272, %f155, %f383; setp.gtu.ftz.f32 %p56, %f383, %f154; selp.f32 %f264, %f383, %f272, %p56; mul.ftz.f32 %f273, %f155, %f382; setp.gtu.ftz.f32 %p57, %f382, %f154; selp.f32 %f265, %f382, %f273, %p57; mul.ftz.f32 %f274, %f155, %f381; setp.gtu.ftz.f32 %p58, %f381, %f154; selp.f32 %f266, %f381, %f274, %p58; mul.ftz.f32 %f275, %f155, %f380; setp.gtu.ftz.f32 %p59, %f380, %f154; selp.f32 %f267, %f380, %f275, %p59; mad.lo.s32 %r168, %r27, %r190, %r1; add.s32 %r169, %r168, %r28; add.s32 %r170, %r169, %r29; mul.wide.s32 %rd47, %r170, 2; add.s64 %rd48, %rd5, %rd47; // begin inline asm { cvt.rn.f16.f32 %rs69, %f261;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs68, %f260;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs71, %f263;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs70, %f262;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs73, %f265;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs72, %f264;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs75, %f267;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs74, %f266;} // end inline asm mov.b32 %r171, {%rs74, %rs75}; mov.b32 %r172, {%rs72, %rs73}; mov.b32 %r173, {%rs70, %rs71}; mov.b32 %r174, {%rs68, %rs69}; st.global.v4.u32 [%rd48], {%r174, %r173, %r172, %r171}; add.s32 %r190, %r190, 1; setp.lt.s32 %p60, %r190, %r5; @%p60 bra $L__BB0_3; $L__BB0_47: ret; }