st6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch623DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<22>; .reg .b16 %rs<693>; .reg .f32 %f<1178>; .reg .b32 %r<557>; .reg .b64 %rd<67>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3840]; ld.param.v2.u32 {%r51, %r52}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r53, %r54}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f59, %f60}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs23, %rs24, %rs25, %rs26}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd24, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd23, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd22, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd21, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+16]; mov.u32 %r556, %tid.y; shl.b32 %r55, %r556, 5; mov.u32 %r56, %tid.x; add.s32 %r555, %r55, %r56; shl.b32 %r554, %r555, 2; setp.ge.u32 %p1, %r554, %r53; mov.f32 %f1160, 0f00000000; mov.f32 %f1161, %f1160; mov.f32 %f1162, %f1160; mov.f32 %f1163, %f1160; mov.f32 %f1164, %f1160; mov.f32 %f1165, %f1160; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd21; mov.u32 %r57, %ctaid.x; mul.lo.s32 %r6, %r54, %r57; shl.b16 %rs2, %rs23, 3; cvta.to.global.u64 %rd3, %rd22; cvta.to.global.u64 %rd4, %rd24; $L__BB0_2: mad.lo.s32 %r59, %r53, %r57, %r554; mul.wide.u32 %rd25, %r59, 4; add.s64 %rd26, %rd3, %rd25; ld.global.v4.u32 {%r60, %r61, %r62, %r63}, [%rd26]; shr.u32 %r65, %r56, 2; shl.b32 %r66, %r556, 3; add.s32 %r14, %r66, %r65; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd27, %r15, 2; add.s64 %rd28, %rd4, %rd27; ld.global.u16 %rs31, [%rd28]; // begin inline asm { cvt.f32.f16 %f67, %rs31;} // end inline asm setp.eq.s64 %p2, %rd23, 0; mov.u16 %rs692, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r67, %r15, 31; add.s32 %r68, %r15, %r67; shr.s32 %r69, %r68, 1; cvt.s64.s32 %rd29, %r69; cvta.to.global.u64 %rd30, %rd23; add.s64 %rd31, %rd30, %rd29; ld.global.u8 %r70, [%rd31]; shl.b32 %r71, %r14, 2; and.b32 %r72, %r71, 4; shr.u32 %r73, %r70, %r72; cvt.u16.u32 %rs32, %r73; and.b16 %rs692, %rs32, 15; $L__BB0_4: shl.b32 %r16, %r555, 5; setp.ge.s32 %p3, %r16, %r51; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs23, 0; shr.u16 %rs34, %rs692, 3; and.b16 %rs35, %rs34, 1; setp.eq.b16 %p5, %rs35, 1; and.pred %p6, %p4, %p5; selp.b16 %rs36, -16, 0, %p6; or.b16 %rs37, %rs36, %rs692; cvt.s16.s8 %rs38, %rs37; cvt.rn.f32.s16 %f8, %rs38; mul.wide.s32 %rd32, %r16, 2; add.s64 %rd5, %rd2, %rd32; ld.global.v4.u32 {%r74, %r75, %r76, %r77}, [%rd5]; mul.wide.s32 %rd33, %r51, 2; add.s64 %rd34, %rd5, %rd33; ld.global.v4.u32 {%r78, %r79, %r80, %r81}, [%rd34]; add.s32 %r82, %r16, %r51; add.s32 %r83, %r82, %r51; mul.wide.s32 %rd35, %r83, 2; add.s64 %rd6, %rd2, %rd35; ld.global.v4.u32 {%r84, %r85, %r86, %r87}, [%rd6]; add.s64 %rd36, %rd6, %rd33; ld.global.v4.u32 {%r88, %r89, %r90, %r91}, [%rd36]; add.s64 %rd37, %rd36, %rd33; ld.global.v4.u32 {%r92, %r93, %r94, %r95}, [%rd37]; add.s64 %rd38, %rd37, %rd33; ld.global.v4.u32 {%r96, %r97, %r98, %r99}, [%rd38]; add.s32 %r100, %r82, 8; mul.wide.s32 %rd39, %r100, 2; add.s64 %rd7, %rd2, %rd39; add.s32 %r101, %r100, %r51; add.s32 %r102, %r101, %r51; mul.wide.s32 %rd40, %r102, 2; add.s64 %rd8, %rd2, %rd40; add.s32 %r103, %r102, %r51; mul.wide.s32 %rd41, %r103, 2; add.s64 %rd9, %rd2, %rd41; add.s32 %r104, %r103, %r51; mul.wide.s32 %rd42, %r104, 2; add.s64 %rd10, %rd2, %rd42; shr.u32 %r105, %r61, 20; cvt.u16.u32 %rs39, %r105; and.b16 %rs5, %rs39, 15; shr.u32 %r106, %r62, 24; cvt.u16.u32 %rs40, %r106; and.b16 %rs6, %rs40, 15; cvt.u16.u32 %rs41, %r63; shr.u16 %rs42, %rs41, 4; and.b16 %rs7, %rs42, 15; shr.u16 %rs43, %rs41, 8; and.b16 %rs8, %rs43, 15; shr.u16 %rs9, %rs41, 12; shr.u32 %r107, %r63, 16; cvt.u16.u32 %rs44, %r107; and.b16 %rs10, %rs44, 15; shr.u32 %r108, %r63, 20; cvt.u16.u32 %rs45, %r108; and.b16 %rs11, %rs45, 15; shr.u32 %r109, %r63, 24; cvt.u16.u32 %rs12, %r109; and.b16 %rs13, %rs12, 15; shr.u32 %r110, %r63, 28; cvt.u16.u32 %rs14, %r110; @%p4 bra $L__BB0_7; cvt.u16.u32 %rs238, %r60; and.b16 %rs239, %rs238, 240; and.b16 %rs240, %rs238, 15; cvt.rn.f32.s16 %f260, %rs240; sub.ftz.f32 %f261, %f260, %f8; mul.ftz.f32 %f262, %f67, %f261; mov.b32 {%rs46, %rs52}, %r74; // begin inline asm { cvt.f32.f16 %f68, %rs46;} // end inline asm fma.rn.ftz.f32 %f263, %f262, %f68, %f1165; mov.b32 {%rs47, %rs53}, %r78; // begin inline asm { cvt.f32.f16 %f69, %rs47;} // end inline asm fma.rn.ftz.f32 %f264, %f262, %f69, %f1164; mov.b32 {%rs48, %rs54}, %r84; // begin inline asm { cvt.f32.f16 %f70, %rs48;} // end inline asm fma.rn.ftz.f32 %f265, %f262, %f70, %f1163; mov.b32 {%rs49, %rs55}, %r88; // begin inline asm { cvt.f32.f16 %f71, %rs49;} // end inline asm fma.rn.ftz.f32 %f266, %f262, %f71, %f1162; mov.b32 {%rs50, %rs56}, %r92; // begin inline asm { cvt.f32.f16 %f72, %rs50;} // end inline asm fma.rn.ftz.f32 %f267, %f262, %f72, %f1161; mov.b32 {%rs51, %rs57}, %r96; // begin inline asm { cvt.f32.f16 %f73, %rs51;} // end inline asm fma.rn.ftz.f32 %f268, %f262, %f73, %f1160; shr.u16 %rs241, %rs239, 4; cvt.rn.f32.s16 %f269, %rs241; sub.ftz.f32 %f270, %f269, %f8; mul.ftz.f32 %f271, %f67, %f270; // begin inline asm { cvt.f32.f16 %f74, %rs52;} // end inline asm fma.rn.ftz.f32 %f272, %f271, %f74, %f263; // begin inline asm { cvt.f32.f16 %f75, %rs53;} // end inline asm fma.rn.ftz.f32 %f273, %f271, %f75, %f264; // begin inline asm { cvt.f32.f16 %f76, %rs54;} // end inline asm fma.rn.ftz.f32 %f274, %f271, %f76, %f265; // begin inline asm { cvt.f32.f16 %f77, %rs55;} // end inline asm fma.rn.ftz.f32 %f275, %f271, %f77, %f266; // begin inline asm { cvt.f32.f16 %f78, %rs56;} // end inline asm fma.rn.ftz.f32 %f276, %f271, %f78, %f267; // begin inline asm { cvt.f32.f16 %f79, %rs57;} // end inline asm fma.rn.ftz.f32 %f277, %f271, %f79, %f268; shr.u16 %rs242, %rs238, 8; and.b16 %rs243, %rs242, 15; cvt.rn.f32.s16 %f278, %rs243; sub.ftz.f32 %f279, %f278, %f8; mul.ftz.f32 %f280, %f67, %f279; mov.b32 {%rs58, %rs64}, %r75; // begin inline asm { cvt.f32.f16 %f80, %rs58;} // end inline asm fma.rn.ftz.f32 %f281, %f280, %f80, %f272; mov.b32 {%rs59, %rs65}, %r79; // begin inline asm { cvt.f32.f16 %f81, %rs59;} // end inline asm fma.rn.ftz.f32 %f282, %f280, %f81, %f273; mov.b32 {%rs60, %rs66}, %r85; // begin inline asm { cvt.f32.f16 %f82, %rs60;} // end inline asm fma.rn.ftz.f32 %f283, %f280, %f82, %f274; mov.b32 {%rs61, %rs67}, %r89; // begin inline asm { cvt.f32.f16 %f83, %rs61;} // end inline asm fma.rn.ftz.f32 %f284, %f280, %f83, %f275; mov.b32 {%rs62, %rs68}, %r93; // begin inline asm { cvt.f32.f16 %f84, %rs62;} // end inline asm fma.rn.ftz.f32 %f285, %f280, %f84, %f276; mov.b32 {%rs63, %rs69}, %r97; // begin inline asm { cvt.f32.f16 %f85, %rs63;} // end inline asm fma.rn.ftz.f32 %f286, %f280, %f85, %f277; shr.u16 %rs244, %rs238, 12; cvt.rn.f32.s16 %f287, %rs244; sub.ftz.f32 %f288, %f287, %f8; mul.ftz.f32 %f289, %f67, %f288; // begin inline asm { cvt.f32.f16 %f86, %rs64;} // end inline asm fma.rn.ftz.f32 %f290, %f289, %f86, %f281; // begin inline asm { cvt.f32.f16 %f87, %rs65;} // end inline asm fma.rn.ftz.f32 %f291, %f289, %f87, %f282; // begin inline asm { cvt.f32.f16 %f88, %rs66;} // end inline asm fma.rn.ftz.f32 %f292, %f289, %f88, %f283; // begin inline asm { cvt.f32.f16 %f89, %rs67;} // end inline asm fma.rn.ftz.f32 %f293, %f289, %f89, %f284; // begin inline asm { cvt.f32.f16 %f90, %rs68;} // end inline asm fma.rn.ftz.f32 %f294, %f289, %f90, %f285; // begin inline asm { cvt.f32.f16 %f91, %rs69;} // end inline asm fma.rn.ftz.f32 %f295, %f289, %f91, %f286; shr.u32 %r111, %r60, 16; cvt.u16.u32 %rs245, %r111; and.b16 %rs246, %rs245, 15; cvt.rn.f32.s16 %f296, %rs246; sub.ftz.f32 %f297, %f296, %f8; mul.ftz.f32 %f298, %f67, %f297; mov.b32 {%rs70, %rs76}, %r76; // begin inline asm { cvt.f32.f16 %f92, %rs70;} // end inline asm fma.rn.ftz.f32 %f299, %f298, %f92, %f290; mov.b32 {%rs71, %rs77}, %r80; // begin inline asm { cvt.f32.f16 %f93, %rs71;} // end inline asm fma.rn.ftz.f32 %f300, %f298, %f93, %f291; mov.b32 {%rs72, %rs78}, %r86; // begin inline asm { cvt.f32.f16 %f94, %rs72;} // end inline asm fma.rn.ftz.f32 %f301, %f298, %f94, %f292; mov.b32 {%rs73, %rs79}, %r90; // begin inline asm { cvt.f32.f16 %f95, %rs73;} // end inline asm fma.rn.ftz.f32 %f302, %f298, %f95, %f293; mov.b32 {%rs74, %rs80}, %r94; // begin inline asm { cvt.f32.f16 %f96, %rs74;} // end inline asm fma.rn.ftz.f32 %f303, %f298, %f96, %f294; mov.b32 {%rs75, %rs81}, %r98; // begin inline asm { cvt.f32.f16 %f97, %rs75;} // end inline asm fma.rn.ftz.f32 %f304, %f298, %f97, %f295; shr.u32 %r112, %r60, 20; cvt.u16.u32 %rs247, %r112; and.b16 %rs248, %rs247, 15; cvt.rn.f32.s16 %f305, %rs248; sub.ftz.f32 %f306, %f305, %f8; mul.ftz.f32 %f307, %f67, %f306; // begin inline asm { cvt.f32.f16 %f98, %rs76;} // end inline asm fma.rn.ftz.f32 %f308, %f307, %f98, %f299; // begin inline asm { cvt.f32.f16 %f99, %rs77;} // end inline asm fma.rn.ftz.f32 %f309, %f307, %f99, %f300; // begin inline asm { cvt.f32.f16 %f100, %rs78;} // end inline asm fma.rn.ftz.f32 %f310, %f307, %f100, %f301; // begin inline asm { cvt.f32.f16 %f101, %rs79;} // end inline asm fma.rn.ftz.f32 %f311, %f307, %f101, %f302; // begin inline asm { cvt.f32.f16 %f102, %rs80;} // end inline asm fma.rn.ftz.f32 %f312, %f307, %f102, %f303; // begin inline asm { cvt.f32.f16 %f103, %rs81;} // end inline asm fma.rn.ftz.f32 %f313, %f307, %f103, %f304; shr.u32 %r113, %r60, 24; cvt.u16.u32 %rs249, %r113; and.b16 %rs250, %rs249, 15; cvt.rn.f32.s16 %f314, %rs250; sub.ftz.f32 %f315, %f314, %f8; mul.ftz.f32 %f316, %f67, %f315; mov.b32 {%rs82, %rs88}, %r77; // begin inline asm { cvt.f32.f16 %f104, %rs82;} // end inline asm fma.rn.ftz.f32 %f317, %f316, %f104, %f308; mov.b32 {%rs83, %rs89}, %r81; // begin inline asm { cvt.f32.f16 %f105, %rs83;} // end inline asm fma.rn.ftz.f32 %f318, %f316, %f105, %f309; mov.b32 {%rs84, %rs90}, %r87; // begin inline asm { cvt.f32.f16 %f106, %rs84;} // end inline asm fma.rn.ftz.f32 %f319, %f316, %f106, %f310; mov.b32 {%rs85, %rs91}, %r91; // begin inline asm { cvt.f32.f16 %f107, %rs85;} // end inline asm fma.rn.ftz.f32 %f320, %f316, %f107, %f311; mov.b32 {%rs86, %rs92}, %r95; // begin inline asm { cvt.f32.f16 %f108, %rs86;} // end inline asm fma.rn.ftz.f32 %f321, %f316, %f108, %f312; mov.b32 {%rs87, %rs93}, %r99; // begin inline asm { cvt.f32.f16 %f109, %rs87;} // end inline asm fma.rn.ftz.f32 %f322, %f316, %f109, %f313; shr.u32 %r114, %r60, 28; cvt.u16.u32 %rs251, %r114; cvt.rn.f32.s16 %f323, %rs251; sub.ftz.f32 %f324, %f323, %f8; mul.ftz.f32 %f325, %f67, %f324; // begin inline asm { cvt.f32.f16 %f110, %rs88;} // end inline asm fma.rn.ftz.f32 %f326, %f325, %f110, %f317; // begin inline asm { cvt.f32.f16 %f111, %rs89;} // end inline asm fma.rn.ftz.f32 %f327, %f325, %f111, %f318; // begin inline asm { cvt.f32.f16 %f112, %rs90;} // end inline asm fma.rn.ftz.f32 %f328, %f325, %f112, %f319; // begin inline asm { cvt.f32.f16 %f113, %rs91;} // end inline asm fma.rn.ftz.f32 %f329, %f325, %f113, %f320; // begin inline asm { cvt.f32.f16 %f114, %rs92;} // end inline asm fma.rn.ftz.f32 %f330, %f325, %f114, %f321; // begin inline asm { cvt.f32.f16 %f115, %rs93;} // end inline asm fma.rn.ftz.f32 %f331, %f325, %f115, %f322; ld.global.v4.u32 {%r115, %r116, %r117, %r118}, [%rd5+16]; ld.global.v4.u32 {%r123, %r124, %r125, %r126}, [%rd7]; ld.global.v4.u32 {%r131, %r132, %r133, %r134}, [%rd6+16]; ld.global.v4.u32 {%r139, %r140, %r141, %r142}, [%rd8]; ld.global.v4.u32 {%r147, %r148, %r149, %r150}, [%rd9]; ld.global.v4.u32 {%r155, %r156, %r157, %r158}, [%rd10]; cvt.u16.u32 %rs252, %r61; and.b16 %rs253, %rs252, 240; and.b16 %rs254, %rs252, 15; cvt.rn.f32.s16 %f332, %rs254; sub.ftz.f32 %f333, %f332, %f8; mul.ftz.f32 %f334, %f67, %f333; mov.b32 {%rs94, %rs100}, %r115; // begin inline asm { cvt.f32.f16 %f116, %rs94;} // end inline asm fma.rn.ftz.f32 %f335, %f334, %f116, %f326; mov.b32 {%rs95, %rs101}, %r123; // begin inline asm { cvt.f32.f16 %f117, %rs95;} // end inline asm fma.rn.ftz.f32 %f336, %f334, %f117, %f327; mov.b32 {%rs96, %rs102}, %r131; // begin inline asm { cvt.f32.f16 %f118, %rs96;} // end inline asm fma.rn.ftz.f32 %f337, %f334, %f118, %f328; mov.b32 {%rs97, %rs103}, %r139; // begin inline asm { cvt.f32.f16 %f119, %rs97;} // end inline asm fma.rn.ftz.f32 %f338, %f334, %f119, %f329; mov.b32 {%rs98, %rs104}, %r147; // begin inline asm { cvt.f32.f16 %f120, %rs98;} // end inline asm fma.rn.ftz.f32 %f339, %f334, %f120, %f330; mov.b32 {%rs99, %rs105}, %r155; // begin inline asm { cvt.f32.f16 %f121, %rs99;} // end inline asm fma.rn.ftz.f32 %f340, %f334, %f121, %f331; shr.u16 %rs255, %rs253, 4; cvt.rn.f32.s16 %f341, %rs255; sub.ftz.f32 %f342, %f341, %f8; mul.ftz.f32 %f343, %f67, %f342; // begin inline asm { cvt.f32.f16 %f122, %rs100;} // end inline asm fma.rn.ftz.f32 %f344, %f343, %f122, %f335; // begin inline asm { cvt.f32.f16 %f123, %rs101;} // end inline asm fma.rn.ftz.f32 %f345, %f343, %f123, %f336; // begin inline asm { cvt.f32.f16 %f124, %rs102;} // end inline asm fma.rn.ftz.f32 %f346, %f343, %f124, %f337; // begin inline asm { cvt.f32.f16 %f125, %rs103;} // end inline asm fma.rn.ftz.f32 %f347, %f343, %f125, %f338; // begin inline asm { cvt.f32.f16 %f126, %rs104;} // end inline asm fma.rn.ftz.f32 %f348, %f343, %f126, %f339; // begin inline asm { cvt.f32.f16 %f127, %rs105;} // end inline asm fma.rn.ftz.f32 %f349, %f343, %f127, %f340; shr.u16 %rs256, %rs252, 8; and.b16 %rs257, %rs256, 15; cvt.rn.f32.s16 %f350, %rs257; sub.ftz.f32 %f351, %f350, %f8; mul.ftz.f32 %f352, %f67, %f351; mov.b32 {%rs106, %rs112}, %r116; // begin inline asm { cvt.f32.f16 %f128, %rs106;} // end inline asm fma.rn.ftz.f32 %f353, %f352, %f128, %f344; mov.b32 {%rs107, %rs113}, %r124; // begin inline asm { cvt.f32.f16 %f129, %rs107;} // end inline asm fma.rn.ftz.f32 %f354, %f352, %f129, %f345; mov.b32 {%rs108, %rs114}, %r132; // begin inline asm { cvt.f32.f16 %f130, %rs108;} // end inline asm fma.rn.ftz.f32 %f355, %f352, %f130, %f346; mov.b32 {%rs109, %rs115}, %r140; // begin inline asm { cvt.f32.f16 %f131, %rs109;} // end inline asm fma.rn.ftz.f32 %f356, %f352, %f131, %f347; mov.b32 {%rs110, %rs116}, %r148; // begin inline asm { cvt.f32.f16 %f132, %rs110;} // end inline asm fma.rn.ftz.f32 %f357, %f352, %f132, %f348; mov.b32 {%rs111, %rs117}, %r156; // begin inline asm { cvt.f32.f16 %f133, %rs111;} // end inline asm fma.rn.ftz.f32 %f358, %f352, %f133, %f349; shr.u16 %rs258, %rs252, 12; cvt.rn.f32.s16 %f359, %rs258; sub.ftz.f32 %f360, %f359, %f8; mul.ftz.f32 %f361, %f67, %f360; // begin inline asm { cvt.f32.f16 %f134, %rs112;} // end inline asm fma.rn.ftz.f32 %f362, %f361, %f134, %f353; // begin inline asm { cvt.f32.f16 %f135, %rs113;} // end inline asm fma.rn.ftz.f32 %f363, %f361, %f135, %f354; // begin inline asm { cvt.f32.f16 %f136, %rs114;} // end inline asm fma.rn.ftz.f32 %f364, %f361, %f136, %f355; // begin inline asm { cvt.f32.f16 %f137, %rs115;} // end inline asm fma.rn.ftz.f32 %f365, %f361, %f137, %f356; // begin inline asm { cvt.f32.f16 %f138, %rs116;} // end inline asm fma.rn.ftz.f32 %f366, %f361, %f138, %f357; // begin inline asm { cvt.f32.f16 %f139, %rs117;} // end inline asm fma.rn.ftz.f32 %f367, %f361, %f139, %f358; shr.u32 %r163, %r61, 16; cvt.u16.u32 %rs259, %r163; and.b16 %rs260, %rs259, 15; cvt.rn.f32.s16 %f368, %rs260; sub.ftz.f32 %f369, %f368, %f8; mul.ftz.f32 %f370, %f67, %f369; mov.b32 {%rs118, %rs124}, %r117; // begin inline asm { cvt.f32.f16 %f140, %rs118;} // end inline asm fma.rn.ftz.f32 %f371, %f370, %f140, %f362; mov.b32 {%rs119, %rs125}, %r125; // begin inline asm { cvt.f32.f16 %f141, %rs119;} // end inline asm fma.rn.ftz.f32 %f372, %f370, %f141, %f363; mov.b32 {%rs120, %rs126}, %r133; // begin inline asm { cvt.f32.f16 %f142, %rs120;} // end inline asm fma.rn.ftz.f32 %f373, %f370, %f142, %f364; mov.b32 {%rs121, %rs127}, %r141; // begin inline asm { cvt.f32.f16 %f143, %rs121;} // end inline asm fma.rn.ftz.f32 %f374, %f370, %f143, %f365; mov.b32 {%rs122, %rs128}, %r149; // begin inline asm { cvt.f32.f16 %f144, %rs122;} // end inline asm fma.rn.ftz.f32 %f375, %f370, %f144, %f366; mov.b32 {%rs123, %rs129}, %r157; // begin inline asm { cvt.f32.f16 %f145, %rs123;} // end inline asm fma.rn.ftz.f32 %f376, %f370, %f145, %f367; cvt.rn.f32.s16 %f377, %rs5; sub.ftz.f32 %f378, %f377, %f8; mul.ftz.f32 %f379, %f67, %f378; // begin inline asm { cvt.f32.f16 %f146, %rs124;} // end inline asm fma.rn.ftz.f32 %f380, %f379, %f146, %f371; // begin inline asm { cvt.f32.f16 %f147, %rs125;} // end inline asm fma.rn.ftz.f32 %f381, %f379, %f147, %f372; // begin inline asm { cvt.f32.f16 %f148, %rs126;} // end inline asm fma.rn.ftz.f32 %f382, %f379, %f148, %f373; // begin inline asm { cvt.f32.f16 %f149, %rs127;} // end inline asm fma.rn.ftz.f32 %f383, %f379, %f149, %f374; // begin inline asm { cvt.f32.f16 %f150, %rs128;} // end inline asm fma.rn.ftz.f32 %f384, %f379, %f150, %f375; // begin inline asm { cvt.f32.f16 %f151, %rs129;} // end inline asm fma.rn.ftz.f32 %f385, %f379, %f151, %f376; shr.u32 %r164, %r61, 24; cvt.u16.u32 %rs261, %r164; and.b16 %rs262, %rs261, 15; cvt.rn.f32.s16 %f386, %rs262; sub.ftz.f32 %f387, %f386, %f8; mul.ftz.f32 %f388, %f67, %f387; mov.b32 {%rs130, %rs136}, %r118; // begin inline asm { cvt.f32.f16 %f152, %rs130;} // end inline asm fma.rn.ftz.f32 %f389, %f388, %f152, %f380; mov.b32 {%rs131, %rs137}, %r126; // begin inline asm { cvt.f32.f16 %f153, %rs131;} // end inline asm fma.rn.ftz.f32 %f390, %f388, %f153, %f381; mov.b32 {%rs132, %rs138}, %r134; // begin inline asm { cvt.f32.f16 %f154, %rs132;} // end inline asm fma.rn.ftz.f32 %f391, %f388, %f154, %f382; mov.b32 {%rs133, %rs139}, %r142; // begin inline asm { cvt.f32.f16 %f155, %rs133;} // end inline asm fma.rn.ftz.f32 %f392, %f388, %f155, %f383; mov.b32 {%rs134, %rs140}, %r150; // begin inline asm { cvt.f32.f16 %f156, %rs134;} // end inline asm fma.rn.ftz.f32 %f393, %f388, %f156, %f384; mov.b32 {%rs135, %rs141}, %r158; // begin inline asm { cvt.f32.f16 %f157, %rs135;} // end inline asm fma.rn.ftz.f32 %f394, %f388, %f157, %f385; shr.u32 %r165, %r61, 28; cvt.u16.u32 %rs263, %r165; cvt.rn.f32.s16 %f395, %rs263; sub.ftz.f32 %f396, %f395, %f8; mul.ftz.f32 %f397, %f67, %f396; // begin inline asm { cvt.f32.f16 %f158, %rs136;} // end inline asm fma.rn.ftz.f32 %f398, %f397, %f158, %f389; // begin inline asm { cvt.f32.f16 %f159, %rs137;} // end inline asm fma.rn.ftz.f32 %f399, %f397, %f159, %f390; // begin inline asm { cvt.f32.f16 %f160, %rs138;} // end inline asm fma.rn.ftz.f32 %f400, %f397, %f160, %f391; // begin inline asm { cvt.f32.f16 %f161, %rs139;} // end inline asm fma.rn.ftz.f32 %f401, %f397, %f161, %f392; // begin inline asm { cvt.f32.f16 %f162, %rs140;} // end inline asm fma.rn.ftz.f32 %f402, %f397, %f162, %f393; // begin inline asm { cvt.f32.f16 %f163, %rs141;} // end inline asm fma.rn.ftz.f32 %f403, %f397, %f163, %f394; ld.global.v4.u32 {%r166, %r167, %r168, %r169}, [%rd5+32]; ld.global.v4.u32 {%r174, %r175, %r176, %r177}, [%rd7+16]; ld.global.v4.u32 {%r182, %r183, %r184, %r185}, [%rd6+32]; ld.global.v4.u32 {%r190, %r191, %r192, %r193}, [%rd8+16]; ld.global.v4.u32 {%r198, %r199, %r200, %r201}, [%rd9+16]; ld.global.v4.u32 {%r206, %r207, %r208, %r209}, [%rd10+16]; cvt.u16.u32 %rs264, %r62; and.b16 %rs265, %rs264, 240; and.b16 %rs266, %rs264, 15; cvt.rn.f32.s16 %f404, %rs266; sub.ftz.f32 %f405, %f404, %f8; mul.ftz.f32 %f406, %f67, %f405; mov.b32 {%rs142, %rs148}, %r166; // begin inline asm { cvt.f32.f16 %f164, %rs142;} // end inline asm fma.rn.ftz.f32 %f407, %f406, %f164, %f398; mov.b32 {%rs143, %rs149}, %r174; // begin inline asm { cvt.f32.f16 %f165, %rs143;} // end inline asm fma.rn.ftz.f32 %f408, %f406, %f165, %f399; mov.b32 {%rs144, %rs150}, %r182; // begin inline asm { cvt.f32.f16 %f166, %rs144;} // end inline asm fma.rn.ftz.f32 %f409, %f406, %f166, %f400; mov.b32 {%rs145, %rs151}, %r190; // begin inline asm { cvt.f32.f16 %f167, %rs145;} // end inline asm fma.rn.ftz.f32 %f410, %f406, %f167, %f401; mov.b32 {%rs146, %rs152}, %r198; // begin inline asm { cvt.f32.f16 %f168, %rs146;} // end inline asm fma.rn.ftz.f32 %f411, %f406, %f168, %f402; mov.b32 {%rs147, %rs153}, %r206; // begin inline asm { cvt.f32.f16 %f169, %rs147;} // end inline asm fma.rn.ftz.f32 %f412, %f406, %f169, %f403; shr.u16 %rs267, %rs265, 4; cvt.rn.f32.s16 %f413, %rs267; sub.ftz.f32 %f414, %f413, %f8; mul.ftz.f32 %f415, %f67, %f414; // begin inline asm { cvt.f32.f16 %f170, %rs148;} // end inline asm fma.rn.ftz.f32 %f416, %f415, %f170, %f407; // begin inline asm { cvt.f32.f16 %f171, %rs149;} // end inline asm fma.rn.ftz.f32 %f417, %f415, %f171, %f408; // begin inline asm { cvt.f32.f16 %f172, %rs150;} // end inline asm fma.rn.ftz.f32 %f418, %f415, %f172, %f409; // begin inline asm { cvt.f32.f16 %f173, %rs151;} // end inline asm fma.rn.ftz.f32 %f419, %f415, %f173, %f410; // begin inline asm { cvt.f32.f16 %f174, %rs152;} // end inline asm fma.rn.ftz.f32 %f420, %f415, %f174, %f411; // begin inline asm { cvt.f32.f16 %f175, %rs153;} // end inline asm fma.rn.ftz.f32 %f421, %f415, %f175, %f412; shr.u16 %rs268, %rs264, 8; and.b16 %rs269, %rs268, 15; cvt.rn.f32.s16 %f422, %rs269; sub.ftz.f32 %f423, %f422, %f8; mul.ftz.f32 %f424, %f67, %f423; mov.b32 {%rs154, %rs160}, %r167; // begin inline asm { cvt.f32.f16 %f176, %rs154;} // end inline asm fma.rn.ftz.f32 %f425, %f424, %f176, %f416; mov.b32 {%rs155, %rs161}, %r175; // begin inline asm { cvt.f32.f16 %f177, %rs155;} // end inline asm fma.rn.ftz.f32 %f426, %f424, %f177, %f417; mov.b32 {%rs156, %rs162}, %r183; // begin inline asm { cvt.f32.f16 %f178, %rs156;} // end inline asm fma.rn.ftz.f32 %f427, %f424, %f178, %f418; mov.b32 {%rs157, %rs163}, %r191; // begin inline asm { cvt.f32.f16 %f179, %rs157;} // end inline asm fma.rn.ftz.f32 %f428, %f424, %f179, %f419; mov.b32 {%rs158, %rs164}, %r199; // begin inline asm { cvt.f32.f16 %f180, %rs158;} // end inline asm fma.rn.ftz.f32 %f429, %f424, %f180, %f420; mov.b32 {%rs159, %rs165}, %r207; // begin inline asm { cvt.f32.f16 %f181, %rs159;} // end inline asm fma.rn.ftz.f32 %f430, %f424, %f181, %f421; shr.u16 %rs270, %rs264, 12; cvt.rn.f32.s16 %f431, %rs270; sub.ftz.f32 %f432, %f431, %f8; mul.ftz.f32 %f433, %f67, %f432; // begin inline asm { cvt.f32.f16 %f182, %rs160;} // end inline asm fma.rn.ftz.f32 %f434, %f433, %f182, %f425; // begin inline asm { cvt.f32.f16 %f183, %rs161;} // end inline asm fma.rn.ftz.f32 %f435, %f433, %f183, %f426; // begin inline asm { cvt.f32.f16 %f184, %rs162;} // end inline asm fma.rn.ftz.f32 %f436, %f433, %f184, %f427; // begin inline asm { cvt.f32.f16 %f185, %rs163;} // end inline asm fma.rn.ftz.f32 %f437, %f433, %f185, %f428; // begin inline asm { cvt.f32.f16 %f186, %rs164;} // end inline asm fma.rn.ftz.f32 %f438, %f433, %f186, %f429; // begin inline asm { cvt.f32.f16 %f187, %rs165;} // end inline asm fma.rn.ftz.f32 %f439, %f433, %f187, %f430; shr.u32 %r214, %r62, 16; cvt.u16.u32 %rs271, %r214; and.b16 %rs272, %rs271, 15; cvt.rn.f32.s16 %f440, %rs272; sub.ftz.f32 %f441, %f440, %f8; mul.ftz.f32 %f442, %f67, %f441; mov.b32 {%rs166, %rs172}, %r168; // begin inline asm { cvt.f32.f16 %f188, %rs166;} // end inline asm fma.rn.ftz.f32 %f443, %f442, %f188, %f434; mov.b32 {%rs167, %rs173}, %r176; // begin inline asm { cvt.f32.f16 %f189, %rs167;} // end inline asm fma.rn.ftz.f32 %f444, %f442, %f189, %f435; mov.b32 {%rs168, %rs174}, %r184; // begin inline asm { cvt.f32.f16 %f190, %rs168;} // end inline asm fma.rn.ftz.f32 %f445, %f442, %f190, %f436; mov.b32 {%rs169, %rs175}, %r192; // begin inline asm { cvt.f32.f16 %f191, %rs169;} // end inline asm fma.rn.ftz.f32 %f446, %f442, %f191, %f437; mov.b32 {%rs170, %rs176}, %r200; // begin inline asm { cvt.f32.f16 %f192, %rs170;} // end inline asm fma.rn.ftz.f32 %f447, %f442, %f192, %f438; mov.b32 {%rs171, %rs177}, %r208; // begin inline asm { cvt.f32.f16 %f193, %rs171;} // end inline asm fma.rn.ftz.f32 %f448, %f442, %f193, %f439; shr.u32 %r215, %r62, 20; cvt.u16.u32 %rs273, %r215; and.b16 %rs274, %rs273, 15; cvt.rn.f32.s16 %f449, %rs274; sub.ftz.f32 %f450, %f449, %f8; mul.ftz.f32 %f451, %f67, %f450; // begin inline asm { cvt.f32.f16 %f194, %rs172;} // end inline asm fma.rn.ftz.f32 %f452, %f451, %f194, %f443; // begin inline asm { cvt.f32.f16 %f195, %rs173;} // end inline asm fma.rn.ftz.f32 %f453, %f451, %f195, %f444; // begin inline asm { cvt.f32.f16 %f196, %rs174;} // end inline asm fma.rn.ftz.f32 %f454, %f451, %f196, %f445; // begin inline asm { cvt.f32.f16 %f197, %rs175;} // end inline asm fma.rn.ftz.f32 %f455, %f451, %f197, %f446; // begin inline asm { cvt.f32.f16 %f198, %rs176;} // end inline asm fma.rn.ftz.f32 %f456, %f451, %f198, %f447; // begin inline asm { cvt.f32.f16 %f199, %rs177;} // end inline asm fma.rn.ftz.f32 %f457, %f451, %f199, %f448; cvt.rn.f32.s16 %f458, %rs6; sub.ftz.f32 %f459, %f458, %f8; mul.ftz.f32 %f460, %f67, %f459; mov.b32 {%rs178, %rs184}, %r169; // begin inline asm { cvt.f32.f16 %f200, %rs178;} // end inline asm fma.rn.ftz.f32 %f461, %f460, %f200, %f452; mov.b32 {%rs179, %rs185}, %r177; // begin inline asm { cvt.f32.f16 %f201, %rs179;} // end inline asm fma.rn.ftz.f32 %f462, %f460, %f201, %f453; mov.b32 {%rs180, %rs186}, %r185; // begin inline asm { cvt.f32.f16 %f202, %rs180;} // end inline asm fma.rn.ftz.f32 %f463, %f460, %f202, %f454; mov.b32 {%rs181, %rs187}, %r193; // begin inline asm { cvt.f32.f16 %f203, %rs181;} // end inline asm fma.rn.ftz.f32 %f464, %f460, %f203, %f455; mov.b32 {%rs182, %rs188}, %r201; // begin inline asm { cvt.f32.f16 %f204, %rs182;} // end inline asm fma.rn.ftz.f32 %f465, %f460, %f204, %f456; mov.b32 {%rs183, %rs189}, %r209; // begin inline asm { cvt.f32.f16 %f205, %rs183;} // end inline asm fma.rn.ftz.f32 %f466, %f460, %f205, %f457; shr.u32 %r216, %r62, 28; cvt.u16.u32 %rs275, %r216; cvt.rn.f32.s16 %f467, %rs275; sub.ftz.f32 %f468, %f467, %f8; mul.ftz.f32 %f469, %f67, %f468; // begin inline asm { cvt.f32.f16 %f206, %rs184;} // end inline asm fma.rn.ftz.f32 %f470, %f469, %f206, %f461; // begin inline asm { cvt.f32.f16 %f207, %rs185;} // end inline asm fma.rn.ftz.f32 %f471, %f469, %f207, %f462; // begin inline asm { cvt.f32.f16 %f208, %rs186;} // end inline asm fma.rn.ftz.f32 %f472, %f469, %f208, %f463; // begin inline asm { cvt.f32.f16 %f209, %rs187;} // end inline asm fma.rn.ftz.f32 %f473, %f469, %f209, %f464; // begin inline asm { cvt.f32.f16 %f210, %rs188;} // end inline asm fma.rn.ftz.f32 %f474, %f469, %f210, %f465; // begin inline asm { cvt.f32.f16 %f211, %rs189;} // end inline asm fma.rn.ftz.f32 %f475, %f469, %f211, %f466; ld.global.v4.u32 {%r217, %r218, %r219, %r220}, [%rd5+48]; ld.global.v4.u32 {%r225, %r226, %r227, %r228}, [%rd7+32]; ld.global.v4.u32 {%r233, %r234, %r235, %r236}, [%rd6+48]; ld.global.v4.u32 {%r241, %r242, %r243, %r244}, [%rd8+32]; ld.global.v4.u32 {%r249, %r250, %r251, %r252}, [%rd9+32]; ld.global.v4.u32 {%r257, %r258, %r259, %r260}, [%rd10+32]; and.b16 %rs277, %rs41, 15; cvt.rn.f32.s16 %f476, %rs277; sub.ftz.f32 %f477, %f476, %f8; mul.ftz.f32 %f478, %f67, %f477; mov.b32 {%rs190, %rs196}, %r217; // begin inline asm { cvt.f32.f16 %f212, %rs190;} // end inline asm fma.rn.ftz.f32 %f479, %f478, %f212, %f470; mov.b32 {%rs191, %rs197}, %r225; // begin inline asm { cvt.f32.f16 %f213, %rs191;} // end inline asm fma.rn.ftz.f32 %f480, %f478, %f213, %f471; mov.b32 {%rs192, %rs198}, %r233; // begin inline asm { cvt.f32.f16 %f214, %rs192;} // end inline asm fma.rn.ftz.f32 %f481, %f478, %f214, %f472; mov.b32 {%rs193, %rs199}, %r241; // begin inline asm { cvt.f32.f16 %f215, %rs193;} // end inline asm fma.rn.ftz.f32 %f482, %f478, %f215, %f473; mov.b32 {%rs194, %rs200}, %r249; // begin inline asm { cvt.f32.f16 %f216, %rs194;} // end inline asm fma.rn.ftz.f32 %f483, %f478, %f216, %f474; mov.b32 {%rs195, %rs201}, %r257; // begin inline asm { cvt.f32.f16 %f217, %rs195;} // end inline asm fma.rn.ftz.f32 %f484, %f478, %f217, %f475; cvt.rn.f32.s16 %f485, %rs7; sub.ftz.f32 %f486, %f485, %f8; mul.ftz.f32 %f487, %f67, %f486; // begin inline asm { cvt.f32.f16 %f218, %rs196;} // end inline asm fma.rn.ftz.f32 %f488, %f487, %f218, %f479; // begin inline asm { cvt.f32.f16 %f219, %rs197;} // end inline asm fma.rn.ftz.f32 %f489, %f487, %f219, %f480; // begin inline asm { cvt.f32.f16 %f220, %rs198;} // end inline asm fma.rn.ftz.f32 %f490, %f487, %f220, %f481; // begin inline asm { cvt.f32.f16 %f221, %rs199;} // end inline asm fma.rn.ftz.f32 %f491, %f487, %f221, %f482; // begin inline asm { cvt.f32.f16 %f222, %rs200;} // end inline asm fma.rn.ftz.f32 %f492, %f487, %f222, %f483; // begin inline asm { cvt.f32.f16 %f223, %rs201;} // end inline asm fma.rn.ftz.f32 %f493, %f487, %f223, %f484; cvt.rn.f32.s16 %f494, %rs8; sub.ftz.f32 %f495, %f494, %f8; mul.ftz.f32 %f496, %f67, %f495; mov.b32 {%rs202, %rs208}, %r218; // begin inline asm { cvt.f32.f16 %f224, %rs202;} // end inline asm fma.rn.ftz.f32 %f497, %f496, %f224, %f488; mov.b32 {%rs203, %rs209}, %r226; // begin inline asm { cvt.f32.f16 %f225, %rs203;} // end inline asm fma.rn.ftz.f32 %f498, %f496, %f225, %f489; mov.b32 {%rs204, %rs210}, %r234; // begin inline asm { cvt.f32.f16 %f226, %rs204;} // end inline asm fma.rn.ftz.f32 %f499, %f496, %f226, %f490; mov.b32 {%rs205, %rs211}, %r242; // begin inline asm { cvt.f32.f16 %f227, %rs205;} // end inline asm fma.rn.ftz.f32 %f500, %f496, %f227, %f491; mov.b32 {%rs206, %rs212}, %r250; // begin inline asm { cvt.f32.f16 %f228, %rs206;} // end inline asm fma.rn.ftz.f32 %f501, %f496, %f228, %f492; mov.b32 {%rs207, %rs213}, %r258; // begin inline asm { cvt.f32.f16 %f229, %rs207;} // end inline asm fma.rn.ftz.f32 %f502, %f496, %f229, %f493; cvt.rn.f32.s16 %f503, %rs9; sub.ftz.f32 %f504, %f503, %f8; mul.ftz.f32 %f505, %f67, %f504; // begin inline asm { cvt.f32.f16 %f230, %rs208;} // end inline asm fma.rn.ftz.f32 %f506, %f505, %f230, %f497; // begin inline asm { cvt.f32.f16 %f231, %rs209;} // end inline asm fma.rn.ftz.f32 %f507, %f505, %f231, %f498; // begin inline asm { cvt.f32.f16 %f232, %rs210;} // end inline asm fma.rn.ftz.f32 %f508, %f505, %f232, %f499; // begin inline asm { cvt.f32.f16 %f233, %rs211;} // end inline asm fma.rn.ftz.f32 %f509, %f505, %f233, %f500; // begin inline asm { cvt.f32.f16 %f234, %rs212;} // end inline asm fma.rn.ftz.f32 %f510, %f505, %f234, %f501; // begin inline asm { cvt.f32.f16 %f235, %rs213;} // end inline asm fma.rn.ftz.f32 %f511, %f505, %f235, %f502; cvt.rn.f32.s16 %f512, %rs10; sub.ftz.f32 %f513, %f512, %f8; mul.ftz.f32 %f514, %f67, %f513; mov.b32 {%rs214, %rs220}, %r219; // begin inline asm { cvt.f32.f16 %f236, %rs214;} // end inline asm fma.rn.ftz.f32 %f515, %f514, %f236, %f506; mov.b32 {%rs215, %rs221}, %r227; // begin inline asm { cvt.f32.f16 %f237, %rs215;} // end inline asm fma.rn.ftz.f32 %f516, %f514, %f237, %f507; mov.b32 {%rs216, %rs222}, %r235; // begin inline asm { cvt.f32.f16 %f238, %rs216;} // end inline asm fma.rn.ftz.f32 %f517, %f514, %f238, %f508; mov.b32 {%rs217, %rs223}, %r243; // begin inline asm { cvt.f32.f16 %f239, %rs217;} // end inline asm fma.rn.ftz.f32 %f518, %f514, %f239, %f509; mov.b32 {%rs218, %rs224}, %r251; // begin inline asm { cvt.f32.f16 %f240, %rs218;} // end inline asm fma.rn.ftz.f32 %f519, %f514, %f240, %f510; mov.b32 {%rs219, %rs225}, %r259; // begin inline asm { cvt.f32.f16 %f241, %rs219;} // end inline asm fma.rn.ftz.f32 %f520, %f514, %f241, %f511; cvt.rn.f32.s16 %f521, %rs11; sub.ftz.f32 %f522, %f521, %f8; mul.ftz.f32 %f523, %f67, %f522; // begin inline asm { cvt.f32.f16 %f242, %rs220;} // end inline asm fma.rn.ftz.f32 %f524, %f523, %f242, %f515; // begin inline asm { cvt.f32.f16 %f243, %rs221;} // end inline asm fma.rn.ftz.f32 %f525, %f523, %f243, %f516; // begin inline asm { cvt.f32.f16 %f244, %rs222;} // end inline asm fma.rn.ftz.f32 %f526, %f523, %f244, %f517; // begin inline asm { cvt.f32.f16 %f245, %rs223;} // end inline asm fma.rn.ftz.f32 %f527, %f523, %f245, %f518; // begin inline asm { cvt.f32.f16 %f246, %rs224;} // end inline asm fma.rn.ftz.f32 %f528, %f523, %f246, %f519; // begin inline asm { cvt.f32.f16 %f247, %rs225;} // end inline asm fma.rn.ftz.f32 %f529, %f523, %f247, %f520; cvt.rn.f32.s16 %f530, %rs13; sub.ftz.f32 %f531, %f530, %f8; mul.ftz.f32 %f532, %f67, %f531; mov.b32 {%rs226, %rs232}, %r220; // begin inline asm { cvt.f32.f16 %f248, %rs226;} // end inline asm fma.rn.ftz.f32 %f533, %f532, %f248, %f524; mov.b32 {%rs227, %rs233}, %r228; // begin inline asm { cvt.f32.f16 %f249, %rs227;} // end inline asm fma.rn.ftz.f32 %f534, %f532, %f249, %f525; mov.b32 {%rs228, %rs234}, %r236; // begin inline asm { cvt.f32.f16 %f250, %rs228;} // end inline asm fma.rn.ftz.f32 %f535, %f532, %f250, %f526; mov.b32 {%rs229, %rs235}, %r244; // begin inline asm { cvt.f32.f16 %f251, %rs229;} // end inline asm fma.rn.ftz.f32 %f536, %f532, %f251, %f527; mov.b32 {%rs230, %rs236}, %r252; // begin inline asm { cvt.f32.f16 %f252, %rs230;} // end inline asm fma.rn.ftz.f32 %f537, %f532, %f252, %f528; mov.b32 {%rs231, %rs237}, %r260; // begin inline asm { cvt.f32.f16 %f253, %rs231;} // end inline asm fma.rn.ftz.f32 %f538, %f532, %f253, %f529; cvt.rn.f32.s16 %f539, %rs14; sub.ftz.f32 %f540, %f539, %f8; mul.ftz.f32 %f541, %f67, %f540; // begin inline asm { cvt.f32.f16 %f254, %rs232;} // end inline asm fma.rn.ftz.f32 %f1165, %f541, %f254, %f533; // begin inline asm { cvt.f32.f16 %f255, %rs233;} // end inline asm fma.rn.ftz.f32 %f1164, %f541, %f255, %f534; // begin inline asm { cvt.f32.f16 %f256, %rs234;} // end inline asm fma.rn.ftz.f32 %f1163, %f541, %f256, %f535; // begin inline asm { cvt.f32.f16 %f257, %rs235;} // end inline asm fma.rn.ftz.f32 %f1162, %f541, %f257, %f536; // begin inline asm { cvt.f32.f16 %f258, %rs236;} // end inline asm fma.rn.ftz.f32 %f1161, %f541, %f258, %f537; // begin inline asm { cvt.f32.f16 %f259, %rs237;} // end inline asm fma.rn.ftz.f32 %f1160, %f541, %f259, %f538; bra.uni $L__BB0_8; $L__BB0_7: cvt.u16.u32 %rs470, %r60; shl.b16 %rs471, %rs470, 4; cvt.s16.s8 %rs472, %rs471; shr.s16 %rs473, %rs472, 7; and.b16 %rs474, %rs473, -16; and.b16 %rs475, %rs470, 15; or.b16 %rs476, %rs474, %rs475; cvt.rn.f32.s16 %f734, %rs476; sub.ftz.f32 %f735, %f734, %f8; mul.ftz.f32 %f736, %f67, %f735; mov.b32 {%rs278, %rs284}, %r74; // begin inline asm { cvt.f32.f16 %f542, %rs278;} // end inline asm fma.rn.ftz.f32 %f737, %f736, %f542, %f1165; mov.b32 {%rs279, %rs285}, %r78; // begin inline asm { cvt.f32.f16 %f543, %rs279;} // end inline asm fma.rn.ftz.f32 %f738, %f736, %f543, %f1164; mov.b32 {%rs280, %rs286}, %r84; // begin inline asm { cvt.f32.f16 %f544, %rs280;} // end inline asm fma.rn.ftz.f32 %f739, %f736, %f544, %f1163; mov.b32 {%rs281, %rs287}, %r88; // begin inline asm { cvt.f32.f16 %f545, %rs281;} // end inline asm fma.rn.ftz.f32 %f740, %f736, %f545, %f1162; mov.b32 {%rs282, %rs288}, %r92; // begin inline asm { cvt.f32.f16 %f546, %rs282;} // end inline asm fma.rn.ftz.f32 %f741, %f736, %f546, %f1161; mov.b32 {%rs283, %rs289}, %r96; // begin inline asm { cvt.f32.f16 %f547, %rs283;} // end inline asm fma.rn.ftz.f32 %f742, %f736, %f547, %f1160; shr.u32 %r265, %r60, 4; cvt.u16.u32 %rs477, %r265; shl.b16 %rs478, %rs477, 4; cvt.s16.s8 %rs479, %rs478; shr.s16 %rs480, %rs479, 7; and.b16 %rs481, %rs480, -16; and.b16 %rs482, %rs477, 15; or.b16 %rs483, %rs481, %rs482; cvt.rn.f32.s16 %f743, %rs483; sub.ftz.f32 %f744, %f743, %f8; mul.ftz.f32 %f745, %f67, %f744; // begin inline asm { cvt.f32.f16 %f548, %rs284;} // end inline asm fma.rn.ftz.f32 %f746, %f745, %f548, %f737; // begin inline asm { cvt.f32.f16 %f549, %rs285;} // end inline asm fma.rn.ftz.f32 %f747, %f745, %f549, %f738; // begin inline asm { cvt.f32.f16 %f550, %rs286;} // end inline asm fma.rn.ftz.f32 %f748, %f745, %f550, %f739; // begin inline asm { cvt.f32.f16 %f551, %rs287;} // end inline asm fma.rn.ftz.f32 %f749, %f745, %f551, %f740; // begin inline asm { cvt.f32.f16 %f552, %rs288;} // end inline asm fma.rn.ftz.f32 %f750, %f745, %f552, %f741; // begin inline asm { cvt.f32.f16 %f553, %rs289;} // end inline asm fma.rn.ftz.f32 %f751, %f745, %f553, %f742; shr.u32 %r266, %r60, 8; cvt.u16.u32 %rs484, %r266; shl.b16 %rs485, %rs484, 4; cvt.s16.s8 %rs486, %rs485; shr.s16 %rs487, %rs486, 7; and.b16 %rs488, %rs487, -16; and.b16 %rs489, %rs484, 15; or.b16 %rs490, %rs488, %rs489; cvt.rn.f32.s16 %f752, %rs490; sub.ftz.f32 %f753, %f752, %f8; mul.ftz.f32 %f754, %f67, %f753; mov.b32 {%rs290, %rs296}, %r75; // begin inline asm { cvt.f32.f16 %f554, %rs290;} // end inline asm fma.rn.ftz.f32 %f755, %f754, %f554, %f746; mov.b32 {%rs291, %rs297}, %r79; // begin inline asm { cvt.f32.f16 %f555, %rs291;} // end inline asm fma.rn.ftz.f32 %f756, %f754, %f555, %f747; mov.b32 {%rs292, %rs298}, %r85; // begin inline asm { cvt.f32.f16 %f556, %rs292;} // end inline asm fma.rn.ftz.f32 %f757, %f754, %f556, %f748; mov.b32 {%rs293, %rs299}, %r89; // begin inline asm { cvt.f32.f16 %f557, %rs293;} // end inline asm fma.rn.ftz.f32 %f758, %f754, %f557, %f749; mov.b32 {%rs294, %rs300}, %r93; // begin inline asm { cvt.f32.f16 %f558, %rs294;} // end inline asm fma.rn.ftz.f32 %f759, %f754, %f558, %f750; mov.b32 {%rs295, %rs301}, %r97; // begin inline asm { cvt.f32.f16 %f559, %rs295;} // end inline asm fma.rn.ftz.f32 %f760, %f754, %f559, %f751; shr.u32 %r267, %r60, 12; cvt.u16.u32 %rs491, %r267; shl.b16 %rs492, %rs491, 4; cvt.s16.s8 %rs493, %rs492; shr.s16 %rs494, %rs493, 7; and.b16 %rs495, %rs494, -16; and.b16 %rs496, %rs491, 15; or.b16 %rs497, %rs495, %rs496; cvt.rn.f32.s16 %f761, %rs497; sub.ftz.f32 %f762, %f761, %f8; mul.ftz.f32 %f763, %f67, %f762; // begin inline asm { cvt.f32.f16 %f560, %rs296;} // end inline asm fma.rn.ftz.f32 %f764, %f763, %f560, %f755; // begin inline asm { cvt.f32.f16 %f561, %rs297;} // end inline asm fma.rn.ftz.f32 %f765, %f763, %f561, %f756; // begin inline asm { cvt.f32.f16 %f562, %rs298;} // end inline asm fma.rn.ftz.f32 %f766, %f763, %f562, %f757; // begin inline asm { cvt.f32.f16 %f563, %rs299;} // end inline asm fma.rn.ftz.f32 %f767, %f763, %f563, %f758; // begin inline asm { cvt.f32.f16 %f564, %rs300;} // end inline asm fma.rn.ftz.f32 %f768, %f763, %f564, %f759; // begin inline asm { cvt.f32.f16 %f565, %rs301;} // end inline asm fma.rn.ftz.f32 %f769, %f763, %f565, %f760; shr.u32 %r268, %r60, 16; cvt.u16.u32 %rs498, %r268; shl.b16 %rs499, %rs498, 4; cvt.s16.s8 %rs500, %rs499; shr.s16 %rs501, %rs500, 7; and.b16 %rs502, %rs501, -16; and.b16 %rs503, %rs498, 15; or.b16 %rs504, %rs502, %rs503; cvt.rn.f32.s16 %f770, %rs504; sub.ftz.f32 %f771, %f770, %f8; mul.ftz.f32 %f772, %f67, %f771; mov.b32 {%rs302, %rs308}, %r76; // begin inline asm { cvt.f32.f16 %f566, %rs302;} // end inline asm fma.rn.ftz.f32 %f773, %f772, %f566, %f764; mov.b32 {%rs303, %rs309}, %r80; // begin inline asm { cvt.f32.f16 %f567, %rs303;} // end inline asm fma.rn.ftz.f32 %f774, %f772, %f567, %f765; mov.b32 {%rs304, %rs310}, %r86; // begin inline asm { cvt.f32.f16 %f568, %rs304;} // end inline asm fma.rn.ftz.f32 %f775, %f772, %f568, %f766; mov.b32 {%rs305, %rs311}, %r90; // begin inline asm { cvt.f32.f16 %f569, %rs305;} // end inline asm fma.rn.ftz.f32 %f776, %f772, %f569, %f767; mov.b32 {%rs306, %rs312}, %r94; // begin inline asm { cvt.f32.f16 %f570, %rs306;} // end inline asm fma.rn.ftz.f32 %f777, %f772, %f570, %f768; mov.b32 {%rs307, %rs313}, %r98; // begin inline asm { cvt.f32.f16 %f571, %rs307;} // end inline asm fma.rn.ftz.f32 %f778, %f772, %f571, %f769; shr.u32 %r269, %r60, 20; cvt.u16.u32 %rs505, %r269; shl.b16 %rs506, %rs505, 4; cvt.s16.s8 %rs507, %rs506; shr.s16 %rs508, %rs507, 7; and.b16 %rs509, %rs508, -16; and.b16 %rs510, %rs505, 15; or.b16 %rs511, %rs509, %rs510; cvt.rn.f32.s16 %f779, %rs511; sub.ftz.f32 %f780, %f779, %f8; mul.ftz.f32 %f781, %f67, %f780; // begin inline asm { cvt.f32.f16 %f572, %rs308;} // end inline asm fma.rn.ftz.f32 %f782, %f781, %f572, %f773; // begin inline asm { cvt.f32.f16 %f573, %rs309;} // end inline asm fma.rn.ftz.f32 %f783, %f781, %f573, %f774; // begin inline asm { cvt.f32.f16 %f574, %rs310;} // end inline asm fma.rn.ftz.f32 %f784, %f781, %f574, %f775; // begin inline asm { cvt.f32.f16 %f575, %rs311;} // end inline asm fma.rn.ftz.f32 %f785, %f781, %f575, %f776; // begin inline asm { cvt.f32.f16 %f576, %rs312;} // end inline asm fma.rn.ftz.f32 %f786, %f781, %f576, %f777; // begin inline asm { cvt.f32.f16 %f577, %rs313;} // end inline asm fma.rn.ftz.f32 %f787, %f781, %f577, %f778; shr.u32 %r270, %r60, 24; cvt.u16.u32 %rs512, %r270; shl.b16 %rs513, %rs512, 4; cvt.s16.s8 %rs514, %rs513; shr.s16 %rs515, %rs514, 7; and.b16 %rs516, %rs515, -16; and.b16 %rs517, %rs512, 15; or.b16 %rs518, %rs516, %rs517; cvt.rn.f32.s16 %f788, %rs518; sub.ftz.f32 %f789, %f788, %f8; mul.ftz.f32 %f790, %f67, %f789; mov.b32 {%rs314, %rs320}, %r77; // begin inline asm { cvt.f32.f16 %f578, %rs314;} // end inline asm fma.rn.ftz.f32 %f791, %f790, %f578, %f782; mov.b32 {%rs315, %rs321}, %r81; // begin inline asm { cvt.f32.f16 %f579, %rs315;} // end inline asm fma.rn.ftz.f32 %f792, %f790, %f579, %f783; mov.b32 {%rs316, %rs322}, %r87; // begin inline asm { cvt.f32.f16 %f580, %rs316;} // end inline asm fma.rn.ftz.f32 %f793, %f790, %f580, %f784; mov.b32 {%rs317, %rs323}, %r91; // begin inline asm { cvt.f32.f16 %f581, %rs317;} // end inline asm fma.rn.ftz.f32 %f794, %f790, %f581, %f785; mov.b32 {%rs318, %rs324}, %r95; // begin inline asm { cvt.f32.f16 %f582, %rs318;} // end inline asm fma.rn.ftz.f32 %f795, %f790, %f582, %f786; mov.b32 {%rs319, %rs325}, %r99; // begin inline asm { cvt.f32.f16 %f583, %rs319;} // end inline asm fma.rn.ftz.f32 %f796, %f790, %f583, %f787; shr.u32 %r271, %r60, 28; cvt.u16.u32 %rs519, %r271; shl.b16 %rs520, %rs519, 4; cvt.s16.s8 %rs521, %rs520; shr.s16 %rs522, %rs521, 7; and.b16 %rs523, %rs522, -16; or.b16 %rs524, %rs523, %rs519; cvt.rn.f32.s16 %f797, %rs524; sub.ftz.f32 %f798, %f797, %f8; mul.ftz.f32 %f799, %f67, %f798; // begin inline asm { cvt.f32.f16 %f584, %rs320;} // end inline asm fma.rn.ftz.f32 %f800, %f799, %f584, %f791; // begin inline asm { cvt.f32.f16 %f585, %rs321;} // end inline asm fma.rn.ftz.f32 %f801, %f799, %f585, %f792; // begin inline asm { cvt.f32.f16 %f586, %rs322;} // end inline asm fma.rn.ftz.f32 %f802, %f799, %f586, %f793; // begin inline asm { cvt.f32.f16 %f587, %rs323;} // end inline asm fma.rn.ftz.f32 %f803, %f799, %f587, %f794; // begin inline asm { cvt.f32.f16 %f588, %rs324;} // end inline asm fma.rn.ftz.f32 %f804, %f799, %f588, %f795; // begin inline asm { cvt.f32.f16 %f589, %rs325;} // end inline asm fma.rn.ftz.f32 %f805, %f799, %f589, %f796; ld.global.v4.u32 {%r272, %r273, %r274, %r275}, [%rd5+16]; ld.global.v4.u32 {%r280, %r281, %r282, %r283}, [%rd7]; ld.global.v4.u32 {%r288, %r289, %r290, %r291}, [%rd6+16]; ld.global.v4.u32 {%r296, %r297, %r298, %r299}, [%rd8]; ld.global.v4.u32 {%r304, %r305, %r306, %r307}, [%rd9]; ld.global.v4.u32 {%r312, %r313, %r314, %r315}, [%rd10]; cvt.u16.u32 %rs525, %r61; shl.b16 %rs526, %rs525, 4; cvt.s16.s8 %rs527, %rs526; shr.s16 %rs528, %rs527, 7; and.b16 %rs529, %rs528, -16; and.b16 %rs530, %rs525, 15; or.b16 %rs531, %rs529, %rs530; cvt.rn.f32.s16 %f806, %rs531; sub.ftz.f32 %f807, %f806, %f8; mul.ftz.f32 %f808, %f67, %f807; mov.b32 {%rs326, %rs332}, %r272; // begin inline asm { cvt.f32.f16 %f590, %rs326;} // end inline asm fma.rn.ftz.f32 %f809, %f808, %f590, %f800; mov.b32 {%rs327, %rs333}, %r280; // begin inline asm { cvt.f32.f16 %f591, %rs327;} // end inline asm fma.rn.ftz.f32 %f810, %f808, %f591, %f801; mov.b32 {%rs328, %rs334}, %r288; // begin inline asm { cvt.f32.f16 %f592, %rs328;} // end inline asm fma.rn.ftz.f32 %f811, %f808, %f592, %f802; mov.b32 {%rs329, %rs335}, %r296; // begin inline asm { cvt.f32.f16 %f593, %rs329;} // end inline asm fma.rn.ftz.f32 %f812, %f808, %f593, %f803; mov.b32 {%rs330, %rs336}, %r304; // begin inline asm { cvt.f32.f16 %f594, %rs330;} // end inline asm fma.rn.ftz.f32 %f813, %f808, %f594, %f804; mov.b32 {%rs331, %rs337}, %r312; // begin inline asm { cvt.f32.f16 %f595, %rs331;} // end inline asm fma.rn.ftz.f32 %f814, %f808, %f595, %f805; shr.u32 %r320, %r61, 4; cvt.u16.u32 %rs532, %r320; shl.b16 %rs533, %rs532, 4; cvt.s16.s8 %rs534, %rs533; shr.s16 %rs535, %rs534, 7; and.b16 %rs536, %rs535, -16; and.b16 %rs537, %rs532, 15; or.b16 %rs538, %rs536, %rs537; cvt.rn.f32.s16 %f815, %rs538; sub.ftz.f32 %f816, %f815, %f8; mul.ftz.f32 %f817, %f67, %f816; // begin inline asm { cvt.f32.f16 %f596, %rs332;} // end inline asm fma.rn.ftz.f32 %f818, %f817, %f596, %f809; // begin inline asm { cvt.f32.f16 %f597, %rs333;} // end inline asm fma.rn.ftz.f32 %f819, %f817, %f597, %f810; // begin inline asm { cvt.f32.f16 %f598, %rs334;} // end inline asm fma.rn.ftz.f32 %f820, %f817, %f598, %f811; // begin inline asm { cvt.f32.f16 %f599, %rs335;} // end inline asm fma.rn.ftz.f32 %f821, %f817, %f599, %f812; // begin inline asm { cvt.f32.f16 %f600, %rs336;} // end inline asm fma.rn.ftz.f32 %f822, %f817, %f600, %f813; // begin inline asm { cvt.f32.f16 %f601, %rs337;} // end inline asm fma.rn.ftz.f32 %f823, %f817, %f601, %f814; shr.u32 %r321, %r61, 8; cvt.u16.u32 %rs539, %r321; shl.b16 %rs540, %rs539, 4; cvt.s16.s8 %rs541, %rs540; shr.s16 %rs542, %rs541, 7; and.b16 %rs543, %rs542, -16; and.b16 %rs544, %rs539, 15; or.b16 %rs545, %rs543, %rs544; cvt.rn.f32.s16 %f824, %rs545; sub.ftz.f32 %f825, %f824, %f8; mul.ftz.f32 %f826, %f67, %f825; mov.b32 {%rs338, %rs344}, %r273; // begin inline asm { cvt.f32.f16 %f602, %rs338;} // end inline asm fma.rn.ftz.f32 %f827, %f826, %f602, %f818; mov.b32 {%rs339, %rs345}, %r281; // begin inline asm { cvt.f32.f16 %f603, %rs339;} // end inline asm fma.rn.ftz.f32 %f828, %f826, %f603, %f819; mov.b32 {%rs340, %rs346}, %r289; // begin inline asm { cvt.f32.f16 %f604, %rs340;} // end inline asm fma.rn.ftz.f32 %f829, %f826, %f604, %f820; mov.b32 {%rs341, %rs347}, %r297; // begin inline asm { cvt.f32.f16 %f605, %rs341;} // end inline asm fma.rn.ftz.f32 %f830, %f826, %f605, %f821; mov.b32 {%rs342, %rs348}, %r305; // begin inline asm { cvt.f32.f16 %f606, %rs342;} // end inline asm fma.rn.ftz.f32 %f831, %f826, %f606, %f822; mov.b32 {%rs343, %rs349}, %r313; // begin inline asm { cvt.f32.f16 %f607, %rs343;} // end inline asm fma.rn.ftz.f32 %f832, %f826, %f607, %f823; shr.u32 %r322, %r61, 12; cvt.u16.u32 %rs546, %r322; shl.b16 %rs547, %rs546, 4; cvt.s16.s8 %rs548, %rs547; shr.s16 %rs549, %rs548, 7; and.b16 %rs550, %rs549, -16; and.b16 %rs551, %rs546, 15; or.b16 %rs552, %rs550, %rs551; cvt.rn.f32.s16 %f833, %rs552; sub.ftz.f32 %f834, %f833, %f8; mul.ftz.f32 %f835, %f67, %f834; // begin inline asm { cvt.f32.f16 %f608, %rs344;} // end inline asm fma.rn.ftz.f32 %f836, %f835, %f608, %f827; // begin inline asm { cvt.f32.f16 %f609, %rs345;} // end inline asm fma.rn.ftz.f32 %f837, %f835, %f609, %f828; // begin inline asm { cvt.f32.f16 %f610, %rs346;} // end inline asm fma.rn.ftz.f32 %f838, %f835, %f610, %f829; // begin inline asm { cvt.f32.f16 %f611, %rs347;} // end inline asm fma.rn.ftz.f32 %f839, %f835, %f611, %f830; // begin inline asm { cvt.f32.f16 %f612, %rs348;} // end inline asm fma.rn.ftz.f32 %f840, %f835, %f612, %f831; // begin inline asm { cvt.f32.f16 %f613, %rs349;} // end inline asm fma.rn.ftz.f32 %f841, %f835, %f613, %f832; shr.u32 %r323, %r61, 16; cvt.u16.u32 %rs553, %r323; shl.b16 %rs554, %rs553, 4; cvt.s16.s8 %rs555, %rs554; shr.s16 %rs556, %rs555, 7; and.b16 %rs557, %rs556, -16; and.b16 %rs558, %rs553, 15; or.b16 %rs559, %rs557, %rs558; cvt.rn.f32.s16 %f842, %rs559; sub.ftz.f32 %f843, %f842, %f8; mul.ftz.f32 %f844, %f67, %f843; mov.b32 {%rs350, %rs356}, %r274; // begin inline asm { cvt.f32.f16 %f614, %rs350;} // end inline asm fma.rn.ftz.f32 %f845, %f844, %f614, %f836; mov.b32 {%rs351, %rs357}, %r282; // begin inline asm { cvt.f32.f16 %f615, %rs351;} // end inline asm fma.rn.ftz.f32 %f846, %f844, %f615, %f837; mov.b32 {%rs352, %rs358}, %r290; // begin inline asm { cvt.f32.f16 %f616, %rs352;} // end inline asm fma.rn.ftz.f32 %f847, %f844, %f616, %f838; mov.b32 {%rs353, %rs359}, %r298; // begin inline asm { cvt.f32.f16 %f617, %rs353;} // end inline asm fma.rn.ftz.f32 %f848, %f844, %f617, %f839; mov.b32 {%rs354, %rs360}, %r306; // begin inline asm { cvt.f32.f16 %f618, %rs354;} // end inline asm fma.rn.ftz.f32 %f849, %f844, %f618, %f840; mov.b32 {%rs355, %rs361}, %r314; // begin inline asm { cvt.f32.f16 %f619, %rs355;} // end inline asm fma.rn.ftz.f32 %f850, %f844, %f619, %f841; shl.b16 %rs561, %rs39, 4; cvt.s16.s8 %rs562, %rs561; shr.s16 %rs563, %rs562, 7; and.b16 %rs564, %rs563, -16; or.b16 %rs565, %rs564, %rs5; cvt.rn.f32.s16 %f851, %rs565; sub.ftz.f32 %f852, %f851, %f8; mul.ftz.f32 %f853, %f67, %f852; // begin inline asm { cvt.f32.f16 %f620, %rs356;} // end inline asm fma.rn.ftz.f32 %f854, %f853, %f620, %f845; // begin inline asm { cvt.f32.f16 %f621, %rs357;} // end inline asm fma.rn.ftz.f32 %f855, %f853, %f621, %f846; // begin inline asm { cvt.f32.f16 %f622, %rs358;} // end inline asm fma.rn.ftz.f32 %f856, %f853, %f622, %f847; // begin inline asm { cvt.f32.f16 %f623, %rs359;} // end inline asm fma.rn.ftz.f32 %f857, %f853, %f623, %f848; // begin inline asm { cvt.f32.f16 %f624, %rs360;} // end inline asm fma.rn.ftz.f32 %f858, %f853, %f624, %f849; // begin inline asm { cvt.f32.f16 %f625, %rs361;} // end inline asm fma.rn.ftz.f32 %f859, %f853, %f625, %f850; shr.u32 %r325, %r61, 24; cvt.u16.u32 %rs566, %r325; shl.b16 %rs567, %rs566, 4; cvt.s16.s8 %rs568, %rs567; shr.s16 %rs569, %rs568, 7; and.b16 %rs570, %rs569, -16; and.b16 %rs571, %rs566, 15; or.b16 %rs572, %rs570, %rs571; cvt.rn.f32.s16 %f860, %rs572; sub.ftz.f32 %f861, %f860, %f8; mul.ftz.f32 %f862, %f67, %f861; mov.b32 {%rs362, %rs368}, %r275; // begin inline asm { cvt.f32.f16 %f626, %rs362;} // end inline asm fma.rn.ftz.f32 %f863, %f862, %f626, %f854; mov.b32 {%rs363, %rs369}, %r283; // begin inline asm { cvt.f32.f16 %f627, %rs363;} // end inline asm fma.rn.ftz.f32 %f864, %f862, %f627, %f855; mov.b32 {%rs364, %rs370}, %r291; // begin inline asm { cvt.f32.f16 %f628, %rs364;} // end inline asm fma.rn.ftz.f32 %f865, %f862, %f628, %f856; mov.b32 {%rs365, %rs371}, %r299; // begin inline asm { cvt.f32.f16 %f629, %rs365;} // end inline asm fma.rn.ftz.f32 %f866, %f862, %f629, %f857; mov.b32 {%rs366, %rs372}, %r307; // begin inline asm { cvt.f32.f16 %f630, %rs366;} // end inline asm fma.rn.ftz.f32 %f867, %f862, %f630, %f858; mov.b32 {%rs367, %rs373}, %r315; // begin inline asm { cvt.f32.f16 %f631, %rs367;} // end inline asm fma.rn.ftz.f32 %f868, %f862, %f631, %f859; shr.u32 %r326, %r61, 28; cvt.u16.u32 %rs573, %r326; shl.b16 %rs574, %rs573, 4; cvt.s16.s8 %rs575, %rs574; shr.s16 %rs576, %rs575, 7; and.b16 %rs577, %rs576, -16; or.b16 %rs578, %rs577, %rs573; cvt.rn.f32.s16 %f869, %rs578; sub.ftz.f32 %f870, %f869, %f8; mul.ftz.f32 %f871, %f67, %f870; // begin inline asm { cvt.f32.f16 %f632, %rs368;} // end inline asm fma.rn.ftz.f32 %f872, %f871, %f632, %f863; // begin inline asm { cvt.f32.f16 %f633, %rs369;} // end inline asm fma.rn.ftz.f32 %f873, %f871, %f633, %f864; // begin inline asm { cvt.f32.f16 %f634, %rs370;} // end inline asm fma.rn.ftz.f32 %f874, %f871, %f634, %f865; // begin inline asm { cvt.f32.f16 %f635, %rs371;} // end inline asm fma.rn.ftz.f32 %f875, %f871, %f635, %f866; // begin inline asm { cvt.f32.f16 %f636, %rs372;} // end inline asm fma.rn.ftz.f32 %f876, %f871, %f636, %f867; // begin inline asm { cvt.f32.f16 %f637, %rs373;} // end inline asm fma.rn.ftz.f32 %f877, %f871, %f637, %f868; ld.global.v4.u32 {%r327, %r328, %r329, %r330}, [%rd5+32]; ld.global.v4.u32 {%r335, %r336, %r337, %r338}, [%rd7+16]; ld.global.v4.u32 {%r343, %r344, %r345, %r346}, [%rd6+32]; ld.global.v4.u32 {%r351, %r352, %r353, %r354}, [%rd8+16]; ld.global.v4.u32 {%r359, %r360, %r361, %r362}, [%rd9+16]; ld.global.v4.u32 {%r367, %r368, %r369, %r370}, [%rd10+16]; cvt.u16.u32 %rs579, %r62; shl.b16 %rs580, %rs579, 4; cvt.s16.s8 %rs581, %rs580; shr.s16 %rs582, %rs581, 7; and.b16 %rs583, %rs582, -16; and.b16 %rs584, %rs579, 15; or.b16 %rs585, %rs583, %rs584; cvt.rn.f32.s16 %f878, %rs585; sub.ftz.f32 %f879, %f878, %f8; mul.ftz.f32 %f880, %f67, %f879; mov.b32 {%rs374, %rs380}, %r327; // begin inline asm { cvt.f32.f16 %f638, %rs374;} // end inline asm fma.rn.ftz.f32 %f881, %f880, %f638, %f872; mov.b32 {%rs375, %rs381}, %r335; // begin inline asm { cvt.f32.f16 %f639, %rs375;} // end inline asm fma.rn.ftz.f32 %f882, %f880, %f639, %f873; mov.b32 {%rs376, %rs382}, %r343; // begin inline asm { cvt.f32.f16 %f640, %rs376;} // end inline asm fma.rn.ftz.f32 %f883, %f880, %f640, %f874; mov.b32 {%rs377, %rs383}, %r351; // begin inline asm { cvt.f32.f16 %f641, %rs377;} // end inline asm fma.rn.ftz.f32 %f884, %f880, %f641, %f875; mov.b32 {%rs378, %rs384}, %r359; // begin inline asm { cvt.f32.f16 %f642, %rs378;} // end inline asm fma.rn.ftz.f32 %f885, %f880, %f642, %f876; mov.b32 {%rs379, %rs385}, %r367; // begin inline asm { cvt.f32.f16 %f643, %rs379;} // end inline asm fma.rn.ftz.f32 %f886, %f880, %f643, %f877; shr.u32 %r375, %r62, 4; cvt.u16.u32 %rs586, %r375; shl.b16 %rs587, %rs586, 4; cvt.s16.s8 %rs588, %rs587; shr.s16 %rs589, %rs588, 7; and.b16 %rs590, %rs589, -16; and.b16 %rs591, %rs586, 15; or.b16 %rs592, %rs590, %rs591; cvt.rn.f32.s16 %f887, %rs592; sub.ftz.f32 %f888, %f887, %f8; mul.ftz.f32 %f889, %f67, %f888; // begin inline asm { cvt.f32.f16 %f644, %rs380;} // end inline asm fma.rn.ftz.f32 %f890, %f889, %f644, %f881; // begin inline asm { cvt.f32.f16 %f645, %rs381;} // end inline asm fma.rn.ftz.f32 %f891, %f889, %f645, %f882; // begin inline asm { cvt.f32.f16 %f646, %rs382;} // end inline asm fma.rn.ftz.f32 %f892, %f889, %f646, %f883; // begin inline asm { cvt.f32.f16 %f647, %rs383;} // end inline asm fma.rn.ftz.f32 %f893, %f889, %f647, %f884; // begin inline asm { cvt.f32.f16 %f648, %rs384;} // end inline asm fma.rn.ftz.f32 %f894, %f889, %f648, %f885; // begin inline asm { cvt.f32.f16 %f649, %rs385;} // end inline asm fma.rn.ftz.f32 %f895, %f889, %f649, %f886; shr.u32 %r376, %r62, 8; cvt.u16.u32 %rs593, %r376; shl.b16 %rs594, %rs593, 4; cvt.s16.s8 %rs595, %rs594; shr.s16 %rs596, %rs595, 7; and.b16 %rs597, %rs596, -16; and.b16 %rs598, %rs593, 15; or.b16 %rs599, %rs597, %rs598; cvt.rn.f32.s16 %f896, %rs599; sub.ftz.f32 %f897, %f896, %f8; mul.ftz.f32 %f898, %f67, %f897; mov.b32 {%rs386, %rs392}, %r328; // begin inline asm { cvt.f32.f16 %f650, %rs386;} // end inline asm fma.rn.ftz.f32 %f899, %f898, %f650, %f890; mov.b32 {%rs387, %rs393}, %r336; // begin inline asm { cvt.f32.f16 %f651, %rs387;} // end inline asm fma.rn.ftz.f32 %f900, %f898, %f651, %f891; mov.b32 {%rs388, %rs394}, %r344; // begin inline asm { cvt.f32.f16 %f652, %rs388;} // end inline asm fma.rn.ftz.f32 %f901, %f898, %f652, %f892; mov.b32 {%rs389, %rs395}, %r352; // begin inline asm { cvt.f32.f16 %f653, %rs389;} // end inline asm fma.rn.ftz.f32 %f902, %f898, %f653, %f893; mov.b32 {%rs390, %rs396}, %r360; // begin inline asm { cvt.f32.f16 %f654, %rs390;} // end inline asm fma.rn.ftz.f32 %f903, %f898, %f654, %f894; mov.b32 {%rs391, %rs397}, %r368; // begin inline asm { cvt.f32.f16 %f655, %rs391;} // end inline asm fma.rn.ftz.f32 %f904, %f898, %f655, %f895; shr.u32 %r377, %r62, 12; cvt.u16.u32 %rs600, %r377; shl.b16 %rs601, %rs600, 4; cvt.s16.s8 %rs602, %rs601; shr.s16 %rs603, %rs602, 7; and.b16 %rs604, %rs603, -16; and.b16 %rs605, %rs600, 15; or.b16 %rs606, %rs604, %rs605; cvt.rn.f32.s16 %f905, %rs606; sub.ftz.f32 %f906, %f905, %f8; mul.ftz.f32 %f907, %f67, %f906; // begin inline asm { cvt.f32.f16 %f656, %rs392;} // end inline asm fma.rn.ftz.f32 %f908, %f907, %f656, %f899; // begin inline asm { cvt.f32.f16 %f657, %rs393;} // end inline asm fma.rn.ftz.f32 %f909, %f907, %f657, %f900; // begin inline asm { cvt.f32.f16 %f658, %rs394;} // end inline asm fma.rn.ftz.f32 %f910, %f907, %f658, %f901; // begin inline asm { cvt.f32.f16 %f659, %rs395;} // end inline asm fma.rn.ftz.f32 %f911, %f907, %f659, %f902; // begin inline asm { cvt.f32.f16 %f660, %rs396;} // end inline asm fma.rn.ftz.f32 %f912, %f907, %f660, %f903; // begin inline asm { cvt.f32.f16 %f661, %rs397;} // end inline asm fma.rn.ftz.f32 %f913, %f907, %f661, %f904; shr.u32 %r378, %r62, 16; cvt.u16.u32 %rs607, %r378; shl.b16 %rs608, %rs607, 4; cvt.s16.s8 %rs609, %rs608; shr.s16 %rs610, %rs609, 7; and.b16 %rs611, %rs610, -16; and.b16 %rs612, %rs607, 15; or.b16 %rs613, %rs611, %rs612; cvt.rn.f32.s16 %f914, %rs613; sub.ftz.f32 %f915, %f914, %f8; mul.ftz.f32 %f916, %f67, %f915; mov.b32 {%rs398, %rs404}, %r329; // begin inline asm { cvt.f32.f16 %f662, %rs398;} // end inline asm fma.rn.ftz.f32 %f917, %f916, %f662, %f908; mov.b32 {%rs399, %rs405}, %r337; // begin inline asm { cvt.f32.f16 %f663, %rs399;} // end inline asm fma.rn.ftz.f32 %f918, %f916, %f663, %f909; mov.b32 {%rs400, %rs406}, %r345; // begin inline asm { cvt.f32.f16 %f664, %rs400;} // end inline asm fma.rn.ftz.f32 %f919, %f916, %f664, %f910; mov.b32 {%rs401, %rs407}, %r353; // begin inline asm { cvt.f32.f16 %f665, %rs401;} // end inline asm fma.rn.ftz.f32 %f920, %f916, %f665, %f911; mov.b32 {%rs402, %rs408}, %r361; // begin inline asm { cvt.f32.f16 %f666, %rs402;} // end inline asm fma.rn.ftz.f32 %f921, %f916, %f666, %f912; mov.b32 {%rs403, %rs409}, %r369; // begin inline asm { cvt.f32.f16 %f667, %rs403;} // end inline asm fma.rn.ftz.f32 %f922, %f916, %f667, %f913; shr.u32 %r379, %r62, 20; cvt.u16.u32 %rs614, %r379; shl.b16 %rs615, %rs614, 4; cvt.s16.s8 %rs616, %rs615; shr.s16 %rs617, %rs616, 7; and.b16 %rs618, %rs617, -16; and.b16 %rs619, %rs614, 15; or.b16 %rs620, %rs618, %rs619; cvt.rn.f32.s16 %f923, %rs620; sub.ftz.f32 %f924, %f923, %f8; mul.ftz.f32 %f925, %f67, %f924; // begin inline asm { cvt.f32.f16 %f668, %rs404;} // end inline asm fma.rn.ftz.f32 %f926, %f925, %f668, %f917; // begin inline asm { cvt.f32.f16 %f669, %rs405;} // end inline asm fma.rn.ftz.f32 %f927, %f925, %f669, %f918; // begin inline asm { cvt.f32.f16 %f670, %rs406;} // end inline asm fma.rn.ftz.f32 %f928, %f925, %f670, %f919; // begin inline asm { cvt.f32.f16 %f671, %rs407;} // end inline asm fma.rn.ftz.f32 %f929, %f925, %f671, %f920; // begin inline asm { cvt.f32.f16 %f672, %rs408;} // end inline asm fma.rn.ftz.f32 %f930, %f925, %f672, %f921; // begin inline asm { cvt.f32.f16 %f673, %rs409;} // end inline asm fma.rn.ftz.f32 %f931, %f925, %f673, %f922; shl.b16 %rs622, %rs40, 4; cvt.s16.s8 %rs623, %rs622; shr.s16 %rs624, %rs623, 7; and.b16 %rs625, %rs624, -16; or.b16 %rs626, %rs625, %rs6; cvt.rn.f32.s16 %f932, %rs626; sub.ftz.f32 %f933, %f932, %f8; mul.ftz.f32 %f934, %f67, %f933; mov.b32 {%rs410, %rs416}, %r330; // begin inline asm { cvt.f32.f16 %f674, %rs410;} // end inline asm fma.rn.ftz.f32 %f935, %f934, %f674, %f926; mov.b32 {%rs411, %rs417}, %r338; // begin inline asm { cvt.f32.f16 %f675, %rs411;} // end inline asm fma.rn.ftz.f32 %f936, %f934, %f675, %f927; mov.b32 {%rs412, %rs418}, %r346; // begin inline asm { cvt.f32.f16 %f676, %rs412;} // end inline asm fma.rn.ftz.f32 %f937, %f934, %f676, %f928; mov.b32 {%rs413, %rs419}, %r354; // begin inline asm { cvt.f32.f16 %f677, %rs413;} // end inline asm fma.rn.ftz.f32 %f938, %f934, %f677, %f929; mov.b32 {%rs414, %rs420}, %r362; // begin inline asm { cvt.f32.f16 %f678, %rs414;} // end inline asm fma.rn.ftz.f32 %f939, %f934, %f678, %f930; mov.b32 {%rs415, %rs421}, %r370; // begin inline asm { cvt.f32.f16 %f679, %rs415;} // end inline asm fma.rn.ftz.f32 %f940, %f934, %f679, %f931; shr.u32 %r381, %r62, 28; cvt.u16.u32 %rs627, %r381; shl.b16 %rs628, %rs627, 4; cvt.s16.s8 %rs629, %rs628; shr.s16 %rs630, %rs629, 7; and.b16 %rs631, %rs630, -16; or.b16 %rs632, %rs631, %rs627; cvt.rn.f32.s16 %f941, %rs632; sub.ftz.f32 %f942, %f941, %f8; mul.ftz.f32 %f943, %f67, %f942; // begin inline asm { cvt.f32.f16 %f680, %rs416;} // end inline asm fma.rn.ftz.f32 %f944, %f943, %f680, %f935; // begin inline asm { cvt.f32.f16 %f681, %rs417;} // end inline asm fma.rn.ftz.f32 %f945, %f943, %f681, %f936; // begin inline asm { cvt.f32.f16 %f682, %rs418;} // end inline asm fma.rn.ftz.f32 %f946, %f943, %f682, %f937; // begin inline asm { cvt.f32.f16 %f683, %rs419;} // end inline asm fma.rn.ftz.f32 %f947, %f943, %f683, %f938; // begin inline asm { cvt.f32.f16 %f684, %rs420;} // end inline asm fma.rn.ftz.f32 %f948, %f943, %f684, %f939; // begin inline asm { cvt.f32.f16 %f685, %rs421;} // end inline asm fma.rn.ftz.f32 %f949, %f943, %f685, %f940; ld.global.v4.u32 {%r382, %r383, %r384, %r385}, [%rd5+48]; ld.global.v4.u32 {%r390, %r391, %r392, %r393}, [%rd7+32]; ld.global.v4.u32 {%r398, %r399, %r400, %r401}, [%rd6+48]; ld.global.v4.u32 {%r406, %r407, %r408, %r409}, [%rd8+32]; ld.global.v4.u32 {%r414, %r415, %r416, %r417}, [%rd9+32]; ld.global.v4.u32 {%r422, %r423, %r424, %r425}, [%rd10+32]; shl.b16 %rs634, %rs41, 4; cvt.s16.s8 %rs635, %rs634; shr.s16 %rs636, %rs635, 7; and.b16 %rs637, %rs636, -16; and.b16 %rs638, %rs41, 15; or.b16 %rs639, %rs637, %rs638; cvt.rn.f32.s16 %f950, %rs639; sub.ftz.f32 %f951, %f950, %f8; mul.ftz.f32 %f952, %f67, %f951; mov.b32 {%rs422, %rs428}, %r382; // begin inline asm { cvt.f32.f16 %f686, %rs422;} // end inline asm fma.rn.ftz.f32 %f953, %f952, %f686, %f944; mov.b32 {%rs423, %rs429}, %r390; // begin inline asm { cvt.f32.f16 %f687, %rs423;} // end inline asm fma.rn.ftz.f32 %f954, %f952, %f687, %f945; mov.b32 {%rs424, %rs430}, %r398; // begin inline asm { cvt.f32.f16 %f688, %rs424;} // end inline asm fma.rn.ftz.f32 %f955, %f952, %f688, %f946; mov.b32 {%rs425, %rs431}, %r406; // begin inline asm { cvt.f32.f16 %f689, %rs425;} // end inline asm fma.rn.ftz.f32 %f956, %f952, %f689, %f947; mov.b32 {%rs426, %rs432}, %r414; // begin inline asm { cvt.f32.f16 %f690, %rs426;} // end inline asm fma.rn.ftz.f32 %f957, %f952, %f690, %f948; mov.b32 {%rs427, %rs433}, %r422; // begin inline asm { cvt.f32.f16 %f691, %rs427;} // end inline asm fma.rn.ftz.f32 %f958, %f952, %f691, %f949; cvt.s16.s8 %rs640, %rs41; shr.s16 %rs641, %rs640, 7; and.b16 %rs642, %rs641, -16; or.b16 %rs643, %rs642, %rs7; cvt.rn.f32.s16 %f959, %rs643; sub.ftz.f32 %f960, %f959, %f8; mul.ftz.f32 %f961, %f67, %f960; // begin inline asm { cvt.f32.f16 %f692, %rs428;} // end inline asm fma.rn.ftz.f32 %f962, %f961, %f692, %f953; // begin inline asm { cvt.f32.f16 %f693, %rs429;} // end inline asm fma.rn.ftz.f32 %f963, %f961, %f693, %f954; // begin inline asm { cvt.f32.f16 %f694, %rs430;} // end inline asm fma.rn.ftz.f32 %f964, %f961, %f694, %f955; // begin inline asm { cvt.f32.f16 %f695, %rs431;} // end inline asm fma.rn.ftz.f32 %f965, %f961, %f695, %f956; // begin inline asm { cvt.f32.f16 %f696, %rs432;} // end inline asm fma.rn.ftz.f32 %f966, %f961, %f696, %f957; // begin inline asm { cvt.f32.f16 %f697, %rs433;} // end inline asm fma.rn.ftz.f32 %f967, %f961, %f697, %f958; cvt.s16.s8 %rs645, %rs42; shr.s16 %rs646, %rs645, 7; and.b16 %rs647, %rs646, -16; or.b16 %rs648, %rs647, %rs8; cvt.rn.f32.s16 %f968, %rs648; sub.ftz.f32 %f969, %f968, %f8; mul.ftz.f32 %f970, %f67, %f969; mov.b32 {%rs434, %rs440}, %r383; // begin inline asm { cvt.f32.f16 %f698, %rs434;} // end inline asm fma.rn.ftz.f32 %f971, %f970, %f698, %f962; mov.b32 {%rs435, %rs441}, %r391; // begin inline asm { cvt.f32.f16 %f699, %rs435;} // end inline asm fma.rn.ftz.f32 %f972, %f970, %f699, %f963; mov.b32 {%rs436, %rs442}, %r399; // begin inline asm { cvt.f32.f16 %f700, %rs436;} // end inline asm fma.rn.ftz.f32 %f973, %f970, %f700, %f964; mov.b32 {%rs437, %rs443}, %r407; // begin inline asm { cvt.f32.f16 %f701, %rs437;} // end inline asm fma.rn.ftz.f32 %f974, %f970, %f701, %f965; mov.b32 {%rs438, %rs444}, %r415; // begin inline asm { cvt.f32.f16 %f702, %rs438;} // end inline asm fma.rn.ftz.f32 %f975, %f970, %f702, %f966; mov.b32 {%rs439, %rs445}, %r423; // begin inline asm { cvt.f32.f16 %f703, %rs439;} // end inline asm fma.rn.ftz.f32 %f976, %f970, %f703, %f967; shr.s16 %rs649, %rs41, 15; and.b16 %rs650, %rs649, -16; or.b16 %rs651, %rs650, %rs9; cvt.rn.f32.s16 %f977, %rs651; sub.ftz.f32 %f978, %f977, %f8; mul.ftz.f32 %f979, %f67, %f978; // begin inline asm { cvt.f32.f16 %f704, %rs440;} // end inline asm fma.rn.ftz.f32 %f980, %f979, %f704, %f971; // begin inline asm { cvt.f32.f16 %f705, %rs441;} // end inline asm fma.rn.ftz.f32 %f981, %f979, %f705, %f972; // begin inline asm { cvt.f32.f16 %f706, %rs442;} // end inline asm fma.rn.ftz.f32 %f982, %f979, %f706, %f973; // begin inline asm { cvt.f32.f16 %f707, %rs443;} // end inline asm fma.rn.ftz.f32 %f983, %f979, %f707, %f974; // begin inline asm { cvt.f32.f16 %f708, %rs444;} // end inline asm fma.rn.ftz.f32 %f984, %f979, %f708, %f975; // begin inline asm { cvt.f32.f16 %f709, %rs445;} // end inline asm fma.rn.ftz.f32 %f985, %f979, %f709, %f976; shl.b16 %rs653, %rs44, 4; cvt.s16.s8 %rs654, %rs653; shr.s16 %rs655, %rs654, 7; and.b16 %rs656, %rs655, -16; or.b16 %rs657, %rs656, %rs10; cvt.rn.f32.s16 %f986, %rs657; sub.ftz.f32 %f987, %f986, %f8; mul.ftz.f32 %f988, %f67, %f987; mov.b32 {%rs446, %rs452}, %r384; // begin inline asm { cvt.f32.f16 %f710, %rs446;} // end inline asm fma.rn.ftz.f32 %f989, %f988, %f710, %f980; mov.b32 {%rs447, %rs453}, %r392; // begin inline asm { cvt.f32.f16 %f711, %rs447;} // end inline asm fma.rn.ftz.f32 %f990, %f988, %f711, %f981; mov.b32 {%rs448, %rs454}, %r400; // begin inline asm { cvt.f32.f16 %f712, %rs448;} // end inline asm fma.rn.ftz.f32 %f991, %f988, %f712, %f982; mov.b32 {%rs449, %rs455}, %r408; // begin inline asm { cvt.f32.f16 %f713, %rs449;} // end inline asm fma.rn.ftz.f32 %f992, %f988, %f713, %f983; mov.b32 {%rs450, %rs456}, %r416; // begin inline asm { cvt.f32.f16 %f714, %rs450;} // end inline asm fma.rn.ftz.f32 %f993, %f988, %f714, %f984; mov.b32 {%rs451, %rs457}, %r424; // begin inline asm { cvt.f32.f16 %f715, %rs451;} // end inline asm fma.rn.ftz.f32 %f994, %f988, %f715, %f985; shl.b16 %rs659, %rs45, 4; cvt.s16.s8 %rs660, %rs659; shr.s16 %rs661, %rs660, 7; and.b16 %rs662, %rs661, -16; or.b16 %rs663, %rs662, %rs11; cvt.rn.f32.s16 %f995, %rs663; sub.ftz.f32 %f996, %f995, %f8; mul.ftz.f32 %f997, %f67, %f996; // begin inline asm { cvt.f32.f16 %f716, %rs452;} // end inline asm fma.rn.ftz.f32 %f998, %f997, %f716, %f989; // begin inline asm { cvt.f32.f16 %f717, %rs453;} // end inline asm fma.rn.ftz.f32 %f999, %f997, %f717, %f990; // begin inline asm { cvt.f32.f16 %f718, %rs454;} // end inline asm fma.rn.ftz.f32 %f1000, %f997, %f718, %f991; // begin inline asm { cvt.f32.f16 %f719, %rs455;} // end inline asm fma.rn.ftz.f32 %f1001, %f997, %f719, %f992; // begin inline asm { cvt.f32.f16 %f720, %rs456;} // end inline asm fma.rn.ftz.f32 %f1002, %f997, %f720, %f993; // begin inline asm { cvt.f32.f16 %f721, %rs457;} // end inline asm fma.rn.ftz.f32 %f1003, %f997, %f721, %f994; shl.b16 %rs664, %rs12, 4; cvt.s16.s8 %rs665, %rs664; shr.s16 %rs666, %rs665, 7; and.b16 %rs667, %rs666, -16; or.b16 %rs668, %rs667, %rs13; cvt.rn.f32.s16 %f1004, %rs668; sub.ftz.f32 %f1005, %f1004, %f8; mul.ftz.f32 %f1006, %f67, %f1005; mov.b32 {%rs458, %rs464}, %r385; // begin inline asm { cvt.f32.f16 %f722, %rs458;} // end inline asm fma.rn.ftz.f32 %f1007, %f1006, %f722, %f998; mov.b32 {%rs459, %rs465}, %r393; // begin inline asm { cvt.f32.f16 %f723, %rs459;} // end inline asm fma.rn.ftz.f32 %f1008, %f1006, %f723, %f999; mov.b32 {%rs460, %rs466}, %r401; // begin inline asm { cvt.f32.f16 %f724, %rs460;} // end inline asm fma.rn.ftz.f32 %f1009, %f1006, %f724, %f1000; mov.b32 {%rs461, %rs467}, %r409; // begin inline asm { cvt.f32.f16 %f725, %rs461;} // end inline asm fma.rn.ftz.f32 %f1010, %f1006, %f725, %f1001; mov.b32 {%rs462, %rs468}, %r417; // begin inline asm { cvt.f32.f16 %f726, %rs462;} // end inline asm fma.rn.ftz.f32 %f1011, %f1006, %f726, %f1002; mov.b32 {%rs463, %rs469}, %r425; // begin inline asm { cvt.f32.f16 %f727, %rs463;} // end inline asm fma.rn.ftz.f32 %f1012, %f1006, %f727, %f1003; shl.b16 %rs669, %rs14, 4; cvt.s16.s8 %rs670, %rs669; shr.s16 %rs671, %rs670, 7; and.b16 %rs672, %rs671, -16; or.b16 %rs673, %rs672, %rs14; cvt.rn.f32.s16 %f1013, %rs673; sub.ftz.f32 %f1014, %f1013, %f8; mul.ftz.f32 %f1015, %f67, %f1014; // begin inline asm { cvt.f32.f16 %f728, %rs464;} // end inline asm fma.rn.ftz.f32 %f1165, %f1015, %f728, %f1007; // begin inline asm { cvt.f32.f16 %f729, %rs465;} // end inline asm fma.rn.ftz.f32 %f1164, %f1015, %f729, %f1008; // begin inline asm { cvt.f32.f16 %f730, %rs466;} // end inline asm fma.rn.ftz.f32 %f1163, %f1015, %f730, %f1009; // begin inline asm { cvt.f32.f16 %f731, %rs467;} // end inline asm fma.rn.ftz.f32 %f1162, %f1015, %f731, %f1010; // begin inline asm { cvt.f32.f16 %f732, %rs468;} // end inline asm fma.rn.ftz.f32 %f1161, %f1015, %f732, %f1011; // begin inline asm { cvt.f32.f16 %f733, %rs469;} // end inline asm fma.rn.ftz.f32 %f1160, %f1015, %f733, %f1012; $L__BB0_8: add.s32 %r556, %r556, 4; shl.b32 %r432, %r556, 5; add.s32 %r555, %r432, %r56; shl.b32 %r554, %r555, 2; setp.lt.u32 %p7, %r554, %r53; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r549, %tid.y; mov.u32 %r548, %tid.x; shl.b32 %r547, %r549, 5; add.s32 %r546, %r547, %r548; shl.b32 %r434, %r546, 2; mov.u32 %r435, _ZZ9gemv_int4ILi4ELi128ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r436, %r435, %r434; setp.lt.u32 %p8, %r546, 32; @%p8 bra $L__BB0_11; add.s32 %r540, %r436, -112; st.shared.f32 [%r540], %f1165; $L__BB0_11: mov.u32 %r553, %tid.y; mov.u32 %r552, %tid.x; shl.b32 %r551, %r553, 5; add.s32 %r550, %r551, %r552; setp.gt.u32 %p9, %r550, 31; bar.sync 0; mad.lo.s32 %r46, %r550, 12, %r435; @%p9 bra $L__BB0_13; mov.u32 %r451, 16; ld.shared.f32 %f1031, [%r46+16]; add.ftz.f32 %f1032, %f1165, %f1031; ld.shared.f32 %f1033, [%r46+20]; add.ftz.f32 %f1034, %f1032, %f1033; ld.shared.f32 %f1035, [%r46+24]; add.ftz.f32 %f1018, %f1034, %f1035; mov.u32 %r439, 1; mov.u32 %r452, 31; mov.u32 %r453, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1018, %r439, %r452, %r453; @p add.f32 r0, r0, %f1018; mov.f32 %f1016, r0;} // end inline asm mov.u32 %r442, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1016, %r442, %r452, %r453; @p add.f32 r0, r0, %f1016; mov.f32 %f1019, r0;} // end inline asm mov.u32 %r445, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1019, %r445, %r452, %r453; @p add.f32 r0, r0, %f1019; mov.f32 %f1022, r0;} // end inline asm mov.u32 %r448, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1022, %r448, %r452, %r453; @p add.f32 r0, r0, %f1022; mov.f32 %f1025, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1025, %r451, %r452, %r453; @p add.f32 r0, r0, %f1025; mov.f32 %f1165, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r541, %r436, -112; st.shared.f32 [%r541+640], %f1164; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f1051, [%r46+656]; add.ftz.f32 %f1052, %f1164, %f1051; ld.shared.f32 %f1053, [%r46+660]; add.ftz.f32 %f1054, %f1052, %f1053; ld.shared.f32 %f1055, [%r46+664]; add.ftz.f32 %f1038, %f1054, %f1055; mov.u32 %r455, 1; mov.u32 %r468, 31; mov.u32 %r469, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1038, %r455, %r468, %r469; @p add.f32 r0, r0, %f1038; mov.f32 %f1036, r0;} // end inline asm mov.u32 %r458, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1036, %r458, %r468, %r469; @p add.f32 r0, r0, %f1036; mov.f32 %f1039, r0;} // end inline asm mov.u32 %r461, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1039, %r461, %r468, %r469; @p add.f32 r0, r0, %f1039; mov.f32 %f1042, r0;} // end inline asm mov.u32 %r464, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1042, %r464, %r468, %r469; @p add.f32 r0, r0, %f1042; mov.f32 %f1045, r0;} // end inline asm mov.u32 %r467, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1045, %r467, %r468, %r469; @p add.f32 r0, r0, %f1045; mov.f32 %f1164, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r542, %r436, -112; st.shared.f32 [%r542+1280], %f1163; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f1071, [%r46+1296]; add.ftz.f32 %f1072, %f1163, %f1071; ld.shared.f32 %f1073, [%r46+1300]; add.ftz.f32 %f1074, %f1072, %f1073; ld.shared.f32 %f1075, [%r46+1304]; add.ftz.f32 %f1058, %f1074, %f1075; mov.u32 %r471, 1; mov.u32 %r484, 31; mov.u32 %r485, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1058, %r471, %r484, %r485; @p add.f32 r0, r0, %f1058; mov.f32 %f1056, r0;} // end inline asm mov.u32 %r474, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1056, %r474, %r484, %r485; @p add.f32 r0, r0, %f1056; mov.f32 %f1059, r0;} // end inline asm mov.u32 %r477, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1059, %r477, %r484, %r485; @p add.f32 r0, r0, %f1059; mov.f32 %f1062, r0;} // end inline asm mov.u32 %r480, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1062, %r480, %r484, %r485; @p add.f32 r0, r0, %f1062; mov.f32 %f1065, r0;} // end inline asm mov.u32 %r483, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1065, %r483, %r484, %r485; @p add.f32 r0, r0, %f1065; mov.f32 %f1163, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r543, %r436, -112; st.shared.f32 [%r543+1920], %f1162; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f1091, [%r46+1936]; add.ftz.f32 %f1092, %f1162, %f1091; ld.shared.f32 %f1093, [%r46+1940]; add.ftz.f32 %f1094, %f1092, %f1093; ld.shared.f32 %f1095, [%r46+1944]; add.ftz.f32 %f1078, %f1094, %f1095; mov.u32 %r487, 1; mov.u32 %r500, 31; mov.u32 %r501, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1078, %r487, %r500, %r501; @p add.f32 r0, r0, %f1078; mov.f32 %f1076, r0;} // end inline asm mov.u32 %r490, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1076, %r490, %r500, %r501; @p add.f32 r0, r0, %f1076; mov.f32 %f1079, r0;} // end inline asm mov.u32 %r493, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1079, %r493, %r500, %r501; @p add.f32 r0, r0, %f1079; mov.f32 %f1082, r0;} // end inline asm mov.u32 %r496, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1082, %r496, %r500, %r501; @p add.f32 r0, r0, %f1082; mov.f32 %f1085, r0;} // end inline asm mov.u32 %r499, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1085, %r499, %r500, %r501; @p add.f32 r0, r0, %f1085; mov.f32 %f1162, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r544, %r436, -112; st.shared.f32 [%r544+2560], %f1161; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f1111, [%r46+2576]; add.ftz.f32 %f1112, %f1161, %f1111; ld.shared.f32 %f1113, [%r46+2580]; add.ftz.f32 %f1114, %f1112, %f1113; ld.shared.f32 %f1115, [%r46+2584]; add.ftz.f32 %f1098, %f1114, %f1115; mov.u32 %r503, 1; mov.u32 %r516, 31; mov.u32 %r517, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1098, %r503, %r516, %r517; @p add.f32 r0, r0, %f1098; mov.f32 %f1096, r0;} // end inline asm mov.u32 %r506, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1096, %r506, %r516, %r517; @p add.f32 r0, r0, %f1096; mov.f32 %f1099, r0;} // end inline asm mov.u32 %r509, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1099, %r509, %r516, %r517; @p add.f32 r0, r0, %f1099; mov.f32 %f1102, r0;} // end inline asm mov.u32 %r512, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1102, %r512, %r516, %r517; @p add.f32 r0, r0, %f1102; mov.f32 %f1105, r0;} // end inline asm mov.u32 %r515, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1105, %r515, %r516, %r517; @p add.f32 r0, r0, %f1105; mov.f32 %f1161, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r545, %r436, -112; st.shared.f32 [%r545+3200], %f1160; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f1131, [%r46+3216]; add.ftz.f32 %f1132, %f1160, %f1131; ld.shared.f32 %f1133, [%r46+3220]; add.ftz.f32 %f1134, %f1132, %f1133; ld.shared.f32 %f1135, [%r46+3224]; add.ftz.f32 %f1118, %f1134, %f1135; mov.u32 %r519, 1; mov.u32 %r532, 31; mov.u32 %r533, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1118, %r519, %r532, %r533; @p add.f32 r0, r0, %f1118; mov.f32 %f1116, r0;} // end inline asm mov.u32 %r522, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1116, %r522, %r532, %r533; @p add.f32 r0, r0, %f1116; mov.f32 %f1119, r0;} // end inline asm mov.u32 %r525, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1119, %r525, %r532, %r533; @p add.f32 r0, r0, %f1119; mov.f32 %f1122, r0;} // end inline asm mov.u32 %r528, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1122, %r528, %r532, %r533; @p add.f32 r0, r0, %f1122; mov.f32 %f1125, r0;} // end inline asm mov.u32 %r531, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1125, %r531, %r532, %r533; @p add.f32 r0, r0, %f1125; mov.f32 %f1160, r0;} // end inline asm $L__BB0_33: mov.u32 %r534, %tid.y; or.b32 %r536, %r56, %r534; setp.ne.s32 %p20, %r536, 0; @%p20 bra $L__BB0_37; ld.param.u64 %rd65, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd64, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd43, %rd64; setp.eq.s64 %p21, %rd65, 0; mul.ftz.f32 %f45, %f59, %f1165; mov.u32 %r537, %ctaid.x; cvt.s64.s32 %rd12, %r537; mul.wide.s32 %rd44, %r537, 2; add.s64 %rd13, %rd43, %rd44; mul.ftz.f32 %f46, %f59, %f1164; add.s32 %r538, %r52, %r537; cvt.s64.s32 %rd14, %r52; mul.wide.s32 %rd45, %r52, 2; add.s64 %rd15, %rd13, %rd45; mul.ftz.f32 %f47, %f59, %f1163; add.s32 %r539, %r538, %r52; cvt.s64.s32 %rd16, %r539; mul.wide.s32 %rd46, %r539, 2; add.s64 %rd18, %rd43, %rd46; mul.ftz.f32 %f48, %f59, %f1162; mul.ftz.f32 %f49, %f59, %f1161; mul.ftz.f32 %f50, %f59, %f1160; @%p21 bra $L__BB0_36; ld.param.u64 %rd66, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd47, %rd66; shl.b64 %rd48, %rd12, 1; add.s64 %rd49, %rd47, %rd48; ld.global.u16 %rs674, [%rd49]; // begin inline asm { cvt.f32.f16 %f1136, %rs674;} // end inline asm fma.rn.ftz.f32 %f1137, %f60, %f1136, %f45; // begin inline asm { cvt.rn.f16.f32 %rs675, %f1137;} // end inline asm st.global.u16 [%rd13], %rs675; shl.b64 %rd50, %rd14, 1; add.s64 %rd51, %rd49, %rd50; ld.global.u16 %rs676, [%rd51]; // begin inline asm { cvt.f32.f16 %f1138, %rs676;} // end inline asm fma.rn.ftz.f32 %f1139, %f60, %f1138, %f46; // begin inline asm { cvt.rn.f16.f32 %rs677, %f1139;} // end inline asm st.global.u16 [%rd15], %rs677; shl.b64 %rd52, %rd16, 1; add.s64 %rd53, %rd47, %rd52; ld.global.u16 %rs678, [%rd53]; // begin inline asm { cvt.f32.f16 %f1140, %rs678;} // end inline asm fma.rn.ftz.f32 %f1141, %f60, %f1140, %f47; // begin inline asm { cvt.rn.f16.f32 %rs679, %f1141;} // end inline asm st.global.u16 [%rd18], %rs679; add.s64 %rd54, %rd53, %rd50; ld.global.u16 %rs680, [%rd54]; // begin inline asm { cvt.f32.f16 %f1142, %rs680;} // end inline asm fma.rn.ftz.f32 %f1143, %f60, %f1142, %f48; // begin inline asm { cvt.rn.f16.f32 %rs681, %f1143;} // end inline asm add.s64 %rd55, %rd18, %rd50; st.global.u16 [%rd55], %rs681; add.s64 %rd56, %rd54, %rd50; ld.global.u16 %rs682, [%rd56]; // begin inline asm { cvt.f32.f16 %f1144, %rs682;} // end inline asm fma.rn.ftz.f32 %f1145, %f60, %f1144, %f49; // begin inline asm { cvt.rn.f16.f32 %rs683, %f1145;} // end inline asm add.s64 %rd57, %rd55, %rd50; st.global.u16 [%rd57], %rs683; add.s64 %rd58, %rd56, %rd50; ld.global.u16 %rs684, [%rd58]; // begin inline asm { cvt.f32.f16 %f1146, %rs684;} // end inline asm fma.rn.ftz.f32 %f1147, %f60, %f1146, %f50; // begin inline asm { cvt.rn.f16.f32 %rs685, %f1147;} // end inline asm add.s64 %rd59, %rd57, %rd50; st.global.u16 [%rd59], %rs685; bra.uni $L__BB0_37; $L__BB0_36: // begin inline asm { cvt.rn.f16.f32 %rs686, %f45;} // end inline asm st.global.u16 [%rd13], %rs686; // begin inline asm { cvt.rn.f16.f32 %rs687, %f46;} // end inline asm st.global.u16 [%rd15], %rs687; // begin inline asm { cvt.rn.f16.f32 %rs688, %f47;} // end inline asm st.global.u16 [%rd18], %rs688; // begin inline asm { cvt.rn.f16.f32 %rs689, %f48;} // end inline asm shl.b64 %rd60, %rd14, 1; add.s64 %rd61, %rd18, %rd60; st.global.u16 [%rd61], %rs689; // begin inline asm { cvt.rn.f16.f32 %rs690, %f49;} // end inline asm add.s64 %rd62, %rd61, %rd60; st.global.u16 [%rd62], %rs690; // begin inline asm { cvt.rn.f16.f32 %rs691, %f50;} // end inline asm add.s64 %rd63, %rd62, %rd60; st.global.u16 [%rd63], %rs691; $L__BB0_37: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }