Compiler // // Compiler Build ID: CL-31678015 // Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_60 .address_size 64 // .globl _Z27dequant_gemv_group64_batch823DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi64ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch823DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<26>; .reg .b16 %rs<429>; .reg .f32 %f<896>; .reg .b32 %r<415>; .reg .b64 %rd<74>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[5120]; ld.param.v2.u32 {%r56, %r57}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r58, %r59}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f77, %f78}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs23, %rs24, %rs25, %rs26}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd26, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd25, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd24, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd23, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+16]; mov.u32 %r414, %tid.y; shl.b32 %r60, %r414, 5; mov.u32 %r61, %tid.x; add.s32 %r413, %r60, %r61; shl.b32 %r412, %r413, 1; setp.ge.u32 %p1, %r412, %r58; mov.f32 %f872, 0f00000000; mov.f32 %f873, %f872; mov.f32 %f874, %f872; mov.f32 %f875, %f872; mov.f32 %f876, %f872; mov.f32 %f877, %f872; mov.f32 %f878, %f872; mov.f32 %f879, %f872; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd23; mov.u32 %r62, %ctaid.x; mul.lo.s32 %r6, %r59, %r62; shl.b16 %rs2, %rs23, 3; cvta.to.global.u64 %rd3, %rd24; cvta.to.global.u64 %rd4, %rd26; $L__BB0_2: mad.lo.s32 %r64, %r58, %r62, %r412; mul.wide.u32 %rd27, %r64, 4; add.s64 %rd28, %rd3, %rd27; ld.global.v2.u32 {%r65, %r66}, [%rd28]; shr.u32 %r68, %r61, 2; shl.b32 %r69, %r414, 3; add.s32 %r12, %r69, %r68; add.s32 %r13, %r12, %r6; mul.wide.s32 %rd29, %r13, 2; add.s64 %rd30, %rd4, %rd29; ld.global.u16 %rs31, [%rd30]; // begin inline asm { cvt.f32.f16 %f87, %rs31;} // end inline asm setp.eq.s64 %p2, %rd25, 0; mov.u16 %rs428, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r70, %r13, 31; add.s32 %r71, %r13, %r70; shr.s32 %r72, %r71, 1; cvt.s64.s32 %rd31, %r72; cvta.to.global.u64 %rd32, %rd25; add.s64 %rd33, %rd32, %rd31; ld.global.u8 %r73, [%rd33]; shl.b32 %r74, %r12, 2; and.b32 %r75, %r74, 4; shr.u32 %r76, %r73, %r75; cvt.u16.u32 %rs32, %r76; and.b16 %rs428, %rs32, 15; $L__BB0_4: shl.b32 %r14, %r413, 4; setp.ge.s32 %p3, %r14, %r56; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs23, 0; shr.u16 %rs34, %rs428, 3; and.b16 %rs35, %rs34, 1; setp.eq.b16 %p5, %rs35, 1; and.pred %p6, %p4, %p5; selp.b16 %rs36, -16, 0, %p6; or.b16 %rs37, %rs36, %rs428; cvt.s16.s8 %rs38, %rs37; cvt.rn.f32.s16 %f10, %rs38; mul.wide.s32 %rd34, %r14, 2; add.s64 %rd5, %rd2, %rd34; ld.global.v4.u32 {%r77, %r78, %r79, %r80}, [%rd5]; mul.wide.s32 %rd35, %r56, 2; add.s64 %rd6, %rd5, %rd35; ld.global.v4.u32 {%r81, %r82, %r83, %r84}, [%rd6]; add.s32 %r85, %r14, %r56; add.s32 %r86, %r85, %r56; mul.wide.s32 %rd36, %r86, 2; add.s64 %rd37, %rd2, %rd36; ld.global.v4.u32 {%r87, %r88, %r89, %r90}, [%rd37]; add.s64 %rd38, %rd37, %rd35; ld.global.v4.u32 {%r91, %r92, %r93, %r94}, [%rd38]; add.s64 %rd39, %rd38, %rd35; ld.global.v4.u32 {%r95, %r96, %r97, %r98}, [%rd39]; add.s64 %rd40, %rd39, %rd35; ld.global.v4.u32 {%r99, %r100, %r101, %r102}, [%rd40]; add.s64 %rd41, %rd40, %rd35; ld.global.v4.u32 {%r103, %r104, %r105, %r106}, [%rd41]; add.s64 %rd42, %rd41, %rd35; ld.global.v4.u32 {%r107, %r108, %r109, %r110}, [%rd42]; cvt.u16.u32 %rs39, %r65; shr.u16 %rs40, %rs39, 4; and.b16 %rs5, %rs40, 15; shr.u16 %rs6, %rs39, 12; shr.u32 %r111, %r65, 16; cvt.u16.u32 %rs41, %r111; and.b16 %rs7, %rs41, 15; shr.u32 %r112, %r65, 24; cvt.u16.u32 %rs42, %r112; and.b16 %rs8, %rs42, 15; cvt.u16.u32 %rs43, %r66; shr.u16 %rs44, %rs43, 4; and.b16 %rs9, %rs44, 15; shr.u16 %rs45, %rs43, 8; and.b16 %rs10, %rs45, 15; shr.u16 %rs11, %rs43, 12; shr.u32 %r113, %r66, 16; cvt.u16.u32 %rs46, %r113; and.b16 %rs12, %rs46, 15; shr.u32 %r114, %r66, 20; cvt.u16.u32 %rs47, %r114; and.b16 %rs13, %rs47, 15; shr.u32 %r115, %r66, 24; cvt.u16.u32 %rs48, %r115; and.b16 %rs14, %rs48, 15; add.s64 %rd43, %rd6, %rd35; add.s64 %rd7, %rd43, 16; add.s64 %rd8, %rd7, %rd35; add.s64 %rd9, %rd8, %rd35; add.s64 %rd10, %rd9, %rd35; add.s64 %rd11, %rd10, %rd35; add.s64 %rd12, %rd11, %rd35; @%p4 bra $L__BB0_7; and.b16 %rs178, %rs39, 15; cvt.rn.f32.s16 %f216, %rs178; sub.ftz.f32 %f217, %f216, %f10; mul.ftz.f32 %f218, %f87, %f217; mov.b32 {%rs49, %rs57}, %r77; // begin inline asm { cvt.f32.f16 %f88, %rs49;} // end inline asm fma.rn.ftz.f32 %f219, %f218, %f88, %f879; mov.b32 {%rs50, %rs58}, %r81; // begin inline asm { cvt.f32.f16 %f89, %rs50;} // end inline asm fma.rn.ftz.f32 %f220, %f218, %f89, %f878; mov.b32 {%rs51, %rs59}, %r87; // begin inline asm { cvt.f32.f16 %f90, %rs51;} // end inline asm fma.rn.ftz.f32 %f221, %f218, %f90, %f877; mov.b32 {%rs52, %rs60}, %r91; // begin inline asm { cvt.f32.f16 %f91, %rs52;} // end inline asm fma.rn.ftz.f32 %f222, %f218, %f91, %f876; mov.b32 {%rs53, %rs61}, %r95; // begin inline asm { cvt.f32.f16 %f92, %rs53;} // end inline asm fma.rn.ftz.f32 %f223, %f218, %f92, %f875; mov.b32 {%rs54, %rs62}, %r99; // begin inline asm { cvt.f32.f16 %f93, %rs54;} // end inline asm fma.rn.ftz.f32 %f224, %f218, %f93, %f874; mov.b32 {%rs55, %rs63}, %r103; // begin inline asm { cvt.f32.f16 %f94, %rs55;} // end inline asm fma.rn.ftz.f32 %f225, %f218, %f94, %f873; mov.b32 {%rs56, %rs64}, %r107; // begin inline asm { cvt.f32.f16 %f95, %rs56;} // end inline asm fma.rn.ftz.f32 %f226, %f218, %f95, %f872; cvt.rn.f32.s16 %f227, %rs5; sub.ftz.f32 %f228, %f227, %f10; mul.ftz.f32 %f229, %f87, %f228; // begin inline asm { cvt.f32.f16 %f96, %rs57;} // end inline asm fma.rn.ftz.f32 %f230, %f229, %f96, %f219; // begin inline asm { cvt.f32.f16 %f97, %rs58;} // end inline asm fma.rn.ftz.f32 %f231, %f229, %f97, %f220; // begin inline asm { cvt.f32.f16 %f98, %rs59;} // end inline asm fma.rn.ftz.f32 %f232, %f229, %f98, %f221; // begin inline asm { cvt.f32.f16 %f99, %rs60;} // end inline asm fma.rn.ftz.f32 %f233, %f229, %f99, %f222; // begin inline asm { cvt.f32.f16 %f100, %rs61;} // end inline asm fma.rn.ftz.f32 %f234, %f229, %f100, %f223; // begin inline asm { cvt.f32.f16 %f101, %rs62;} // end inline asm fma.rn.ftz.f32 %f235, %f229, %f101, %f224; // begin inline asm { cvt.f32.f16 %f102, %rs63;} // end inline asm fma.rn.ftz.f32 %f236, %f229, %f102, %f225; // begin inline asm { cvt.f32.f16 %f103, %rs64;} // end inline asm fma.rn.ftz.f32 %f237, %f229, %f103, %f226; shr.u16 %rs179, %rs39, 8; and.b16 %rs180, %rs179, 15; cvt.rn.f32.s16 %f238, %rs180; sub.ftz.f32 %f239, %f238, %f10; mul.ftz.f32 %f240, %f87, %f239; mov.b32 {%rs65, %rs73}, %r78; // begin inline asm { cvt.f32.f16 %f104, %rs65;} // end inline asm fma.rn.ftz.f32 %f241, %f240, %f104, %f230; mov.b32 {%rs66, %rs74}, %r82; // begin inline asm { cvt.f32.f16 %f105, %rs66;} // end inline asm fma.rn.ftz.f32 %f242, %f240, %f105, %f231; mov.b32 {%rs67, %rs75}, %r88; // begin inline asm { cvt.f32.f16 %f106, %rs67;} // end inline asm fma.rn.ftz.f32 %f243, %f240, %f106, %f232; mov.b32 {%rs68, %rs76}, %r92; // begin inline asm { cvt.f32.f16 %f107, %rs68;} // end inline asm fma.rn.ftz.f32 %f244, %f240, %f107, %f233; mov.b32 {%rs69, %rs77}, %r96; // begin inline asm { cvt.f32.f16 %f108, %rs69;} // end inline asm fma.rn.ftz.f32 %f245, %f240, %f108, %f234; mov.b32 {%rs70, %rs78}, %r100; // begin inline asm { cvt.f32.f16 %f109, %rs70;} // end inline asm fma.rn.ftz.f32 %f246, %f240, %f109, %f235; mov.b32 {%rs71, %rs79}, %r104; // begin inline asm { cvt.f32.f16 %f110, %rs71;} // end inline asm fma.rn.ftz.f32 %f247, %f240, %f110, %f236; mov.b32 {%rs72, %rs80}, %r108; // begin inline asm { cvt.f32.f16 %f111, %rs72;} // end inline asm fma.rn.ftz.f32 %f248, %f240, %f111, %f237; cvt.rn.f32.s16 %f249, %rs6; sub.ftz.f32 %f250, %f249, %f10; mul.ftz.f32 %f251, %f87, %f250; // begin inline asm { cvt.f32.f16 %f112, %rs73;} // end inline asm fma.rn.ftz.f32 %f252, %f251, %f112, %f241; // begin inline asm { cvt.f32.f16 %f113, %rs74;} // end inline asm fma.rn.ftz.f32 %f253, %f251, %f113, %f242; // begin inline asm { cvt.f32.f16 %f114, %rs75;} // end inline asm fma.rn.ftz.f32 %f254, %f251, %f114, %f243; // begin inline asm { cvt.f32.f16 %f115, %rs76;} // end inline asm fma.rn.ftz.f32 %f255, %f251, %f115, %f244; // begin inline asm { cvt.f32.f16 %f116, %rs77;} // end inline asm fma.rn.ftz.f32 %f256, %f251, %f116, %f245; // begin inline asm { cvt.f32.f16 %f117, %rs78;} // end inline asm fma.rn.ftz.f32 %f257, %f251, %f117, %f246; // begin inline asm { cvt.f32.f16 %f118, %rs79;} // end inline asm fma.rn.ftz.f32 %f258, %f251, %f118, %f247; // begin inline asm { cvt.f32.f16 %f119, %rs80;} // end inline asm fma.rn.ftz.f32 %f259, %f251, %f119, %f248; cvt.rn.f32.s16 %f260, %rs7; sub.ftz.f32 %f261, %f260, %f10; mul.ftz.f32 %f262, %f87, %f261; mov.b32 {%rs81, %rs89}, %r79; // begin inline asm { cvt.f32.f16 %f120, %rs81;} // end inline asm fma.rn.ftz.f32 %f263, %f262, %f120, %f252; mov.b32 {%rs82, %rs90}, %r83; // begin inline asm { cvt.f32.f16 %f121, %rs82;} // end inline asm fma.rn.ftz.f32 %f264, %f262, %f121, %f253; mov.b32 {%rs83, %rs91}, %r89; // begin inline asm { cvt.f32.f16 %f122, %rs83;} // end inline asm fma.rn.ftz.f32 %f265, %f262, %f122, %f254; mov.b32 {%rs84, %rs92}, %r93; // begin inline asm { cvt.f32.f16 %f123, %rs84;} // end inline asm fma.rn.ftz.f32 %f266, %f262, %f123, %f255; mov.b32 {%rs85, %rs93}, %r97; // begin inline asm { cvt.f32.f16 %f124, %rs85;} // end inline asm fma.rn.ftz.f32 %f267, %f262, %f124, %f256; mov.b32 {%rs86, %rs94}, %r101; // begin inline asm { cvt.f32.f16 %f125, %rs86;} // end inline asm fma.rn.ftz.f32 %f268, %f262, %f125, %f257; mov.b32 {%rs87, %rs95}, %r105; // begin inline asm { cvt.f32.f16 %f126, %rs87;} // end inline asm fma.rn.ftz.f32 %f269, %f262, %f126, %f258; mov.b32 {%rs88, %rs96}, %r109; // begin inline asm { cvt.f32.f16 %f127, %rs88;} // end inline asm fma.rn.ftz.f32 %f270, %f262, %f127, %f259; shr.u32 %r116, %r65, 20; cvt.u16.u32 %rs181, %r116; and.b16 %rs182, %rs181, 15; cvt.rn.f32.s16 %f271, %rs182; sub.ftz.f32 %f272, %f271, %f10; mul.ftz.f32 %f273, %f87, %f272; // begin inline asm { cvt.f32.f16 %f128, %rs89;} // end inline asm fma.rn.ftz.f32 %f274, %f273, %f128, %f263; // begin inline asm { cvt.f32.f16 %f129, %rs90;} // end inline asm fma.rn.ftz.f32 %f275, %f273, %f129, %f264; // begin inline asm { cvt.f32.f16 %f130, %rs91;} // end inline asm fma.rn.ftz.f32 %f276, %f273, %f130, %f265; // begin inline asm { cvt.f32.f16 %f131, %rs92;} // end inline asm fma.rn.ftz.f32 %f277, %f273, %f131, %f266; // begin inline asm { cvt.f32.f16 %f132, %rs93;} // end inline asm fma.rn.ftz.f32 %f278, %f273, %f132, %f267; // begin inline asm { cvt.f32.f16 %f133, %rs94;} // end inline asm fma.rn.ftz.f32 %f279, %f273, %f133, %f268; // begin inline asm { cvt.f32.f16 %f134, %rs95;} // end inline asm fma.rn.ftz.f32 %f280, %f273, %f134, %f269; // begin inline asm { cvt.f32.f16 %f135, %rs96;} // end inline asm fma.rn.ftz.f32 %f281, %f273, %f135, %f270; cvt.rn.f32.s16 %f282, %rs8; sub.ftz.f32 %f283, %f282, %f10; mul.ftz.f32 %f284, %f87, %f283; mov.b32 {%rs97, %rs105}, %r80; // begin inline asm { cvt.f32.f16 %f136, %rs97;} // end inline asm fma.rn.ftz.f32 %f285, %f284, %f136, %f274; mov.b32 {%rs98, %rs106}, %r84; // begin inline asm { cvt.f32.f16 %f137, %rs98;} // end inline asm fma.rn.ftz.f32 %f286, %f284, %f137, %f275; mov.b32 {%rs99, %rs107}, %r90; // begin inline asm { cvt.f32.f16 %f138, %rs99;} // end inline asm fma.rn.ftz.f32 %f287, %f284, %f138, %f276; mov.b32 {%rs100, %rs108}, %r94; // begin inline asm { cvt.f32.f16 %f139, %rs100;} // end inline asm fma.rn.ftz.f32 %f288, %f284, %f139, %f277; mov.b32 {%rs101, %rs109}, %r98; // begin inline asm { cvt.f32.f16 %f140, %rs101;} // end inline asm fma.rn.ftz.f32 %f289, %f284, %f140, %f278; mov.b32 {%rs102, %rs110}, %r102; // begin inline asm { cvt.f32.f16 %f141, %rs102;} // end inline asm fma.rn.ftz.f32 %f290, %f284, %f141, %f279; mov.b32 {%rs103, %rs111}, %r106; // begin inline asm { cvt.f32.f16 %f142, %rs103;} // end inline asm fma.rn.ftz.f32 %f291, %f284, %f142, %f280; mov.b32 {%rs104, %rs112}, %r110; // begin inline asm { cvt.f32.f16 %f143, %rs104;} // end inline asm fma.rn.ftz.f32 %f292, %f284, %f143, %f281; shr.u32 %r117, %r65, 28; cvt.u16.u32 %rs183, %r117; cvt.rn.f32.s16 %f293, %rs183; sub.ftz.f32 %f294, %f293, %f10; mul.ftz.f32 %f295, %f87, %f294; // begin inline asm { cvt.f32.f16 %f144, %rs105;} // end inline asm fma.rn.ftz.f32 %f296, %f295, %f144, %f285; // begin inline asm { cvt.f32.f16 %f145, %rs106;} // end inline asm fma.rn.ftz.f32 %f297, %f295, %f145, %f286; // begin inline asm { cvt.f32.f16 %f146, %rs107;} // end inline asm fma.rn.ftz.f32 %f298, %f295, %f146, %f287; // begin inline asm { cvt.f32.f16 %f147, %rs108;} // end inline asm fma.rn.ftz.f32 %f299, %f295, %f147, %f288; // begin inline asm { cvt.f32.f16 %f148, %rs109;} // end inline asm fma.rn.ftz.f32 %f300, %f295, %f148, %f289; // begin inline asm { cvt.f32.f16 %f149, %rs110;} // end inline asm fma.rn.ftz.f32 %f301, %f295, %f149, %f290; // begin inline asm { cvt.f32.f16 %f150, %rs111;} // end inline asm fma.rn.ftz.f32 %f302, %f295, %f150, %f291; // begin inline asm { cvt.f32.f16 %f151, %rs112;} // end inline asm fma.rn.ftz.f32 %f303, %f295, %f151, %f292; ld.global.v4.u32 {%r118, %r119, %r120, %r121}, [%rd5+16]; ld.global.v4.u32 {%r126, %r127, %r128, %r129}, [%rd6+16]; ld.global.v4.u32 {%r134, %r135, %r136, %r137}, [%rd7]; ld.global.v4.u32 {%r142, %r143, %r144, %r145}, [%rd8]; ld.global.v4.u32 {%r150, %r151, %r152, %r153}, [%rd9]; ld.global.v4.u32 {%r158, %r159, %r160, %r161}, [%rd10]; ld.global.v4.u32 {%r166, %r167, %r168, %r169}, [%rd11]; ld.global.v4.u32 {%r174, %r175, %r176, %r177}, [%rd12]; and.b16 %rs185, %rs43, 15; cvt.rn.f32.s16 %f304, %rs185; sub.ftz.f32 %f305, %f304, %f10; mul.ftz.f32 %f306, %f87, %f305; mov.b32 {%rs113, %rs121}, %r118; // begin inline asm { cvt.f32.f16 %f152, %rs113;} // end inline asm fma.rn.ftz.f32 %f307, %f306, %f152, %f296; mov.b32 {%rs114, %rs122}, %r126; // begin inline asm { cvt.f32.f16 %f153, %rs114;} // end inline asm fma.rn.ftz.f32 %f308, %f306, %f153, %f297; mov.b32 {%rs115, %rs123}, %r134; // begin inline asm { cvt.f32.f16 %f154, %rs115;} // end inline asm fma.rn.ftz.f32 %f309, %f306, %f154, %f298; mov.b32 {%rs116, %rs124}, %r142; // begin inline asm { cvt.f32.f16 %f155, %rs116;} // end inline asm fma.rn.ftz.f32 %f310, %f306, %f155, %f299; mov.b32 {%rs117, %rs125}, %r150; // begin inline asm { cvt.f32.f16 %f156, %rs117;} // end inline asm fma.rn.ftz.f32 %f311, %f306, %f156, %f300; mov.b32 {%rs118, %rs126}, %r158; // begin inline asm { cvt.f32.f16 %f157, %rs118;} // end inline asm fma.rn.ftz.f32 %f312, %f306, %f157, %f301; mov.b32 {%rs119, %rs127}, %r166; // begin inline asm { cvt.f32.f16 %f158, %rs119;} // end inline asm fma.rn.ftz.f32 %f313, %f306, %f158, %f302; mov.b32 {%rs120, %rs128}, %r174; // begin inline asm { cvt.f32.f16 %f159, %rs120;} // end inline asm fma.rn.ftz.f32 %f314, %f306, %f159, %f303; cvt.rn.f32.s16 %f315, %rs9; sub.ftz.f32 %f316, %f315, %f10; mul.ftz.f32 %f317, %f87, %f316; // begin inline asm { cvt.f32.f16 %f160, %rs121;} // end inline asm fma.rn.ftz.f32 %f318, %f317, %f160, %f307; // begin inline asm { cvt.f32.f16 %f161, %rs122;} // end inline asm fma.rn.ftz.f32 %f319, %f317, %f161, %f308; // begin inline asm { cvt.f32.f16 %f162, %rs123;} // end inline asm fma.rn.ftz.f32 %f320, %f317, %f162, %f309; // begin inline asm { cvt.f32.f16 %f163, %rs124;} // end inline asm fma.rn.ftz.f32 %f321, %f317, %f163, %f310; // begin inline asm { cvt.f32.f16 %f164, %rs125;} // end inline asm fma.rn.ftz.f32 %f322, %f317, %f164, %f311; // begin inline asm { cvt.f32.f16 %f165, %rs126;} // end inline asm fma.rn.ftz.f32 %f323, %f317, %f165, %f312; // begin inline asm { cvt.f32.f16 %f166, %rs127;} // end inline asm fma.rn.ftz.f32 %f324, %f317, %f166, %f313; // begin inline asm { cvt.f32.f16 %f167, %rs128;} // end inline asm fma.rn.ftz.f32 %f325, %f317, %f167, %f314; cvt.rn.f32.s16 %f326, %rs10; sub.ftz.f32 %f327, %f326, %f10; mul.ftz.f32 %f328, %f87, %f327; mov.b32 {%rs129, %rs137}, %r119; // begin inline asm { cvt.f32.f16 %f168, %rs129;} // end inline asm fma.rn.ftz.f32 %f329, %f328, %f168, %f318; mov.b32 {%rs130, %rs138}, %r127; // begin inline asm { cvt.f32.f16 %f169, %rs130;} // end inline asm fma.rn.ftz.f32 %f330, %f328, %f169, %f319; mov.b32 {%rs131, %rs139}, %r135; // begin inline asm { cvt.f32.f16 %f170, %rs131;} // end inline asm fma.rn.ftz.f32 %f331, %f328, %f170, %f320; mov.b32 {%rs132, %rs140}, %r143; // begin inline asm { cvt.f32.f16 %f171, %rs132;} // end inline asm fma.rn.ftz.f32 %f332, %f328, %f171, %f321; mov.b32 {%rs133, %rs141}, %r151; // begin inline asm { cvt.f32.f16 %f172, %rs133;} // end inline asm fma.rn.ftz.f32 %f333, %f328, %f172, %f322; mov.b32 {%rs134, %rs142}, %r159; // begin inline asm { cvt.f32.f16 %f173, %rs134;} // end inline asm fma.rn.ftz.f32 %f334, %f328, %f173, %f323; mov.b32 {%rs135, %rs143}, %r167; // begin inline asm { cvt.f32.f16 %f174, %rs135;} // end inline asm fma.rn.ftz.f32 %f335, %f328, %f174, %f324; mov.b32 {%rs136, %rs144}, %r175; // begin inline asm { cvt.f32.f16 %f175, %rs136;} // end inline asm fma.rn.ftz.f32 %f336, %f328, %f175, %f325; cvt.rn.f32.s16 %f337, %rs11; sub.ftz.f32 %f338, %f337, %f10; mul.ftz.f32 %f339, %f87, %f338; // begin inline asm { cvt.f32.f16 %f176, %rs137;} // end inline asm fma.rn.ftz.f32 %f340, %f339, %f176, %f329; // begin inline asm { cvt.f32.f16 %f177, %rs138;} // end inline asm fma.rn.ftz.f32 %f341, %f339, %f177, %f330; // begin inline asm { cvt.f32.f16 %f178, %rs139;} // end inline asm fma.rn.ftz.f32 %f342, %f339, %f178, %f331; // begin inline asm { cvt.f32.f16 %f179, %rs140;} // end inline asm fma.rn.ftz.f32 %f343, %f339, %f179, %f332; // begin inline asm { cvt.f32.f16 %f180, %rs141;} // end inline asm fma.rn.ftz.f32 %f344, %f339, %f180, %f333; // begin inline asm { cvt.f32.f16 %f181, %rs142;} // end inline asm fma.rn.ftz.f32 %f345, %f339, %f181, %f334; // begin inline asm { cvt.f32.f16 %f182, %rs143;} // end inline asm fma.rn.ftz.f32 %f346, %f339, %f182, %f335; // begin inline asm { cvt.f32.f16 %f183, %rs144;} // end inline asm fma.rn.ftz.f32 %f347, %f339, %f183, %f336; cvt.rn.f32.s16 %f348, %rs12; sub.ftz.f32 %f349, %f348, %f10; mul.ftz.f32 %f350, %f87, %f349; mov.b32 {%rs145, %rs153}, %r120; // begin inline asm { cvt.f32.f16 %f184, %rs145;} // end inline asm fma.rn.ftz.f32 %f351, %f350, %f184, %f340; mov.b32 {%rs146, %rs154}, %r128; // begin inline asm { cvt.f32.f16 %f185, %rs146;} // end inline asm fma.rn.ftz.f32 %f352, %f350, %f185, %f341; mov.b32 {%rs147, %rs155}, %r136; // begin inline asm { cvt.f32.f16 %f186, %rs147;} // end inline asm fma.rn.ftz.f32 %f353, %f350, %f186, %f342; mov.b32 {%rs148, %rs156}, %r144; // begin inline asm { cvt.f32.f16 %f187, %rs148;} // end inline asm fma.rn.ftz.f32 %f354, %f350, %f187, %f343; mov.b32 {%rs149, %rs157}, %r152; // begin inline asm { cvt.f32.f16 %f188, %rs149;} // end inline asm fma.rn.ftz.f32 %f355, %f350, %f188, %f344; mov.b32 {%rs150, %rs158}, %r160; // begin inline asm { cvt.f32.f16 %f189, %rs150;} // end inline asm fma.rn.ftz.f32 %f356, %f350, %f189, %f345; mov.b32 {%rs151, %rs159}, %r168; // begin inline asm { cvt.f32.f16 %f190, %rs151;} // end inline asm fma.rn.ftz.f32 %f357, %f350, %f190, %f346; mov.b32 {%rs152, %rs160}, %r176; // begin inline asm { cvt.f32.f16 %f191, %rs152;} // end inline asm fma.rn.ftz.f32 %f358, %f350, %f191, %f347; cvt.rn.f32.s16 %f359, %rs13; sub.ftz.f32 %f360, %f359, %f10; mul.ftz.f32 %f361, %f87, %f360; // begin inline asm { cvt.f32.f16 %f192, %rs153;} // end inline asm fma.rn.ftz.f32 %f362, %f361, %f192, %f351; // begin inline asm { cvt.f32.f16 %f193, %rs154;} // end inline asm fma.rn.ftz.f32 %f363, %f361, %f193, %f352; // begin inline asm { cvt.f32.f16 %f194, %rs155;} // end inline asm fma.rn.ftz.f32 %f364, %f361, %f194, %f353; // begin inline asm { cvt.f32.f16 %f195, %rs156;} // end inline asm fma.rn.ftz.f32 %f365, %f361, %f195, %f354; // begin inline asm { cvt.f32.f16 %f196, %rs157;} // end inline asm fma.rn.ftz.f32 %f366, %f361, %f196, %f355; // begin inline asm { cvt.f32.f16 %f197, %rs158;} // end inline asm fma.rn.ftz.f32 %f367, %f361, %f197, %f356; // begin inline asm { cvt.f32.f16 %f198, %rs159;} // end inline asm fma.rn.ftz.f32 %f368, %f361, %f198, %f357; // begin inline asm { cvt.f32.f16 %f199, %rs160;} // end inline asm fma.rn.ftz.f32 %f369, %f361, %f199, %f358; cvt.rn.f32.s16 %f370, %rs14; sub.ftz.f32 %f371, %f370, %f10; mul.ftz.f32 %f372, %f87, %f371; mov.b32 {%rs161, %rs169}, %r121; // begin inline asm { cvt.f32.f16 %f200, %rs161;} // end inline asm fma.rn.ftz.f32 %f373, %f372, %f200, %f362; mov.b32 {%rs162, %rs170}, %r129; // begin inline asm { cvt.f32.f16 %f201, %rs162;} // end inline asm fma.rn.ftz.f32 %f374, %f372, %f201, %f363; mov.b32 {%rs163, %rs171}, %r137; // begin inline asm { cvt.f32.f16 %f202, %rs163;} // end inline asm fma.rn.ftz.f32 %f375, %f372, %f202, %f364; mov.b32 {%rs164, %rs172}, %r145; // begin inline asm { cvt.f32.f16 %f203, %rs164;} // end inline asm fma.rn.ftz.f32 %f376, %f372, %f203, %f365; mov.b32 {%rs165, %rs173}, %r153; // begin inline asm { cvt.f32.f16 %f204, %rs165;} // end inline asm fma.rn.ftz.f32 %f377, %f372, %f204, %f366; mov.b32 {%rs166, %rs174}, %r161; // begin inline asm { cvt.f32.f16 %f205, %rs166;} // end inline asm fma.rn.ftz.f32 %f378, %f372, %f205, %f367; mov.b32 {%rs167, %rs175}, %r169; // begin inline asm { cvt.f32.f16 %f206, %rs167;} // end inline asm fma.rn.ftz.f32 %f379, %f372, %f206, %f368; mov.b32 {%rs168, %rs176}, %r177; // begin inline asm { cvt.f32.f16 %f207, %rs168;} // end inline asm fma.rn.ftz.f32 %f380, %f372, %f207, %f369; shr.u32 %r182, %r66, 28; cvt.u16.u32 %rs186, %r182; cvt.rn.f32.s16 %f381, %rs186; sub.ftz.f32 %f382, %f381, %f10; mul.ftz.f32 %f383, %f87, %f382; // begin inline asm { cvt.f32.f16 %f208, %rs169;} // end inline asm fma.rn.ftz.f32 %f879, %f383, %f208, %f373; // begin inline asm { cvt.f32.f16 %f209, %rs170;} // end inline asm fma.rn.ftz.f32 %f878, %f383, %f209, %f374; // begin inline asm { cvt.f32.f16 %f210, %rs171;} // end inline asm fma.rn.ftz.f32 %f877, %f383, %f210, %f375; // begin inline asm { cvt.f32.f16 %f211, %rs172;} // end inline asm fma.rn.ftz.f32 %f876, %f383, %f211, %f376; // begin inline asm { cvt.f32.f16 %f212, %rs173;} // end inline asm fma.rn.ftz.f32 %f875, %f383, %f212, %f377; // begin inline asm { cvt.f32.f16 %f213, %rs174;} // end inline asm fma.rn.ftz.f32 %f874, %f383, %f213, %f378; // begin inline asm { cvt.f32.f16 %f214, %rs175;} // end inline asm fma.rn.ftz.f32 %f873, %f383, %f214, %f379; // begin inline asm { cvt.f32.f16 %f215, %rs176;} // end inline asm fma.rn.ftz.f32 %f872, %f383, %f215, %f380; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs316, %rs39, 4; cvt.s16.s8 %rs317, %rs316; shr.s16 %rs318, %rs317, 7; and.b16 %rs319, %rs318, -16; and.b16 %rs320, %rs39, 15; or.b16 %rs321, %rs319, %rs320; cvt.rn.f32.s16 %f512, %rs321; sub.ftz.f32 %f513, %f512, %f10; mul.ftz.f32 %f514, %f87, %f513; mov.b32 {%rs187, %rs195}, %r77; // begin inline asm { cvt.f32.f16 %f384, %rs187;} // end inline asm fma.rn.ftz.f32 %f515, %f514, %f384, %f879; mov.b32 {%rs188, %rs196}, %r81; // begin inline asm { cvt.f32.f16 %f385, %rs188;} // end inline asm fma.rn.ftz.f32 %f516, %f514, %f385, %f878; mov.b32 {%rs189, %rs197}, %r87; // begin inline asm { cvt.f32.f16 %f386, %rs189;} // end inline asm fma.rn.ftz.f32 %f517, %f514, %f386, %f877; mov.b32 {%rs190, %rs198}, %r91; // begin inline asm { cvt.f32.f16 %f387, %rs190;} // end inline asm fma.rn.ftz.f32 %f518, %f514, %f387, %f876; mov.b32 {%rs191, %rs199}, %r95; // begin inline asm { cvt.f32.f16 %f388, %rs191;} // end inline asm fma.rn.ftz.f32 %f519, %f514, %f388, %f875; mov.b32 {%rs192, %rs200}, %r99; // begin inline asm { cvt.f32.f16 %f389, %rs192;} // end inline asm fma.rn.ftz.f32 %f520, %f514, %f389, %f874; mov.b32 {%rs193, %rs201}, %r103; // begin inline asm { cvt.f32.f16 %f390, %rs193;} // end inline asm fma.rn.ftz.f32 %f521, %f514, %f390, %f873; mov.b32 {%rs194, %rs202}, %r107; // begin inline asm { cvt.f32.f16 %f391, %rs194;} // end inline asm fma.rn.ftz.f32 %f522, %f514, %f391, %f872; cvt.s16.s8 %rs322, %rs39; shr.s16 %rs323, %rs322, 7; and.b16 %rs324, %rs323, -16; or.b16 %rs325, %rs324, %rs5; cvt.rn.f32.s16 %f523, %rs325; sub.ftz.f32 %f524, %f523, %f10; mul.ftz.f32 %f525, %f87, %f524; // begin inline asm { cvt.f32.f16 %f392, %rs195;} // end inline asm fma.rn.ftz.f32 %f526, %f525, %f392, %f515; // begin inline asm { cvt.f32.f16 %f393, %rs196;} // end inline asm fma.rn.ftz.f32 %f527, %f525, %f393, %f516; // begin inline asm { cvt.f32.f16 %f394, %rs197;} // end inline asm fma.rn.ftz.f32 %f528, %f525, %f394, %f517; // begin inline asm { cvt.f32.f16 %f395, %rs198;} // end inline asm fma.rn.ftz.f32 %f529, %f525, %f395, %f518; // begin inline asm { cvt.f32.f16 %f396, %rs199;} // end inline asm fma.rn.ftz.f32 %f530, %f525, %f396, %f519; // begin inline asm { cvt.f32.f16 %f397, %rs200;} // end inline asm fma.rn.ftz.f32 %f531, %f525, %f397, %f520; // begin inline asm { cvt.f32.f16 %f398, %rs201;} // end inline asm fma.rn.ftz.f32 %f532, %f525, %f398, %f521; // begin inline asm { cvt.f32.f16 %f399, %rs202;} // end inline asm fma.rn.ftz.f32 %f533, %f525, %f399, %f522; shr.u32 %r183, %r65, 8; cvt.u16.u32 %rs326, %r183; shl.b16 %rs327, %rs326, 4; cvt.s16.s8 %rs328, %rs327; shr.s16 %rs329, %rs328, 7; and.b16 %rs330, %rs329, -16; and.b16 %rs331, %rs326, 15; or.b16 %rs332, %rs330, %rs331; cvt.rn.f32.s16 %f534, %rs332; sub.ftz.f32 %f535, %f534, %f10; mul.ftz.f32 %f536, %f87, %f535; mov.b32 {%rs203, %rs211}, %r78; // begin inline asm { cvt.f32.f16 %f400, %rs203;} // end inline asm fma.rn.ftz.f32 %f537, %f536, %f400, %f526; mov.b32 {%rs204, %rs212}, %r82; // begin inline asm { cvt.f32.f16 %f401, %rs204;} // end inline asm fma.rn.ftz.f32 %f538, %f536, %f401, %f527; mov.b32 {%rs205, %rs213}, %r88; // begin inline asm { cvt.f32.f16 %f402, %rs205;} // end inline asm fma.rn.ftz.f32 %f539, %f536, %f402, %f528; mov.b32 {%rs206, %rs214}, %r92; // begin inline asm { cvt.f32.f16 %f403, %rs206;} // end inline asm fma.rn.ftz.f32 %f540, %f536, %f403, %f529; mov.b32 {%rs207, %rs215}, %r96; // begin inline asm { cvt.f32.f16 %f404, %rs207;} // end inline asm fma.rn.ftz.f32 %f541, %f536, %f404, %f530; mov.b32 {%rs208, %rs216}, %r100; // begin inline asm { cvt.f32.f16 %f405, %rs208;} // end inline asm fma.rn.ftz.f32 %f542, %f536, %f405, %f531; mov.b32 {%rs209, %rs217}, %r104; // begin inline asm { cvt.f32.f16 %f406, %rs209;} // end inline asm fma.rn.ftz.f32 %f543, %f536, %f406, %f532; mov.b32 {%rs210, %rs218}, %r108; // begin inline asm { cvt.f32.f16 %f407, %rs210;} // end inline asm fma.rn.ftz.f32 %f544, %f536, %f407, %f533; shr.s16 %rs333, %rs39, 15; and.b16 %rs334, %rs333, -16; or.b16 %rs335, %rs334, %rs6; cvt.rn.f32.s16 %f545, %rs335; sub.ftz.f32 %f546, %f545, %f10; mul.ftz.f32 %f547, %f87, %f546; // begin inline asm { cvt.f32.f16 %f408, %rs211;} // end inline asm fma.rn.ftz.f32 %f548, %f547, %f408, %f537; // begin inline asm { cvt.f32.f16 %f409, %rs212;} // end inline asm fma.rn.ftz.f32 %f549, %f547, %f409, %f538; // begin inline asm { cvt.f32.f16 %f410, %rs213;} // end inline asm fma.rn.ftz.f32 %f550, %f547, %f410, %f539; // begin inline asm { cvt.f32.f16 %f411, %rs214;} // end inline asm fma.rn.ftz.f32 %f551, %f547, %f411, %f540; // begin inline asm { cvt.f32.f16 %f412, %rs215;} // end inline asm fma.rn.ftz.f32 %f552, %f547, %f412, %f541; // begin inline asm { cvt.f32.f16 %f413, %rs216;} // end inline asm fma.rn.ftz.f32 %f553, %f547, %f413, %f542; // begin inline asm { cvt.f32.f16 %f414, %rs217;} // end inline asm fma.rn.ftz.f32 %f554, %f547, %f414, %f543; // begin inline asm { cvt.f32.f16 %f415, %rs218;} // end inline asm fma.rn.ftz.f32 %f555, %f547, %f415, %f544; shl.b16 %rs337, %rs41, 4; cvt.s16.s8 %rs338, %rs337; shr.s16 %rs339, %rs338, 7; and.b16 %rs340, %rs339, -16; or.b16 %rs341, %rs340, %rs7; cvt.rn.f32.s16 %f556, %rs341; sub.ftz.f32 %f557, %f556, %f10; mul.ftz.f32 %f558, %f87, %f557; mov.b32 {%rs219, %rs227}, %r79; // begin inline asm { cvt.f32.f16 %f416, %rs219;} // end inline asm fma.rn.ftz.f32 %f559, %f558, %f416, %f548; mov.b32 {%rs220, %rs228}, %r83; // begin inline asm { cvt.f32.f16 %f417, %rs220;} // end inline asm fma.rn.ftz.f32 %f560, %f558, %f417, %f549; mov.b32 {%rs221, %rs229}, %r89; // begin inline asm { cvt.f32.f16 %f418, %rs221;} // end inline asm fma.rn.ftz.f32 %f561, %f558, %f418, %f550; mov.b32 {%rs222, %rs230}, %r93; // begin inline asm { cvt.f32.f16 %f419, %rs222;} // end inline asm fma.rn.ftz.f32 %f562, %f558, %f419, %f551; mov.b32 {%rs223, %rs231}, %r97; // begin inline asm { cvt.f32.f16 %f420, %rs223;} // end inline asm fma.rn.ftz.f32 %f563, %f558, %f420, %f552; mov.b32 {%rs224, %rs232}, %r101; // begin inline asm { cvt.f32.f16 %f421, %rs224;} // end inline asm fma.rn.ftz.f32 %f564, %f558, %f421, %f553; mov.b32 {%rs225, %rs233}, %r105; // begin inline asm { cvt.f32.f16 %f422, %rs225;} // end inline asm fma.rn.ftz.f32 %f565, %f558, %f422, %f554; mov.b32 {%rs226, %rs234}, %r109; // begin inline asm { cvt.f32.f16 %f423, %rs226;} // end inline asm fma.rn.ftz.f32 %f566, %f558, %f423, %f555; shr.u32 %r185, %r65, 20; cvt.u16.u32 %rs342, %r185; shl.b16 %rs343, %rs342, 4; cvt.s16.s8 %rs344, %rs343; shr.s16 %rs345, %rs344, 7; and.b16 %rs346, %rs345, -16; and.b16 %rs347, %rs342, 15; or.b16 %rs348, %rs346, %rs347; cvt.rn.f32.s16 %f567, %rs348; sub.ftz.f32 %f568, %f567, %f10; mul.ftz.f32 %f569, %f87, %f568; // begin inline asm { cvt.f32.f16 %f424, %rs227;} // end inline asm fma.rn.ftz.f32 %f570, %f569, %f424, %f559; // begin inline asm { cvt.f32.f16 %f425, %rs228;} // end inline asm fma.rn.ftz.f32 %f571, %f569, %f425, %f560; // begin inline asm { cvt.f32.f16 %f426, %rs229;} // end inline asm fma.rn.ftz.f32 %f572, %f569, %f426, %f561; // begin inline asm { cvt.f32.f16 %f427, %rs230;} // end inline asm fma.rn.ftz.f32 %f573, %f569, %f427, %f562; // begin inline asm { cvt.f32.f16 %f428, %rs231;} // end inline asm fma.rn.ftz.f32 %f574, %f569, %f428, %f563; // begin inline asm { cvt.f32.f16 %f429, %rs232;} // end inline asm fma.rn.ftz.f32 %f575, %f569, %f429, %f564; // begin inline asm { cvt.f32.f16 %f430, %rs233;} // end inline asm fma.rn.ftz.f32 %f576, %f569, %f430, %f565; // begin inline asm { cvt.f32.f16 %f431, %rs234;} // end inline asm fma.rn.ftz.f32 %f577, %f569, %f431, %f566; shl.b16 %rs350, %rs42, 4; cvt.s16.s8 %rs351, %rs350; shr.s16 %rs352, %rs351, 7; and.b16 %rs353, %rs352, -16; or.b16 %rs354, %rs353, %rs8; cvt.rn.f32.s16 %f578, %rs354; sub.ftz.f32 %f579, %f578, %f10; mul.ftz.f32 %f580, %f87, %f579; mov.b32 {%rs235, %rs243}, %r80; // begin inline asm { cvt.f32.f16 %f432, %rs235;} // end inline asm fma.rn.ftz.f32 %f581, %f580, %f432, %f570; mov.b32 {%rs236, %rs244}, %r84; // begin inline asm { cvt.f32.f16 %f433, %rs236;} // end inline asm fma.rn.ftz.f32 %f582, %f580, %f433, %f571; mov.b32 {%rs237, %rs245}, %r90; // begin inline asm { cvt.f32.f16 %f434, %rs237;} // end inline asm fma.rn.ftz.f32 %f583, %f580, %f434, %f572; mov.b32 {%rs238, %rs246}, %r94; // begin inline asm { cvt.f32.f16 %f435, %rs238;} // end inline asm fma.rn.ftz.f32 %f584, %f580, %f435, %f573; mov.b32 {%rs239, %rs247}, %r98; // begin inline asm { cvt.f32.f16 %f436, %rs239;} // end inline asm fma.rn.ftz.f32 %f585, %f580, %f436, %f574; mov.b32 {%rs240, %rs248}, %r102; // begin inline asm { cvt.f32.f16 %f437, %rs240;} // end inline asm fma.rn.ftz.f32 %f586, %f580, %f437, %f575; mov.b32 {%rs241, %rs249}, %r106; // begin inline asm { cvt.f32.f16 %f438, %rs241;} // end inline asm fma.rn.ftz.f32 %f587, %f580, %f438, %f576; mov.b32 {%rs242, %rs250}, %r110; // begin inline asm { cvt.f32.f16 %f439, %rs242;} // end inline asm fma.rn.ftz.f32 %f588, %f580, %f439, %f577; shr.u32 %r187, %r65, 28; cvt.u16.u32 %rs355, %r187; shl.b16 %rs356, %rs355, 4; cvt.s16.s8 %rs357, %rs356; shr.s16 %rs358, %rs357, 7; and.b16 %rs359, %rs358, -16; or.b16 %rs360, %rs359, %rs355; cvt.rn.f32.s16 %f589, %rs360; sub.ftz.f32 %f590, %f589, %f10; mul.ftz.f32 %f591, %f87, %f590; // begin inline asm { cvt.f32.f16 %f440, %rs243;} // end inline asm fma.rn.ftz.f32 %f592, %f591, %f440, %f581; // begin inline asm { cvt.f32.f16 %f441, %rs244;} // end inline asm fma.rn.ftz.f32 %f593, %f591, %f441, %f582; // begin inline asm { cvt.f32.f16 %f442, %rs245;} // end inline asm fma.rn.ftz.f32 %f594, %f591, %f442, %f583; // begin inline asm { cvt.f32.f16 %f443, %rs246;} // end inline asm fma.rn.ftz.f32 %f595, %f591, %f443, %f584; // begin inline asm { cvt.f32.f16 %f444, %rs247;} // end inline asm fma.rn.ftz.f32 %f596, %f591, %f444, %f585; // begin inline asm { cvt.f32.f16 %f445, %rs248;} // end inline asm fma.rn.ftz.f32 %f597, %f591, %f445, %f586; // begin inline asm { cvt.f32.f16 %f446, %rs249;} // end inline asm fma.rn.ftz.f32 %f598, %f591, %f446, %f587; // begin inline asm { cvt.f32.f16 %f447, %rs250;} // end inline asm fma.rn.ftz.f32 %f599, %f591, %f447, %f588; ld.global.v4.u32 {%r188, %r189, %r190, %r191}, [%rd5+16]; ld.global.v4.u32 {%r196, %r197, %r198, %r199}, [%rd6+16]; ld.global.v4.u32 {%r204, %r205, %r206, %r207}, [%rd7]; ld.global.v4.u32 {%r212, %r213, %r214, %r215}, [%rd8]; ld.global.v4.u32 {%r220, %r221, %r222, %r223}, [%rd9]; ld.global.v4.u32 {%r228, %r229, %r230, %r231}, [%rd10]; ld.global.v4.u32 {%r236, %r237, %r238, %r239}, [%rd11]; ld.global.v4.u32 {%r244, %r245, %r246, %r247}, [%rd12]; shl.b16 %rs362, %rs43, 4; cvt.s16.s8 %rs363, %rs362; shr.s16 %rs364, %rs363, 7; and.b16 %rs365, %rs364, -16; and.b16 %rs366, %rs43, 15; or.b16 %rs367, %rs365, %rs366; cvt.rn.f32.s16 %f600, %rs367; sub.ftz.f32 %f601, %f600, %f10; mul.ftz.f32 %f602, %f87, %f601; mov.b32 {%rs251, %rs259}, %r188; // begin inline asm { cvt.f32.f16 %f448, %rs251;} // end inline asm fma.rn.ftz.f32 %f603, %f602, %f448, %f592; mov.b32 {%rs252, %rs260}, %r196; // begin inline asm { cvt.f32.f16 %f449, %rs252;} // end inline asm fma.rn.ftz.f32 %f604, %f602, %f449, %f593; mov.b32 {%rs253, %rs261}, %r204; // begin inline asm { cvt.f32.f16 %f450, %rs253;} // end inline asm fma.rn.ftz.f32 %f605, %f602, %f450, %f594; mov.b32 {%rs254, %rs262}, %r212; // begin inline asm { cvt.f32.f16 %f451, %rs254;} // end inline asm fma.rn.ftz.f32 %f606, %f602, %f451, %f595; mov.b32 {%rs255, %rs263}, %r220; // begin inline asm { cvt.f32.f16 %f452, %rs255;} // end inline asm fma.rn.ftz.f32 %f607, %f602, %f452, %f596; mov.b32 {%rs256, %rs264}, %r228; // begin inline asm { cvt.f32.f16 %f453, %rs256;} // end inline asm fma.rn.ftz.f32 %f608, %f602, %f453, %f597; mov.b32 {%rs257, %rs265}, %r236; // begin inline asm { cvt.f32.f16 %f454, %rs257;} // end inline asm fma.rn.ftz.f32 %f609, %f602, %f454, %f598; mov.b32 {%rs258, %rs266}, %r244; // begin inline asm { cvt.f32.f16 %f455, %rs258;} // end inline asm fma.rn.ftz.f32 %f610, %f602, %f455, %f599; cvt.s16.s8 %rs368, %rs43; shr.s16 %rs369, %rs368, 7; and.b16 %rs370, %rs369, -16; or.b16 %rs371, %rs370, %rs9; cvt.rn.f32.s16 %f611, %rs371; sub.ftz.f32 %f612, %f611, %f10; mul.ftz.f32 %f613, %f87, %f612; // begin inline asm { cvt.f32.f16 %f456, %rs259;} // end inline asm fma.rn.ftz.f32 %f614, %f613, %f456, %f603; // begin inline asm { cvt.f32.f16 %f457, %rs260;} // end inline asm fma.rn.ftz.f32 %f615, %f613, %f457, %f604; // begin inline asm { cvt.f32.f16 %f458, %rs261;} // end inline asm fma.rn.ftz.f32 %f616, %f613, %f458, %f605; // begin inline asm { cvt.f32.f16 %f459, %rs262;} // end inline asm fma.rn.ftz.f32 %f617, %f613, %f459, %f606; // begin inline asm { cvt.f32.f16 %f460, %rs263;} // end inline asm fma.rn.ftz.f32 %f618, %f613, %f460, %f607; // begin inline asm { cvt.f32.f16 %f461, %rs264;} // end inline asm fma.rn.ftz.f32 %f619, %f613, %f461, %f608; // begin inline asm { cvt.f32.f16 %f462, %rs265;} // end inline asm fma.rn.ftz.f32 %f620, %f613, %f462, %f609; // begin inline asm { cvt.f32.f16 %f463, %rs266;} // end inline asm fma.rn.ftz.f32 %f621, %f613, %f463, %f610; cvt.s16.s8 %rs373, %rs44; shr.s16 %rs374, %rs373, 7; and.b16 %rs375, %rs374, -16; or.b16 %rs376, %rs375, %rs10; cvt.rn.f32.s16 %f622, %rs376; sub.ftz.f32 %f623, %f622, %f10; mul.ftz.f32 %f624, %f87, %f623; mov.b32 {%rs267, %rs275}, %r189; // begin inline asm { cvt.f32.f16 %f464, %rs267;} // end inline asm fma.rn.ftz.f32 %f625, %f624, %f464, %f614; mov.b32 {%rs268, %rs276}, %r197; // begin inline asm { cvt.f32.f16 %f465, %rs268;} // end inline asm fma.rn.ftz.f32 %f626, %f624, %f465, %f615; mov.b32 {%rs269, %rs277}, %r205; // begin inline asm { cvt.f32.f16 %f466, %rs269;} // end inline asm fma.rn.ftz.f32 %f627, %f624, %f466, %f616; mov.b32 {%rs270, %rs278}, %r213; // begin inline asm { cvt.f32.f16 %f467, %rs270;} // end inline asm fma.rn.ftz.f32 %f628, %f624, %f467, %f617; mov.b32 {%rs271, %rs279}, %r221; // begin inline asm { cvt.f32.f16 %f468, %rs271;} // end inline asm fma.rn.ftz.f32 %f629, %f624, %f468, %f618; mov.b32 {%rs272, %rs280}, %r229; // begin inline asm { cvt.f32.f16 %f469, %rs272;} // end inline asm fma.rn.ftz.f32 %f630, %f624, %f469, %f619; mov.b32 {%rs273, %rs281}, %r237; // begin inline asm { cvt.f32.f16 %f470, %rs273;} // end inline asm fma.rn.ftz.f32 %f631, %f624, %f470, %f620; mov.b32 {%rs274, %rs282}, %r245; // begin inline asm { cvt.f32.f16 %f471, %rs274;} // end inline asm fma.rn.ftz.f32 %f632, %f624, %f471, %f621; shr.s16 %rs377, %rs43, 15; and.b16 %rs378, %rs377, -16; or.b16 %rs379, %rs378, %rs11; cvt.rn.f32.s16 %f633, %rs379; sub.ftz.f32 %f634, %f633, %f10; mul.ftz.f32 %f635, %f87, %f634; // begin inline asm { cvt.f32.f16 %f472, %rs275;} // end inline asm fma.rn.ftz.f32 %f636, %f635, %f472, %f625; // begin inline asm { cvt.f32.f16 %f473, %rs276;} // end inline asm fma.rn.ftz.f32 %f637, %f635, %f473, %f626; // begin inline asm { cvt.f32.f16 %f474, %rs277;} // end inline asm fma.rn.ftz.f32 %f638, %f635, %f474, %f627; // begin inline asm { cvt.f32.f16 %f475, %rs278;} // end inline asm fma.rn.ftz.f32 %f639, %f635, %f475, %f628; // begin inline asm { cvt.f32.f16 %f476, %rs279;} // end inline asm fma.rn.ftz.f32 %f640, %f635, %f476, %f629; // begin inline asm { cvt.f32.f16 %f477, %rs280;} // end inline asm fma.rn.ftz.f32 %f641, %f635, %f477, %f630; // begin inline asm { cvt.f32.f16 %f478, %rs281;} // end inline asm fma.rn.ftz.f32 %f642, %f635, %f478, %f631; // begin inline asm { cvt.f32.f16 %f479, %rs282;} // end inline asm fma.rn.ftz.f32 %f643, %f635, %f479, %f632; shl.b16 %rs381, %rs46, 4; cvt.s16.s8 %rs382, %rs381; shr.s16 %rs383, %rs382, 7; and.b16 %rs384, %rs383, -16; or.b16 %rs385, %rs384, %rs12; cvt.rn.f32.s16 %f644, %rs385; sub.ftz.f32 %f645, %f644, %f10; mul.ftz.f32 %f646, %f87, %f645; mov.b32 {%rs283, %rs291}, %r190; // begin inline asm { cvt.f32.f16 %f480, %rs283;} // end inline asm fma.rn.ftz.f32 %f647, %f646, %f480, %f636; mov.b32 {%rs284, %rs292}, %r198; // begin inline asm { cvt.f32.f16 %f481, %rs284;} // end inline asm fma.rn.ftz.f32 %f648, %f646, %f481, %f637; mov.b32 {%rs285, %rs293}, %r206; // begin inline asm { cvt.f32.f16 %f482, %rs285;} // end inline asm fma.rn.ftz.f32 %f649, %f646, %f482, %f638; mov.b32 {%rs286, %rs294}, %r214; // begin inline asm { cvt.f32.f16 %f483, %rs286;} // end inline asm fma.rn.ftz.f32 %f650, %f646, %f483, %f639; mov.b32 {%rs287, %rs295}, %r222; // begin inline asm { cvt.f32.f16 %f484, %rs287;} // end inline asm fma.rn.ftz.f32 %f651, %f646, %f484, %f640; mov.b32 {%rs288, %rs296}, %r230; // begin inline asm { cvt.f32.f16 %f485, %rs288;} // end inline asm fma.rn.ftz.f32 %f652, %f646, %f485, %f641; mov.b32 {%rs289, %rs297}, %r238; // begin inline asm { cvt.f32.f16 %f486, %rs289;} // end inline asm fma.rn.ftz.f32 %f653, %f646, %f486, %f642; mov.b32 {%rs290, %rs298}, %r246; // begin inline asm { cvt.f32.f16 %f487, %rs290;} // end inline asm fma.rn.ftz.f32 %f654, %f646, %f487, %f643; shl.b16 %rs387, %rs47, 4; cvt.s16.s8 %rs388, %rs387; shr.s16 %rs389, %rs388, 7; and.b16 %rs390, %rs389, -16; or.b16 %rs391, %rs390, %rs13; cvt.rn.f32.s16 %f655, %rs391; sub.ftz.f32 %f656, %f655, %f10; mul.ftz.f32 %f657, %f87, %f656; // begin inline asm { cvt.f32.f16 %f488, %rs291;} // end inline asm fma.rn.ftz.f32 %f658, %f657, %f488, %f647; // begin inline asm { cvt.f32.f16 %f489, %rs292;} // end inline asm fma.rn.ftz.f32 %f659, %f657, %f489, %f648; // begin inline asm { cvt.f32.f16 %f490, %rs293;} // end inline asm fma.rn.ftz.f32 %f660, %f657, %f490, %f649; // begin inline asm { cvt.f32.f16 %f491, %rs294;} // end inline asm fma.rn.ftz.f32 %f661, %f657, %f491, %f650; // begin inline asm { cvt.f32.f16 %f492, %rs295;} // end inline asm fma.rn.ftz.f32 %f662, %f657, %f492, %f651; // begin inline asm { cvt.f32.f16 %f493, %rs296;} // end inline asm fma.rn.ftz.f32 %f663, %f657, %f493, %f652; // begin inline asm { cvt.f32.f16 %f494, %rs297;} // end inline asm fma.rn.ftz.f32 %f664, %f657, %f494, %f653; // begin inline asm { cvt.f32.f16 %f495, %rs298;} // end inline asm fma.rn.ftz.f32 %f665, %f657, %f495, %f654; shl.b16 %rs393, %rs48, 4; cvt.s16.s8 %rs394, %rs393; shr.s16 %rs395, %rs394, 7; and.b16 %rs396, %rs395, -16; or.b16 %rs397, %rs396, %rs14; cvt.rn.f32.s16 %f666, %rs397; sub.ftz.f32 %f667, %f666, %f10; mul.ftz.f32 %f668, %f87, %f667; mov.b32 {%rs299, %rs307}, %r191; // begin inline asm { cvt.f32.f16 %f496, %rs299;} // end inline asm fma.rn.ftz.f32 %f669, %f668, %f496, %f658; mov.b32 {%rs300, %rs308}, %r199; // begin inline asm { cvt.f32.f16 %f497, %rs300;} // end inline asm fma.rn.ftz.f32 %f670, %f668, %f497, %f659; mov.b32 {%rs301, %rs309}, %r207; // begin inline asm { cvt.f32.f16 %f498, %rs301;} // end inline asm fma.rn.ftz.f32 %f671, %f668, %f498, %f660; mov.b32 {%rs302, %rs310}, %r215; // begin inline asm { cvt.f32.f16 %f499, %rs302;} // end inline asm fma.rn.ftz.f32 %f672, %f668, %f499, %f661; mov.b32 {%rs303, %rs311}, %r223; // begin inline asm { cvt.f32.f16 %f500, %rs303;} // end inline asm fma.rn.ftz.f32 %f673, %f668, %f500, %f662; mov.b32 {%rs304, %rs312}, %r231; // begin inline asm { cvt.f32.f16 %f501, %rs304;} // end inline asm fma.rn.ftz.f32 %f674, %f668, %f501, %f663; mov.b32 {%rs305, %rs313}, %r239; // begin inline asm { cvt.f32.f16 %f502, %rs305;} // end inline asm fma.rn.ftz.f32 %f675, %f668, %f502, %f664; mov.b32 {%rs306, %rs314}, %r247; // begin inline asm { cvt.f32.f16 %f503, %rs306;} // end inline asm fma.rn.ftz.f32 %f676, %f668, %f503, %f665; shr.u32 %r255, %r66, 28; cvt.u16.u32 %rs398, %r255; shl.b16 %rs399, %rs398, 4; cvt.s16.s8 %rs400, %rs399; shr.s16 %rs401, %rs400, 7; and.b16 %rs402, %rs401, -16; or.b16 %rs403, %rs402, %rs398; cvt.rn.f32.s16 %f677, %rs403; sub.ftz.f32 %f678, %f677, %f10; mul.ftz.f32 %f679, %f87, %f678; // begin inline asm { cvt.f32.f16 %f504, %rs307;} // end inline asm fma.rn.ftz.f32 %f879, %f679, %f504, %f669; // begin inline asm { cvt.f32.f16 %f505, %rs308;} // end inline asm fma.rn.ftz.f32 %f878, %f679, %f505, %f670; // begin inline asm { cvt.f32.f16 %f506, %rs309;} // end inline asm fma.rn.ftz.f32 %f877, %f679, %f506, %f671; // begin inline asm { cvt.f32.f16 %f507, %rs310;} // end inline asm fma.rn.ftz.f32 %f876, %f679, %f507, %f672; // begin inline asm { cvt.f32.f16 %f508, %rs311;} // end inline asm fma.rn.ftz.f32 %f875, %f679, %f508, %f673; // begin inline asm { cvt.f32.f16 %f509, %rs312;} // end inline asm fma.rn.ftz.f32 %f874, %f679, %f509, %f674; // begin inline asm { cvt.f32.f16 %f510, %rs313;} // end inline asm fma.rn.ftz.f32 %f873, %f679, %f510, %f675; // begin inline asm { cvt.f32.f16 %f511, %rs314;} // end inline asm fma.rn.ftz.f32 %f872, %f679, %f511, %f676; $L__BB0_8: add.s32 %r414, %r414, 4; shl.b32 %r256, %r414, 5; add.s32 %r413, %r256, %r61; shl.b32 %r412, %r413, 1; setp.lt.u32 %p7, %r412, %r58; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r407, %tid.y; mov.u32 %r406, %tid.x; shl.b32 %r405, %r407, 5; add.s32 %r404, %r405, %r406; shl.b32 %r258, %r404, 2; mov.u32 %r259, _ZZ9gemv_int4ILi4ELi64ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r260, %r259, %r258; setp.lt.u32 %p8, %r404, 32; @%p8 bra $L__BB0_11; add.s32 %r396, %r260, -112; st.shared.f32 [%r396], %f879; $L__BB0_11: mov.u32 %r411, %tid.y; mov.u32 %r410, %tid.x; shl.b32 %r409, %r411, 5; add.s32 %r408, %r409, %r410; setp.gt.u32 %p9, %r408, 31; bar.sync 0; mad.lo.s32 %r51, %r408, 12, %r259; @%p9 bra $L__BB0_13; mov.u32 %r275, 16; ld.shared.f32 %f695, [%r51+16]; add.ftz.f32 %f696, %f879, %f695; ld.shared.f32 %f697, [%r51+20]; add.ftz.f32 %f698, %f696, %f697; ld.shared.f32 %f699, [%r51+24]; add.ftz.f32 %f682, %f698, %f699; mov.u32 %r263, 1; mov.u32 %r276, 31; mov.u32 %r277, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f682, %r263, %r276, %r277; @p add.f32 r0, r0, %f682; mov.f32 %f680, r0;} // end inline asm mov.u32 %r266, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f680, %r266, %r276, %r277; @p add.f32 r0, r0, %f680; mov.f32 %f683, r0;} // end inline asm mov.u32 %r269, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f683, %r269, %r276, %r277; @p add.f32 r0, r0, %f683; mov.f32 %f686, r0;} // end inline asm mov.u32 %r272, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f686, %r272, %r276, %r277; @p add.f32 r0, r0, %f686; mov.f32 %f689, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f689, %r275, %r276, %r277; @p add.f32 r0, r0, %f689; mov.f32 %f879, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r397, %r260, -112; st.shared.f32 [%r397+640], %f878; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f715, [%r51+656]; add.ftz.f32 %f716, %f878, %f715; ld.shared.f32 %f717, [%r51+660]; add.ftz.f32 %f718, %f716, %f717; ld.shared.f32 %f719, [%r51+664]; add.ftz.f32 %f702, %f718, %f719; mov.u32 %r279, 1; mov.u32 %r292, 31; mov.u32 %r293, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f702, %r279, %r292, %r293; @p add.f32 r0, r0, %f702; mov.f32 %f700, r0;} // end inline asm mov.u32 %r282, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f700, %r282, %r292, %r293; @p add.f32 r0, r0, %f700; mov.f32 %f703, r0;} // end inline asm mov.u32 %r285, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f703, %r285, %r292, %r293; @p add.f32 r0, r0, %f703; mov.f32 %f706, r0;} // end inline asm mov.u32 %r288, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f706, %r288, %r292, %r293; @p add.f32 r0, r0, %f706; mov.f32 %f709, r0;} // end inline asm mov.u32 %r291, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f709, %r291, %r292, %r293; @p add.f32 r0, r0, %f709; mov.f32 %f878, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r398, %r260, -112; st.shared.f32 [%r398+1280], %f877; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f735, [%r51+1296]; add.ftz.f32 %f736, %f877, %f735; ld.shared.f32 %f737, [%r51+1300]; add.ftz.f32 %f738, %f736, %f737; ld.shared.f32 %f739, [%r51+1304]; add.ftz.f32 %f722, %f738, %f739; mov.u32 %r295, 1; mov.u32 %r308, 31; mov.u32 %r309, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f722, %r295, %r308, %r309; @p add.f32 r0, r0, %f722; mov.f32 %f720, r0;} // end inline asm mov.u32 %r298, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f720, %r298, %r308, %r309; @p add.f32 r0, r0, %f720; mov.f32 %f723, r0;} // end inline asm mov.u32 %r301, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f723, %r301, %r308, %r309; @p add.f32 r0, r0, %f723; mov.f32 %f726, r0;} // end inline asm mov.u32 %r304, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f726, %r304, %r308, %r309; @p add.f32 r0, r0, %f726; mov.f32 %f729, r0;} // end inline asm mov.u32 %r307, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f729, %r307, %r308, %r309; @p add.f32 r0, r0, %f729; mov.f32 %f877, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r399, %r260, -112; st.shared.f32 [%r399+1920], %f876; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f755, [%r51+1936]; add.ftz.f32 %f756, %f876, %f755; ld.shared.f32 %f757, [%r51+1940]; add.ftz.f32 %f758, %f756, %f757; ld.shared.f32 %f759, [%r51+1944]; add.ftz.f32 %f742, %f758, %f759; mov.u32 %r311, 1; mov.u32 %r324, 31; mov.u32 %r325, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f742, %r311, %r324, %r325; @p add.f32 r0, r0, %f742; mov.f32 %f740, r0;} // end inline asm mov.u32 %r314, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f740, %r314, %r324, %r325; @p add.f32 r0, r0, %f740; mov.f32 %f743, r0;} // end inline asm mov.u32 %r317, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f743, %r317, %r324, %r325; @p add.f32 r0, r0, %f743; mov.f32 %f746, r0;} // end inline asm mov.u32 %r320, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f746, %r320, %r324, %r325; @p add.f32 r0, r0, %f746; mov.f32 %f749, r0;} // end inline asm mov.u32 %r323, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f749, %r323, %r324, %r325; @p add.f32 r0, r0, %f749; mov.f32 %f876, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r400, %r260, -112; st.shared.f32 [%r400+2560], %f875; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f775, [%r51+2576]; add.ftz.f32 %f776, %f875, %f775; ld.shared.f32 %f777, [%r51+2580]; add.ftz.f32 %f778, %f776, %f777; ld.shared.f32 %f779, [%r51+2584]; add.ftz.f32 %f762, %f778, %f779; mov.u32 %r327, 1; mov.u32 %r340, 31; mov.u32 %r341, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f762, %r327, %r340, %r341; @p add.f32 r0, r0, %f762; mov.f32 %f760, r0;} // end inline asm mov.u32 %r330, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f760, %r330, %r340, %r341; @p add.f32 r0, r0, %f760; mov.f32 %f763, r0;} // end inline asm mov.u32 %r333, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f763, %r333, %r340, %r341; @p add.f32 r0, r0, %f763; mov.f32 %f766, r0;} // end inline asm mov.u32 %r336, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f766, %r336, %r340, %r341; @p add.f32 r0, r0, %f766; mov.f32 %f769, r0;} // end inline asm mov.u32 %r339, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f769, %r339, %r340, %r341; @p add.f32 r0, r0, %f769; mov.f32 %f875, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r401, %r260, -112; st.shared.f32 [%r401+3200], %f874; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f795, [%r51+3216]; add.ftz.f32 %f796, %f874, %f795; ld.shared.f32 %f797, [%r51+3220]; add.ftz.f32 %f798, %f796, %f797; ld.shared.f32 %f799, [%r51+3224]; add.ftz.f32 %f782, %f798, %f799; mov.u32 %r343, 1; mov.u32 %r356, 31; mov.u32 %r357, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f782, %r343, %r356, %r357; @p add.f32 r0, r0, %f782; mov.f32 %f780, r0;} // end inline asm mov.u32 %r346, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f780, %r346, %r356, %r357; @p add.f32 r0, r0, %f780; mov.f32 %f783, r0;} // end inline asm mov.u32 %r349, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f783, %r349, %r356, %r357; @p add.f32 r0, r0, %f783; mov.f32 %f786, r0;} // end inline asm mov.u32 %r352, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f786, %r352, %r356, %r357; @p add.f32 r0, r0, %f786; mov.f32 %f789, r0;} // end inline asm mov.u32 %r355, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f789, %r355, %r356, %r357; @p add.f32 r0, r0, %f789; mov.f32 %f874, r0;} // end inline asm $L__BB0_33: @%p8 bra $L__BB0_35; add.s32 %r402, %r260, -112; st.shared.f32 [%r402+3840], %f873; $L__BB0_35: bar.sync 0; @%p9 bra $L__BB0_37; ld.shared.f32 %f815, [%r51+3856]; add.ftz.f32 %f816, %f873, %f815; ld.shared.f32 %f817, [%r51+3860]; add.ftz.f32 %f818, %f816, %f817; ld.shared.f32 %f819, [%r51+3864]; add.ftz.f32 %f802, %f818, %f819; mov.u32 %r359, 1; mov.u32 %r372, 31; mov.u32 %r373, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f802, %r359, %r372, %r373; @p add.f32 r0, r0, %f802; mov.f32 %f800, r0;} // end inline asm mov.u32 %r362, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f800, %r362, %r372, %r373; @p add.f32 r0, r0, %f800; mov.f32 %f803, r0;} // end inline asm mov.u32 %r365, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f803, %r365, %r372, %r373; @p add.f32 r0, r0, %f803; mov.f32 %f806, r0;} // end inline asm mov.u32 %r368, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f806, %r368, %r372, %r373; @p add.f32 r0, r0, %f806; mov.f32 %f809, r0;} // end inline asm mov.u32 %r371, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f809, %r371, %r372, %r373; @p add.f32 r0, r0, %f809; mov.f32 %f873, r0;} // end inline asm $L__BB0_37: @%p8 bra $L__BB0_39; add.s32 %r403, %r260, -112; st.shared.f32 [%r403+4480], %f872; $L__BB0_39: bar.sync 0; @%p9 bra $L__BB0_41; ld.shared.f32 %f835, [%r51+4496]; add.ftz.f32 %f836, %f872, %f835; ld.shared.f32 %f837, [%r51+4500]; add.ftz.f32 %f838, %f836, %f837; ld.shared.f32 %f839, [%r51+4504]; add.ftz.f32 %f822, %f838, %f839; mov.u32 %r375, 1; mov.u32 %r388, 31; mov.u32 %r389, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f822, %r375, %r388, %r389; @p add.f32 r0, r0, %f822; mov.f32 %f820, r0;} // end inline asm mov.u32 %r378, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f820, %r378, %r388, %r389; @p add.f32 r0, r0, %f820; mov.f32 %f823, r0;} // end inline asm mov.u32 %r381, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f823, %r381, %r388, %r389; @p add.f32 r0, r0, %f823; mov.f32 %f826, r0;} // end inline asm mov.u32 %r384, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f826, %r384, %r388, %r389; @p add.f32 r0, r0, %f826; mov.f32 %f829, r0;} // end inline asm mov.u32 %r387, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f829, %r387, %r388, %r389; @p add.f32 r0, r0, %f829; mov.f32 %f872, r0;} // end inline asm $L__BB0_41: mov.u32 %r390, %tid.y; or.b32 %r392, %r61, %r390; setp.ne.s32 %p24, %r392, 0; @%p24 bra $L__BB0_45; ld.param.u64 %rd72, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd71, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd44, %rd71; setp.eq.s64 %p25, %rd72, 0; mul.ftz.f32 %f59, %f77, %f879; mov.u32 %r393, %ctaid.x; cvt.s64.s32 %rd14, %r393; mul.wide.s32 %rd45, %r393, 2; add.s64 %rd15, %rd44, %rd45; mul.ftz.f32 %f60, %f77, %f878; add.s32 %r394, %r57, %r393; cvt.s64.s32 %rd16, %r57; mul.wide.s32 %rd46, %r57, 2; add.s64 %rd17, %rd15, %rd46; mul.ftz.f32 %f61, %f77, %f877; add.s32 %r395, %r394, %r57; cvt.s64.s32 %rd18, %r395; mul.wide.s32 %rd47, %r395, 2; add.s64 %rd20, %rd44, %rd47; mul.ftz.f32 %f62, %f77, %f876; mul.ftz.f32 %f63, %f77, %f875; mul.ftz.f32 %f64, %f77, %f874; mul.ftz.f32 %f65, %f77, %f873; mul.ftz.f32 %f66, %f77, %f872; @%p25 bra $L__BB0_44; ld.param.u64 %rd73, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd48, %rd73; shl.b64 %rd49, %rd14, 1; add.s64 %rd50, %rd48, %rd49; ld.global.u16 %rs404, [%rd50]; // begin inline asm { cvt.f32.f16 %f840, %rs404;} // end inline asm fma.rn.ftz.f32 %f841, %f78, %f840, %f59; // begin inline asm { cvt.rn.f16.f32 %rs405, %f841;} // end inline asm st.global.u16 [%rd15], %rs405; shl.b64 %rd51, %rd16, 1; add.s64 %rd52, %rd50, %rd51; ld.global.u16 %rs406, [%rd52]; // begin inline asm { cvt.f32.f16 %f842, %rs406;} // end inline asm fma.rn.ftz.f32 %f843, %f78, %f842, %f60; // begin inline asm { cvt.rn.f16.f32 %rs407, %f843;} // end inline asm st.global.u16 [%rd17], %rs407; shl.b64 %rd53, %rd18, 1; add.s64 %rd54, %rd48, %rd53; ld.global.u16 %rs408, [%rd54]; // begin inline asm { cvt.f32.f16 %f844, %rs408;} // end inline asm fma.rn.ftz.f32 %f845, %f78, %f844, %f61; // begin inline asm { cvt.rn.f16.f32 %rs409, %f845;} // end inline asm st.global.u16 [%rd20], %rs409; add.s64 %rd55, %rd54, %rd51; ld.global.u16 %rs410, [%rd55]; // begin inline asm { cvt.f32.f16 %f846, %rs410;} // end inline asm fma.rn.ftz.f32 %f847, %f78, %f846, %f62; // begin inline asm { cvt.rn.f16.f32 %rs411, %f847;} // end inline asm add.s64 %rd56, %rd20, %rd51; st.global.u16 [%rd56], %rs411; add.s64 %rd57, %rd55, %rd51; ld.global.u16 %rs412, [%rd57]; // begin inline asm { cvt.f32.f16 %f848, %rs412;} // end inline asm fma.rn.ftz.f32 %f849, %f78, %f848, %f63; // begin inline asm { cvt.rn.f16.f32 %rs413, %f849;} // end inline asm add.s64 %rd58, %rd56, %rd51; st.global.u16 [%rd58], %rs413; add.s64 %rd59, %rd57, %rd51; ld.global.u16 %rs414, [%rd59]; // begin inline asm { cvt.f32.f16 %f850, %rs414;} // end inline asm fma.rn.ftz.f32 %f851, %f78, %f850, %f64; // begin inline asm { cvt.rn.f16.f32 %rs415, %f851;} // end inline asm add.s64 %rd60, %rd58, %rd51; st.global.u16 [%rd60], %rs415; add.s64 %rd61, %rd59, %rd51; ld.global.u16 %rs416, [%rd61]; // begin inline asm { cvt.f32.f16 %f852, %rs416;} // end inline asm fma.rn.ftz.f32 %f853, %f78, %f852, %f65; // begin inline asm { cvt.rn.f16.f32 %rs417, %f853;} // end inline asm add.s64 %rd62, %rd60, %rd51; st.global.u16 [%rd62], %rs417; add.s64 %rd63, %rd61, %rd51; ld.global.u16 %rs418, [%rd63]; // begin inline asm { cvt.f32.f16 %f854, %rs418;} // end inline asm fma.rn.ftz.f32 %f855, %f78, %f854, %f66; // begin inline asm { cvt.rn.f16.f32 %rs419, %f855;} // end inline asm add.s64 %rd64, %rd62, %rd51; st.global.u16 [%rd64], %rs419; bra.uni $L__BB0_45; $L__BB0_44: // begin inline asm { cvt.rn.f16.f32 %rs420, %f59;} // end inline asm st.global.u16 [%rd15], %rs420; // begin inline asm { cvt.rn.f16.f32 %rs421, %f60;} // end inline asm st.global.u16 [%rd17], %rs421; // begin inline asm { cvt.rn.f16.f32 %rs422, %f61;} // end inline asm st.global.u16 [%rd20], %rs422; // begin inline asm { cvt.rn.f16.f32 %rs423, %f62;} // end inline asm shl.b64 %rd65, %rd16, 1; add.s64 %rd66, %rd20, %rd65; st.global.u16 [%rd66], %rs423; // begin inline asm { cvt.rn.f16.f32 %rs424, %f63;} // end inline asm add.s64 %rd67, %rd66, %rd65; st.global.u16 [%rd67], %rs424; // begin inline asm { cvt.rn.f16.f32 %rs425, %f64;} // end inline asm add.s64 %rd68, %rd67, %rd65; st.global.u16 [%rd68], %rs425; // begin inline asm { cvt.rn.f16.f32 %rs426, %f65;} // end inline asm add.s64 %rd69, %rd68, %rd65; st.global.u16 [%rd69], %rs426; // begin inline asm { cvt.rn.f16.f32 %rs427, %f66;} // end inline asm add.s64 %rd70, %rd69, %rd65; st.global.u16 [%rd70], %rs427; $L__BB0_45: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }