storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch423DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<18>; .reg .b16 %rs<312>; .reg .f32 %f<500>; .reg .b32 %r<245>; .reg .b64 %rd<57>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[2560]; ld.param.v2.u32 {%r28, %r29}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r30, %r31}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f41, %f42}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs75, %rs76, %rs77, %rs78}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd25, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd24, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd23, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd22, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r244, %tid.y; shl.b32 %r32, %r244, 5; mov.u32 %r3, %tid.x; add.s32 %r243, %r32, %r3; shl.b32 %r242, %r243, 1; setp.ge.u32 %p1, %r242, %r30; mov.f32 %f488, 0f00000000; mov.f32 %f489, %f488; mov.f32 %f490, %f488; mov.f32 %f491, %f488; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd1, %rd22; mul.lo.s32 %r8, %r30, %r1; shr.u32 %r9, %r3, 2; mul.lo.s32 %r10, %r31, %r1; shl.b16 %rs2, %rs75, 3; mul.wide.s32 %rd3, %r28, 2; cvta.to.global.u64 %rd4, %rd24; cvta.to.global.u64 %rd5, %rd23; cvta.to.global.u64 %rd6, %rd25; $L__BB0_2: add.s32 %r33, %r242, %r8; mul.wide.u32 %rd26, %r33, 4; add.s64 %rd27, %rd5, %rd26; ld.global.v2.u32 {%r34, %r35}, [%rd27]; shl.b32 %r36, %r244, 3; add.s32 %r16, %r36, %r9; add.s32 %r17, %r16, %r10; mul.wide.s32 %rd28, %r17, 2; add.s64 %rd29, %rd6, %rd28; ld.global.u16 %rs83, [%rd29]; // begin inline asm { cvt.f32.f16 %f47, %rs83;} // end inline asm setp.eq.s64 %p2, %rd24, 0; mov.u16 %rs311, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r37, %r17, 31; add.s32 %r38, %r17, %r37; shr.s32 %r39, %r38, 1; cvt.s64.s32 %rd30, %r39; add.s64 %rd31, %rd4, %rd30; ld.global.u8 %r40, [%rd31]; shl.b32 %r41, %r16, 2; and.b32 %r42, %r41, 4; shr.u32 %r43, %r40, %r42; cvt.u16.u32 %rs84, %r43; and.b16 %rs311, %rs84, 15; $L__BB0_4: shl.b32 %r18, %r243, 4; setp.ge.s32 %p3, %r18, %r28; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs75, 0; shr.u16 %rs86, %rs311, 3; and.b16 %rs87, %rs86, 1; setp.eq.b16 %p5, %rs87, 1; and.pred %p6, %p4, %p5; selp.b16 %rs88, -16, 0, %p6; or.b16 %rs89, %rs88, %rs311; cvt.s16.s8 %rs90, %rs89; cvt.rn.f32.s16 %f6, %rs90; mul.wide.s32 %rd32, %r18, 2; add.s64 %rd7, %rd1, %rd32; ld.global.v4.u32 {%r44, %r45, %r46, %r47}, [%rd7]; add.s64 %rd8, %rd7, %rd3; ld.global.v4.u32 {%r52, %r53, %r54, %r55}, [%rd8]; add.s32 %r60, %r18, %r28; add.s32 %r61, %r60, %r28; mul.wide.s32 %rd34, %r61, 2; add.s64 %rd35, %rd1, %rd34; ld.global.v4.u32 {%r62, %r63, %r64, %r65}, [%rd35]; add.s32 %r70, %r61, %r28; mul.wide.s32 %rd36, %r70, 2; add.s64 %rd37, %rd1, %rd36; ld.global.v4.u32 {%r71, %r72, %r73, %r74}, [%rd37]; cvt.u16.u32 %rs5, %r34; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs13}, %r44; mov.b32 {%rs8, %rs14}, %r52; mov.b32 {%rs9, %rs15}, %r62; mov.b32 {%rs10, %rs16}, %r71; shr.u32 %r79, %r34, 4; cvt.u16.u32 %rs11, %r79; and.b16 %rs12, %rs11, 15; shr.u32 %r80, %r34, 8; cvt.u16.u32 %rs17, %r80; and.b16 %rs18, %rs17, 15; mov.b32 {%rs19, %rs25}, %r45; mov.b32 {%rs20, %rs26}, %r53; mov.b32 {%rs21, %rs27}, %r63; mov.b32 {%rs22, %rs28}, %r72; shr.u32 %r81, %r34, 12; cvt.u16.u32 %rs23, %r81; and.b16 %rs24, %rs23, 15; shr.u32 %r82, %r34, 16; cvt.u16.u32 %rs29, %r82; and.b16 %rs30, %rs29, 15; mov.b32 {%rs31, %rs37}, %r46; mov.b32 {%rs32, %rs38}, %r54; mov.b32 {%rs33, %rs39}, %r64; mov.b32 {%rs34, %rs40}, %r73; shr.u32 %r83, %r34, 20; cvt.u16.u32 %rs35, %r83; and.b16 %rs36, %rs35, 15; shr.u32 %r84, %r34, 24; cvt.u16.u32 %rs41, %r84; and.b16 %rs42, %rs41, 15; mov.b32 {%rs43, %rs48}, %r47; mov.b32 {%rs44, %rs49}, %r55; mov.b32 {%rs45, %rs50}, %r65; mov.b32 {%rs46, %rs51}, %r74; shr.u32 %r85, %r34, 28; cvt.u16.u32 %rs47, %r85; cvt.u16.u32 %rs52, %r35; and.b16 %rs53, %rs52, 15; shr.u32 %r86, %r35, 4; cvt.u16.u32 %rs54, %r86; and.b16 %rs55, %rs54, 15; shr.u32 %r87, %r35, 8; cvt.u16.u32 %rs56, %r87; and.b16 %rs57, %rs56, 15; shr.u32 %r88, %r35, 12; cvt.u16.u32 %rs58, %r88; and.b16 %rs59, %rs58, 15; shr.u32 %r89, %r35, 16; cvt.u16.u32 %rs60, %r89; and.b16 %rs61, %rs60, 15; shr.u32 %r90, %r35, 20; cvt.u16.u32 %rs62, %r90; and.b16 %rs63, %rs62, 15; shr.u32 %r91, %r35, 24; cvt.u16.u32 %rs64, %r91; and.b16 %rs65, %rs64, 15; shr.u32 %r92, %r35, 28; cvt.u16.u32 %rs66, %r92; add.s64 %rd39, %rd8, %rd3; add.s64 %rd9, %rd39, 16; add.s64 %rd10, %rd9, %rd3; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f112, %rs6; sub.ftz.f32 %f113, %f112, %f6; mul.ftz.f32 %f114, %f47, %f113; // begin inline asm { cvt.f32.f16 %f48, %rs7;} // end inline asm fma.rn.ftz.f32 %f115, %f114, %f48, %f491; // begin inline asm { cvt.f32.f16 %f49, %rs8;} // end inline asm fma.rn.ftz.f32 %f116, %f114, %f49, %f490; // begin inline asm { cvt.f32.f16 %f50, %rs9;} // end inline asm fma.rn.ftz.f32 %f117, %f114, %f50, %f489; // begin inline asm { cvt.f32.f16 %f51, %rs10;} // end inline asm fma.rn.ftz.f32 %f118, %f114, %f51, %f488; cvt.rn.f32.s16 %f119, %rs12; sub.ftz.f32 %f120, %f119, %f6; mul.ftz.f32 %f121, %f47, %f120; // begin inline asm { cvt.f32.f16 %f52, %rs13;} // end inline asm fma.rn.ftz.f32 %f122, %f121, %f52, %f115; // begin inline asm { cvt.f32.f16 %f53, %rs14;} // end inline asm fma.rn.ftz.f32 %f123, %f121, %f53, %f116; // begin inline asm { cvt.f32.f16 %f54, %rs15;} // end inline asm fma.rn.ftz.f32 %f124, %f121, %f54, %f117; // begin inline asm { cvt.f32.f16 %f55, %rs16;} // end inline asm fma.rn.ftz.f32 %f125, %f121, %f55, %f118; cvt.rn.f32.s16 %f126, %rs18; sub.ftz.f32 %f127, %f126, %f6; mul.ftz.f32 %f128, %f47, %f127; // begin inline asm { cvt.f32.f16 %f56, %rs19;} // end inline asm fma.rn.ftz.f32 %f129, %f128, %f56, %f122; // begin inline asm { cvt.f32.f16 %f57, %rs20;} // end inline asm fma.rn.ftz.f32 %f130, %f128, %f57, %f123; // begin inline asm { cvt.f32.f16 %f58, %rs21;} // end inline asm fma.rn.ftz.f32 %f131, %f128, %f58, %f124; // begin inline asm { cvt.f32.f16 %f59, %rs22;} // end inline asm fma.rn.ftz.f32 %f132, %f128, %f59, %f125; cvt.rn.f32.s16 %f133, %rs24; sub.ftz.f32 %f134, %f133, %f6; mul.ftz.f32 %f135, %f47, %f134; // begin inline asm { cvt.f32.f16 %f60, %rs25;} // end inline asm fma.rn.ftz.f32 %f136, %f135, %f60, %f129; // begin inline asm { cvt.f32.f16 %f61, %rs26;} // end inline asm fma.rn.ftz.f32 %f137, %f135, %f61, %f130; // begin inline asm { cvt.f32.f16 %f62, %rs27;} // end inline asm fma.rn.ftz.f32 %f138, %f135, %f62, %f131; // begin inline asm { cvt.f32.f16 %f63, %rs28;} // end inline asm fma.rn.ftz.f32 %f139, %f135, %f63, %f132; cvt.rn.f32.s16 %f140, %rs30; sub.ftz.f32 %f141, %f140, %f6; mul.ftz.f32 %f142, %f47, %f141; // begin inline asm { cvt.f32.f16 %f64, %rs31;} // end inline asm fma.rn.ftz.f32 %f143, %f142, %f64, %f136; // begin inline asm { cvt.f32.f16 %f65, %rs32;} // end inline asm fma.rn.ftz.f32 %f144, %f142, %f65, %f137; // begin inline asm { cvt.f32.f16 %f66, %rs33;} // end inline asm fma.rn.ftz.f32 %f145, %f142, %f66, %f138; // begin inline asm { cvt.f32.f16 %f67, %rs34;} // end inline asm fma.rn.ftz.f32 %f146, %f142, %f67, %f139; cvt.rn.f32.s16 %f147, %rs36; sub.ftz.f32 %f148, %f147, %f6; mul.ftz.f32 %f149, %f47, %f148; // begin inline asm { cvt.f32.f16 %f68, %rs37;} // end inline asm fma.rn.ftz.f32 %f150, %f149, %f68, %f143; // begin inline asm { cvt.f32.f16 %f69, %rs38;} // end inline asm fma.rn.ftz.f32 %f151, %f149, %f69, %f144; // begin inline asm { cvt.f32.f16 %f70, %rs39;} // end inline asm fma.rn.ftz.f32 %f152, %f149, %f70, %f145; // begin inline asm { cvt.f32.f16 %f71, %rs40;} // end inline asm fma.rn.ftz.f32 %f153, %f149, %f71, %f146; cvt.rn.f32.s16 %f154, %rs42; sub.ftz.f32 %f155, %f154, %f6; mul.ftz.f32 %f156, %f47, %f155; // begin inline asm { cvt.f32.f16 %f72, %rs43;} // end inline asm fma.rn.ftz.f32 %f157, %f156, %f72, %f150; // begin inline asm { cvt.f32.f16 %f73, %rs44;} // end inline asm fma.rn.ftz.f32 %f158, %f156, %f73, %f151; // begin inline asm { cvt.f32.f16 %f74, %rs45;} // end inline asm fma.rn.ftz.f32 %f159, %f156, %f74, %f152; // begin inline asm { cvt.f32.f16 %f75, %rs46;} // end inline asm fma.rn.ftz.f32 %f160, %f156, %f75, %f153; cvt.rn.f32.s16 %f161, %rs47; sub.ftz.f32 %f162, %f161, %f6; mul.ftz.f32 %f163, %f47, %f162; // begin inline asm { cvt.f32.f16 %f76, %rs48;} // end inline asm fma.rn.ftz.f32 %f164, %f163, %f76, %f157; // begin inline asm { cvt.f32.f16 %f77, %rs49;} // end inline asm fma.rn.ftz.f32 %f165, %f163, %f77, %f158; // begin inline asm { cvt.f32.f16 %f78, %rs50;} // end inline asm fma.rn.ftz.f32 %f166, %f163, %f78, %f159; // begin inline asm { cvt.f32.f16 %f79, %rs51;} // end inline asm fma.rn.ftz.f32 %f167, %f163, %f79, %f160; ld.global.v4.u32 {%r93, %r94, %r95, %r96}, [%rd7+16]; ld.global.v4.u32 {%r101, %r102, %r103, %r104}, [%rd8+16]; ld.global.v4.u32 {%r109, %r110, %r111, %r112}, [%rd9]; ld.global.v4.u32 {%r117, %r118, %r119, %r120}, [%rd10]; cvt.rn.f32.s16 %f168, %rs53; sub.ftz.f32 %f169, %f168, %f6; mul.ftz.f32 %f170, %f47, %f169; mov.b32 {%rs123, %rs127}, %r93; // begin inline asm { cvt.f32.f16 %f80, %rs123;} // end inline asm fma.rn.ftz.f32 %f171, %f170, %f80, %f164; mov.b32 {%rs124, %rs128}, %r101; // begin inline asm { cvt.f32.f16 %f81, %rs124;} // end inline asm fma.rn.ftz.f32 %f172, %f170, %f81, %f165; mov.b32 {%rs125, %rs129}, %r109; // begin inline asm { cvt.f32.f16 %f82, %rs125;} // end inline asm fma.rn.ftz.f32 %f173, %f170, %f82, %f166; mov.b32 {%rs126, %rs130}, %r117; // begin inline asm { cvt.f32.f16 %f83, %rs126;} // end inline asm fma.rn.ftz.f32 %f174, %f170, %f83, %f167; cvt.rn.f32.s16 %f175, %rs55; sub.ftz.f32 %f176, %f175, %f6; mul.ftz.f32 %f177, %f47, %f176; // begin inline asm { cvt.f32.f16 %f84, %rs127;} // end inline asm fma.rn.ftz.f32 %f178, %f177, %f84, %f171; // begin inline asm { cvt.f32.f16 %f85, %rs128;} // end inline asm fma.rn.ftz.f32 %f179, %f177, %f85, %f172; // begin inline asm { cvt.f32.f16 %f86, %rs129;} // end inline asm fma.rn.ftz.f32 %f180, %f177, %f86, %f173; // begin inline asm { cvt.f32.f16 %f87, %rs130;} // end inline asm fma.rn.ftz.f32 %f181, %f177, %f87, %f174; cvt.rn.f32.s16 %f182, %rs57; sub.ftz.f32 %f183, %f182, %f6; mul.ftz.f32 %f184, %f47, %f183; mov.b32 {%rs131, %rs135}, %r94; // begin inline asm { cvt.f32.f16 %f88, %rs131;} // end inline asm fma.rn.ftz.f32 %f185, %f184, %f88, %f178; mov.b32 {%rs132, %rs136}, %r102; // begin inline asm { cvt.f32.f16 %f89, %rs132;} // end inline asm fma.rn.ftz.f32 %f186, %f184, %f89, %f179; mov.b32 {%rs133, %rs137}, %r110; // begin inline asm { cvt.f32.f16 %f90, %rs133;} // end inline asm fma.rn.ftz.f32 %f187, %f184, %f90, %f180; mov.b32 {%rs134, %rs138}, %r118; // begin inline asm { cvt.f32.f16 %f91, %rs134;} // end inline asm fma.rn.ftz.f32 %f188, %f184, %f91, %f181; cvt.rn.f32.s16 %f189, %rs59; sub.ftz.f32 %f190, %f189, %f6; mul.ftz.f32 %f191, %f47, %f190; // begin inline asm { cvt.f32.f16 %f92, %rs135;} // end inline asm fma.rn.ftz.f32 %f192, %f191, %f92, %f185; // begin inline asm { cvt.f32.f16 %f93, %rs136;} // end inline asm fma.rn.ftz.f32 %f193, %f191, %f93, %f186; // begin inline asm { cvt.f32.f16 %f94, %rs137;} // end inline asm fma.rn.ftz.f32 %f194, %f191, %f94, %f187; // begin inline asm { cvt.f32.f16 %f95, %rs138;} // end inline asm fma.rn.ftz.f32 %f195, %f191, %f95, %f188; cvt.rn.f32.s16 %f196, %rs61; sub.ftz.f32 %f197, %f196, %f6; mul.ftz.f32 %f198, %f47, %f197; mov.b32 {%rs139, %rs143}, %r95; // begin inline asm { cvt.f32.f16 %f96, %rs139;} // end inline asm fma.rn.ftz.f32 %f199, %f198, %f96, %f192; mov.b32 {%rs140, %rs144}, %r103; // begin inline asm { cvt.f32.f16 %f97, %rs140;} // end inline asm fma.rn.ftz.f32 %f200, %f198, %f97, %f193; mov.b32 {%rs141, %rs145}, %r111; // begin inline asm { cvt.f32.f16 %f98, %rs141;} // end inline asm fma.rn.ftz.f32 %f201, %f198, %f98, %f194; mov.b32 {%rs142, %rs146}, %r119; // begin inline asm { cvt.f32.f16 %f99, %rs142;} // end inline asm fma.rn.ftz.f32 %f202, %f198, %f99, %f195; cvt.rn.f32.s16 %f203, %rs63; sub.ftz.f32 %f204, %f203, %f6; mul.ftz.f32 %f205, %f47, %f204; // begin inline asm { cvt.f32.f16 %f100, %rs143;} // end inline asm fma.rn.ftz.f32 %f206, %f205, %f100, %f199; // begin inline asm { cvt.f32.f16 %f101, %rs144;} // end inline asm fma.rn.ftz.f32 %f207, %f205, %f101, %f200; // begin inline asm { cvt.f32.f16 %f102, %rs145;} // end inline asm fma.rn.ftz.f32 %f208, %f205, %f102, %f201; // begin inline asm { cvt.f32.f16 %f103, %rs146;} // end inline asm fma.rn.ftz.f32 %f209, %f205, %f103, %f202; cvt.rn.f32.s16 %f210, %rs65; sub.ftz.f32 %f211, %f210, %f6; mul.ftz.f32 %f212, %f47, %f211; mov.b32 {%rs147, %rs151}, %r96; // begin inline asm { cvt.f32.f16 %f104, %rs147;} // end inline asm fma.rn.ftz.f32 %f213, %f212, %f104, %f206; mov.b32 {%rs148, %rs152}, %r104; // begin inline asm { cvt.f32.f16 %f105, %rs148;} // end inline asm fma.rn.ftz.f32 %f214, %f212, %f105, %f207; mov.b32 {%rs149, %rs153}, %r112; // begin inline asm { cvt.f32.f16 %f106, %rs149;} // end inline asm fma.rn.ftz.f32 %f215, %f212, %f106, %f208; mov.b32 {%rs150, %rs154}, %r120; // begin inline asm { cvt.f32.f16 %f107, %rs150;} // end inline asm fma.rn.ftz.f32 %f216, %f212, %f107, %f209; cvt.rn.f32.s16 %f217, %rs66; sub.ftz.f32 %f218, %f217, %f6; mul.ftz.f32 %f219, %f47, %f218; // begin inline asm { cvt.f32.f16 %f108, %rs151;} // end inline asm fma.rn.ftz.f32 %f491, %f219, %f108, %f213; // begin inline asm { cvt.f32.f16 %f109, %rs152;} // end inline asm fma.rn.ftz.f32 %f490, %f219, %f109, %f214; // begin inline asm { cvt.f32.f16 %f110, %rs153;} // end inline asm fma.rn.ftz.f32 %f489, %f219, %f110, %f215; // begin inline asm { cvt.f32.f16 %f111, %rs154;} // end inline asm fma.rn.ftz.f32 %f488, %f219, %f111, %f216; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs219, %rs5, 4; cvt.s16.s8 %rs220, %rs219; shr.s16 %rs221, %rs220, 7; and.b16 %rs222, %rs221, -16; or.b16 %rs223, %rs222, %rs6; cvt.rn.f32.s16 %f284, %rs223; sub.ftz.f32 %f285, %f284, %f6; mul.ftz.f32 %f286, %f47, %f285; // begin inline asm { cvt.f32.f16 %f220, %rs7;} // end inline asm fma.rn.ftz.f32 %f287, %f286, %f220, %f491; // begin inline asm { cvt.f32.f16 %f221, %rs8;} // end inline asm fma.rn.ftz.f32 %f288, %f286, %f221, %f490; // begin inline asm { cvt.f32.f16 %f222, %rs9;} // end inline asm fma.rn.ftz.f32 %f289, %f286, %f222, %f489; // begin inline asm { cvt.f32.f16 %f223, %rs10;} // end inline asm fma.rn.ftz.f32 %f290, %f286, %f223, %f488; shl.b16 %rs224, %rs11, 4; cvt.s16.s8 %rs225, %rs224; shr.s16 %rs226, %rs225, 7; and.b16 %rs227, %rs226, -16; or.b16 %rs228, %rs227, %rs12; cvt.rn.f32.s16 %f291, %rs228; sub.ftz.f32 %f292, %f291, %f6; mul.ftz.f32 %f293, %f47, %f292; // begin inline asm { cvt.f32.f16 %f224, %rs13;} // end inline asm fma.rn.ftz.f32 %f294, %f293, %f224, %f287; // begin inline asm { cvt.f32.f16 %f225, %rs14;} // end inline asm fma.rn.ftz.f32 %f295, %f293, %f225, %f288; // begin inline asm { cvt.f32.f16 %f226, %rs15;} // end inline asm fma.rn.ftz.f32 %f296, %f293, %f226, %f289; // begin inline asm { cvt.f32.f16 %f227, %rs16;} // end inline asm fma.rn.ftz.f32 %f297, %f293, %f227, %f290; shl.b16 %rs229, %rs17, 4; cvt.s16.s8 %rs230, %rs229; shr.s16 %rs231, %rs230, 7; and.b16 %rs232, %rs231, -16; or.b16 %rs233, %rs232, %rs18; cvt.rn.f32.s16 %f298, %rs233; sub.ftz.f32 %f299, %f298, %f6; mul.ftz.f32 %f300, %f47, %f299; // begin inline asm { cvt.f32.f16 %f228, %rs19;} // end inline asm fma.rn.ftz.f32 %f301, %f300, %f228, %f294; // begin inline asm { cvt.f32.f16 %f229, %rs20;} // end inline asm fma.rn.ftz.f32 %f302, %f300, %f229, %f295; // begin inline asm { cvt.f32.f16 %f230, %rs21;} // end inline asm fma.rn.ftz.f32 %f303, %f300, %f230, %f296; // begin inline asm { cvt.f32.f16 %f231, %rs22;} // end inline asm fma.rn.ftz.f32 %f304, %f300, %f231, %f297; shl.b16 %rs234, %rs23, 4; cvt.s16.s8 %rs235, %rs234; shr.s16 %rs236, %rs235, 7; and.b16 %rs237, %rs236, -16; or.b16 %rs238, %rs237, %rs24; cvt.rn.f32.s16 %f305, %rs238; sub.ftz.f32 %f306, %f305, %f6; mul.ftz.f32 %f307, %f47, %f306; // begin inline asm { cvt.f32.f16 %f232, %rs25;} // end inline asm fma.rn.ftz.f32 %f308, %f307, %f232, %f301; // begin inline asm { cvt.f32.f16 %f233, %rs26;} // end inline asm fma.rn.ftz.f32 %f309, %f307, %f233, %f302; // begin inline asm { cvt.f32.f16 %f234, %rs27;} // end inline asm fma.rn.ftz.f32 %f310, %f307, %f234, %f303; // begin inline asm { cvt.f32.f16 %f235, %rs28;} // end inline asm fma.rn.ftz.f32 %f311, %f307, %f235, %f304; shl.b16 %rs239, %rs29, 4; cvt.s16.s8 %rs240, %rs239; shr.s16 %rs241, %rs240, 7; and.b16 %rs242, %rs241, -16; or.b16 %rs243, %rs242, %rs30; cvt.rn.f32.s16 %f312, %rs243; sub.ftz.f32 %f313, %f312, %f6; mul.ftz.f32 %f314, %f47, %f313; // begin inline asm { cvt.f32.f16 %f236, %rs31;} // end inline asm fma.rn.ftz.f32 %f315, %f314, %f236, %f308; // begin inline asm { cvt.f32.f16 %f237, %rs32;} // end inline asm fma.rn.ftz.f32 %f316, %f314, %f237, %f309; // begin inline asm { cvt.f32.f16 %f238, %rs33;} // end inline asm fma.rn.ftz.f32 %f317, %f314, %f238, %f310; // begin inline asm { cvt.f32.f16 %f239, %rs34;} // end inline asm fma.rn.ftz.f32 %f318, %f314, %f239, %f311; shl.b16 %rs244, %rs35, 4; cvt.s16.s8 %rs245, %rs244; shr.s16 %rs246, %rs245, 7; and.b16 %rs247, %rs246, -16; or.b16 %rs248, %rs247, %rs36; cvt.rn.f32.s16 %f319, %rs248; sub.ftz.f32 %f320, %f319, %f6; mul.ftz.f32 %f321, %f47, %f320; // begin inline asm { cvt.f32.f16 %f240, %rs37;} // end inline asm fma.rn.ftz.f32 %f322, %f321, %f240, %f315; // begin inline asm { cvt.f32.f16 %f241, %rs38;} // end inline asm fma.rn.ftz.f32 %f323, %f321, %f241, %f316; // begin inline asm { cvt.f32.f16 %f242, %rs39;} // end inline asm fma.rn.ftz.f32 %f324, %f321, %f242, %f317; // begin inline asm { cvt.f32.f16 %f243, %rs40;} // end inline asm fma.rn.ftz.f32 %f325, %f321, %f243, %f318; shl.b16 %rs249, %rs41, 4; cvt.s16.s8 %rs250, %rs249; shr.s16 %rs251, %rs250, 7; and.b16 %rs252, %rs251, -16; or.b16 %rs253, %rs252, %rs42; cvt.rn.f32.s16 %f326, %rs253; sub.ftz.f32 %f327, %f326, %f6; mul.ftz.f32 %f328, %f47, %f327; // begin inline asm { cvt.f32.f16 %f244, %rs43;} // end inline asm fma.rn.ftz.f32 %f329, %f328, %f244, %f322; // begin inline asm { cvt.f32.f16 %f245, %rs44;} // end inline asm fma.rn.ftz.f32 %f330, %f328, %f245, %f323; // begin inline asm { cvt.f32.f16 %f246, %rs45;} // end inline asm fma.rn.ftz.f32 %f331, %f328, %f246, %f324; // begin inline asm { cvt.f32.f16 %f247, %rs46;} // end inline asm fma.rn.ftz.f32 %f332, %f328, %f247, %f325; shl.b16 %rs254, %rs47, 4; cvt.s16.s8 %rs255, %rs254; shr.s16 %rs256, %rs255, 7; and.b16 %rs257, %rs256, -16; or.b16 %rs258, %rs257, %rs47; cvt.rn.f32.s16 %f333, %rs258; sub.ftz.f32 %f334, %f333, %f6; mul.ftz.f32 %f335, %f47, %f334; // begin inline asm { cvt.f32.f16 %f248, %rs48;} // end inline asm fma.rn.ftz.f32 %f336, %f335, %f248, %f329; // begin inline asm { cvt.f32.f16 %f249, %rs49;} // end inline asm fma.rn.ftz.f32 %f337, %f335, %f249, %f330; // begin inline asm { cvt.f32.f16 %f250, %rs50;} // end inline asm fma.rn.ftz.f32 %f338, %f335, %f250, %f331; // begin inline asm { cvt.f32.f16 %f251, %rs51;} // end inline asm fma.rn.ftz.f32 %f339, %f335, %f251, %f332; ld.global.v4.u32 {%r125, %r126, %r127, %r128}, [%rd7+16]; ld.global.v4.u32 {%r133, %r134, %r135, %r136}, [%rd8+16]; ld.global.v4.u32 {%r141, %r142, %r143, %r144}, [%rd9]; ld.global.v4.u32 {%r149, %r150, %r151, %r152}, [%rd10]; shl.b16 %rs259, %rs52, 4; cvt.s16.s8 %rs260, %rs259; shr.s16 %rs261, %rs260, 7; and.b16 %rs262, %rs261, -16; or.b16 %rs263, %rs262, %rs53; cvt.rn.f32.s16 %f340, %rs263; sub.ftz.f32 %f341, %f340, %f6; mul.ftz.f32 %f342, %f47, %f341; mov.b32 {%rs187, %rs191}, %r125; // begin inline asm { cvt.f32.f16 %f252, %rs187;} // end inline asm fma.rn.ftz.f32 %f343, %f342, %f252, %f336; mov.b32 {%rs188, %rs192}, %r133; // begin inline asm { cvt.f32.f16 %f253, %rs188;} // end inline asm fma.rn.ftz.f32 %f344, %f342, %f253, %f337; mov.b32 {%rs189, %rs193}, %r141; // begin inline asm { cvt.f32.f16 %f254, %rs189;} // end inline asm fma.rn.ftz.f32 %f345, %f342, %f254, %f338; mov.b32 {%rs190, %rs194}, %r149; // begin inline asm { cvt.f32.f16 %f255, %rs190;} // end inline asm fma.rn.ftz.f32 %f346, %f342, %f255, %f339; shl.b16 %rs264, %rs54, 4; cvt.s16.s8 %rs265, %rs264; shr.s16 %rs266, %rs265, 7; and.b16 %rs267, %rs266, -16; or.b16 %rs268, %rs267, %rs55; cvt.rn.f32.s16 %f347, %rs268; sub.ftz.f32 %f348, %f347, %f6; mul.ftz.f32 %f349, %f47, %f348; // begin inline asm { cvt.f32.f16 %f256, %rs191;} // end inline asm fma.rn.ftz.f32 %f350, %f349, %f256, %f343; // begin inline asm { cvt.f32.f16 %f257, %rs192;} // end inline asm fma.rn.ftz.f32 %f351, %f349, %f257, %f344; // begin inline asm { cvt.f32.f16 %f258, %rs193;} // end inline asm fma.rn.ftz.f32 %f352, %f349, %f258, %f345; // begin inline asm { cvt.f32.f16 %f259, %rs194;} // end inline asm fma.rn.ftz.f32 %f353, %f349, %f259, %f346; shl.b16 %rs269, %rs56, 4; cvt.s16.s8 %rs270, %rs269; shr.s16 %rs271, %rs270, 7; and.b16 %rs272, %rs271, -16; or.b16 %rs273, %rs272, %rs57; cvt.rn.f32.s16 %f354, %rs273; sub.ftz.f32 %f355, %f354, %f6; mul.ftz.f32 %f356, %f47, %f355; mov.b32 {%rs195, %rs199}, %r126; // begin inline asm { cvt.f32.f16 %f260, %rs195;} // end inline asm fma.rn.ftz.f32 %f357, %f356, %f260, %f350; mov.b32 {%rs196, %rs200}, %r134; // begin inline asm { cvt.f32.f16 %f261, %rs196;} // end inline asm fma.rn.ftz.f32 %f358, %f356, %f261, %f351; mov.b32 {%rs197, %rs201}, %r142; // begin inline asm { cvt.f32.f16 %f262, %rs197;} // end inline asm fma.rn.ftz.f32 %f359, %f356, %f262, %f352; mov.b32 {%rs198, %rs202}, %r150; // begin inline asm { cvt.f32.f16 %f263, %rs198;} // end inline asm fma.rn.ftz.f32 %f360, %f356, %f263, %f353; shl.b16 %rs274, %rs58, 4; cvt.s16.s8 %rs275, %rs274; shr.s16 %rs276, %rs275, 7; and.b16 %rs277, %rs276, -16; or.b16 %rs278, %rs277, %rs59; cvt.rn.f32.s16 %f361, %rs278; sub.ftz.f32 %f362, %f361, %f6; mul.ftz.f32 %f363, %f47, %f362; // begin inline asm { cvt.f32.f16 %f264, %rs199;} // end inline asm fma.rn.ftz.f32 %f364, %f363, %f264, %f357; // begin inline asm { cvt.f32.f16 %f265, %rs200;} // end inline asm fma.rn.ftz.f32 %f365, %f363, %f265, %f358; // begin inline asm { cvt.f32.f16 %f266, %rs201;} // end inline asm fma.rn.ftz.f32 %f366, %f363, %f266, %f359; // begin inline asm { cvt.f32.f16 %f267, %rs202;} // end inline asm fma.rn.ftz.f32 %f367, %f363, %f267, %f360; shl.b16 %rs279, %rs60, 4; cvt.s16.s8 %rs280, %rs279; shr.s16 %rs281, %rs280, 7; and.b16 %rs282, %rs281, -16; or.b16 %rs283, %rs282, %rs61; cvt.rn.f32.s16 %f368, %rs283; sub.ftz.f32 %f369, %f368, %f6; mul.ftz.f32 %f370, %f47, %f369; mov.b32 {%rs203, %rs207}, %r127; // begin inline asm { cvt.f32.f16 %f268, %rs203;} // end inline asm fma.rn.ftz.f32 %f371, %f370, %f268, %f364; mov.b32 {%rs204, %rs208}, %r135; // begin inline asm { cvt.f32.f16 %f269, %rs204;} // end inline asm fma.rn.ftz.f32 %f372, %f370, %f269, %f365; mov.b32 {%rs205, %rs209}, %r143; // begin inline asm { cvt.f32.f16 %f270, %rs205;} // end inline asm fma.rn.ftz.f32 %f373, %f370, %f270, %f366; mov.b32 {%rs206, %rs210}, %r151; // begin inline asm { cvt.f32.f16 %f271, %rs206;} // end inline asm fma.rn.ftz.f32 %f374, %f370, %f271, %f367; shl.b16 %rs284, %rs62, 4; cvt.s16.s8 %rs285, %rs284; shr.s16 %rs286, %rs285, 7; and.b16 %rs287, %rs286, -16; or.b16 %rs288, %rs287, %rs63; cvt.rn.f32.s16 %f375, %rs288; sub.ftz.f32 %f376, %f375, %f6; mul.ftz.f32 %f377, %f47, %f376; // begin inline asm { cvt.f32.f16 %f272, %rs207;} // end inline asm fma.rn.ftz.f32 %f378, %f377, %f272, %f371; // begin inline asm { cvt.f32.f16 %f273, %rs208;} // end inline asm fma.rn.ftz.f32 %f379, %f377, %f273, %f372; // begin inline asm { cvt.f32.f16 %f274, %rs209;} // end inline asm fma.rn.ftz.f32 %f380, %f377, %f274, %f373; // begin inline asm { cvt.f32.f16 %f275, %rs210;} // end inline asm fma.rn.ftz.f32 %f381, %f377, %f275, %f374; shl.b16 %rs289, %rs64, 4; cvt.s16.s8 %rs290, %rs289; shr.s16 %rs291, %rs290, 7; and.b16 %rs292, %rs291, -16; or.b16 %rs293, %rs292, %rs65; cvt.rn.f32.s16 %f382, %rs293; sub.ftz.f32 %f383, %f382, %f6; mul.ftz.f32 %f384, %f47, %f383; mov.b32 {%rs211, %rs215}, %r128; // begin inline asm { cvt.f32.f16 %f276, %rs211;} // end inline asm fma.rn.ftz.f32 %f385, %f384, %f276, %f378; mov.b32 {%rs212, %rs216}, %r136; // begin inline asm { cvt.f32.f16 %f277, %rs212;} // end inline asm fma.rn.ftz.f32 %f386, %f384, %f277, %f379; mov.b32 {%rs213, %rs217}, %r144; // begin inline asm { cvt.f32.f16 %f278, %rs213;} // end inline asm fma.rn.ftz.f32 %f387, %f384, %f278, %f380; mov.b32 {%rs214, %rs218}, %r152; // begin inline asm { cvt.f32.f16 %f279, %rs214;} // end inline asm fma.rn.ftz.f32 %f388, %f384, %f279, %f381; shl.b16 %rs294, %rs66, 4; cvt.s16.s8 %rs295, %rs294; shr.s16 %rs296, %rs295, 7; and.b16 %rs297, %rs296, -16; or.b16 %rs298, %rs297, %rs66; cvt.rn.f32.s16 %f389, %rs298; sub.ftz.f32 %f390, %f389, %f6; mul.ftz.f32 %f391, %f47, %f390; // begin inline asm { cvt.f32.f16 %f280, %rs215;} // end inline asm fma.rn.ftz.f32 %f491, %f391, %f280, %f385; // begin inline asm { cvt.f32.f16 %f281, %rs216;} // end inline asm fma.rn.ftz.f32 %f490, %f391, %f281, %f386; // begin inline asm { cvt.f32.f16 %f282, %rs217;} // end inline asm fma.rn.ftz.f32 %f489, %f391, %f282, %f387; // begin inline asm { cvt.f32.f16 %f283, %rs218;} // end inline asm fma.rn.ftz.f32 %f488, %f391, %f283, %f388; $L__BB0_8: add.s32 %r244, %r244, 4; shl.b32 %r157, %r244, 5; add.s32 %r243, %r157, %r3; shl.b32 %r242, %r243, 1; setp.lt.u32 %p7, %r242, %r30; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r238, %tid.y; shl.b32 %r237, %r238, 5; add.s32 %r236, %r237, %r3; shl.b32 %r158, %r236, 2; mov.u32 %r159, _ZZ9gemv_int4ILi4ELi64ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r160, %r159, %r158; setp.lt.u32 %p8, %r236, 32; @%p8 bra $L__BB0_11; add.s32 %r230, %r160, -112; st.shared.f32 [%r230], %f491; $L__BB0_11: mov.u32 %r241, %tid.y; shl.b32 %r240, %r241, 5; add.s32 %r239, %r240, %r3; setp.gt.u32 %p9, %r239, 31; bar.sync 0; mad.lo.s32 %r23, %r239, 12, %r159; @%p9 bra $L__BB0_13; mov.u32 %r175, 16; ld.shared.f32 %f407, [%r23+16]; add.ftz.f32 %f408, %f491, %f407; ld.shared.f32 %f409, [%r23+20]; add.ftz.f32 %f410, %f408, %f409; ld.shared.f32 %f411, [%r23+24]; add.ftz.f32 %f394, %f410, %f411; mov.u32 %r163, 1; mov.u32 %r176, 31; mov.u32 %r177, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f394, %r163, %r176, %r177; @p add.f32 r0, r0, %f394; mov.f32 %f392, r0;} // end inline asm mov.u32 %r166, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f392, %r166, %r176, %r177; @p add.f32 r0, r0, %f392; mov.f32 %f395, r0;} // end inline asm mov.u32 %r169, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f395, %r169, %r176, %r177; @p add.f32 r0, r0, %f395; mov.f32 %f398, r0;} // end inline asm mov.u32 %r172, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f398, %r172, %r176, %r177; @p add.f32 r0, r0, %f398; mov.f32 %f401, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f401, %r175, %r176, %r177; @p add.f32 r0, r0, %f401; mov.f32 %f491, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r231, %r160, -112; st.shared.f32 [%r231+640], %f490; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f427, [%r23+656]; add.ftz.f32 %f428, %f490, %f427; ld.shared.f32 %f429, [%r23+660]; add.ftz.f32 %f430, %f428, %f429; ld.shared.f32 %f431, [%r23+664]; add.ftz.f32 %f414, %f430, %f431; mov.u32 %r179, 1; mov.u32 %r192, 31; mov.u32 %r193, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f414, %r179, %r192, %r193; @p add.f32 r0, r0, %f414; mov.f32 %f412, r0;} // end inline asm mov.u32 %r182, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f412, %r182, %r192, %r193; @p add.f32 r0, r0, %f412; mov.f32 %f415, r0;} // end inline asm mov.u32 %r185, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f415, %r185, %r192, %r193; @p add.f32 r0, r0, %f415; mov.f32 %f418, r0;} // end inline asm mov.u32 %r188, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f418, %r188, %r192, %r193; @p add.f32 r0, r0, %f418; mov.f32 %f421, r0;} // end inline asm mov.u32 %r191, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f421, %r191, %r192, %r193; @p add.f32 r0, r0, %f421; mov.f32 %f490, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r232, %r160, -112; st.shared.f32 [%r232+1280], %f489; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f447, [%r23+1296]; add.ftz.f32 %f448, %f489, %f447; ld.shared.f32 %f449, [%r23+1300]; add.ftz.f32 %f450, %f448, %f449; ld.shared.f32 %f451, [%r23+1304]; add.ftz.f32 %f434, %f450, %f451; mov.u32 %r195, 1; mov.u32 %r208, 31; mov.u32 %r209, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f434, %r195, %r208, %r209; @p add.f32 r0, r0, %f434; mov.f32 %f432, r0;} // end inline asm mov.u32 %r198, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f432, %r198, %r208, %r209; @p add.f32 r0, r0, %f432; mov.f32 %f435, r0;} // end inline asm mov.u32 %r201, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f435, %r201, %r208, %r209; @p add.f32 r0, r0, %f435; mov.f32 %f438, r0;} // end inline asm mov.u32 %r204, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f438, %r204, %r208, %r209; @p add.f32 r0, r0, %f438; mov.f32 %f441, r0;} // end inline asm mov.u32 %r207, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f441, %r207, %r208, %r209; @p add.f32 r0, r0, %f441; mov.f32 %f489, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r233, %r160, -112; st.shared.f32 [%r233+1920], %f488; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f467, [%r23+1936]; add.ftz.f32 %f468, %f488, %f467; ld.shared.f32 %f469, [%r23+1940]; add.ftz.f32 %f470, %f468, %f469; ld.shared.f32 %f471, [%r23+1944]; add.ftz.f32 %f454, %f470, %f471; mov.u32 %r211, 1; mov.u32 %r224, 31; mov.u32 %r225, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f454, %r211, %r224, %r225; @p add.f32 r0, r0, %f454; mov.f32 %f452, r0;} // end inline asm mov.u32 %r214, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f452, %r214, %r224, %r225; @p add.f32 r0, r0, %f452; mov.f32 %f455, r0;} // end inline asm mov.u32 %r217, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f455, %r217, %r224, %r225; @p add.f32 r0, r0, %f455; mov.f32 %f458, r0;} // end inline asm mov.u32 %r220, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f458, %r220, %r224, %r225; @p add.f32 r0, r0, %f458; mov.f32 %f461, r0;} // end inline asm mov.u32 %r223, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f461, %r223, %r224, %r225; @p add.f32 r0, r0, %f461; mov.f32 %f488, r0;} // end inline asm $L__BB0_25: mov.u32 %r234, %tid.y; or.b32 %r226, %r3, %r234; setp.ne.s32 %p16, %r226, 0; @%p16 bra $L__BB0_29; ld.param.u64 %rd55, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd54, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0]; mov.u32 %r235, %ctaid.x; cvta.to.global.u64 %rd40, %rd54; setp.eq.s64 %p17, %rd55, 0; mul.ftz.f32 %f31, %f41, %f491; cvt.s64.s32 %rd12, %r235; mul.wide.s32 %rd41, %r235, 2; add.s64 %rd13, %rd40, %rd41; mul.ftz.f32 %f32, %f41, %f490; cvt.s64.s32 %rd14, %r29; mul.wide.s32 %rd42, %r29, 2; add.s64 %rd15, %rd13, %rd42; mul.ftz.f32 %f33, %f41, %f489; add.s32 %r227, %r29, %r235; add.s32 %r228, %r227, %r29; cvt.s64.s32 %rd16, %r228; mul.wide.s32 %rd43, %r228, 2; add.s64 %rd17, %rd40, %rd43; mul.ftz.f32 %f34, %f41, %f488; add.s32 %r229, %r228, %r29; cvt.s64.s32 %rd18, %r229; mul.wide.s32 %rd44, %r229, 2; add.s64 %rd19, %rd40, %rd44; @%p17 bra $L__BB0_28; ld.param.u64 %rd56, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd45, %rd56; shl.b64 %rd46, %rd12, 1; add.s64 %rd47, %rd45, %rd46; ld.global.u16 %rs299, [%rd47]; // begin inline asm { cvt.f32.f16 %f472, %rs299;} // end inline asm fma.rn.ftz.f32 %f473, %f42, %f472, %f31; // begin inline asm { cvt.rn.f16.f32 %rs300, %f473;} // end inline asm st.global.u16 [%rd13], %rs300; shl.b64 %rd48, %rd14, 1; add.s64 %rd49, %rd47, %rd48; ld.global.u16 %rs301, [%rd49]; // begin inline asm { cvt.f32.f16 %f474, %rs301;} // end inline asm fma.rn.ftz.f32 %f475, %f42, %f474, %f32; // begin inline asm { cvt.rn.f16.f32 %rs302, %f475;} // end inline asm st.global.u16 [%rd15], %rs302; shl.b64 %rd50, %rd16, 1; add.s64 %rd51, %rd45, %rd50; ld.global.u16 %rs303, [%rd51]; // begin inline asm { cvt.f32.f16 %f476, %rs303;} // end inline asm fma.rn.ftz.f32 %f477, %f42, %f476, %f33; // begin inline asm { cvt.rn.f16.f32 %rs304, %f477;} // end inline asm st.global.u16 [%rd17], %rs304; shl.b64 %rd52, %rd18, 1; add.s64 %rd53, %rd45, %rd52; ld.global.u16 %rs305, [%rd53]; // begin inline asm { cvt.f32.f16 %f478, %rs305;} // end inline asm fma.rn.ftz.f32 %f479, %f42, %f478, %f34; // begin inline asm { cvt.rn.f16.f32 %rs306, %f479;} // end inline asm st.global.u16 [%rd19], %rs306; bra.uni $L__BB0_29; $L__BB0_28: // begin inline asm { cvt.rn.f16.f32 %rs307, %f31;} // end inline asm st.global.u16 [%rd13], %rs307; // begin inline asm { cvt.rn.f16.f32 %rs308, %f32;} // end inline asm st.global.u16 [%rd15], %rs308; // begin inline asm { cvt.rn.f16.f32 %rs309, %f33;} // end inline asm st.global.u16 [%rd17], %rs309; // begin inline asm { cvt.rn.f16.f32 %rs310, %f34;} // end inline asm st.global.u16 [%rd19], %rs310; $L__BB0_29: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }