Spaces:
Build error
Build error
//------------------------------------------------------------------------------ | |
// This file is contains additional kernels for data conversion. | |
// These kernels are used when loading the model, so its performance is less | |
// important. | |
//------------------------------------------------------------------------------ | |
// Always use subgroup size of 32 on Intel. | |
// Always use subgroups size of 64 on Adreno. | |
// TODO: do not know how to choose subgroup size on other GPUs. | |
typedef char int8_t; | |
typedef uchar uint8_t; | |
typedef short int16_t; | |
typedef ushort uint16_t; | |
typedef int int32_t; | |
typedef uint uint32_t; | |
//------------------------------------------------------------------------------ | |
// block_q4_0 | |
//------------------------------------------------------------------------------ | |
struct block_q4_0 | |
{ | |
half d; | |
uint8_t qs[QK4_0 / 2]; | |
}; | |
//------------------------------------------------------------------------------ | |
// mul_vec_q_n_f32_flat_noshuffle | |
// | |
// This variation uses flat arrays (struct of arrays, SOA) representation for | |
// quant tensors. It also uses non shuffled bit order for weights. | |
// | |
// The shuffled version is kept in the original file because moving it here | |
// seems to result in worse performance for adreno. | |
//------------------------------------------------------------------------------ | |
kernel void kernel_convert_block_q4_0_noshuffle( | |
global struct block_q4_0 * src0, | |
global uchar * dst_q, | |
global half * dst_d | |
) { | |
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0); | |
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0); | |
global half * d = (global half *) dst_d + get_global_id(0); | |
*d = b->d; | |
for (int i = 0; i < QK4_0/4; ++i) { | |
uchar x0 = b->qs[2*i + 0]; | |
uchar x1 = b->qs[2*i + 1]; | |
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); | |
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); | |
// Workaround for adreno - must have the following printf statement for | |
// the kernel to work properly. Otherwise it produces incorrect result. | |
// convert_uchar above also seems necessary. | |
// Compare against a large number so that it does not print anything. | |
// get_sub_group_local_id() also works. | |
if (get_global_id(0) == 65536*4096) { | |
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0)); | |
} | |
} | |
} | |