Spaces:

YZ-TAN
/

flask-llama

Build error

App Files Files Community

flask-llama / llama.cpp /ggml /src /ggml-opencl /kernels /ggml-opencl_cvt.cl

YZ-TAN

Upload 2821 files

5a29263 verified 5 months ago

raw

history blame

4.15 kB

	//------------------------------------------------------------------------------
	// This file is contains additional kernels for data conversion.
	// These kernels are used when loading the model, so its performance is less
	// important.
	//------------------------------------------------------------------------------
	#ifdef cl_khr_fp16
	#pragma OPENCL EXTENSION cl_khr_fp16 : enable
	#elif defined(cl_amd_fp16)
	#pragma OPENCL EXTENSION cl_amd_fp16 : enable
	#else
	#error "Half precision floating point not supportedby OpenCL implementation on your device."
	#endif

	#ifdef cl_khr_subgroups
	#pragma OPENCL EXTENSION cl_khr_subgroups : enable
	#elif defined(cl_intel_subgroups)
	#pragma OPENCL EXTENSION cl_intel_subgroups : enable
	#else
	#error "Subgroup not supported on your device."
	#endif

	#ifdef cl_intel_required_subgroup_size
	// Always use subgroup size of 32 on Intel.
	#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
	#define INTEL_GPU 1
	#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
	#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
	#elif defined(cl_qcom_reqd_sub_group_size)
	// Always use subgroups size of 64 on Adreno.
	#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
	#define ADRENO_GPU 1
	#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
	#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
	#else
	// TODO: do not know how to choose subgroup size on other GPUs.
	#error "Selecting subgroup size is not supported on your device."
	#endif

	#define QK4_0 32
	#define QR4_0 2
	#define QK4_1 32
	#define QR4_1 2
	#define QK5_0 32
	#define QR5_0 2
	#define QK5_1 32
	#define QR5_1 2
	#define QK8_0 32
	#define QR8_0 1
	#define QK_K 256
	#define K_QUANTS_PER_ITERATION 2

	typedef char int8_t;
	typedef uchar uint8_t;
	typedef short int16_t;
	typedef ushort uint16_t;
	typedef int int32_t;
	typedef uint uint32_t;

	//------------------------------------------------------------------------------
	// block_q4_0
	//------------------------------------------------------------------------------
	struct block_q4_0
	{
	half d;
	uint8_t qs[QK4_0 / 2];
	};

	//------------------------------------------------------------------------------
	// mul_vec_q_n_f32_flat_noshuffle
	//
	// This variation uses flat arrays (struct of arrays, SOA) representation for
	// quant tensors. It also uses non shuffled bit order for weights.
	//
	// The shuffled version is kept in the original file because moving it here
	// seems to result in worse performance for adreno.
	//------------------------------------------------------------------------------

	kernel void kernel_convert_block_q4_0_noshuffle(
	global struct block_q4_0 * src0,
	global uchar * dst_q,
	global half * dst_d
	) {
	global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
	global uchar * q = (global uchar ) dst_q + QK4_0/2get_global_id(0);
	global half * d = (global half *) dst_d + get_global_id(0);

	*d = b->d;
	for (int i = 0; i < QK4_0/4; ++i) {
	uchar x0 = b->qs[2*i + 0];
	uchar x1 = b->qs[2*i + 1];

	q[i + 0 ] = convert_uchar(x0 & 0x0F) \| convert_uchar((x1 & 0x0F) << 4);
	q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) \| convert_uchar(x1 & 0xF0);

	#ifdef ADRENO_GPU
	// Workaround for adreno - must have the following printf statement for
	// the kernel to work properly. Otherwise it produces incorrect result.
	// convert_uchar above also seems necessary.
	// Compare against a large number so that it does not print anything.
	// get_sub_group_local_id() also works.
	if (get_global_id(0) == 65536*4096) {
	printf("%04x - %02x\n", (global ushort)d, ((x0 & 0xF0) >> 4) \| (x1 & 0xF0));
	}
	#endif
	}
	}