llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures
作者:XD / 发表: 2024年1月25日 01:05 / 更新: 2024年1月25日 01:15 / 编程笔记 / 阅读量:1328
The source code from llama.cpp /ggml-quants.c includes detailed definitions of various quantization structures used in neural networks and computational models. These structures, named Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K, are designed for efficient representation and processing of weights in a quantized format, reducing memory footprint while maintaining acceptable levels of accuracy.
//
// Super-block quantization structures
//
// Define the super-block size based on a preprocessor directive.
// This affects the size of quantization blocks and related arrays.
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif
// 2-bit quantization structure
// Each weight is represented as x = a * q + b, where a is the scale and b is the minimum value.
// The structure is divided into 16 blocks of 16 elements each, leading to 2.625 bits per weight.
// When QK_K = 256, then scales = 16 bytes, qs = 64 bytes, d = 2 bytes, dmin = 2 bytes. The total is 84 bytes = 84 * 8 bits = 672 bits, so have 672 bits / 256 = 2.625 (bpw) bits per weight.
typedef struct {
uint8_t scales[QK_K/16]; // Scales and minimums, quantized using 4 bits.
uint8_t qs[QK_K/4]; // Quantized values.
ggml_fp16_t d; // Super-block scale for quantized scales.
ggml_fp16_t dmin; // Super-block scale for quantized minimums.
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
// 3-bit quantization structure
// Weights are represented as x = a * q, using only the scale factor a.
// Divided into 16 blocks of 16 elements each, this achieves 3.4375 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
uint8_t hmask[QK_K/8]; // High bit of the quantized values.
uint8_t qs[QK_K/4]; // Low 2 bits of the quantized values.
uint8_t scales[2]; // Scale values.
ggml_fp16_t d; // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else
// When QK_K = 256, then hmask= 32 bytes, qs = 64 bytes, scales = 12 bytes, d = 2 bytes. The total is 110 bytes = 110 * 8 bits = 880 bits, so we have 880 bits / 256 = 3.4375 (bpw) bits per weight.
typedef struct {
uint8_t hmask[QK_K/8]; // High bit of the quantized values.
uint8_t qs[QK_K/4]; // Low 2 bits of the quantized values.
uint8_t scales[12]; // Scales, quantized with 6 bits.
ggml_fp16_t d; // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif
// 4-bit quantization structure
// Weights are again represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 4.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
ggml_fp16_t d[2]; // Super-block scales/mins.
uint8_t scales[2]; // 4-bit block scales/mins.
uint8_t qs[QK_K/2]; // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct {
ggml_fp16_t d; // Super-block scale for quantized scales.
ggml_fp16_t dmin; // Super-block scale for quantized mins.
uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
uint8_t qs[QK_K/2]; // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif
// 5-bit quantization structure
// Weights are represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 5.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
ggml_fp16_t d; // Super-block scale.
int8_t scales[QK_K/16]; // 8-bit block scales.
uint8_t qh[QK_K/8]; // High bit of the quantized values.
uint8_t qs[QK_K/2]; // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
ggml_fp16_t d; // Super-block scale for quantized scales.
ggml_fp16_t dmin; // Super-block scale for quantized mins.
uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
uint8_t qh[QK_K/8]; // High bit of the quantized values.
uint8_t qs[QK_K/2]; // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif
// 6-bit quantization structure
// Weights are represented as x = a * q.
// The structure is divided into 16 blocks of 16 elements each, achieving 6.5625 bits per weight.
typedef struct {
uint8_t ql[QK_K/2]; // Lower 4 bits of the quantized values.
uint8_t qh[QK_K/4]; // Upper 2 bits of the quantized values.
int8_t scales[QK_K/16]; // Scales, quantized with 8 bits.
ggml_fp16_t d; // Super-block scale.
} block_q6_K;
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
// Intermediate quantization and dot product structure
typedef struct {
float d; // Delta value for quantization.
int8_t qs[QK_K]; // Quantized values.
int16_t bsums[QK_K/16]; // Sum of quants in groups of 16.
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
// "True" 2-bit quantization structure, adjusted for block usage in ggml design.
// Results in 2.0625 bits per weight due to 16-bit scale for each block of 256.
typedef struct {
ggml_fp16_t d; // Super-block scale.
uint16_t qs[QK_K/8]; // Quantized values.
} block_iq2_xxs;
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
// 2.3125 bpw (bits per weight) quantization structure
typedef struct {
ggml_fp16_t d; // Super-block scale.
uint16_t qs[QK_K/8]; // Quantized values.
uint8_t scales[QK_K/32];// Scales for quantization.
} block_iq2_xs;
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
相关标签