EADST

llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures

The source code from llama.cpp /ggml-quants.c includes detailed definitions of various quantization structures used in neural networks and computational models. These structures, named Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K, are designed for efficient representation and processing of weights in a quantized format, reducing memory footprint while maintaining acceptable levels of accuracy.

//
// Super-block quantization structures
//

// Define the super-block size based on a preprocessor directive. 
// This affects the size of quantization blocks and related arrays.
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif

// 2-bit quantization structure
// Each weight is represented as x = a * q + b, where a is the scale and b is the minimum value.
// The structure is divided into 16 blocks of 16 elements each, leading to 2.625 bits per weight.

// When QK_K = 256, then scales = 16 bytes, qs = 64 bytes, d = 2 bytes, dmin = 2 bytes. The total is 84 bytes = 84 * 8  bits = 672 bits, so have 672 bits / 256 = 2.625 (bpw) bits per weight.

typedef struct {
    uint8_t scales[QK_K/16];    // Scales and minimums, quantized using 4 bits.
    uint8_t qs[QK_K/4];         // Quantized values.
    ggml_fp16_t d;              // Super-block scale for quantized scales.
    ggml_fp16_t dmin;           // Super-block scale for quantized minimums.
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

// 3-bit quantization structure
// Weights are represented as x = a * q, using only the scale factor a.
// Divided into 16 blocks of 16 elements each, this achieves 3.4375 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[2];        // Scale values.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else

// When QK_K = 256, then hmask= 32 bytes, qs = 64 bytes, scales = 12 bytes, d = 2 bytes. The total is 110 bytes = 110 * 8  bits = 880 bits, so we have 880 bits / 256 = 3.4375 (bpw) bits per weight.

typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[12];       // Scales, quantized with 6 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif

// 4-bit quantization structure
// Weights are again represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 4.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d[2];         // Super-block scales/mins.
    uint8_t scales[2];        // 4-bit block scales/mins.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif

// 5-bit quantization structure
// Weights are represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 5.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d;            // Super-block scale.
    int8_t  scales[QK_K/16];  // 8-bit block scales.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif

// 6-bit quantization structure
// Weights are represented as x = a * q.
// The structure is divided into 16 blocks of 16 elements each, achieving 6.5625 bits per weight.
typedef struct {
    uint8_t ql[QK_K/2];       // Lower 4 bits of the quantized values.
    uint8_t qh[QK_K/4];       // Upper 2 bits of the quantized values.
    int8_t  scales[QK_K/16];  // Scales, quantized with 8 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q6_K;
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

// Intermediate quantization and dot product structure
typedef struct {
    float   d;               // Delta value for quantization.
    int8_t  qs[QK_K];        // Quantized values.
    int16_t bsums[QK_K/16];  // Sum of quants in groups of 16.
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

// "True" 2-bit quantization structure, adjusted for block usage in ggml design.
// Results in 2.0625 bits per weight due to 16-bit scale for each block of 256.
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
} block_iq2_xxs;
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");

// 2.3125 bpw (bits per weight) quantization structure
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
    uint8_t  scales[QK_K/32];// Scales for quantization.
} block_iq2_xs;
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); 
相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
PIP Miniforge 强化学习 BeautifulSoup FP16 RGB WAN JSON Linux Bitcoin QWEN git-lfs Quantize Conda CC Nginx Cloudreve Freesound Hotel Jupyter Video SQL Vim InvalidArgumentError Datetime Bipartite 图形思考法 Bin BTC Heatmap TTS API HuggingFace 公式 版权 NameSilo Knowledge Template XML Search 财报 DeepStream Attention 关于博主 FastAPI Zip ModelScope GoogLeNet LaTeX 多进程 Plotly Input Pytorch CAM Land Qwen2.5 Logo Tracking GIT Tiktoken DeepSeek 音频 GGML 签证 Docker PDB logger Agent NLP PDF TSV Image2Text Hilton MD5 ONNX SQLite uwsgi Statistics Food VGG-16 FlashAttention Clash Quantization Translation C++ TensorFlow Algorithm 证件照 Qwen v2ray COCO Llama Review ChatGPT Pickle Jetson CTC Gemma Augmentation Google SVR 继承 LoRA Random Safetensors AI Pillow llama.cpp Markdown Dataset BF16 Password 净利润 Sklearn UI Claude Plate SAM 算法题 tqdm Distillation Github v0.dev VSCode 腾讯云 Base64 Proxy transformers Git Permission printf Data CLAP Pandas FP32 Anaconda SPIE CV HaggingFace Qwen2 LLAMA TensorRT 阿里云 diffusers tar Firewall OpenCV Animate Math LLM 域名 VPN 飞书 Paddle Transformers GPT4 Baidu 多线程 Mixtral Crawler uWSGI Paper NLTK Numpy FP64 Shortcut Michelin Diagram 第一性原理 git Python Hungarian UNIX WebCrawler RAR PyTorch PyCharm Ubuntu LeetCode Use ResNet-50 XGBoost Interview 报税 Windows Web Card CSV OCR Vmess 顶会 scipy Ptyhon Streamlit Magnet Django Disk 递归学习法 EXCEL Tensor Domain IndexTTS2 CEIR OpenAI torchinfo FP8 搞笑 Website GPTQ YOLO Color CUDA Breakpoint mmap Bert hf Excel
站点统计

本站现有博文319篇,共被浏览751683

本站已经建立2408天!

热门文章
文章归档
回到顶部