EADST

llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures

The source code from llama.cpp /ggml-quants.c includes detailed definitions of various quantization structures used in neural networks and computational models. These structures, named Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K, are designed for efficient representation and processing of weights in a quantized format, reducing memory footprint while maintaining acceptable levels of accuracy.

//
// Super-block quantization structures
//

// Define the super-block size based on a preprocessor directive. 
// This affects the size of quantization blocks and related arrays.
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif

// 2-bit quantization structure
// Each weight is represented as x = a * q + b, where a is the scale and b is the minimum value.
// The structure is divided into 16 blocks of 16 elements each, leading to 2.625 bits per weight.

// When QK_K = 256, then scales = 16 bytes, qs = 64 bytes, d = 2 bytes, dmin = 2 bytes. The total is 84 bytes = 84 * 8  bits = 672 bits, so have 672 bits / 256 = 2.625 (bpw) bits per weight.

typedef struct {
    uint8_t scales[QK_K/16];    // Scales and minimums, quantized using 4 bits.
    uint8_t qs[QK_K/4];         // Quantized values.
    ggml_fp16_t d;              // Super-block scale for quantized scales.
    ggml_fp16_t dmin;           // Super-block scale for quantized minimums.
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

// 3-bit quantization structure
// Weights are represented as x = a * q, using only the scale factor a.
// Divided into 16 blocks of 16 elements each, this achieves 3.4375 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[2];        // Scale values.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else

// When QK_K = 256, then hmask= 32 bytes, qs = 64 bytes, scales = 12 bytes, d = 2 bytes. The total is 110 bytes = 110 * 8  bits = 880 bits, so we have 880 bits / 256 = 3.4375 (bpw) bits per weight.

typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[12];       // Scales, quantized with 6 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif

// 4-bit quantization structure
// Weights are again represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 4.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d[2];         // Super-block scales/mins.
    uint8_t scales[2];        // 4-bit block scales/mins.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif

// 5-bit quantization structure
// Weights are represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 5.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d;            // Super-block scale.
    int8_t  scales[QK_K/16];  // 8-bit block scales.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif

// 6-bit quantization structure
// Weights are represented as x = a * q.
// The structure is divided into 16 blocks of 16 elements each, achieving 6.5625 bits per weight.
typedef struct {
    uint8_t ql[QK_K/2];       // Lower 4 bits of the quantized values.
    uint8_t qh[QK_K/4];       // Upper 2 bits of the quantized values.
    int8_t  scales[QK_K/16];  // Scales, quantized with 8 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q6_K;
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

// Intermediate quantization and dot product structure
typedef struct {
    float   d;               // Delta value for quantization.
    int8_t  qs[QK_K];        // Quantized values.
    int16_t bsums[QK_K/16];  // Sum of quants in groups of 16.
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

// "True" 2-bit quantization structure, adjusted for block usage in ggml design.
// Results in 2.0625 bits per weight due to 16-bit scale for each block of 256.
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
} block_iq2_xxs;
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");

// 2.3125 bpw (bits per weight) quantization structure
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
    uint8_t  scales[QK_K/32];// Scales for quantization.
} block_iq2_xs;
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); 
相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
算法题 Shortcut Jetson Input v0.dev EXCEL FP64 PIP TTS AI 飞书 RGB NameSilo Food DeepStream CUDA tqdm Use PDB NLP Python 证件照 Pickle 顶会 FP16 Heatmap CLAP Mixtral CSV logger Logo Password mmap WAN Docker LoRA Tracking 签证 Knowledge 版权 GPT4 Nginx 域名 IndexTTS2 CC Sklearn Card Vim RAR CV Safetensors Base64 XGBoost Plotly Windows Plate Land Firewall XML 报税 SQL Transformers Distillation Disk Qwen2 HaggingFace 云服务器 git-lfs 关于博主 Dataset DeepSeek TensorRT 递归学习法 Github torchinfo tar SPIE MD5 GPTQ Anaconda PyCharm Freesound OpenCV Augmentation 多线程 uwsgi LLAMA JSON BF16 VSCode Hungarian Math Crawler uWSGI diffusers LeetCode Image2Text GGML FP8 YOLO Ubuntu Review Clash Bipartite VGG-16 BTC Interview 继承 v2ray Excel LLM UNIX Attention Git 多进程 CEIR BeautifulSoup ModelScope Llama CTC COCO 图标 Vmess TensorFlow News Miniforge Web llama.cpp Domain PyTorch Pillow InvalidArgumentError Search CAM Algorithm TSV C++ Permission SAM VPN hf Paper HuggingFace printf GoogLeNet GIT Animate NLTK Paddle UI Pytorch Hotel ONNX FastAPI Django 第一性原理 Website Claude PDF Qwen Tensor Qwen2.5 Ptyhon Datetime Pandas QWEN 净利润 transformers Color OCR WebCrawler Translation SVR 音频 图形思考法 Video 财报 ResNet-50 FP32 Bitcoin Cloudreve Template SQLite scipy ChatGPT Baidu Agent Conda Gemma LaTeX Quantization Markdown Diagram 公式 强化学习 OpenAI Numpy Random Breakpoint Bert git icon Statistics 腾讯云 Bin Quantize Michelin Hilton Streamlit Jupyter 阿里云 Google FlashAttention 搞笑 Magnet Linux API Data Tiktoken Proxy Zip
站点统计

本站现有博文322篇,共被浏览783913

本站已经建立2477天!

热门文章
文章归档
回到顶部