EADST

llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures

The source code from llama.cpp /ggml-quants.c includes detailed definitions of various quantization structures used in neural networks and computational models. These structures, named Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K, are designed for efficient representation and processing of weights in a quantized format, reducing memory footprint while maintaining acceptable levels of accuracy.

//
// Super-block quantization structures
//

// Define the super-block size based on a preprocessor directive. 
// This affects the size of quantization blocks and related arrays.
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif

// 2-bit quantization structure
// Each weight is represented as x = a * q + b, where a is the scale and b is the minimum value.
// The structure is divided into 16 blocks of 16 elements each, leading to 2.625 bits per weight.

// When QK_K = 256, then scales = 16 bytes, qs = 64 bytes, d = 2 bytes, dmin = 2 bytes. The total is 84 bytes = 84 * 8  bits = 672 bits, so have 672 bits / 256 = 2.625 (bpw) bits per weight.

typedef struct {
    uint8_t scales[QK_K/16];    // Scales and minimums, quantized using 4 bits.
    uint8_t qs[QK_K/4];         // Quantized values.
    ggml_fp16_t d;              // Super-block scale for quantized scales.
    ggml_fp16_t dmin;           // Super-block scale for quantized minimums.
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

// 3-bit quantization structure
// Weights are represented as x = a * q, using only the scale factor a.
// Divided into 16 blocks of 16 elements each, this achieves 3.4375 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[2];        // Scale values.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else

// When QK_K = 256, then hmask= 32 bytes, qs = 64 bytes, scales = 12 bytes, d = 2 bytes. The total is 110 bytes = 110 * 8  bits = 880 bits, so we have 880 bits / 256 = 3.4375 (bpw) bits per weight.

typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[12];       // Scales, quantized with 6 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif

// 4-bit quantization structure
// Weights are again represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 4.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d[2];         // Super-block scales/mins.
    uint8_t scales[2];        // 4-bit block scales/mins.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif

// 5-bit quantization structure
// Weights are represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 5.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d;            // Super-block scale.
    int8_t  scales[QK_K/16];  // 8-bit block scales.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif

// 6-bit quantization structure
// Weights are represented as x = a * q.
// The structure is divided into 16 blocks of 16 elements each, achieving 6.5625 bits per weight.
typedef struct {
    uint8_t ql[QK_K/2];       // Lower 4 bits of the quantized values.
    uint8_t qh[QK_K/4];       // Upper 2 bits of the quantized values.
    int8_t  scales[QK_K/16];  // Scales, quantized with 8 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q6_K;
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

// Intermediate quantization and dot product structure
typedef struct {
    float   d;               // Delta value for quantization.
    int8_t  qs[QK_K];        // Quantized values.
    int16_t bsums[QK_K/16];  // Sum of quants in groups of 16.
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

// "True" 2-bit quantization structure, adjusted for block usage in ggml design.
// Results in 2.0625 bits per weight due to 16-bit scale for each block of 256.
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
} block_iq2_xxs;
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");

// 2.3125 bpw (bits per weight) quantization structure
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
    uint8_t  scales[QK_K/32];// Scales for quantization.
} block_iq2_xs;
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); 
相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
API mmap Zip Jupyter News torchinfo uwsgi CLAP Password Use uWSGI Docker ResNet-50 GoogLeNet RAR 论文速读 Sklearn 腾讯云 HuggingFace Safetensors FlashAttention Disk Pandas Vmess FastAPI Nginx 净利润 Agent InvalidArgumentError CTC Website Bipartite 飞书 Algorithm Paddle BTC 图标 Bitcoin Llama SPIE UI Jetson Hilton Pytorch Excel TensorRT TSV Translation 递归学习法 Crawler Rebuttal Tracking PDF VSCode Review CC Qwen2 论文 Attention llama.cpp NameSilo Cloudreve PyTorch EXCEL NLP Animate 音频 CEIR Tensor transformers ChatGPT JSON GPT4 Hungarian Distillation VPN diffusers VGG-16 Bert scipy Hotel git Mixtral LLM Diagram BF16 LLAMA TTS 图形思考法 PIP AI FP16 Freesound CSV Plotly 关于博主 tar Firewall Conda SQLite BeautifulSoup v2ray Paper Transformers Land TensorFlow Web Permission Windows 强化学习 FP64 Clash Linux FP32 OpenCV 第一性原理 报税 Python XML 搞笑 继承 多线程 YOLO printf v0.dev Template Anaconda PyCharm Ubuntu 财报 RGB Bin Math LaTeX Food COCO GGML Data tqdm OCR Knowledge Google Michelin SQL 云服务器 Statistics WAN Qwen Git SAM SVR Pillow GIT ONNX Shortcut Claude Django Color Video Heatmap C++ icon Numpy PDB DeepSeek 版权 Vim Datetime 算法题 Baidu Proxy LeetCode GPTQ Interview 顶会 多进程 DeepStream CV 阿里云 Streamlit Gemma Logo CAM 签证 WebCrawler Ptyhon Plate IndexTTS2 OpenAI Image2Text 域名 QWEN Miniforge 公式 UNIX Card MD5 CUDA Github Base64 logger Dataset Qwen2.5 Magnet Quantize LoRA git-lfs ModelScope Random Augmentation Domain XGBoost 证件照 Search Breakpoint NLTK FP8 Pickle Markdown hf HaggingFace Input Tiktoken Quantization ms-swift
站点统计

本站现有博文330篇,共被浏览863124

本站已经建立2571天!

热门文章
文章归档
回到顶部