EADST

llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures

The source code from llama.cpp /ggml-quants.c includes detailed definitions of various quantization structures used in neural networks and computational models. These structures, named Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K, are designed for efficient representation and processing of weights in a quantized format, reducing memory footprint while maintaining acceptable levels of accuracy.

//
// Super-block quantization structures
//

// Define the super-block size based on a preprocessor directive. 
// This affects the size of quantization blocks and related arrays.
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif

// 2-bit quantization structure
// Each weight is represented as x = a * q + b, where a is the scale and b is the minimum value.
// The structure is divided into 16 blocks of 16 elements each, leading to 2.625 bits per weight.

// When QK_K = 256, then scales = 16 bytes, qs = 64 bytes, d = 2 bytes, dmin = 2 bytes. The total is 84 bytes = 84 * 8  bits = 672 bits, so have 672 bits / 256 = 2.625 (bpw) bits per weight.

typedef struct {
    uint8_t scales[QK_K/16];    // Scales and minimums, quantized using 4 bits.
    uint8_t qs[QK_K/4];         // Quantized values.
    ggml_fp16_t d;              // Super-block scale for quantized scales.
    ggml_fp16_t dmin;           // Super-block scale for quantized minimums.
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

// 3-bit quantization structure
// Weights are represented as x = a * q, using only the scale factor a.
// Divided into 16 blocks of 16 elements each, this achieves 3.4375 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[2];        // Scale values.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else

// When QK_K = 256, then hmask= 32 bytes, qs = 64 bytes, scales = 12 bytes, d = 2 bytes. The total is 110 bytes = 110 * 8  bits = 880 bits, so we have 880 bits / 256 = 3.4375 (bpw) bits per weight.

typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[12];       // Scales, quantized with 6 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif

// 4-bit quantization structure
// Weights are again represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 4.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d[2];         // Super-block scales/mins.
    uint8_t scales[2];        // 4-bit block scales/mins.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif

// 5-bit quantization structure
// Weights are represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 5.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d;            // Super-block scale.
    int8_t  scales[QK_K/16];  // 8-bit block scales.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif

// 6-bit quantization structure
// Weights are represented as x = a * q.
// The structure is divided into 16 blocks of 16 elements each, achieving 6.5625 bits per weight.
typedef struct {
    uint8_t ql[QK_K/2];       // Lower 4 bits of the quantized values.
    uint8_t qh[QK_K/4];       // Upper 2 bits of the quantized values.
    int8_t  scales[QK_K/16];  // Scales, quantized with 8 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q6_K;
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

// Intermediate quantization and dot product structure
typedef struct {
    float   d;               // Delta value for quantization.
    int8_t  qs[QK_K];        // Quantized values.
    int16_t bsums[QK_K/16];  // Sum of quants in groups of 16.
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

// "True" 2-bit quantization structure, adjusted for block usage in ggml design.
// Results in 2.0625 bits per weight due to 16-bit scale for each block of 256.
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
} block_iq2_xxs;
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");

// 2.3125 bpw (bits per weight) quantization structure
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
    uint8_t  scales[QK_K/32];// Scales for quantization.
} block_iq2_xs;
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); 
相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
ChatGPT FP64 Base64 Quantization Tracking GGML Zip UI LoRA Land printf Django Windows TensorFlow Quantize Plate hf 飞书 OCR Card UNIX TSV Statistics Python Conda VGG-16 Claude git BeautifulSoup SQL Template TTS LaTeX Sklearn Logo DeepStream Plotly Transformers CUDA Bipartite Hotel BF16 JSON AI v0.dev Paddle Image2Text Mixtral PDF scipy 算法题 Hungarian Proxy PyCharm 多线程 Review Pandas 净利润 FP8 Random mmap YOLO Distillation Vmess Algorithm CEIR Google NLP CLAP Video Gemma Knowledge Breakpoint Anaconda CC logger Use NLTK 签证 阿里云 Ubuntu Jetson Food FastAPI Firewall 多进程 Pillow LeetCode llama.cpp PDB 证件照 Diagram Hilton Disk 关于博主 Color Miniforge C++ Domain Freesound Cloudreve RAR tqdm ModelScope Permission TensorRT WAN XGBoost HaggingFace Bert 搞笑 HuggingFace Web GPT4 Bin CTC CAM Nginx Website Magnet Clash Qwen Github 财报 Qwen2 Pytorch Dataset Paper FP16 OpenAI COCO Numpy FlashAttention GPTQ BTC IndexTTS2 Shortcut Ptyhon DeepSeek Translation Qwen2.5 公式 NameSilo SPIE ResNet-50 torchinfo API Jupyter LLM FP32 继承 ONNX SQLite Michelin XML Tiktoken v2ray Baidu 腾讯云 Bitcoin Pickle VSCode OpenCV 域名 LLAMA GIT 版权 PIP git-lfs Animate CV uwsgi Streamlit diffusers transformers Docker tar PyTorch SAM Excel Data CSV Linux Password 音频 Git Heatmap Attention Datetime EXCEL 报税 VPN Augmentation Vim QWEN MD5 Input Agent Interview Tensor Markdown uWSGI GoogLeNet RGB Crawler Math InvalidArgumentError WebCrawler Safetensors SVR Llama
站点统计

本站现有博文312篇,共被浏览744296

本站已经建立2387天!

热门文章
文章归档
回到顶部