EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Diagram WebCrawler GoogLeNet Use 强化学习 NLP Interview 继承 ONNX NameSilo printf Food Safetensors Baidu uwsgi SAM Password MD5 TSV Data News Hilton PDB Bipartite OCR LeetCode Ptyhon Color 版权 AI PDF Card VPN OpenCV Base64 Logo v2ray CC Tiktoken 多进程 logger Nginx Google CTC Shortcut 财报 Domain Crawler Anaconda Disk ModelScope Pandas tar Animate Plate Math FP32 SVR Heatmap CSV Ubuntu v0.dev GPTQ DeepSeek API Mixtral Image2Text 证件照 ResNet-50 Permission Statistics Qwen2 Web Bert TTS Plotly SPIE Land XML Git 阿里云 Excel BTC 顶会 TensorFlow mmap CEIR C++ Video Python Breakpoint Pillow 云服务器 Paper Random Claude RGB WAN QWEN Bitcoin Qwen2.5 搞笑 第一性原理 Zip Pickle Miniforge Vmess BeautifulSoup XGBoost tqdm Review Windows CUDA Gemma GGML 递归学习法 RAR NLTK Template 签证 Hungarian PyTorch Jetson 音频 Markdown DeepStream Github Algorithm FlashAttention PIP Search git Translation Tensor Knowledge ChatGPT TensorRT JSON InvalidArgumentError Freesound Qwen Michelin Quantization Input IndexTTS2 LLM Linux 报税 COCO HaggingFace 图形思考法 UNIX Cloudreve YOLO Streamlit Transformers hf Agent LoRA Website 算法题 公式 uWSGI Clash PyCharm FP16 Quantize Sklearn Pytorch Dataset diffusers Proxy Attention Vim llama.cpp Llama CV LLAMA Magnet VSCode Paddle 净利润 多线程 VGG-16 域名 git-lfs Conda Datetime HuggingFace Jupyter CAM UI Docker FastAPI 关于博主 SQL GPT4 FP8 Numpy GIT OpenAI Augmentation Django SQLite Tracking transformers LaTeX torchinfo Bin Firewall EXCEL CLAP 飞书 FP64 腾讯云 Distillation BF16 Hotel scipy
站点统计

本站现有博文321篇,共被浏览783119

本站已经建立2476天!

热门文章
文章归档
回到顶部