EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
图形思考法 Excel 论文速读 icon BeautifulSoup Vim LaTeX PyCharm ONNX ResNet-50 云服务器 Disk Anaconda GPT4 Website Tracking Pandas PDB Quantize VSCode 多进程 Vmess Python Clash Animate VPN Google LLAMA GGML HuggingFace ms-swift Firewall Sklearn Hungarian SQLite printf Safetensors CAM AI 腾讯云 SQL Dataset ChatGPT 报税 Bipartite Bitcoin Nginx 版权 XML SPIE FP32 Git git Ptyhon FlashAttention Video 阿里云 Food Image2Text 证件照 关于博主 CSV v0.dev TensorRT Input Cloudreve NLTK llama.cpp Hotel Statistics COCO Land TSV Ubuntu Magnet 飞书 Qwen diffusers SVR Algorithm 顶会 RGB Jupyter Domain Heatmap NLP NameSilo 域名 DeepSeek PDF Michelin 多线程 Llama scipy 第一性原理 MD5 ModelScope Transformers BTC Hilton Web SAM Qwen2.5 Numpy GPTQ Pickle 算法题 PIP Translation 继承 PyTorch Use v2ray CTC Zip Permission Datetime Gemma Miniforge YOLO 论文 JSON Shortcut OpenCV Rebuttal FP16 Proxy tqdm Quantization LLM CC CV Template 强化学习 Django Color Windows Jetson QWEN Random uwsgi IndexTTS2 torchinfo CLAP Freesound logger OpenAI Pytorch LeetCode Agent Mixtral Pillow Password hf FP64 CEIR Data UNIX GIT UI WebCrawler uWSGI LoRA Interview Paddle Docker Streamlit Plotly TensorFlow 净利润 Plate EXCEL HaggingFace API 音频 Linux GoogLeNet 递归学习法 Base64 Attention mmap OCR XGBoost 公式 VGG-16 InvalidArgumentError RAR Search Baidu 签证 Knowledge Paper C++ Diagram Augmentation 图标 BF16 Markdown Logo Card tar Bert Tiktoken git-lfs Math transformers FP8 搞笑 Crawler News Claude Qwen2 DeepStream Github WAN FastAPI CUDA Bin Tensor Distillation 财报 Breakpoint TTS Review Conda
站点统计

本站现有博文330篇,共被浏览863192

本站已经建立2571天!

热门文章
文章归档
回到顶部