EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Attention ResNet-50 Safetensors Magnet FP8 版权 Ptyhon Vmess git-lfs 证件照 Card COCO Agent LLAMA TTS Llama PyCharm Markdown Tracking Breakpoint Translation logger Input printf Website LeetCode Plotly Docker hf Pytorch QWEN SQL MD5 GIT Proxy FlashAttention NLP WebCrawler Baidu Algorithm 飞书 搞笑 GPTQ Transformers Bipartite Password GPT4 音频 Data Datetime Use EXCEL Windows Google InvalidArgumentError Search Miniforge Freesound Python Anaconda Disk Quantize VPN FastAPI Nginx v0.dev Image2Text Django 图形思考法 公式 UI 关于博主 净利润 报税 BeautifulSoup git ONNX transformers 财报 OpenAI Quantization API AI UNIX Permission Shortcut tar Review diffusers SQLite NameSilo Logo HuggingFace CAM Linux 云服务器 Cloudreve Gemma Augmentation Template DeepSeek 域名 LaTeX Bin Pillow Numpy uWSGI Hotel Color 签证 阿里云 C++ Crawler uwsgi Knowledge Qwen2 BF16 Plate CV CUDA Bert Distillation Firewall Random Heatmap mmap Web Diagram 腾讯云 Pickle 多进程 Statistics TensorFlow BTC OCR Animate Math 顶会 Excel Pandas SAM Dataset Video YOLO WAN OpenCV Hungarian CTC RGB Streamlit 算法题 Bitcoin ChatGPT Git LoRA llama.cpp Tiktoken scipy Vim Domain Ubuntu Hilton GGML PyTorch Mixtral XGBoost FP16 Qwen2.5 多线程 递归学习法 XML Sklearn PIP Tensor ModelScope SVR CLAP 第一性原理 Paper Zip v2ray VGG-16 SPIE Michelin Base64 Land JSON DeepStream Food PDF CSV tqdm RAR Clash FP64 Jupyter VSCode PDB Github HaggingFace IndexTTS2 Claude Conda GoogLeNet 强化学习 CEIR LLM Paddle TSV TensorRT CC NLTK FP32 Qwen Jetson 继承 Interview torchinfo News
站点统计

本站现有博文321篇,共被浏览770810

本站已经建立2457天!

热门文章
文章归档
回到顶部