EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
继承 FP16 Jupyter VGG-16 scipy XML transformers Diagram Qwen2.5 Linux 证件照 Distillation torchinfo TensorRT NameSilo ModelScope MD5 UI VSCode Plotly YOLO OpenAI Domain 财报 Input Paper Knowledge GoogLeNet Food Quantize Bitcoin 签证 Git 强化学习 PDF Qwen2 Anaconda WAN Use git-lfs PIP RGB Numpy Search git Card Datetime Windows Excel Image2Text Llama Animate SQLite GIT Plate Color FP64 递归学习法 公式 LLM CAM SVR diffusers InvalidArgumentError Agent Random Pickle Translation Disk 腾讯云 OpenCV AI CC Markdown CEIR Streamlit Freesound 飞书 LLAMA FlashAttention Google 关于博主 Vmess Algorithm Land Web Website BTC Mixtral DeepSeek Python Sklearn Clash GGML Ubuntu Pandas TensorFlow mmap Zip Transformers Cloudreve Base64 Safetensors logger Review Tensor Github CV FastAPI Quantization Tiktoken Django HaggingFace Breakpoint 阿里云 域名 Bert v2ray NLP Nginx COCO Firewall SPIE TSV GPT4 RAR Math Tracking 版权 CUDA Docker Heatmap Password 音频 Vim TTS UNIX News Conda PyTorch ChatGPT Shortcut 顶会 Magnet Ptyhon SAM FP8 Crawler FP32 Jetson Logo 净利润 Bipartite CLAP ResNet-50 GPTQ C++ 第一性原理 EXCEL hf Baidu Miniforge API PyCharm Template OCR SQL Michelin NLTK BF16 PDB BeautifulSoup Attention DeepStream 算法题 JSON Gemma Statistics Qwen XGBoost printf Paddle Pillow Data Interview WebCrawler 报税 uwsgi v0.dev ONNX Proxy CTC Claude 搞笑 tqdm Permission 多进程 VPN llama.cpp LoRA Hungarian 图形思考法 IndexTTS2 Hotel Pytorch LaTeX HuggingFace tar uWSGI Hilton Dataset LeetCode CSV QWEN Video Augmentation 多线程 Bin
站点统计

本站现有博文320篇,共被浏览760645

本站已经建立2432天!

热门文章
文章归档
回到顶部