EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Tiktoken Plotly LLAMA icon Quantize 继承 Github 图标 公式 Template Llama RGB 腾讯云 QWEN Sklearn Hotel Bipartite Google Distillation Nginx AI mmap HaggingFace BeautifulSoup CV Domain Hungarian BTC Food Safetensors Pillow Miniforge Qwen2 证件照 TensorFlow Password Heatmap Pandas JSON Tensor 阿里云 Hilton Animate Card Plate SQLite 净利润 tar 音频 CTC 递归学习法 多线程 v0.dev Qwen Web LeetCode Paddle GGML InvalidArgumentError Gemma Attention 云服务器 Disk 域名 v2ray EXCEL Cloudreve 顶会 Paper Datetime logger Image2Text Land NLP LoRA DeepStream VSCode SVR CC Website ModelScope UI Algorithm Diagram llama.cpp Python Magnet 飞书 财报 Data C++ 图形思考法 GoogLeNet Streamlit PIP Statistics COCO Excel Input diffusers Logo Random uWSGI XML git tqdm Freesound MD5 DeepSeek PDB hf RAR Michelin transformers GIT Transformers Pytorch Jetson XGBoost torchinfo Color BF16 News FP16 ResNet-50 Bin PyCharm LaTeX API Vmess CUDA 强化学习 Crawler Augmentation CAM Dataset NLTK Shortcut Ptyhon CLAP Zip NameSilo WAN Git Mixtral PDF git-lfs 多进程 FP64 PyTorch Math Translation Video WebCrawler SPIE Use VGG-16 IndexTTS2 FlashAttention UNIX Base64 Pickle 第一性原理 ONNX Review Django Search OpenCV Conda CSV FP8 版权 Bert Bitcoin 签证 Vim Knowledge GPTQ Baidu ChatGPT OCR LLM VPN Windows uwsgi 搞笑 SQL Docker Agent Quantization 报税 Ubuntu Qwen2.5 算法题 Markdown Rebuttal Firewall Linux FastAPI TSV YOLO Tracking Breakpoint GPT4 Interview OpenAI CEIR HuggingFace Numpy Anaconda Clash SAM Claude scipy FP32 Permission TensorRT Jupyter printf 关于博主 Proxy TTS
站点统计

本站现有博文324篇,共被浏览819207

本站已经建立2523天!

热门文章
文章归档
回到顶部