EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
GoogLeNet 关于博主 FP16 CV 算法题 Miniforge Statistics Web Qwen2.5 Land PDB Proxy HuggingFace Breakpoint 递归学习法 飞书 Github uwsgi Color Michelin EXCEL Random Numpy CUDA 多进程 Pickle PyCharm Freesound tar AI transformers Magnet Streamlit DeepSeek Mixtral Claude CAM 图形思考法 Qwen2 Vmess Agent Windows Heatmap Safetensors Video VSCode XML Data logger Llama icon Hilton PIP BeautifulSoup CLAP COCO OCR 报税 FP8 NLP QWEN Baidu Excel YOLO Google LaTeX Qwen NameSilo Augmentation Password UNIX Pillow printf Tiktoken Plate Attention 腾讯云 mmap Pandas Disk XGBoost Food Conda Cloudreve 音频 CC uWSGI Use VGG-16 Sklearn VPN LeetCode Quantization CSV 财报 Linux Quantize 继承 图标 Vim Git FP32 CTC Docker Crawler JSON DeepStream Bin Firewall 云服务器 Clash BF16 Paddle v2ray Rebuttal ChatGPT SQL Jupyter 签证 FlashAttention v0.dev ONNX TSV CEIR Tracking Django LLM BTC Tensor Jetson Zip NLTK torchinfo RGB Gemma 域名 ModelScope Ubuntu RAR Image2Text git-lfs Paper 净利润 WebCrawler OpenAI Diagram tqdm Bitcoin Algorithm Bert News IndexTTS2 GGML Card 强化学习 Animate C++ Interview TensorFlow PyTorch HaggingFace SAM llama.cpp git SPIE PDF LLAMA Distillation scipy ResNet-50 Anaconda 多线程 hf 阿里云 Shortcut Dataset Bipartite 论文 GPTQ Translation Logo Python Datetime FastAPI Base64 搞笑 版权 Search Plotly TensorRT API FP64 LoRA Input InvalidArgumentError Domain Ptyhon SQLite Knowledge UI GIT Nginx OpenCV TTS diffusers Permission Hotel WAN 论文速读 Hungarian Pytorch Math SVR MD5 证件照 ms-swift GPT4 顶会 公式 第一性原理 Website Review Transformers Template Markdown
站点统计

本站现有博文330篇,共被浏览863133

本站已经建立2571天!

热门文章
文章归档
回到顶部