EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Qwen SAM Website Crawler PDF Animate 版权 Streamlit ResNet-50 CC CUDA Math GGML QWEN GIT IndexTTS2 ModelScope Plate Logo 多线程 FlashAttention Ptyhon Password Tensor C++ Image2Text SVR 搞笑 Llama Bitcoin 证件照 Numpy Miniforge Interview VPN NLTK CEIR mmap 递归学习法 v2ray NameSilo TTS BTC diffusers PDB Color Bipartite Windows Pillow PIP BeautifulSoup Review TensorFlow Sklearn 报税 图形思考法 Freesound Gemma Augmentation Linux CTC ONNX Bin Excel Web uWSGI Qwen2.5 公式 git-lfs Magnet Dataset Clash 净利润 阿里云 算法题 Hotel RAR Diagram Heatmap Nginx Plotly llama.cpp Ubuntu FP16 Bert Template FP64 Jetson LLM TSV NLP Django Git 多进程 UI Data FP32 OpenCV 强化学习 Safetensors XML Tiktoken Cloudreve CSV Distillation VSCode Vmess CAM Conda Video Hungarian Anaconda Zip Base64 tar Quantize v0.dev Qwen2 Michelin 继承 LeetCode Translation Baidu 顶会 XGBoost Docker Food PyCharm Jupyter BF16 Proxy Breakpoint Use FP8 Agent 音频 财报 YOLO Google SQLite EXCEL GoogLeNet SQL Random LaTeX printf GPTQ CV 第一性原理 scipy Domain CLAP Knowledge Search MD5 Paper 签证 API Vim UNIX DeepSeek Firewall Claude SPIE Transformers Paddle transformers RGB Shortcut AI logger Tracking Input DeepStream torchinfo Datetime Statistics FastAPI Disk OCR HaggingFace COCO tqdm VGG-16 Permission Github Python WAN Pytorch JSON 飞书 腾讯云 Pickle PyTorch git LoRA Card OpenAI InvalidArgumentError WebCrawler LLAMA Markdown TensorRT GPT4 hf 域名 Hilton Quantization Mixtral Pandas HuggingFace ChatGPT Algorithm Land Attention 关于博主 uwsgi
站点统计

本站现有博文319篇,共被浏览751692

本站已经建立2408天!

热门文章
文章归档
回到顶部