EADST

Pytorch Q4_1 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_1 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_1_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 4-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the min and max values per block
    min_vals = torch.min(tensor, dim=1)[0]
    max_vals = torch.max(tensor, dim=1)[0]

    # Calculate scale d for each block
    d = (max_vals - min_vals) / (2**4 - 1)
    d[d == 0] = 1.0  # Prevent division by zero

    # Calculate inverse of d
    ids = 1.0 / d

    # Quantize tensor elements
    quantized_tensors = (tensor - min_vals[:, None]) * ids[:, None]

    # Clamp values to be between 0 and 15 (for 4 bits)
    quantized_tensors = torch.clamp(quantized_tensors + 0.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() * d[:, None]) + min_vals[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_1_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
tar Docker 递归学习法 Django VPN Safetensors RGB Firewall 净利润 diffusers TTS Qwen2 Pytorch ResNet-50 git-lfs Heatmap CSV Llama XGBoost Land 财报 Baidu VGG-16 Augmentation ChatGPT IndexTTS2 ONNX Card LLM Clash BF16 Distillation 论文 JSON CEIR git YOLO FP8 hf Search WebCrawler WAN Image2Text CTC NLP DeepSeek Sklearn 阿里云 报税 GGML Pandas Color 图标 COCO API 证件照 Breakpoint QWEN Rebuttal Quantize Markdown UI PIP uWSGI GIT Shortcut mmap Nginx Datetime Bipartite GPT4 Knowledge XML Hungarian Hilton Bert HuggingFace LaTeX 公式 FP16 Qwen2.5 HaggingFace Tracking scipy Anaconda printf 强化学习 Conda BTC NameSilo v2ray AI SQLite TSV Paddle transformers PDF Github Ptyhon Dataset TensorRT LLAMA ModelScope Vim Jupyter ms-swift LoRA FP32 CUDA PDB logger Video Hotel 搞笑 Logo 多线程 LeetCode 论文速读 Template tqdm Password VSCode Agent 算法题 Python Statistics Paper 域名 Cloudreve FP64 Gemma Windows Linux Website Math SQL Review Claude SVR Git GoogLeNet PyTorch Crawler Diagram CV 第一性原理 Tiktoken Numpy Pickle Freesound Vmess Mixtral 签证 Ubuntu CLAP Excel Google Disk Random RAR UNIX Plotly GPTQ CC Qwen Translation News Algorithm Bin Jetson Bitcoin Permission Pillow InvalidArgumentError Animate MD5 腾讯云 SAM v0.dev llama.cpp Attention 云服务器 顶会 PyCharm torchinfo uwsgi Tensor Interview Use Input Transformers Magnet 关于博主 BeautifulSoup icon Food 继承 FlashAttention NLTK Plate DeepStream 飞书 Zip Data Domain FastAPI Base64 版权 SPIE C++ CAM OpenCV OpenAI EXCEL Proxy TensorFlow 多进程 Miniforge Web Michelin OCR Quantization 音频 图形思考法 Streamlit
站点统计

本站现有博文330篇,共被浏览863127

本站已经建立2571天!

热门文章
文章归档
回到顶部