CANN大模型推理优化技术解读：从KV Cache到PagedAttention的优化策略

大语言模型推理面临着内存占用大、延迟高、吞吐量低等挑战。CANN针对大模型推理提供了一系列优化技术，包括KV Cache优化、连续批处理、PagedAttention等。本文将深入解读大模型推理的优化原理、实现机制以及性能调优策略，帮助你在NPU上高效部署大模型。内存优化：KV Cache、PagedAttention批处理：连续批处理、动态调度计算优化：Flash Attention、算子融合服

2201_76027234

985人浏览 · 2026-02-07 00:00:55

2201_76027234 · 2026-02-07 00:00:55 发布

本文基于CANN开源社区的多个仓库进行技术解读

CANN组织地址：https://atomgit.com/cann

ops-transformer仓库地址：https://atomgit.com/cann/ops-transformer

runtime仓库地址：https://atomgit.com/cann/runtime

前言

大语言模型推理面临着内存占用大、延迟高、吞吐量低等挑战。CANN针对大模型推理提供了一系列优化技术，包括KV Cache优化、连续批处理、PagedAttention等。

本文将深入解读大模型推理的优化原理、实现机制以及性能调优策略，帮助你在NPU上高效部署大模型。

大模型推理挑战

1. 内存占用分析

# 大模型内存占用计算
class MemoryCalculator:
    """
    内存占用计算器
    """
    def calculate_model_memory(self, num_params, dtype='fp16'):
        """计算模型参数内存"""
        bytes_per_param = {
            'fp32': 4,
            'fp16': 2,
            'int8': 1
        }
      
        memory_gb = num_params * bytes_per_param[dtype] / 1e9
        return memory_gb
  
    def calculate_kv_cache(self, batch_size, seq_len, hidden_size, num_layers, dtype='fp16'):
        """计算KV Cache内存"""
        bytes_per_elem = 2 if dtype == 'fp16' else 4
      
        # K和V各一份
        kv_memory = (batch_size * seq_len * hidden_size * num_layers * 2 * bytes_per_elem)
      
        return kv_memory / 1e9  # GB
  
    def estimate_total_memory(self, model_config, batch_size, seq_len):
        """估算总内存"""
        # 模型参数
        model_mem = self.calculate_model_memory(
            model_config['num_params'],
            model_config['dtype']
        )
      
        # KV Cache
        kv_mem = self.calculate_kv_cache(
            batch_size,
            seq_len,
            model_config['hidden_size'],
            model_config['num_layers'],
            model_config['dtype']
        )
      
        # 激活值（估算为KV的2倍）
        activation_mem = kv_mem * 2
      
        total = model_mem + kv_mem + activation_mem
      
        return {
            'model': model_mem,
            'kv_cache': kv_mem,
            'activation': activation_mem,
            'total': total
        }

# 使用示例：GPT-3 175B
calc = MemoryCalculator()

gpt3_config = {
    'num_params': 175e9,
    'hidden_size': 12288,
    'num_layers': 96,
    'dtype': 'fp16'
}

memory = calc.estimate_total_memory(gpt3_config, batch_size=1, seq_len=2048)

print("GPT-3 175B 内存占用估算:")
print(f"  模型参数: {memory['model']:.1f} GB")
print(f"  KV Cache: {memory['kv_cache']:.1f} GB")
print(f"  激活值: {memory['activation']:.1f} GB")
print(f"  总计: {memory['total']:.1f} GB")

# 内存优化方向：
# - 模型压缩：量化、剪枝
# - KV Cache优化：复用、分页
# - 激活重计算：用时间换空间

2. 推理性能瓶颈

# 推理性能分析
import torch
import torch_npu
import time

class InferenceProfiler:
    """
    推理性能分析器
    """
    def profile_generation(self, model, tokenizer, prompt, max_new_tokens=100):
        """分析生成过程"""
        model.eval()
      
        # 编码输入
        input_ids = tokenizer.encode(prompt, return_tensors='pt').npu()
      
        # Prefill阶段
        torch.npu.synchronize()
        prefill_start = time.time()
      
        with torch.no_grad():
            outputs = model(input_ids)
      
        torch.npu.synchronize()
        prefill_time = time.time() - prefill_start
      
        # Decode阶段
        decode_times = []
      
        for _ in range(max_new_tokens):
            torch.npu.synchronize()
            decode_start = time.time()
          
            with torch.no_grad():
                # 只处理最后一个token
                outputs = model(input_ids[:, -1:])
          
            torch.npu.synchronize()
            decode_time = time.time() - decode_start
            decode_times.append(decode_time)
          
            # 采样下一个token
            next_token = outputs.logits[:, -1, :].argmax(dim=-1, keepdim=True)
            input_ids = torch.cat([input_ids, next_token], dim=1)
      
        # 统计
        avg_decode_time = sum(decode_times) / len(decode_times)
      
        print("\n=== 推理性能分析 ===")
        print(f"Prefill时间: {prefill_time*1000:.2f} ms")
        print(f"平均Decode时间: {avg_decode_time*1000:.2f} ms")
        print(f"总生成时间: {(prefill_time + sum(decode_times))*1000:.2f} ms")
        print(f"吞吐量: {max_new_tokens / (prefill_time + sum(decode_times)):.2f} tokens/s")

# 使用示例
# profiler = InferenceProfiler()
# profiler.profile_generation(model, tokenizer, "Hello, how are you?")

# 性能瓶颈：
# - Prefill阶段：计算密集，处理整个prompt
# - Decode阶段：内存密集，逐token生成
# - KV Cache：内存占用大，访问频繁

KV Cache优化

1. KV Cache基础实现

# KV Cache管理器
import torch

class KVCache:
    """
    KV Cache管理器
  
    存储和复用Key/Value
    """
    def __init__(self, num_layers, batch_size, max_seq_len, hidden_size, dtype=torch.float16):
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len
        self.hidden_size = hidden_size
      
        # 预分配KV Cache
        self.k_cache = [
            torch.zeros(batch_size, max_seq_len, hidden_size, dtype=dtype).npu()
            for _ in range(num_layers)
        ]
      
        self.v_cache = [
            torch.zeros(batch_size, max_seq_len, hidden_size, dtype=dtype).npu()
            for _ in range(num_layers)
        ]
      
        # 当前序列长度
        self.seq_lens = torch.zeros(batch_size, dtype=torch.long)
  
    def update(self, layer_idx, k, v, positions):
        """更新KV Cache"""
        batch_size, seq_len, _ = k.shape
      
        for b in range(batch_size):
            pos = positions[b]
            self.k_cache[layer_idx][b, pos:pos+seq_len] = k[b]
            self.v_cache[layer_idx][b, pos:pos+seq_len] = v[b]
      
        # 更新序列长度
        self.seq_lens = positions + seq_len
  
    def get(self, layer_idx, batch_idx=None):
        """获取KV Cache"""
        if batch_idx is not None:
            seq_len = self.seq_lens[batch_idx]
            k = self.k_cache[layer_idx][batch_idx, :seq_len]
            v = self.v_cache[layer_idx][batch_idx, :seq_len]
        else:
            k = self.k_cache[layer_idx]
            v = self.v_cache[layer_idx]
      
        return k, v
  
    def clear(self, batch_idx=None):
        """清空Cache"""
        if batch_idx is not None:
            self.seq_lens[batch_idx] = 0
        else:
            self.seq_lens.zero_()

# 使用示例：自回归生成
cache = KVCache(
    num_layers=32,
    batch_size=4,
    max_seq_len=2048,
    hidden_size=4096
)

# Prefill阶段
prompt_tokens = tokenize("Hello, how are")
k, v = model.compute_kv(prompt_tokens)  # 计算所有token的KV
cache.update(layer_idx=0, k=k, v=v, positions=torch.zeros(4))

# Decode阶段
for step in range(max_new_tokens):
    # 只计算新token的KV
    new_token = generate_next_token()
    k_new, v_new = model.compute_kv(new_token)
  
    # 更新cache
    positions = cache.seq_lens
    cache.update(layer_idx=0, k=k_new, v=v_new, positions=positions)
  
    # 使用完整的KV做attention
    k_full, v_full = cache.get(layer_idx=0)
    output = attention(q_new, k_full, v_full)

# KV Cache的作用：
# - 避免重复计算：已生成token的KV不再计算
# - 加速生成：decode阶段只计算一个token
# - 内存换时间：存储KV换取计算加速

2. KV Cache复用

# KV Cache复用策略
class KVCacheReuse:
    """
    KV Cache复用
  
    多个请求共享相同prefix的KV
    """
    def __init__(self, cache_manager):
        self.cache_manager = cache_manager
        self.prefix_cache = {}  # prefix -> cache_id
  
    def get_or_create_cache(self, prefix):
        """获取或创建cache"""
        prefix_key = tuple(prefix.tolist())
      
        if prefix_key in self.prefix_cache:
            # 复用已有cache
            cache_id = self.prefix_cache[prefix_key]
            return cache_id, True
        else:
            # 创建新cache
            cache_id = self.cache_manager.allocate()
            self.prefix_cache[prefix_key] = cache_id
            return cache_id, False
  
    def generate_with_reuse(self, model, prefix, continuation):
        """使用cache复用生成"""
        # 获取或创建cache
        cache_id, is_reused = self.get_or_create_cache(prefix)
      
        if is_reused:
            print(f"复用cache {cache_id}")
            # 直接使用已有的KV
            cache = self.cache_manager.get_cache(cache_id)
        else:
            print(f"创建新cache {cache_id}")
            # 计算prefix的KV
            cache = self.cache_manager.get_cache(cache_id)
            k, v = model.compute_kv(prefix)
            cache.update(k, v)
      
        # 生成continuation
        output = model.generate(continuation, cache=cache)
      
        return output

# 使用场景：
# - 多轮对话：共享对话历史
# - 批量推理：共享系统提示词
# - Few-shot学习：共享示例

连续批处理

1. 动态批处理

# 连续批处理调度器
import queue
import time

class ContinuousBatcher:
    """
    连续批处理调度器
  
    动态组batch，提高吞吐量
    """
    def __init__(self, max_batch_size=32, max_wait_time=0.01):
        self.max_batch_size = max_batch_size
        self.max_wait_time = max_wait_time
        self.running_requests = []
        self.waiting_requests = []
  
    def add_request(self, request):
        """添加新请求"""
        self.waiting_requests.append({
            'id': request.id,
            'prompt': request.prompt,
            'max_tokens': request.max_tokens,
            'generated_tokens': 0,
            'finished': False
        })
  
    def schedule_batch(self):
        """调度一个batch"""
        # 移除完成的请求
        self.running_requests = [r for r in self.running_requests if not r['finished']]
      
        # 计算可用空间
        available_slots = self.max_batch_size - len(self.running_requests)
      
        # 添加新请求
        while available_slots > 0 and self.waiting_requests:
            request = self.waiting_requests.pop(0)
            self.running_requests.append(request)
            available_slots -= 1
      
        return self.running_requests
  
    def process_batch(self, model, cache):
        """处理当前batch"""
        if not self.running_requests:
            return
      
        # 准备输入
        batch_size = len(self.running_requests)
        input_ids = []
        positions = []
      
        for req in self.running_requests:
            if req['generated_tokens'] == 0:
                # Prefill阶段：处理整个prompt
                input_ids.append(req['prompt'])
                positions.append(0)
            else:
                # Decode阶段：只处理最后一个token
                input_ids.append([req['last_token']])
                positions.append(req['generated_tokens'])
      
        # 模型推理
        outputs = model.forward(input_ids, cache, positions)
      
        # 更新请求状态
        for i, req in enumerate(self.running_requests):
            next_token = outputs[i].argmax()
            req['last_token'] = next_token
            req['generated_tokens'] += 1
          
            # 检查是否完成
            if next_token == EOS_TOKEN or req['generated_tokens'] >= req['max_tokens']:
                req['finished'] = True

# 使用示例
batcher = ContinuousBatcher(max_batch_size=32)

# 请求陆续到达
batcher.add_request(Request(prompt="Hello", max_tokens=50))
batcher.add_request(Request(prompt="How are you", max_tokens=30))

# 持续处理
while batcher.running_requests or batcher.waiting_requests:
    # 调度batch
    batch = batcher.schedule_batch()
  
    # 处理batch
    batcher.process_batch(model, cache)

# 连续批处理的优势：
# - 高吞吐量：充分利用硬件
# - 低延迟：请求立即开始处理
# - 灵活性：支持不同长度请求

PagedAttention技术

1. 分页KV Cache

# PagedAttention实现
class PagedKVCache:
    """
    分页KV Cache管理器
  
    类似操作系统的虚拟内存
    """
    def __init__(self, num_layers, page_size=16, hidden_size=4096, dtype=torch.float16):
        self.num_layers = num_layers
        self.page_size = page_size
        self.hidden_size = hidden_size
        self.dtype = dtype
      
        # 物理页池
        self.physical_pages = []
        self.free_pages = []
      
        # 逻辑到物理的映射
        self.page_tables = {}  # {request_id: [page_ids]}
  
    def allocate_page(self):
        """分配一个物理页"""
        if self.free_pages:
            # 复用空闲页
            page_id = self.free_pages.pop()
        else:
            # 分配新页
            page = {
                'k': torch.zeros(self.num_layers, self.page_size, self.hidden_size, dtype=self.dtype).npu(),
                'v': torch.zeros(self.num_layers, self.page_size, self.hidden_size, dtype=self.dtype).npu()
            }
            page_id = len(self.physical_pages)
            self.physical_pages.append(page)
      
        return page_id
  
    def allocate_request(self, request_id, initial_length):
        """为请求分配页"""
        # 计算需要的页数
        num_pages = (initial_length + self.page_size - 1) // self.page_size
      
        # 分配页
        page_ids = []
        for _ in range(num_pages):
            page_id = self.allocate_page()
            page_ids.append(page_id)
      
        self.page_tables[request_id] = page_ids
  
    def append_tokens(self, request_id, num_tokens):
        """追加tokens"""
        page_ids = self.page_tables[request_id]
        current_length = len(page_ids) * self.page_size
      
        # 计算需要的总页数
        new_length = current_length + num_tokens
        needed_pages = (new_length + self.page_size - 1) // self.page_size
      
        # 分配额外的页
        while len(page_ids) < needed_pages:
            page_id = self.allocate_page()
            page_ids.append(page_id)
  
    def get_kv(self, request_id, layer_idx):
        """获取KV"""
        page_ids = self.page_tables[request_id]
      
        k_list = []
        v_list = []
      
        for page_id in page_ids:
            page = self.physical_pages[page_id]
            k_list.append(page['k'][layer_idx])
            v_list.append(page['v'][layer_idx])
      
        k = torch.cat(k_list, dim=0)
        v = torch.cat(v_list, dim=0)
      
        return k, v
  
    def free_request(self, request_id):
        """释放请求的页"""
        if request_id in self.page_tables:
            page_ids = self.page_tables[request_id]
            self.free_pages.extend(page_ids)
            del self.page_tables[request_id]

# 使用示例
cache = PagedKVCache(num_layers=32, page_size=16, hidden_size=4096)

# 分配请求
cache.allocate_request(request_id=0, initial_length=100)

# 追加tokens
cache.append_tokens(request_id=0, num_tokens=50)

# 获取KV
k, v = cache.get_kv(request_id=0, layer_idx=0)

# 释放
cache.free_request(request_id=0)

# PagedAttention优势：
# - 内存利用率高：按需分配
# - 支持动态长度：灵活扩展
# - 减少碎片：页级管理

Flash Attention优化

1. Flash Attention原理

# Flash Attention简化实现
class FlashAttention:
    """
    Flash Attention
  
    IO感知的注意力优化
    """
    def __init__(self, block_size=64):
        self.block_size = block_size
  
    def forward(self, Q, K, V):
        """
        Flash Attention前向传播
      
        特点：
        - 分块计算
        - SRAM优化
        - 减少HBM访问
        """
        batch, seq_len, dim = Q.shape
        block_size = self.block_size
      
        # 输出和统计量
        O = torch.zeros_like(Q)
        l = torch.zeros(batch, seq_len, 1).to(Q.device)
        m = torch.full((batch, seq_len, 1), -float('inf')).to(Q.device)
      
        # 分块处理Q
        for i in range(0, seq_len, block_size):
            Q_block = Q[:, i:i+block_size, :]
          
            # 分块处理K和V
            for j in range(0, seq_len, block_size):
                K_block = K[:, j:j+block_size, :]
                V_block = V[:, j:j+block_size, :]
              
                # 计算注意力分数（在SRAM中）
                S_block = torch.matmul(Q_block, K_block.transpose(-2, -1))
                S_block = S_block / (dim ** 0.5)
              
                # 更新最大值
                m_new = torch.maximum(m[:, i:i+block_size], S_block.max(dim=-1, keepdim=True)[0])
              
                # 计算指数（数值稳定）
                exp_S = torch.exp(S_block - m_new)
              
                # 更新统计量
                l_new = torch.exp(m[:, i:i+block_size] - m_new) * l[:, i:i+block_size] + exp_S.sum(dim=-1, keepdim=True)
              
                # 更新输出
                O[:, i:i+block_size] = (O[:, i:i+block_size] * torch.exp(m[:, i:i+block_size] - m_new) * l[:, i:i+block_size] + torch.matmul(exp_S, V_block)) / l_new
              
                # 更新统计量
                m[:, i:i+block_size] = m_new
                l[:, i:i+block_size] = l_new
      
        return O

# 使用示例
flash_attn = FlashAttention(block_size=64)

Q = torch.randn(2, 1024, 64).npu()
K = torch.randn(2, 1024, 64).npu()
V = torch.randn(2, 1024, 64).npu()

output = flash_attn.forward(Q, K, V)

# Flash Attention优势：
# - 内存高效：O(N)而非O(N^2)
# - 速度快：减少HBM访问
# - 支持长序列：可处理更长输入

# 传统Attention问题：
# 1. 计算S = QK^T：写入HBM
# 2. 计算P = softmax(S)：读S，写P
# 3. 计算O = PV：读P，写O
# 总HBM访问：O(N^2)

# Flash Attention优化：
# 1. 分块计算
# 2. 在SRAM中完成softmax
# 3. 增量更新输出
# 总HBM访问：O(N)

推理服务优化

1. 请求调度

# 请求调度器
class RequestScheduler:
    """
    请求调度器
  
    优化吞吐量和延迟
    """
    def __init__(self, max_batch_size=32):
        self.max_batch_size = max_batch_size
        self.pending_requests = []
        self.running_requests = []
  
    def add_request(self, request):
        """添加新请求"""
        self.pending_requests.append(request)
        self.pending_requests.sort(key=lambda r: r.priority, reverse=True)
  
    def schedule(self):
        """调度策略"""
        # 移除完成的请求
        self.running_requests = [r for r in self.running_requests if not r.finished]
      
        # 按长度分组
        length_groups = {}
        for req in self.pending_requests:
            length = req.current_length
            if length not in length_groups:
                length_groups[length] = []
            length_groups[length].append(req)
      
        # 调度batch
        batch = []
        available_slots = self.max_batch_size - len(self.running_requests)
      
        # 优先调度相同长度的请求
        for length, requests in sorted(length_groups.items()):
            while requests and available_slots > 0:
                req = requests.pop(0)
                batch.append(req)
                available_slots -= 1
      
        # 添加到running
        self.running_requests.extend(batch)
      
        # 从pending移除
        for req in batch:
            self.pending_requests.remove(req)
      
        return self.running_requests

# 使用示例
scheduler = RequestScheduler(max_batch_size=32)

# 添加请求
for i in range(100):
    request = Request(id=i, prompt="Hello", priority=random.randint(1, 10))
    scheduler.add_request(request)

# 调度
batch = scheduler.schedule()
print(f"调度了 {len(batch)} 个请求")

# 调度策略：
# - FCFS：先来先服务
# - Priority：优先级调度
# - SJF：最短作业优先
# - Round Robin：轮转调度