Inference
Compiling an LLM for FHE Inference
import random
import json
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Conv1D, Trainer, TrainingArguments
from concrete.ml.torch.hybrid_model import HybridFHEModel
# Load the GPT2 model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id
# Determine which layers run with FHE (all linear ones)
remote_names = []
for name, module in model.named_modules():
if isinstance(module, (torch.nn.Linear, Conv1D)):
remote_names.append(name)
# Create the HybridFHEModel with the specified remote modules
hybrid_model = HybridFHEModel(model, module_names=remote_names)
# Prepare input data for calibration
input_tensor = torch.randint(0, tokenizer.vocab_size, (1, 32), dtype=torch.long)
# Calibrate and compile the model
hybrid_model.compile_model(input_tensor, n_bits=8, use_dynamic_quantization=True)Latency and throughput
Last updated
Was this helpful?