Chapter 11: HuggingFace Internals — How the Library Wraps Your PyTorch Model¶
Learning Outcome¶
Understand the architecture of the transformers library: how models are structured,
how configuration objects map to nn.Module hierarchies, and how to integrate a custom
model into the HuggingFace ecosystem.
Concepts¶
PretrainedConfig¶
Every HuggingFace model has a configuration class (e.g., GPT2Config, BertConfig)
that stores hyperparameters. This is serialized to config.json when saving.
from transformers import GPT2Config
config = GPT2Config(
vocab_size=50257,
n_positions=1024,
n_embd=768,
n_layer=12,
n_head=12,
)
print(config)
# GPT2Config {
# "n_embd": 768,
# "n_head": 12,
# ...
# }
AutoConfig.from_pretrained("gpt2") reads config.json and returns the correct
config subclass.
PreTrainedModel¶
The base class for all HuggingFace models. Provides:
from_pretrained(name_or_path)— downloads weights, maps tonn.Module.save_pretrained(path)— savesconfig.json+pytorch_model.bin/model.safetensors.generate()— autoregressive text generation with all decoding strategies.push_to_hub()— upload to HuggingFace Hub.
The modeling_*.py Pattern¶
Each model family has a src/transformers/models/<name>/modeling_<name>.py file.
For example, modeling_gpt2.py contains:
GPT2Attention— the attention moduleGPT2MLP— the feed-forward networkGPT2Block— one transformer blockGPT2Model— the full backbone (no head)GPT2LMHeadModel—GPT2Model+ language model headGPT2ForSequenceClassification—GPT2Model+ classification head
AutoModel Registry¶
AutoModel uses a registry to map model_type in config.json to the correct
model class:
from transformers import AutoModel
# Internally does: config = AutoConfig.from_pretrained("bert-base-uncased")
# Then: model_class = MODEL_MAPPING[type(config)]
# Then: model = model_class(config)
model = AutoModel.from_pretrained("bert-base-uncased")
Weight Loading: _load_pretrained_model¶
When calling from_pretrained(), HuggingFace:
- Loads the weight file (
.binor.safetensors). - Calls
model.load_state_dict(state_dict, strict=False). - Reports
missing_keysandunexpected_keys. - Handles tied weights (e.g.,
lm_head.weight=embed_tokens.weight).
Output Dataclasses¶
HuggingFace model outputs are dataclasses, not plain tuples:
outputs = model(input_ids)
# Can access by attribute
print(outputs.last_hidden_state.shape)
print(outputs.pooler_output.shape)
# Can also unpack as tuple (for backwards compatibility)
hidden, pooled = outputs
Hooks: output_hidden_states and output_attentions¶
outputs = model(input_ids, output_hidden_states=True, output_attentions=True)
# outputs.hidden_states: tuple of (batch, seq, d_model) per layer
# outputs.attentions: tuple of (batch, n_heads, seq, seq) per layer
Exercise 1 — Register a Custom Model with AutoModel¶
Guided Exercise
Make your custom GPT from Chapter 7 saveable and loadable with HuggingFace APIs.
from transformers import PretrainedConfig, PreTrainedModel, AutoConfig, AutoModel
from transformers.modeling_outputs import BaseModelOutput
import torch
import torch.nn as nn
class MyGPTConfig(PretrainedConfig):
model_type = "my_gpt"
def __init__(
self,
vocab_size: int = 50257,
d_model: int = 256,
n_heads: int = 8,
n_layers: int = 4,
max_len: int = 1024,
dropout: float = 0.1,
**kwargs,
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.max_len = max_len
self.dropout = dropout
class MyGPTModel(PreTrainedModel):
config_class = MyGPTConfig
def __init__(self, config: MyGPTConfig):
super().__init__(config)
self.gpt = GPTModel(
vocab_size=config.vocab_size,
d_model=config.d_model,
n_heads=config.n_heads,
n_layers=config.n_layers,
max_len=config.max_len,
dropout=config.dropout,
)
# Initialize weights
self.post_init()
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor | None = None,
**kwargs,
) -> BaseModelOutput:
# We ignore attention_mask here for simplicity
hidden_states = self.gpt(input_ids)
return BaseModelOutput(last_hidden_state=hidden_states)
# Register with Auto classes
AutoConfig.register("my_gpt", MyGPTConfig)
AutoModel.register(MyGPTConfig, MyGPTModel)
# Save and reload
config = MyGPTConfig(vocab_size=1000, d_model=64, n_heads=4, n_layers=2)
model = MyGPTModel(config)
import tempfile, os
with tempfile.TemporaryDirectory() as tmpdir:
model.save_pretrained(tmpdir)
print("Saved files:", os.listdir(tmpdir))
# Reload using AutoModel
loaded_model = AutoModel.from_pretrained(tmpdir)
print("Loaded model type:", type(loaded_model).__name__)
# Verify outputs match
ids = torch.randint(0, 1000, (1, 16))
with torch.no_grad():
out1 = model(ids).last_hidden_state
out2 = loaded_model(ids).last_hidden_state
diff = (out1 - out2).abs().max().item()
print(f"Output diff after reload: {diff:.2e}")
assert diff < 1e-6, "Reload changed model outputs!"
print("✓ Model saves and loads correctly")
Exercise 2 — Implement a Custom Pipeline¶
from transformers import Pipeline, AutoTokenizer
import torch
class TextClassificationWithMyGPT(Pipeline):
"""Custom pipeline for next-token–based classification."""
def __init__(self, model, tokenizer, label_names: list[str], **kwargs):
super().__init__(model=model, tokenizer=tokenizer, **kwargs)
self.label_names = label_names
def _sanitize_parameters(self, **kwargs):
return {}, {}, {}
def preprocess(self, text: str):
return self.tokenizer(text, return_tensors="pt", truncation=True,
max_length=128)
def _forward(self, inputs):
with torch.no_grad():
outputs = self.model(**inputs)
# Use last hidden state's last position for classification
return outputs.last_hidden_state[:, -1, :]
def postprocess(self, model_outputs):
# Simple nearest-centroid classification using cosine similarity
# (In practice you'd have a linear head)
return {"label": "positive", "score": 0.9} # placeholder
# Usage
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
pipeline = TextClassificationWithMyGPT(
model=model,
tokenizer=tokenizer,
label_names=["negative", "positive"],
)
result = pipeline("This movie was absolutely fantastic!")
print(result)
Exercise 3 — Map Custom Parameter Names to HuggingFace Keys¶
from transformers import GPT2LMHeadModel
import torch
def load_hf_weights(custom_model: GPTModel, hf_model_name: str):
"""
Utility that loads HuggingFace GPT-2 weights into our custom GPT.
Handles the naming convention differences automatically.
"""
hf = GPT2LMHeadModel.from_pretrained(hf_model_name)
hf_state = {k: v for k, v in hf.state_dict().items()}
# Print both naming schemes for inspection
print("=== Custom model parameter names (first 10) ===")
for name, param in list(custom_model.named_parameters())[:10]:
print(f" {name:50s} {tuple(param.shape)}")
print("\n=== HuggingFace parameter names (first 10) ===")
for name, param in list(hf.named_parameters())[:10]:
print(f" {name:60s} {tuple(param.shape)}")
# Build a name mapping
name_map = {
"token_embed.weight": "transformer.wte.weight",
"pos_embed.weight": "transformer.wpe.weight",
"norm_f.weight": "transformer.ln_f.weight",
"norm_f.bias": "transformer.ln_f.bias",
"lm_head.weight": "lm_head.weight",
}
for i in range(12):
cm = f"blocks.{i}"
hm = f"transformer.h.{i}"
name_map.update({
f"{cm}.norm1.weight": f"{hm}.ln_1.weight",
f"{cm}.norm1.bias": f"{hm}.ln_1.bias",
f"{cm}.norm2.weight": f"{hm}.ln_2.weight",
f"{cm}.norm2.bias": f"{hm}.ln_2.bias",
f"{cm}.ff.fc1.bias": f"{hm}.mlp.c_fc.bias",
f"{cm}.ff.fc2.bias": f"{hm}.mlp.c_proj.bias",
})
return name_map
# Load and print mapping
gpt_model = GPTModel(vocab_size=50257, d_model=768, n_heads=12, n_layers=12)
mapping = load_hf_weights(gpt_model, "gpt2")
print(f"\nMapped {len(mapping)} parameter groups")
print("\nSample mappings:")
for k, v in list(mapping.items())[:5]:
print(f" custom: {k}")
print(f" hf: {v}\n")
Summary¶
PretrainedConfigstores hyperparameters;PreTrainedModelprovides the fullfrom_pretrained/save_pretrained/generateAPI.AutoModelandAutoConfiguse a registry to mapmodel_typeto Python classes.- HuggingFace outputs are dataclasses — accessible by attribute or tuple unpacking.
- Custom models can be registered with
AutoModel.register()to integrate fully. - Weight loading requires a name mapping between your convention and HuggingFace's.