Lesson 4.5: Metadata Tables for ML Systems
Model Registry
CREATE TABLE ml_models (
id BIGSERIAL PRIMARY KEY,
name TEXT NOT NULL,
version TEXT NOT NULL,
framework TEXT,
config JSONB,
metrics JSONB,
artifact_path TEXT,
created_at TIMESTAMPTZ DEFAULT NOW(),
created_by TEXT,
status TEXT DEFAULT 'training',
UNIQUE(name, version)
);
CREATE INDEX ml_models_status_idx ON ml_models(status, created_at DESC);
Feature Metadata
CREATE TABLE feature_definitions (
id BIGSERIAL PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
description TEXT,
data_type TEXT,
source_table TEXT,
source_column TEXT,
transformation TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE feature_values (
feature_id BIGINT NOT NULL REFERENCES feature_definitions(id),
entity_id BIGINT NOT NULL,
value JSONB,
computed_at TIMESTAMPTZ DEFAULT NOW(),
PRIMARY KEY (feature_id, entity_id)
);
Training Dataset Versioning
CREATE TABLE training_datasets (
id BIGSERIAL PRIMARY KEY,
name TEXT NOT NULL,
version TEXT NOT NULL,
description TEXT,
query TEXT,
row_count BIGINT,
file_path TEXT,
created_at TIMESTAMPTZ DEFAULT NOW(),
created_by TEXT,
UNIQUE(name, version)
);
CREATE TABLE model_datasets (
model_id BIGINT NOT NULL REFERENCES ml_models(id),
dataset_id BIGINT NOT NULL REFERENCES training_datasets(id),
split TEXT,
PRIMARY KEY (model_id, dataset_id, split)
);
Key Takeaways
- Model registry tracks all model versions with metadata and metrics
- Feature metadata tables document feature definitions and transformations
- Dataset versioning enables reproducibility of model training
- Link models to datasets to track exactly what data trained each model
- Store artifact paths (S3/GCS) to model files for deployment
- Track model status (training, deployed, archived) for lifecycle management
- Metadata tables are essential for ML governance and compliance