Lesson 4.5: Metadata Tables for ML Systems
Model Registry
CREATE TABLE ml_models (
id BIGSERIAL PRIMARY KEY,
name TEXT NOT NULL,
version TEXT NOT NULL,
framework TEXT, -- 'pytorch', 'tensorflow', 'sklearn'
config JSONB, -- Hyperparameters
metrics JSONB, -- Accuracy, F1, etc.
artifact_path TEXT, -- S3/GCS path to model file
created_at TIMESTAMPTZ DEFAULT NOW(),
created_by TEXT,
status TEXT DEFAULT 'training', -- 'training', 'deployed', 'archived'
UNIQUE(name, version)
);
CREATE INDEX ml_models_status_idx ON ml_models(status, created_at DESC);
Feature Metadata
CREATE TABLE feature_definitions (
id BIGSERIAL PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
description TEXT,
data_type TEXT, -- 'numeric', 'categorical', 'embedding'
source_table TEXT,
source_column TEXT,
transformation TEXT, -- SQL or code
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE feature_values (
feature_id BIGINT NOT NULL REFERENCES feature_definitions(id),
entity_id BIGINT NOT NULL, -- user_id, product_id, etc.
value JSONB,
computed_at TIMESTAMPTZ DEFAULT NOW(),
PRIMARY KEY (feature_id, entity_id)
);
Training Dataset Versioning
CREATE TABLE training_datasets (
id BIGSERIAL PRIMARY KEY,
name TEXT NOT NULL,
version TEXT NOT NULL,
description TEXT,
query TEXT, -- SQL query that generated data
row_count BIGINT,
file_path TEXT, -- S3/GCS path
created_at TIMESTAMPTZ DEFAULT NOW(),
created_by TEXT,
UNIQUE(name, version)
);
-- Link models to datasets (reproducibility)
CREATE TABLE model_datasets (
model_id BIGINT NOT NULL REFERENCES ml_models(id),
dataset_id BIGINT NOT NULL REFERENCES training_datasets(id),
split TEXT, -- 'train', 'val', 'test'
PRIMARY KEY (model_id, dataset_id, split)
);
Key Takeaways
- Model registry tracks all model versions with metadata and metrics
- Feature metadata tables document feature definitions and transformations
- Dataset versioning enables reproducibility of model training
- Link models to datasets to track exactly what data trained each model
- Store artifact paths (S3/GCS) to model files for deployment
- Track model status (training, deployed, archived) for lifecycle management
- Metadata tables are essential for ML governance and compliance
Discussion
Sign in to join the discussion.
0 comments

