Appendix C: Reference Architectures
This appendix provides “Copy-Pasteable” Infrastructure as Code (IaC) for the most common MLOps patterns.
C.1. The “Standard RAG Stack” (AWS)
Pattern: Serverless Vector DB + Containerized LLM Service + Event-Driven Ingestion.
Terraform Implementation
# main.tf
provider "aws" { region = "us-east-1" }
# 1. Knowledge Base Storage (S3)
resource "aws_s3_bucket" "knowledge_base" {
bucket = "enterprise-rag-kb-prod-v1"
}
# 2. Vector Database (OpenSearch Serverless)
resource "aws_opensearchserverless_collection" "rag_search" {
name = "rag-vectors"
type = "VECTORSEARCH"
}
resource "aws_opensearchserverless_vpc_endpoint" "rag_vpce" {
name = "rag-vpce"
collection_arn = aws_opensearchserverless_collection.rag_search.arn
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
}
# 3. Embedding Generator (Lambda)
resource "aws_lambda_function" "ingest_pipeline" {
function_name = "rag-ingest"
image_uri = "${aws_ecr_repository.rag_repo.repository_url}:latest"
role = aws_iam_role.lambda_exec.arn
timeout = 300
memory_size = 2048
package_type = "Image"
environment {
variables = {
OPENSEARCH_ENDPOINT = aws_opensearchserverless_collection.rag_search.collection_endpoint
MODEL_ID = "text-embedding-ada-002"
}
}
}
# 4. Event Trigger (S3 -> Lambda)
resource "aws_s3_bucket_notification" "bucket_notification" {
bucket = aws_s3_bucket.knowledge_base.id
lambda_function {
lambda_function_arn = aws_lambda_function.ingest_pipeline.arn
events = ["s3:ObjectCreated:*"]
filter_suffix = ".pdf"
}
}
# 5. The Inference Service (ECS Fargate)
resource "aws_ecs_service" "llm_api" {
name = "rag-chat-api"
cluster = aws_ecs_cluster.ml_cluster.id
task_definition = aws_ecs_task_definition.llm_task.arn
desired_count = 2
launch_type = "FARGATE"
network_configuration {
subnets = module.vpc.private_subnets
security_groups = [aws_security_group.api_sg.id]
}
load_balancer {
target_group_arn = aws_lb_target_group.api_tg.arn
container_name = "api"
container_port = 8000
}
}
C.2. The “Real-Time CV Pipeline” (GCP)
Pattern: Pub/Sub Ingestion -> Dataflow (Preprocessing) -> Vertex AI (Inference) -> BigQuery.
Terraform Implementation
# gcp_cv_pipeline.tf
provider "google" { region = "us-central1" }
# 1. Ingestion Topic (Images from Edge Devices)
resource "google_pubsub_topic" "image_ingress" {
name = "cv-image-ingress"
}
# 2. Processing Pipeline (Dataflow / Apache Beam)
resource "google_dataflow_job" "preprocessor" {
name = "image-resize-and-norm"
template_gcs_path = "gs://dataflow-templates/latest/PubSub_to_VertexAI"
temp_gcs_location = "gs://my-temp-bucket/tmp_dir"
parameters = {
inputTopic = google_pubsub_topic.image_ingress.id
outputProject = var.project_id
modelEndpoint = google_vertex_ai_endpoint.detection_model.id
}
}
# 3. Model Registry & Endpoint
resource "google_vertex_ai_endpoint" "detection_model" {
display_name = "yolo-v8-production"
location = "us-central1"
}
resource "google_vertex_ai_model" "yolo_model" {
display_name = "yolo-v8-v1.0"
uri = "gs://model-bucket/yolo/saved_model"
container_spec {
image_uri = "us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-12:latest"
}
}
resource "google_vertex_ai_endpoint_traffic_split" "traffic_split" {
endpoint = google_vertex_ai_endpoint.detection_model.id
traffic_split = {
(google_vertex_ai_model.yolo_model.id) = 100
}
}
# 4. Analytics Storage (BigQuery)
resource "google_bigquery_dataset" "cv_analytics" {
dataset_id = "cv_production_logs"
location = "US"
}
resource "google_bigquery_table" "predictions" {
dataset_id = google_bigquery_dataset.cv_analytics.dataset_id
table_id = "raw_predictions"
schema = file("schemas/bq_predictions.json")
}
C.3. The “LLM Fine-Tuning Factory” (AWS)
Pattern: Scheduled Training (SageMaker) -> Model Registry -> Approval Gate -> Deployment.
CloudFormation (SAM) Template
# template.yaml
AWSTemplateFormatVersion: '2010-09-09'
Transform: AWS::Serverless-2016-10-31
Resources:
# 1. The Training Pipeline Step Function
FineTuningStateMachine:
Type: AWS::Serverless::StateMachine
Properties:
Definition:
StartAt: FetchData
States:
FetchData:
Type: Task
Resource: arn:aws:lambda:us-east-1:123456789012:function:FetchLatestData
Next: TrainingJob
TrainingJob:
Type: Task
Resource: arn:aws:states:::sagemaker:createTrainingJob.sync
Parameters:
AlgorithmSpecification:
TrainingImage: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04
TrainingInputMode: File
OutputDataConfig:
S3OutputPath: s3://my-model-bucket/output/
ResourceConfig:
InstanceCount: 4
InstanceType: ml.p4d.24xlarge # 32 A100s
VolumeSizeInGB: 500
HyperParameters:
epochs: "3"
batch_size: "32"
learning_rate: "2e-5"
Next: EvaluateModel
EvaluateModel:
Type: Task
Resource: arn:aws:states:::sagemaker:createProcessingJob.sync
Next: CheckAccuracy
CheckAccuracy:
Type: Choice
Choices:
- Variable: "$.Evaluation.Accuracy"
NumericGreaterThan: 0.85
Next: RegisterModel
Default: NotifyFailure
RegisterModel:
Type: Task
Resource: arn:aws:states:::sagemaker:createModelPackage
Parameters:
ModelPackageGroupName: "llama-3-finetuned"
ModelApprovalStatus: "PendingManualApproval"
End: True
NotifyFailure:
Type: Task
Resource: arn:aws:states:::sns:publish
Parameters:
TopicArn: !Ref AlertsTopic
Message: "Model training failed to meet accuracy threshold."
End: True
C.4. The “Hybrid Cloud Bursting” Stack
Pattern: On-Prem Data + Cloud Compute. Use Case: Training on massive datasets that cannot move (Data Sovereignty) but needing 1000 GPUs for a week.
Solution: AWS Direct Connect + EKS Anywhere.
Terraform for Networking
# hybrid_network.tf
# 1. The Direct Connect Gateway
resource "aws_dx_gateway" "hybrid_gw" {
name = "hybrid-dx-gateway"
amazon_side_asn = "64512"
}
# 2. Virtual Interface to On-Prem
resource "aws_dx_private_virtual_interface" "primary" {
connection_id = var.dx_connection_id
name = "primary-vif"
vlan = 4096
address_family = "ipv4"
bgp_asn = 65000 # On-Prem ASN
dx_gateway_id = aws_dx_gateway.hybrid_gw.id
}
# 3. Route Propagation
resource "aws_vpn_gateway_route_propagation" "propagation" {
vpn_gateway_id = aws_vpn_gateway.vpn_gw.id
route_table_id = aws_route_table.private.id
}
C.5. Key Architecture Decisions Log (ADR)
When adopting these architectures, document your choices.
ADR-001: Vector DB Selection
- Decision: Use OpenSearch Serverless.
- Context: We need vector search for RAG.
- Alternatives: Pinecone (SaaS), Postgres (pgvector).
- Rationale: We are already in AWS. OpenSearch Serverless removes the need to manage EC2 instances or shards. It complies with our HIPAA BAA.
- Consequences: Cost is higher ($700/mo min) than RDS ($50/mo), but ops load is zero.
ADR-002: Inference Hardware
- Decision: Use Inf2 (Inferentia) for Llama-3 serving.
- Context: High throughput requirements (1000 req/s).
- Alternatives: g5.2xlarge (NVIDIA A10G).
- Rationale: Inf2 offers 40% lower cost-per-inference than GPU due to specific Transformer Engine optimizations.
- Risks: Requires compiling models with AWS Neuron SDK. Vendor lock-in to AWS chips.
C.6. The “Policy as Code” Guardrails (OPA)
Don’t trust developers to remember to tag resources. Enforce it.
OPA (Open Policy Agent) Rego Rule
Requirement: All Training Jobs must have a ProjectCostCenter tag.
package main
deny[msg] {
# Trigger for SageMaker Training Jobs
input.resourceType == "AWS::SageMaker::TrainingJob"
# Check for tags
not input.resource.Tags["ProjectCostCenter"]
msg = sprintf("Training Job %v is missing mandatory tag 'ProjectCostCenter'", [input.resourceName])
}
deny[msg] {
# Ban P4d instances in Dev account
input.resourceType == "AWS::SageMaker::TrainingJob"
input.resource.ResourceConfig.InstanceType == "ml.p4d.24xlarge"
input.accountID == "123456789 (Dev)"
msg = "P4d instances are not allowed in Dev. Use p3.2xlarge."
}
C.7. The “Active-Active” Multi-Region Architecture
Pattern: Traffic goes to the nearest region. If US-East-1 fails, US-West-2 takes over instantly. Complexity: High. Requires Global Data Replication.
Terraform for Global Traffic Manager
# global_routing.tf
# 1. Route53 Health Checks
resource "aws_route53_health_check" "us_east_1" {
fqdn = "api-us-east-1.mycompany.com"
port = 443
type = "HTTPS"
resource_path = "/health"
failure_threshold = "3"
request_interval = "10"
}
# 2. Global DNS Record (Latency-Based Routing)
resource "aws_route53_record" "api" {
zone_id = aws_route53_zone.main.zone_id
name = "api.mycompany.com"
type = "A"
alias {
name = aws_lb.us_east_1_alb.dns_name
zone_id = aws_lb.us_east_1_alb.zone_id
evaluate_target_health = true
}
set_identifier = "us-east-1"
latency_routing_policy {
region = "us-east-1"
}
# If health check fails, Route53 removes this record
health_check_id = aws_route53_health_check.us_east_1.id
}
The Data Challenge:
- Model Registry: Enable S3 Cross-Region Replication (CRR).
- Feature Store: DynamoDB Global Tables (Active-Active).
- Vector DB: Manual dual-write or use a DB with Global capabilities (e.g., MongoDB Atlas).
These reference architectures are starting points. The “Best” architecture is the one your team can maintain at 3 AM.
C.12. The “Full Stack” Terraform (AWS)
A complete main.tf for a VPC, EKS Cluster, and RDS Database.
# main.tf
provider "aws" {
region = "us-east-1"
default_tags {
tags = {
Project = "MLOps-Platform"
ManagedBy = "Terraform"
}
}
}
# ==========================================
# 1. NETWORKING (VPC)
# ==========================================
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "5.0.0"
name = "mlops-vpc"
cidr = "10.0.0.0/16"
azs = ["us-east-1a", "us-east-1b", "us-east-1c"]
private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
enable_nat_gateway = true
single_nat_gateway = true # Save cost in Dev
enable_dns_hostnames = true
# VPC Endpoints for Private Access
enable_s3_endpoint = true
enable_dynamodb_endpoint = true
}
# ==========================================
# 2. DATABASE (RDS POSTGRES)
# ==========================================
resource "aws_db_subnet_group" "db_subnet" {
name = "ml-db-subnet-group"
subnet_ids = module.vpc.private_subnets
}
resource "aws_security_group" "rds_sg" {
name = "rds-sg"
vpc_id = module.vpc.vpc_id
ingress {
from_port = 5432
to_port = 5432
protocol = "tcp"
cidr_blocks = [module.vpc.vpc_cidr_block] # Allow entire VPC
}
}
resource "aws_db_instance" "mlflow_db" {
identifier = "mlflow-backend-store"
engine = "postgres"
engine_version = "14.7"
instance_class = "db.t4g.small"
allocated_storage = 20
storage_type = "gp3"
username = "mlflow_admin"
password = var.db_password # Pass via TF_VAR_db_password
db_subnet_group_name = aws_db_subnet_group.db_subnet.name
vpc_security_group_ids = [aws_security_group.rds_sg.id]
skip_final_snapshot = true
}
# ==========================================
# 3. COMPUTE (EKS CLUSTER)
# ==========================================
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "19.15.0"
cluster_name = "mlops-cluster"
cluster_version = "1.27"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
cluster_endpoint_public_access = true
# OIDC for Service Accounts (IRSA)
enable_irsa = true
eks_managed_node_groups = {
# 1. System Node Group (CoreDNS, Controllers)
system_nodes = {
min_size = 2
max_size = 3
desired_size = 2
instance_types = ["t3.medium"]
labels = {
"role" = "system"
}
}
# 2. CPU Workload Group (Spot Instances)
cpu_workers = {
min_size = 0
max_size = 10
desired_size = 1
instance_types = ["c6a.2xlarge", "c6i.2xlarge"]
capacity_type = "SPOT"
labels = {
"role" = "batch-processing"
}
}
# 3. GPU Workload Group (On-Demand)
gpu_workers = {
min_size = 0
max_size = 4
desired_size = 0
instance_types = ["g5.xlarge"]
capacity_type = "ON_DEMAND"
ami_type = "AL2_x86_64_GPU"
labels = {
"accelerator" = "nvidia-gpu"
}
taints = {
dedicated = {
key = "nvidia.com/gpu"
value = "true"
effect = "NO_SCHEDULE"
}
}
}
}
}
# ==========================================
# 4. STORAGE (S3)
# ==========================================
resource "aws_s3_bucket" "artifacts" {
bucket = "mlops-artifacts-${random_id.suffix.hex}"
}
resource "aws_s3_bucket_lifecycle_configuration" "lifecycle" {
bucket = aws_s3_bucket.artifacts.id
rule {
id = "expire-temp-data"
filter {
prefix = "temp/"
}
expiration {
days = 7
}
status = "Enabled"
}
}
resource "random_id" "suffix" {
byte_length = 4
}
This file alone saves you 2 days of debugging Networking configurations.