Keyboard shortcuts

Press or to navigate between chapters

Press ? to show this help

Press Esc to hide this help

Appendix C: Reference Architectures

This appendix provides “Copy-Pasteable” Infrastructure as Code (IaC) for the most common MLOps patterns.

C.1. The “Standard RAG Stack” (AWS)

Pattern: Serverless Vector DB + Containerized LLM Service + Event-Driven Ingestion.

RAG Architecture

Terraform Implementation

# main.tf

provider "aws" { region = "us-east-1" }

# 1. Knowledge Base Storage (S3)
resource "aws_s3_bucket" "knowledge_base" {
  bucket = "enterprise-rag-kb-prod-v1"
}

# 2. Vector Database (OpenSearch Serverless)
resource "aws_opensearchserverless_collection" "rag_search" {
  name = "rag-vectors"
  type = "VECTORSEARCH"
}

resource "aws_opensearchserverless_vpc_endpoint" "rag_vpce" {
  name       = "rag-vpce"
  collection_arn = aws_opensearchserverless_collection.rag_search.arn
  vpc_id     = module.vpc.vpc_id
  subnet_ids = module.vpc.private_subnets
}

# 3. Embedding Generator (Lambda)
resource "aws_lambda_function" "ingest_pipeline" {
  function_name = "rag-ingest"
  image_uri     = "${aws_ecr_repository.rag_repo.repository_url}:latest"
  role          = aws_iam_role.lambda_exec.arn
  timeout       = 300
  memory_size   = 2048
  package_type  = "Image"

  environment {
    variables = {
      OPENSEARCH_ENDPOINT = aws_opensearchserverless_collection.rag_search.collection_endpoint
      MODEL_ID            = "text-embedding-ada-002"
    }
  }
}

# 4. Event Trigger (S3 -> Lambda)
resource "aws_s3_bucket_notification" "bucket_notification" {
  bucket = aws_s3_bucket.knowledge_base.id
  lambda_function {
    lambda_function_arn = aws_lambda_function.ingest_pipeline.arn
    events              = ["s3:ObjectCreated:*"]
    filter_suffix       = ".pdf"
  }
}

# 5. The Inference Service (ECS Fargate)
resource "aws_ecs_service" "llm_api" {
  name            = "rag-chat-api"
  cluster         = aws_ecs_cluster.ml_cluster.id
  task_definition = aws_ecs_task_definition.llm_task.arn
  desired_count   = 2
  launch_type     = "FARGATE"
  
  network_configuration {
    subnets = module.vpc.private_subnets
    security_groups = [aws_security_group.api_sg.id]
  }
  
  load_balancer {
    target_group_arn = aws_lb_target_group.api_tg.arn
    container_name   = "api"
    container_port   = 8000
  }
}

C.2. The “Real-Time CV Pipeline” (GCP)

Pattern: Pub/Sub Ingestion -> Dataflow (Preprocessing) -> Vertex AI (Inference) -> BigQuery.

Terraform Implementation

# gcp_cv_pipeline.tf

provider "google" { region = "us-central1" }

# 1. Ingestion Topic (Images from Edge Devices)
resource "google_pubsub_topic" "image_ingress" {
  name = "cv-image-ingress"
}

# 2. Processing Pipeline (Dataflow / Apache Beam)
resource "google_dataflow_job" "preprocessor" {
  name              = "image-resize-and-norm"
  template_gcs_path = "gs://dataflow-templates/latest/PubSub_to_VertexAI"
  temp_gcs_location = "gs://my-temp-bucket/tmp_dir"
  parameters = {
    inputTopic      = google_pubsub_topic.image_ingress.id
    outputProject   = var.project_id
    modelEndpoint   = google_vertex_ai_endpoint.detection_model.id
  }
}

# 3. Model Registry & Endpoint
resource "google_vertex_ai_endpoint" "detection_model" {
  display_name = "yolo-v8-production"
  location     = "us-central1"
}

resource "google_vertex_ai_model" "yolo_model" {
  display_name = "yolo-v8-v1.0"
  uri          = "gs://model-bucket/yolo/saved_model"
  container_spec {
    image_uri = "us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-12:latest"
  }
}

resource "google_vertex_ai_endpoint_traffic_split" "traffic_split" {
  endpoint = google_vertex_ai_endpoint.detection_model.id
  traffic_split = {
    (google_vertex_ai_model.yolo_model.id) = 100
  }
}

# 4. Analytics Storage (BigQuery)
resource "google_bigquery_dataset" "cv_analytics" {
  dataset_id = "cv_production_logs"
  location   = "US"
}

resource "google_bigquery_table" "predictions" {
  dataset_id = google_bigquery_dataset.cv_analytics.dataset_id
  table_id   = "raw_predictions"
  schema     = file("schemas/bq_predictions.json")
}

C.3. The “LLM Fine-Tuning Factory” (AWS)

Pattern: Scheduled Training (SageMaker) -> Model Registry -> Approval Gate -> Deployment.

CloudFormation (SAM) Template

# template.yaml

AWSTemplateFormatVersion: '2010-09-09'
Transform: AWS::Serverless-2016-10-31

Resources:
  # 1. The Training Pipeline Step Function
  FineTuningStateMachine:
    Type: AWS::Serverless::StateMachine
    Properties:
      Definition:
        StartAt: FetchData
        States:
          FetchData:
            Type: Task
            Resource: arn:aws:lambda:us-east-1:123456789012:function:FetchLatestData
            Next: TrainingJob
          
          TrainingJob:
            Type: Task
            Resource: arn:aws:states:::sagemaker:createTrainingJob.sync
            Parameters:
              AlgorithmSpecification:
                TrainingImage: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04
                TrainingInputMode: File
              OutputDataConfig:
                S3OutputPath: s3://my-model-bucket/output/
              ResourceConfig:
                InstanceCount: 4
                InstanceType: ml.p4d.24xlarge # 32 A100s
                VolumeSizeInGB: 500
              HyperParameters:
                epochs: "3"
                batch_size: "32"
                learning_rate: "2e-5"
            Next: EvaluateModel

          EvaluateModel:
            Type: Task
            Resource: arn:aws:states:::sagemaker:createProcessingJob.sync
            Next: CheckAccuracy
          
          CheckAccuracy:
            Type: Choice
            Choices:
              - Variable: "$.Evaluation.Accuracy"
                NumericGreaterThan: 0.85
                Next: RegisterModel
            Default: NotifyFailure

          RegisterModel:
            Type: Task
            Resource: arn:aws:states:::sagemaker:createModelPackage
            Parameters:
              ModelPackageGroupName: "llama-3-finetuned"
              ModelApprovalStatus: "PendingManualApproval"
            End: True

          NotifyFailure:
            Type: Task
            Resource: arn:aws:states:::sns:publish
            Parameters:
              TopicArn: !Ref AlertsTopic
              Message: "Model training failed to meet accuracy threshold."
            End: True

C.4. The “Hybrid Cloud Bursting” Stack

Pattern: On-Prem Data + Cloud Compute. Use Case: Training on massive datasets that cannot move (Data Sovereignty) but needing 1000 GPUs for a week.

Solution: AWS Direct Connect + EKS Anywhere.

Terraform for Networking

# hybrid_network.tf

# 1. The Direct Connect Gateway
resource "aws_dx_gateway" "hybrid_gw" {
  name            = "hybrid-dx-gateway"
  amazon_side_asn = "64512"
}

# 2. Virtual Interface to On-Prem
resource "aws_dx_private_virtual_interface" "primary" {
  connection_id    = var.dx_connection_id
  name             = "primary-vif"
  vlan             = 4096
  address_family   = "ipv4"
  bgp_asn          = 65000 # On-Prem ASN
  dx_gateway_id    = aws_dx_gateway.hybrid_gw.id
}

# 3. Route Propagation
resource "aws_vpn_gateway_route_propagation" "propagation" {
  vpn_gateway_id = aws_vpn_gateway.vpn_gw.id
  route_table_id = aws_route_table.private.id
}

C.5. Key Architecture Decisions Log (ADR)

When adopting these architectures, document your choices.

ADR-001: Vector DB Selection

  • Decision: Use OpenSearch Serverless.
  • Context: We need vector search for RAG.
  • Alternatives: Pinecone (SaaS), Postgres (pgvector).
  • Rationale: We are already in AWS. OpenSearch Serverless removes the need to manage EC2 instances or shards. It complies with our HIPAA BAA.
  • Consequences: Cost is higher ($700/mo min) than RDS ($50/mo), but ops load is zero.

ADR-002: Inference Hardware

  • Decision: Use Inf2 (Inferentia) for Llama-3 serving.
  • Context: High throughput requirements (1000 req/s).
  • Alternatives: g5.2xlarge (NVIDIA A10G).
  • Rationale: Inf2 offers 40% lower cost-per-inference than GPU due to specific Transformer Engine optimizations.
  • Risks: Requires compiling models with AWS Neuron SDK. Vendor lock-in to AWS chips.

C.6. The “Policy as Code” Guardrails (OPA)

Don’t trust developers to remember to tag resources. Enforce it.

OPA (Open Policy Agent) Rego Rule

Requirement: All Training Jobs must have a ProjectCostCenter tag.

package main

deny[msg] {
  # Trigger for SageMaker Training Jobs
  input.resourceType == "AWS::SageMaker::TrainingJob"
  
  # Check for tags
  not input.resource.Tags["ProjectCostCenter"]
  
  msg = sprintf("Training Job %v is missing mandatory tag 'ProjectCostCenter'", [input.resourceName])
}

deny[msg] {
  # Ban P4d instances in Dev account
  input.resourceType == "AWS::SageMaker::TrainingJob"
  input.resource.ResourceConfig.InstanceType == "ml.p4d.24xlarge"
  input.accountID == "123456789 (Dev)"
  
  msg = "P4d instances are not allowed in Dev. Use p3.2xlarge."
}

C.7. The “Active-Active” Multi-Region Architecture

Pattern: Traffic goes to the nearest region. If US-East-1 fails, US-West-2 takes over instantly. Complexity: High. Requires Global Data Replication.

Terraform for Global Traffic Manager

# global_routing.tf

# 1. Route53 Health Checks
resource "aws_route53_health_check" "us_east_1" {
  fqdn              = "api-us-east-1.mycompany.com"
  port              = 443
  type              = "HTTPS"
  resource_path     = "/health"
  failure_threshold = "3"
  request_interval  = "10"
}

# 2. Global DNS Record (Latency-Based Routing)
resource "aws_route53_record" "api" {
  zone_id = aws_route53_zone.main.zone_id
  name    = "api.mycompany.com"
  type    = "A"
  
  alias {
    name                   = aws_lb.us_east_1_alb.dns_name
    zone_id                = aws_lb.us_east_1_alb.zone_id
    evaluate_target_health = true
  }
  
  set_identifier = "us-east-1"
  latency_routing_policy {
    region = "us-east-1"
  }
  
  # If health check fails, Route53 removes this record
  health_check_id = aws_route53_health_check.us_east_1.id
}

The Data Challenge:

  • Model Registry: Enable S3 Cross-Region Replication (CRR).
  • Feature Store: DynamoDB Global Tables (Active-Active).
  • Vector DB: Manual dual-write or use a DB with Global capabilities (e.g., MongoDB Atlas).

These reference architectures are starting points. The “Best” architecture is the one your team can maintain at 3 AM.


C.12. The “Full Stack” Terraform (AWS)

A complete main.tf for a VPC, EKS Cluster, and RDS Database.

# main.tf

provider "aws" {
  region = "us-east-1"
  default_tags {
    tags = {
      Project   = "MLOps-Platform"
      ManagedBy = "Terraform"
    }
  }
}

# ==========================================
# 1. NETWORKING (VPC)
# ==========================================
module "vpc" {
  source = "terraform-aws-modules/vpc/aws"
  version = "5.0.0"

  name = "mlops-vpc"
  cidr = "10.0.0.0/16"

  azs             = ["us-east-1a", "us-east-1b", "us-east-1c"]
  private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
  public_subnets  = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]

  enable_nat_gateway   = true
  single_nat_gateway   = true # Save cost in Dev
  enable_dns_hostnames = true
  
  # VPC Endpoints for Private Access
  enable_s3_endpoint       = true
  enable_dynamodb_endpoint = true
}

# ==========================================
# 2. DATABASE (RDS POSTGRES)
# ==========================================
resource "aws_db_subnet_group" "db_subnet" {
  name       = "ml-db-subnet-group"
  subnet_ids = module.vpc.private_subnets
}

resource "aws_security_group" "rds_sg" {
  name        = "rds-sg"
  vpc_id      = module.vpc.vpc_id

  ingress {
    from_port   = 5432
    to_port     = 5432
    protocol    = "tcp"
    cidr_blocks = [module.vpc.vpc_cidr_block] # Allow entire VPC
  }
}

resource "aws_db_instance" "mlflow_db" {
  identifier        = "mlflow-backend-store"
  engine            = "postgres"
  engine_version    = "14.7"
  instance_class    = "db.t4g.small"
  allocated_storage = 20
  storage_type      = "gp3"

  username = "mlflow_admin"
  password = var.db_password # Pass via TF_VAR_db_password

  db_subnet_group_name   = aws_db_subnet_group.db_subnet.name
  vpc_security_group_ids = [aws_security_group.rds_sg.id]
  skip_final_snapshot    = true
}

# ==========================================
# 3. COMPUTE (EKS CLUSTER)
# ==========================================
module "eks" {
  source  = "terraform-aws-modules/eks/aws"
  version = "19.15.0"

  cluster_name    = "mlops-cluster"
  cluster_version = "1.27"

  vpc_id     = module.vpc.vpc_id
  subnet_ids = module.vpc.private_subnets

  cluster_endpoint_public_access = true

  # OIDC for Service Accounts (IRSA)
  enable_irsa = true

  eks_managed_node_groups = {
    # 1. System Node Group (CoreDNS, Controllers)
    system_nodes = {
      min_size     = 2
      max_size     = 3
      desired_size = 2
      instance_types = ["t3.medium"]
      labels = {
        "role" = "system"
      }
    }
    
    # 2. CPU Workload Group (Spot Instances)
    cpu_workers = {
      min_size     = 0
      max_size     = 10
      desired_size = 1
      instance_types = ["c6a.2xlarge", "c6i.2xlarge"]
      capacity_type  = "SPOT"
      labels = {
        "role" = "batch-processing"
      }
    }
    
    # 3. GPU Workload Group (On-Demand)
    gpu_workers = {
      min_size     = 0
      max_size     = 4
      desired_size = 0
      instance_types = ["g5.xlarge"]
      capacity_type  = "ON_DEMAND"
      ami_type       = "AL2_x86_64_GPU"
      labels = {
        "accelerator" = "nvidia-gpu"
      }
      taints = {
        dedicated = {
          key    = "nvidia.com/gpu"
          value  = "true"
          effect = "NO_SCHEDULE"
        }
      }
    }
  }
}

# ==========================================
# 4. STORAGE (S3)
# ==========================================
resource "aws_s3_bucket" "artifacts" {
  bucket = "mlops-artifacts-${random_id.suffix.hex}"
}

resource "aws_s3_bucket_lifecycle_configuration" "lifecycle" {
  bucket = aws_s3_bucket.artifacts.id

  rule {
    id = "expire-temp-data"
    filter {
      prefix = "temp/"
    }
    expiration {
      days = 7
    }
    status = "Enabled"
  }
}

resource "random_id" "suffix" {
  byte_length = 4
}

This file alone saves you 2 days of debugging Networking configurations.