Edge & Routing

Amazon VPC

Virtual private cloud networking for isolated infrastructure

VPC

networking that decides your latency + cost + security

What seniors use it for (ML/GenAI)

  • Place inference workers in private subnets
  • Control egress + avoid accidental internet exposure
  • Add VPC endpoints for S3/DynamoDB/CloudWatch to reduce NAT cost + improve security posture
  • Enable flow logs for debugging/security

Knobs that matter

  • Subnets

    • Public subnets: only load balancers / NAT gateways / bastions
    • Private subnets: ECS/EKS/EC2/SageMaker (as needed)
  • NAT Gateway: convenient but can become a cost sink.

  • VPC endpoints

    • Gateway endpoints (S3/DynamoDB): usually must-have (free-ish, no hourly).
    • Interface endpoints (PrivateLink): hourly per AZ + data processing; use when you need private access to AWS APIs.
  • Security Groups vs NACLs: SGs do 95% of the job.

  • Cross-AZ data transfer: can be a hidden cost (and latency) if you’re careless.

Pricing mental models

  • NAT Gateway: “hourly + per GB” → expensive under high egress. Heuristic: if NAT data processing is noticeable on the bill, add endpoints and reduce internet-bound traffic.
  • Interface endpoints: “hourly per AZ + per GB” → don’t create 20 endpoints blindly.

Heuristics

  • Default pattern for production apps:

    • 2–3 AZs
    • private subnets for compute
    • S3/Dynamo gateway endpoints
    • NAT only if you truly need outbound internet (pip installs, external APIs)
  • Use endpoints first, NAT second.

Terraform template (minimal VPC: public+private, NAT, endpoints, flow logs)

# vpc.tf
resource "aws_vpc" "main" {
  cidr_block           = var.vpc_cidr
  enable_dns_hostnames = true
  enable_dns_support   = true
  tags                 = merge(var.tags, { Name = var.name })
}

resource "aws_internet_gateway" "igw" {
  vpc_id = aws_vpc.main.id
  tags   = var.tags
}

# Public subnets (one per AZ)
resource "aws_subnet" "public" {
  for_each                = var.public_subnets
  vpc_id                  = aws_vpc.main.id
  cidr_block              = each.value.cidr
  availability_zone       = each.value.az
  map_public_ip_on_launch = true
  tags                    = merge(var.tags, { Name = "${var.name}-public-${each.key}" })
}

# Private subnets (one per AZ)
resource "aws_subnet" "private" {
  for_each          = var.private_subnets
  vpc_id            = aws_vpc.main.id
  cidr_block        = each.value.cidr
  availability_zone = each.value.az
  tags              = merge(var.tags, { Name = "${var.name}-private-${each.key}" })
}

resource "aws_route_table" "public" {
  vpc_id = aws_vpc.main.id
  tags   = var.tags
}

resource "aws_route" "public_internet" {
  route_table_id         = aws_route_table.public.id
  destination_cidr_block = "0.0.0.0/0"
  gateway_id             = aws_internet_gateway.igw.id
}

resource "aws_route_table_association" "public" {
  for_each       = aws_subnet.public
  subnet_id      = each.value.id
  route_table_id = aws_route_table.public.id
}

# NAT (1 per AZ is best practice; 1 total is cheaper but less resilient)
resource "aws_eip" "nat" {
  vpc  = true
  tags = var.tags
}

resource "aws_nat_gateway" "nat" {
  allocation_id = aws_eip.nat.id
  subnet_id     = values(aws_subnet.public)[0].id
  tags          = var.tags
}

resource "aws_route_table" "private" {
  vpc_id = aws_vpc.main.id
  tags   = var.tags
}

resource "aws_route" "private_nat" {
  route_table_id         = aws_route_table.private.id
  destination_cidr_block = "0.0.0.0/0"
  nat_gateway_id         = aws_nat_gateway.nat.id
}

resource "aws_route_table_association" "private" {
  for_each       = aws_subnet.private
  subnet_id      = each.value.id
  route_table_id = aws_route_table.private.id
}

# Gateway endpoints: S3 + DynamoDB (big cost saver vs NAT for AWS traffic)
resource "aws_vpc_endpoint" "s3" {
  vpc_id            = aws_vpc.main.id
  service_name      = "com.amazonaws.${var.region}.s3"
  vpc_endpoint_type = "Gateway"
  route_table_ids   = [aws_route_table.private.id]
  tags              = var.tags
}

resource "aws_vpc_endpoint" "dynamodb" {
  vpc_id            = aws_vpc.main.id
  service_name      = "com.amazonaws.${var.region}.dynamodb"
  vpc_endpoint_type = "Gateway"
  route_table_ids   = [aws_route_table.private.id]
  tags              = var.tags
}

# Optional: VPC Flow Logs (debug/security). Sends to CloudWatch logs.
resource "aws_cloudwatch_log_group" "vpc_flow" {
  name              = "/aws/vpc/${var.name}/flowlogs"
  retention_in_days = 14
  tags              = var.tags
}

resource "aws_iam_role" "flowlogs" {
  name               = "${var.name}-vpc-flowlogs-role"
  assume_role_policy = data.aws_iam_policy_document.flowlogs_assume.json
}

data "aws_iam_policy_document" "flowlogs_assume" {
  statement {
    effect = "Allow"
    principals { type = "Service", identifiers = ["vpc-flow-logs.amazonaws.com"] }
    actions = ["sts:AssumeRole"]
  }
}

resource "aws_iam_role_policy" "flowlogs" {
  name   = "${var.name}-vpc-flowlogs-policy"
  role   = aws_iam_role.flowlogs.id
  policy = data.aws_iam_policy_document.flowlogs_policy.json
}

data "aws_iam_policy_document" "flowlogs_policy" {
  statement {
    effect  = "Allow"
    actions = ["logs:CreateLogStream", "logs:PutLogEvents", "logs:DescribeLogGroups", "logs:DescribeLogStreams"]
    resources = ["*"]
  }
}

resource "aws_flow_log" "vpc" {
  vpc_id               = aws_vpc.main.id
  traffic_type         = "ALL"
  log_destination_type = "cloud-watch-logs"
  log_group_name       = aws_cloudwatch_log_group.vpc_flow.name
  iam_role_arn         = aws_iam_role.flowlogs.arn
  tags                 = var.tags
}

variable "name"     { type = string }
variable "region"   { type = string }
variable "vpc_cidr" { type = string }
variable "tags"     { type = map(string) default = {} }

# Example maps:
# public_subnets = { a = { cidr="10.0.0.0/24", az="us-east-1a" }, b = { cidr="10.0.1.0/24", az="us-east-1b" } }
variable "public_subnets" {
  type = map(object({ cidr = string, az = string }))
}
variable "private_subnets" {
  type = map(object({ cidr = string, az = string }))
}