API Design

📖 10 min read 📄 Part 5 of 10

Resource Allocation Service - API Design

Overview

The API layer provides the interface between tenants, administrators, and the resource allocation engine. It follows REST conventions for CRUD operations and uses gRPC for high-performance internal communication (scheduler-to-node). The design is inspired by the Kubernetes API server pattern: declarative desired state, watch-based notifications, and optimistic concurrency via resource versions.


Resource Request API

Submit Allocation Request

POST /api/v1/namespaces/{namespace}/allocations
Authorization: Bearer {token}
Content-Type: application/json

{
  "metadata": {
    "name": "ml-training-job-42",
    "namespace": "ml-team",
    "labels": {
      "app": "model-training",
      "team": "ml-platform",
      "cost-center": "CC-1234"
    },
    "annotations": {
      "scheduler.example.com/gang-size": "4",
      "scheduler.example.com/max-wait": "300s"
    }
  },
  "spec": {
    "resources": {
      "requests": {
        "cpu": "8000m",
        "memory": "32Gi",
        "nvidia.com/gpu": "4"
      },
      "limits": {
        "cpu": "16000m",
        "memory": "64Gi",
        "nvidia.com/gpu": "4"
      }
    },
    "priority": 100,
    "priorityClass": "high-priority",
    "preemptionPolicy": "PreemptLowerPriority",
    "scheduling": {
      "nodeSelector": {
        "gpu-type": "nvidia-a100",
        "zone": "us-east-1a"
      },
      "affinity": {
        "nodeAffinity": {
          "requiredDuringScheduling": {
            "nodeSelectorTerms": [
              {
                "matchExpressions": [
                  {"key": "gpu-memory", "operator": "Gte", "values": ["80"]}
                ]
              }
            ]
          },
          "preferredDuringScheduling": [
            {
              "weight": 50,
              "preference": {
                "matchExpressions": [
                  {"key": "network", "operator": "In", "values": ["rdma"]}
                ]
              }
            }
          ]
        },
        "podAntiAffinity": {
          "requiredDuringScheduling": [
            {
              "topologyKey": "kubernetes.io/hostname",
              "labelSelector": {"matchLabels": {"app": "model-training"}}
            }
          ]
        }
      },
      "tolerations": [
        {"key": "gpu-only", "operator": "Exists", "effect": "NoSchedule"}
      ],
      "topologySpreadConstraints": [
        {
          "maxSkew": 1,
          "topologyKey": "topology.kubernetes.io/zone",
          "whenUnsatisfiable": "DoNotSchedule"
        }
      ]
    },
    "ttl": "7200s",
    "gangScheduling": {
      "groupId": "training-gang-001",
      "minMembers": 4,
      "timeout": "300s"
    }
  }
}

Response 201 Created:
{
  "metadata": {
    "name": "ml-training-job-42",
    "namespace": "ml-team",
    "uid": "alloc-7f3a2b1c-4d5e-6f7a-8b9c-0d1e2f3a4b5c",
    "resourceVersion": "1",
    "creationTimestamp": "2024-01-15T10:30:00Z"
  },
  "spec": { ... },
  "status": {
    "phase": "Pending",
    "conditions": [
      {
        "type": "Scheduled",
        "status": "False",
        "reason": "Unscheduled",
        "message": "Waiting for scheduler"
      }
    ],
    "queuePosition": 12
  }
}

Response 403 Forbidden (quota exceeded):
{
  "kind": "Status",
  "code": 403,
  "reason": "Forbidden",
  "message": "exceeded quota: gpu limit 100, requested 4, used 98"
}

Get Allocation Status

GET /api/v1/namespaces/{namespace}/allocations/{name}
Authorization: Bearer {token}

Response 200:
{
  "metadata": {
    "name": "ml-training-job-42",
    "namespace": "ml-team",
    "uid": "alloc-7f3a2b1c-4d5e-6f7a-8b9c-0d1e2f3a4b5c",
    "resourceVersion": "5"
  },
  "spec": { ... },
  "status": {
    "phase": "Running",
    "nodeId": "node-xyz789",
    "nodeName": "gpu-worker-042.us-east-1a",
    "startTime": "2024-01-15T10:30:05Z",
    "conditions": [
      {"type": "Scheduled", "status": "True", "lastTransitionTime": "2024-01-15T10:30:02Z"},
      {"type": "Ready", "status": "True", "lastTransitionTime": "2024-01-15T10:30:05Z"}
    ],
    "allocatedResources": {
      "cpu": "8000m",
      "memory": "32Gi",
      "nvidia.com/gpu": "4"
    }
  }
}

List Allocations with Filtering

GET /api/v1/namespaces/{namespace}/allocations?labelSelector=app%3Dmodel-training&fieldSelector=status.phase%3DRunning&limit=100&continue={token}
Authorization: Bearer {token}

Response 200:
{
  "kind": "AllocationList",
  "metadata": {
    "resourceVersion": "48291",
    "continue": "eyJvZmZzZXQiOjEwMH0="
  },
  "items": [ ... ]
}

Watch Allocation Changes (Server-Sent Events)

GET /api/v1/namespaces/{namespace}/allocations?watch=true&resourceVersion=48291
Authorization: Bearer {token}
Accept: text/event-stream

Response 200 (streaming):
data: {"type":"MODIFIED","object":{"metadata":{"name":"job-1","resourceVersion":"48292"},"status":{"phase":"Running"}}}

data: {"type":"ADDED","object":{"metadata":{"name":"job-2","resourceVersion":"48293"},"status":{"phase":"Pending"}}}

data: {"type":"DELETED","object":{"metadata":{"name":"job-3","resourceVersion":"48294"}}}

Release API

Release Resources (Graceful)

DELETE /api/v1/namespaces/{namespace}/allocations/{name}
Authorization: Bearer {token}
Content-Type: application/json

{
  "gracePeriodSeconds": 30,
  "propagationPolicy": "Foreground"
}

Response 200:
{
  "metadata": {
    "name": "ml-training-job-42",
    "deletionTimestamp": "2024-01-15T12:30:00Z",
    "deletionGracePeriodSeconds": 30
  },
  "status": {
    "phase": "Terminating",
    "conditions": [
      {
        "type": "Terminating",
        "status": "True",
        "reason": "UserRequested",
        "message": "Graceful termination in progress"
      }
    ]
  }
}

Bulk Release

POST /api/v1/namespaces/{namespace}/allocations/bulk-release
Authorization: Bearer {token}
Content-Type: application/json

{
  "labelSelector": "batch-id=job-20240115",
  "gracePeriodSeconds": 60,
  "dryRun": false
}

Response 200:
{
  "released": 24,
  "failed": 0,
  "totalResourcesFreed": {
    "cpu": "192000m",
    "memory": "768Gi",
    "nvidia.com/gpu": "96"
  }
}

Quota Management API

Set Resource Quota

POST /api/v1/namespaces/{namespace}/resourcequotas
Authorization: Bearer {token}
Content-Type: application/json

{
  "metadata": {
    "name": "ml-team-quota",
    "namespace": "ml-team"
  },
  "spec": {
    "hard": {
      "cpu": "10000",
      "memory": "40Ti",
      "nvidia.com/gpu": "100",
      "pods": "5000",
      "persistentvolumeclaims": "200",
      "services": "100"
    },
    "soft": {
      "cpu": "8000",
      "memory": "32Ti",
      "nvidia.com/gpu": "80"
    },
    "scopes": ["NotTerminating"],
    "scopeSelector": {
      "matchExpressions": [
        {"scopeName": "PriorityClass", "operator": "In", "values": ["high-priority", "normal"]}
      ]
    }
  }
}

Response 201 Created:
{
  "metadata": { ... },
  "spec": { ... },
  "status": {
    "hard": {
      "cpu": "10000",
      "memory": "40Ti",
      "nvidia.com/gpu": "100"
    },
    "used": {
      "cpu": "6500",
      "memory": "26Ti",
      "nvidia.com/gpu": "72"
    }
  }
}

Get Quota Usage

GET /api/v1/namespaces/{namespace}/resourcequotas/{name}/status
Authorization: Bearer {token}

Response 200:
{
  "metadata": {"name": "ml-team-quota"},
  "status": {
    "hard": {"cpu": "10000", "memory": "40Ti", "nvidia.com/gpu": "100"},
    "used": {"cpu": "6500", "memory": "26Ti", "nvidia.com/gpu": "72"},
    "utilization": {"cpu": "65%", "memory": "65%", "nvidia.com/gpu": "72%"},
    "trend": {
      "cpu": "+5% over 24h",
      "nvidia.com/gpu": "+12% over 24h"
    }
  }
}

Update Quota Limits

PATCH /api/v1/namespaces/{namespace}/resourcequotas/{name}
Authorization: Bearer {token}
Content-Type: application/merge-patch+json

{
  "spec": {
    "hard": {
      "nvidia.com/gpu": "150"
    }
  }
}

Response 200:
{
  "metadata": {"name": "ml-team-quota", "resourceVersion": "12"},
  "spec": {
    "hard": {"nvidia.com/gpu": "150"}
  }
}

Scheduling API

Submit Batch Job

POST /api/v1/namespaces/{namespace}/jobs
Authorization: Bearer {token}
Content-Type: application/json

{
  "metadata": {
    "name": "distributed-training-v2",
    "namespace": "ml-team"
  },
  "spec": {
    "parallelism": 8,
    "completions": 8,
    "backoffLimit": 3,
    "activeDeadlineSeconds": 86400,
    "template": {
      "spec": {
        "resources": {
          "requests": {"cpu": "16000m", "memory": "64Gi", "nvidia.com/gpu": "8"},
          "limits": {"cpu": "32000m", "memory": "128Gi", "nvidia.com/gpu": "8"}
        },
        "scheduling": {
          "gangScheduling": {"minMembers": 8, "timeout": "600s"},
          "nodeSelector": {"gpu-type": "nvidia-a100"},
          "topologySpreadConstraints": [
            {"maxSkew": 1, "topologyKey": "rack", "whenUnsatisfiable": "DoNotSchedule"}
          ]
        }
      }
    },
    "priority": 200,
    "priorityClass": "batch-high",
    "queue": "gpu-training"
  }
}

Response 201 Created:
{
  "metadata": {"name": "distributed-training-v2", "uid": "job-abc123"},
  "status": {
    "phase": "Queued",
    "queuePosition": 3,
    "estimatedStartTime": "2024-01-15T11:00:00Z",
    "conditions": [
      {"type": "Queued", "status": "True", "reason": "WaitingForResources"}
    ]
  }
}

Get Job Status

GET /api/v1/namespaces/{namespace}/jobs/{name}/status
Authorization: Bearer {token}

Response 200:
{
  "status": {
    "phase": "Running",
    "active": 8,
    "succeeded": 0,
    "failed": 0,
    "startTime": "2024-01-15T11:02:00Z",
    "allocations": [
      {"name": "training-0", "node": "gpu-001", "phase": "Running"},
      {"name": "training-1", "node": "gpu-002", "phase": "Running"},
      ...
    ]
  }
}

Cancel Job

POST /api/v1/namespaces/{namespace}/jobs/{name}/cancel
Authorization: Bearer {token}
Content-Type: application/json

{
  "reason": "Superseded by newer model version",
  "gracePeriodSeconds": 60
}

Response 200:
{
  "status": {
    "phase": "Cancelling",
    "message": "Graceful cancellation initiated, 60s grace period"
  }
}

Priority Boost

POST /api/v1/namespaces/{namespace}/jobs/{name}/boost
Authorization: Bearer {token}
Content-Type: application/json

{
  "newPriority": 500,
  "reason": "Production incident requires immediate training",
  "duration": "3600s",
  "approvedBy": "admin@example.com"
}

Response 200:
{
  "previousPriority": 200,
  "newPriority": 500,
  "boostExpires": "2024-01-15T12:02:00Z",
  "newQueuePosition": 1
}

Node Management API

Register Node

POST /api/v1/nodes
Authorization: Bearer {node-bootstrap-token}
Content-Type: application/json

{
  "metadata": {
    "name": "gpu-worker-043",
    "labels": {
      "zone": "us-east-1a",
      "rack": "rack-12",
      "gpu-type": "nvidia-a100",
      "instance-type": "p4d.24xlarge"
    }
  },
  "spec": {
    "capacity": {
      "cpu": "96000m",
      "memory": "1152Gi",
      "nvidia.com/gpu": "8",
      "ephemeral-storage": "3800Gi",
      "pods": "110"
    },
    "allocatable": {
      "cpu": "94000m",
      "memory": "1100Gi",
      "nvidia.com/gpu": "8",
      "ephemeral-storage": "3500Gi",
      "pods": "110"
    },
    "taints": [
      {"key": "nvidia.com/gpu", "effect": "NoSchedule"}
    ]
  }
}

Response 201 Created:
{
  "metadata": {"name": "gpu-worker-043", "uid": "node-def456"},
  "spec": { ... },
  "status": {"phase": "Ready", "leaseId": "lease-xyz"}
}

Update Node Capacity (e.g., after hardware change)

PATCH /api/v1/nodes/{name}/status
Authorization: Bearer {node-token}
Content-Type: application/strategic-merge-patch+json

{
  "status": {
    "capacity": {"nvidia.com/gpu": "6"},
    "allocatable": {"nvidia.com/gpu": "6"},
    "conditions": [
      {"type": "Ready", "status": "True", "lastHeartbeatTime": "2024-01-15T10:35:00Z"}
    ]
  }
}

Drain Node (Graceful Evacuation)

POST /api/v1/nodes/{name}/drain
Authorization: Bearer {admin-token}
Content-Type: application/json

{
  "gracePeriodSeconds": 300,
  "deleteEmptyDirData": true,
  "ignoreDaemonSets": true,
  "force": false,
  "reason": "Scheduled maintenance window"
}

Response 200:
{
  "status": "Draining",
  "allocationsToEvict": 23,
  "estimatedCompletionTime": "2024-01-15T10:40:00Z",
  "progress": {
    "evicted": 0,
    "pending": 23,
    "failed": 0
  }
}

Cordon Node (Prevent New Scheduling)

PATCH /api/v1/nodes/{name}
Authorization: Bearer {admin-token}
Content-Type: application/merge-patch+json

{
  "spec": {
    "unschedulable": true
  }
}

Monitoring API

Cluster Utilization

GET /api/v1/metrics/cluster/utilization
Authorization: Bearer {token}

Response 200:
{
  "timestamp": "2024-01-15T10:35:00Z",
  "cluster": {
    "nodes": {"total": 5000, "ready": 4987, "notReady": 8, "cordoned": 5},
    "cpu": {
      "capacity": "320000000m",
      "allocatable": "310000000m",
      "requested": "248000000m",
      "used": "186000000m",
      "utilization": 0.60
    },
    "memory": {
      "capacity": "1280Ti",
      "allocatable": "1240Ti",
      "requested": "992Ti",
      "used": "868Ti",
      "utilization": 0.70
    },
    "gpu": {
      "capacity": 2000,
      "allocatable": 2000,
      "requested": 1680,
      "used": 1520,
      "utilization": 0.76
    }
  },
  "scheduling": {
    "pendingAllocations": 142,
    "schedulingRate": "487/sec",
    "avgSchedulingLatencyMs": 12,
    "p99SchedulingLatencyMs": 85,
    "preemptionsLast1h": 23
  }
}

Per-Tenant Usage

GET /api/v1/metrics/tenants/{tenant_id}/usage?period=24h
Authorization: Bearer {token}

Response 200:
{
  "tenant": "ml-team",
  "period": "24h",
  "resources": {
    "cpu": {
      "quota": "10000",
      "currentUsage": "6500",
      "peakUsage": "8200",
      "avgUsage": "5800",
      "utilizationOfQuota": 0.65
    },
    "nvidia.com/gpu": {
      "quota": "100",
      "currentUsage": "72",
      "peakUsage": "95",
      "avgUsage": "68",
      "utilizationOfQuota": 0.72
    }
  },
  "cost": {
    "period": "24h",
    "total": "$12,450.00",
    "breakdown": {
      "cpu": "$3,200.00",
      "memory": "$2,100.00",
      "gpu": "$6,800.00",
      "storage": "$350.00"
    }
  }
}

Queue Depth and Wait Times

GET /api/v1/metrics/queues
Authorization: Bearer {token}

Response 200:
{
  "queues": [
    {
      "name": "default",
      "depth": 45,
      "oldestItem": "2024-01-15T10:28:00Z",
      "avgWaitTime": "12s",
      "p99WaitTime": "120s"
    },
    {
      "name": "gpu-training",
      "depth": 23,
      "oldestItem": "2024-01-15T09:45:00Z",
      "avgWaitTime": "300s",
      "p99WaitTime": "1800s"
    }
  ]
}

Preemption API

Trigger Preemption (Admin)

POST /api/v1/namespaces/{namespace}/preemptions
Authorization: Bearer {admin-token}
Content-Type: application/json

{
  "targetAllocation": "low-priority-batch-job",
  "reason": "Emergency capacity needed for production incident",
  "gracePeriodSeconds": 30,
  "initiatedBy": "oncall-engineer@example.com"
}

Response 201:
{
  "preemptionId": "preempt-abc123",
  "targetAllocation": "low-priority-batch-job",
  "status": "Initiated",
  "gracePeriodEnds": "2024-01-15T10:35:30Z",
  "resourcesFreed": {
    "cpu": "16000m",
    "memory": "64Gi",
    "nvidia.com/gpu": "4"
  }
}

Get Eviction Notice (Node Agent Polls)

GET /api/v1/nodes/{node_name}/evictions?watch=true
Authorization: Bearer {node-token}

Response 200 (streaming):
{
  "type": "EVICTION",
  "allocation": "low-priority-batch-job",
  "reason": "Preempted by higher priority job",
  "gracePeriodSeconds": 30,
  "deadline": "2024-01-15T10:35:30Z"
}

gRPC Internal API (Scheduler to Node)

syntax = "proto3";
package scheduler.v1;

service NodeAgent {
  // Heartbeat stream (bidirectional)
  rpc Heartbeat(stream HeartbeatRequest) returns (stream HeartbeatResponse);

  // Bind allocation to node
  rpc BindAllocation(BindRequest) returns (BindResponse);

  // Evict allocation from node
  rpc EvictAllocation(EvictRequest) returns (EvictResponse);

  // Get node resource snapshot
  rpc GetResourceSnapshot(SnapshotRequest) returns (ResourceSnapshot);
}

message HeartbeatRequest {
  string node_id = 1;
  int64 timestamp = 2;
  ResourceUsage current_usage = 3;
  repeated PodStatus pod_statuses = 4;
  repeated NodeCondition conditions = 5;
}

message HeartbeatResponse {
  repeated SchedulingDirective directives = 1;
  int64 lease_duration_seconds = 2;
}

message BindRequest {
  string allocation_id = 1;
  string node_id = 2;
  ResourceRequirements resources = 3;
  map<string, string> annotations = 4;
}

message BindResponse {
  bool success = 1;
  string error_message = 2;
  string failure_reason = 3;
}

Error Handling and Status Codes

Status Code Meaning Example
200 Success Resource retrieved/updated
201 Created Allocation submitted
202 Accepted Async operation started
400 Bad Request Invalid resource specification
401 Unauthorized Invalid/expired token
403 Forbidden Quota exceeded, insufficient permissions
404 Not Found Allocation/node doesn't exist
409 Conflict Resource version conflict (OCC)
422 Unprocessable Valid JSON but unsatisfiable constraints
429 Too Many Requests Rate limit exceeded
503 Service Unavailable Scheduler overloaded

Rate Limiting Headers

HTTP/1.1 429 Too Many Requests
X-RateLimit-Limit: 1000
X-RateLimit-Remaining: 0
X-RateLimit-Reset: 1705312500
Retry-After: 30

{
  "kind": "Status",
  "code": 429,
  "reason": "TooManyRequests",
  "message": "Rate limit exceeded. Tenant 'ml-team' limited to 1000 req/min."
}

Client SDK Example (Go)

package main

import (
    "context"
    "fmt"
    "time"

    allocator "github.com/example/resource-allocator/sdk/go"
)

func main() {
    client, _ := allocator.NewClient(allocator.Config{
        Endpoint: "https://scheduler.internal:6443",
        Token:    os.Getenv("SCHEDULER_TOKEN"),
    })

    // Submit allocation with constraints
    alloc, err := client.Allocations("ml-team").Create(context.Background(), &allocator.Allocation{
        Name: "training-job-1",
        Spec: allocator.AllocationSpec{
            Resources: allocator.ResourceRequirements{
                Requests: allocator.ResourceList{"cpu": "8", "memory": "32Gi", "nvidia.com/gpu": "4"},
                Limits:   allocator.ResourceList{"cpu": "16", "memory": "64Gi", "nvidia.com/gpu": "4"},
            },
            Priority:      100,
            PriorityClass: "high-priority",
            NodeSelector:  map[string]string{"gpu-type": "nvidia-a100"},
        },
    })

    // Watch for status changes
    watcher, _ := client.Allocations("ml-team").Watch(context.Background(),
        allocator.WatchOptions{FieldSelector: "metadata.name=training-job-1"})

    for event := range watcher.ResultChan() {
        alloc := event.Object.(*allocator.Allocation)
        if alloc.Status.Phase == "Running" {
            fmt.Printf("Allocated on node: %s\n", alloc.Status.NodeName)
            break
        }
    }

    // Release when done
    client.Allocations("ml-team").Delete(context.Background(), "training-job-1",
        allocator.DeleteOptions{GracePeriodSeconds: 30})
}

This API design provides a complete interface for resource allocation, from simple single-resource requests to complex gang-scheduled distributed training jobs with topology constraints.