Resource Allocation Service - API Design
Overview
The API layer provides the interface between tenants, administrators, and the resource allocation engine. It follows REST conventions for CRUD operations and uses gRPC for high-performance internal communication (scheduler-to-node). The design is inspired by the Kubernetes API server pattern: declarative desired state, watch-based notifications, and optimistic concurrency via resource versions.
Resource Request API
Submit Allocation Request
POST /api/v1/namespaces/{namespace}/allocations
Authorization: Bearer {token}
Content-Type: application/json
{
"metadata": {
"name": "ml-training-job-42",
"namespace": "ml-team",
"labels": {
"app": "model-training",
"team": "ml-platform",
"cost-center": "CC-1234"
},
"annotations": {
"scheduler.example.com/gang-size": "4",
"scheduler.example.com/max-wait": "300s"
}
},
"spec": {
"resources": {
"requests": {
"cpu": "8000m",
"memory": "32Gi",
"nvidia.com/gpu": "4"
},
"limits": {
"cpu": "16000m",
"memory": "64Gi",
"nvidia.com/gpu": "4"
}
},
"priority": 100,
"priorityClass": "high-priority",
"preemptionPolicy": "PreemptLowerPriority",
"scheduling": {
"nodeSelector": {
"gpu-type": "nvidia-a100",
"zone": "us-east-1a"
},
"affinity": {
"nodeAffinity": {
"requiredDuringScheduling": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{"key": "gpu-memory", "operator": "Gte", "values": ["80"]}
]
}
]
},
"preferredDuringScheduling": [
{
"weight": 50,
"preference": {
"matchExpressions": [
{"key": "network", "operator": "In", "values": ["rdma"]}
]
}
}
]
},
"podAntiAffinity": {
"requiredDuringScheduling": [
{
"topologyKey": "kubernetes.io/hostname",
"labelSelector": {"matchLabels": {"app": "model-training"}}
}
]
}
},
"tolerations": [
{"key": "gpu-only", "operator": "Exists", "effect": "NoSchedule"}
],
"topologySpreadConstraints": [
{
"maxSkew": 1,
"topologyKey": "topology.kubernetes.io/zone",
"whenUnsatisfiable": "DoNotSchedule"
}
]
},
"ttl": "7200s",
"gangScheduling": {
"groupId": "training-gang-001",
"minMembers": 4,
"timeout": "300s"
}
}
}
Response 201 Created:
{
"metadata": {
"name": "ml-training-job-42",
"namespace": "ml-team",
"uid": "alloc-7f3a2b1c-4d5e-6f7a-8b9c-0d1e2f3a4b5c",
"resourceVersion": "1",
"creationTimestamp": "2024-01-15T10:30:00Z"
},
"spec": { ... },
"status": {
"phase": "Pending",
"conditions": [
{
"type": "Scheduled",
"status": "False",
"reason": "Unscheduled",
"message": "Waiting for scheduler"
}
],
"queuePosition": 12
}
}
Response 403 Forbidden (quota exceeded):
{
"kind": "Status",
"code": 403,
"reason": "Forbidden",
"message": "exceeded quota: gpu limit 100, requested 4, used 98"
}Get Allocation Status
GET /api/v1/namespaces/{namespace}/allocations/{name}
Authorization: Bearer {token}
Response 200:
{
"metadata": {
"name": "ml-training-job-42",
"namespace": "ml-team",
"uid": "alloc-7f3a2b1c-4d5e-6f7a-8b9c-0d1e2f3a4b5c",
"resourceVersion": "5"
},
"spec": { ... },
"status": {
"phase": "Running",
"nodeId": "node-xyz789",
"nodeName": "gpu-worker-042.us-east-1a",
"startTime": "2024-01-15T10:30:05Z",
"conditions": [
{"type": "Scheduled", "status": "True", "lastTransitionTime": "2024-01-15T10:30:02Z"},
{"type": "Ready", "status": "True", "lastTransitionTime": "2024-01-15T10:30:05Z"}
],
"allocatedResources": {
"cpu": "8000m",
"memory": "32Gi",
"nvidia.com/gpu": "4"
}
}
}List Allocations with Filtering
GET /api/v1/namespaces/{namespace}/allocations?labelSelector=app%3Dmodel-training&fieldSelector=status.phase%3DRunning&limit=100&continue={token}
Authorization: Bearer {token}
Response 200:
{
"kind": "AllocationList",
"metadata": {
"resourceVersion": "48291",
"continue": "eyJvZmZzZXQiOjEwMH0="
},
"items": [ ... ]
}Watch Allocation Changes (Server-Sent Events)
GET /api/v1/namespaces/{namespace}/allocations?watch=true&resourceVersion=48291
Authorization: Bearer {token}
Accept: text/event-stream
Response 200 (streaming):
data: {"type":"MODIFIED","object":{"metadata":{"name":"job-1","resourceVersion":"48292"},"status":{"phase":"Running"}}}
data: {"type":"ADDED","object":{"metadata":{"name":"job-2","resourceVersion":"48293"},"status":{"phase":"Pending"}}}
data: {"type":"DELETED","object":{"metadata":{"name":"job-3","resourceVersion":"48294"}}}Release API
Release Resources (Graceful)
DELETE /api/v1/namespaces/{namespace}/allocations/{name}
Authorization: Bearer {token}
Content-Type: application/json
{
"gracePeriodSeconds": 30,
"propagationPolicy": "Foreground"
}
Response 200:
{
"metadata": {
"name": "ml-training-job-42",
"deletionTimestamp": "2024-01-15T12:30:00Z",
"deletionGracePeriodSeconds": 30
},
"status": {
"phase": "Terminating",
"conditions": [
{
"type": "Terminating",
"status": "True",
"reason": "UserRequested",
"message": "Graceful termination in progress"
}
]
}
}Bulk Release
POST /api/v1/namespaces/{namespace}/allocations/bulk-release
Authorization: Bearer {token}
Content-Type: application/json
{
"labelSelector": "batch-id=job-20240115",
"gracePeriodSeconds": 60,
"dryRun": false
}
Response 200:
{
"released": 24,
"failed": 0,
"totalResourcesFreed": {
"cpu": "192000m",
"memory": "768Gi",
"nvidia.com/gpu": "96"
}
}Quota Management API
Set Resource Quota
POST /api/v1/namespaces/{namespace}/resourcequotas
Authorization: Bearer {token}
Content-Type: application/json
{
"metadata": {
"name": "ml-team-quota",
"namespace": "ml-team"
},
"spec": {
"hard": {
"cpu": "10000",
"memory": "40Ti",
"nvidia.com/gpu": "100",
"pods": "5000",
"persistentvolumeclaims": "200",
"services": "100"
},
"soft": {
"cpu": "8000",
"memory": "32Ti",
"nvidia.com/gpu": "80"
},
"scopes": ["NotTerminating"],
"scopeSelector": {
"matchExpressions": [
{"scopeName": "PriorityClass", "operator": "In", "values": ["high-priority", "normal"]}
]
}
}
}
Response 201 Created:
{
"metadata": { ... },
"spec": { ... },
"status": {
"hard": {
"cpu": "10000",
"memory": "40Ti",
"nvidia.com/gpu": "100"
},
"used": {
"cpu": "6500",
"memory": "26Ti",
"nvidia.com/gpu": "72"
}
}
}Get Quota Usage
GET /api/v1/namespaces/{namespace}/resourcequotas/{name}/status
Authorization: Bearer {token}
Response 200:
{
"metadata": {"name": "ml-team-quota"},
"status": {
"hard": {"cpu": "10000", "memory": "40Ti", "nvidia.com/gpu": "100"},
"used": {"cpu": "6500", "memory": "26Ti", "nvidia.com/gpu": "72"},
"utilization": {"cpu": "65%", "memory": "65%", "nvidia.com/gpu": "72%"},
"trend": {
"cpu": "+5% over 24h",
"nvidia.com/gpu": "+12% over 24h"
}
}
}Update Quota Limits
PATCH /api/v1/namespaces/{namespace}/resourcequotas/{name}
Authorization: Bearer {token}
Content-Type: application/merge-patch+json
{
"spec": {
"hard": {
"nvidia.com/gpu": "150"
}
}
}
Response 200:
{
"metadata": {"name": "ml-team-quota", "resourceVersion": "12"},
"spec": {
"hard": {"nvidia.com/gpu": "150"}
}
}Scheduling API
Submit Batch Job
POST /api/v1/namespaces/{namespace}/jobs
Authorization: Bearer {token}
Content-Type: application/json
{
"metadata": {
"name": "distributed-training-v2",
"namespace": "ml-team"
},
"spec": {
"parallelism": 8,
"completions": 8,
"backoffLimit": 3,
"activeDeadlineSeconds": 86400,
"template": {
"spec": {
"resources": {
"requests": {"cpu": "16000m", "memory": "64Gi", "nvidia.com/gpu": "8"},
"limits": {"cpu": "32000m", "memory": "128Gi", "nvidia.com/gpu": "8"}
},
"scheduling": {
"gangScheduling": {"minMembers": 8, "timeout": "600s"},
"nodeSelector": {"gpu-type": "nvidia-a100"},
"topologySpreadConstraints": [
{"maxSkew": 1, "topologyKey": "rack", "whenUnsatisfiable": "DoNotSchedule"}
]
}
}
},
"priority": 200,
"priorityClass": "batch-high",
"queue": "gpu-training"
}
}
Response 201 Created:
{
"metadata": {"name": "distributed-training-v2", "uid": "job-abc123"},
"status": {
"phase": "Queued",
"queuePosition": 3,
"estimatedStartTime": "2024-01-15T11:00:00Z",
"conditions": [
{"type": "Queued", "status": "True", "reason": "WaitingForResources"}
]
}
}Get Job Status
GET /api/v1/namespaces/{namespace}/jobs/{name}/status
Authorization: Bearer {token}
Response 200:
{
"status": {
"phase": "Running",
"active": 8,
"succeeded": 0,
"failed": 0,
"startTime": "2024-01-15T11:02:00Z",
"allocations": [
{"name": "training-0", "node": "gpu-001", "phase": "Running"},
{"name": "training-1", "node": "gpu-002", "phase": "Running"},
...
]
}
}Cancel Job
POST /api/v1/namespaces/{namespace}/jobs/{name}/cancel
Authorization: Bearer {token}
Content-Type: application/json
{
"reason": "Superseded by newer model version",
"gracePeriodSeconds": 60
}
Response 200:
{
"status": {
"phase": "Cancelling",
"message": "Graceful cancellation initiated, 60s grace period"
}
}Priority Boost
POST /api/v1/namespaces/{namespace}/jobs/{name}/boost
Authorization: Bearer {token}
Content-Type: application/json
{
"newPriority": 500,
"reason": "Production incident requires immediate training",
"duration": "3600s",
"approvedBy": "admin@example.com"
}
Response 200:
{
"previousPriority": 200,
"newPriority": 500,
"boostExpires": "2024-01-15T12:02:00Z",
"newQueuePosition": 1
}Node Management API
Register Node
POST /api/v1/nodes
Authorization: Bearer {node-bootstrap-token}
Content-Type: application/json
{
"metadata": {
"name": "gpu-worker-043",
"labels": {
"zone": "us-east-1a",
"rack": "rack-12",
"gpu-type": "nvidia-a100",
"instance-type": "p4d.24xlarge"
}
},
"spec": {
"capacity": {
"cpu": "96000m",
"memory": "1152Gi",
"nvidia.com/gpu": "8",
"ephemeral-storage": "3800Gi",
"pods": "110"
},
"allocatable": {
"cpu": "94000m",
"memory": "1100Gi",
"nvidia.com/gpu": "8",
"ephemeral-storage": "3500Gi",
"pods": "110"
},
"taints": [
{"key": "nvidia.com/gpu", "effect": "NoSchedule"}
]
}
}
Response 201 Created:
{
"metadata": {"name": "gpu-worker-043", "uid": "node-def456"},
"spec": { ... },
"status": {"phase": "Ready", "leaseId": "lease-xyz"}
}Update Node Capacity (e.g., after hardware change)
PATCH /api/v1/nodes/{name}/status
Authorization: Bearer {node-token}
Content-Type: application/strategic-merge-patch+json
{
"status": {
"capacity": {"nvidia.com/gpu": "6"},
"allocatable": {"nvidia.com/gpu": "6"},
"conditions": [
{"type": "Ready", "status": "True", "lastHeartbeatTime": "2024-01-15T10:35:00Z"}
]
}
}Drain Node (Graceful Evacuation)
POST /api/v1/nodes/{name}/drain
Authorization: Bearer {admin-token}
Content-Type: application/json
{
"gracePeriodSeconds": 300,
"deleteEmptyDirData": true,
"ignoreDaemonSets": true,
"force": false,
"reason": "Scheduled maintenance window"
}
Response 200:
{
"status": "Draining",
"allocationsToEvict": 23,
"estimatedCompletionTime": "2024-01-15T10:40:00Z",
"progress": {
"evicted": 0,
"pending": 23,
"failed": 0
}
}Cordon Node (Prevent New Scheduling)
PATCH /api/v1/nodes/{name}
Authorization: Bearer {admin-token}
Content-Type: application/merge-patch+json
{
"spec": {
"unschedulable": true
}
}Monitoring API
Cluster Utilization
GET /api/v1/metrics/cluster/utilization
Authorization: Bearer {token}
Response 200:
{
"timestamp": "2024-01-15T10:35:00Z",
"cluster": {
"nodes": {"total": 5000, "ready": 4987, "notReady": 8, "cordoned": 5},
"cpu": {
"capacity": "320000000m",
"allocatable": "310000000m",
"requested": "248000000m",
"used": "186000000m",
"utilization": 0.60
},
"memory": {
"capacity": "1280Ti",
"allocatable": "1240Ti",
"requested": "992Ti",
"used": "868Ti",
"utilization": 0.70
},
"gpu": {
"capacity": 2000,
"allocatable": 2000,
"requested": 1680,
"used": 1520,
"utilization": 0.76
}
},
"scheduling": {
"pendingAllocations": 142,
"schedulingRate": "487/sec",
"avgSchedulingLatencyMs": 12,
"p99SchedulingLatencyMs": 85,
"preemptionsLast1h": 23
}
}Per-Tenant Usage
GET /api/v1/metrics/tenants/{tenant_id}/usage?period=24h
Authorization: Bearer {token}
Response 200:
{
"tenant": "ml-team",
"period": "24h",
"resources": {
"cpu": {
"quota": "10000",
"currentUsage": "6500",
"peakUsage": "8200",
"avgUsage": "5800",
"utilizationOfQuota": 0.65
},
"nvidia.com/gpu": {
"quota": "100",
"currentUsage": "72",
"peakUsage": "95",
"avgUsage": "68",
"utilizationOfQuota": 0.72
}
},
"cost": {
"period": "24h",
"total": "$12,450.00",
"breakdown": {
"cpu": "$3,200.00",
"memory": "$2,100.00",
"gpu": "$6,800.00",
"storage": "$350.00"
}
}
}Queue Depth and Wait Times
GET /api/v1/metrics/queues
Authorization: Bearer {token}
Response 200:
{
"queues": [
{
"name": "default",
"depth": 45,
"oldestItem": "2024-01-15T10:28:00Z",
"avgWaitTime": "12s",
"p99WaitTime": "120s"
},
{
"name": "gpu-training",
"depth": 23,
"oldestItem": "2024-01-15T09:45:00Z",
"avgWaitTime": "300s",
"p99WaitTime": "1800s"
}
]
}Preemption API
Trigger Preemption (Admin)
POST /api/v1/namespaces/{namespace}/preemptions
Authorization: Bearer {admin-token}
Content-Type: application/json
{
"targetAllocation": "low-priority-batch-job",
"reason": "Emergency capacity needed for production incident",
"gracePeriodSeconds": 30,
"initiatedBy": "oncall-engineer@example.com"
}
Response 201:
{
"preemptionId": "preempt-abc123",
"targetAllocation": "low-priority-batch-job",
"status": "Initiated",
"gracePeriodEnds": "2024-01-15T10:35:30Z",
"resourcesFreed": {
"cpu": "16000m",
"memory": "64Gi",
"nvidia.com/gpu": "4"
}
}Get Eviction Notice (Node Agent Polls)
GET /api/v1/nodes/{node_name}/evictions?watch=true
Authorization: Bearer {node-token}
Response 200 (streaming):
{
"type": "EVICTION",
"allocation": "low-priority-batch-job",
"reason": "Preempted by higher priority job",
"gracePeriodSeconds": 30,
"deadline": "2024-01-15T10:35:30Z"
}gRPC Internal API (Scheduler to Node)
syntax = "proto3";
package scheduler.v1;
service NodeAgent {
// Heartbeat stream (bidirectional)
rpc Heartbeat(stream HeartbeatRequest) returns (stream HeartbeatResponse);
// Bind allocation to node
rpc BindAllocation(BindRequest) returns (BindResponse);
// Evict allocation from node
rpc EvictAllocation(EvictRequest) returns (EvictResponse);
// Get node resource snapshot
rpc GetResourceSnapshot(SnapshotRequest) returns (ResourceSnapshot);
}
message HeartbeatRequest {
string node_id = 1;
int64 timestamp = 2;
ResourceUsage current_usage = 3;
repeated PodStatus pod_statuses = 4;
repeated NodeCondition conditions = 5;
}
message HeartbeatResponse {
repeated SchedulingDirective directives = 1;
int64 lease_duration_seconds = 2;
}
message BindRequest {
string allocation_id = 1;
string node_id = 2;
ResourceRequirements resources = 3;
map<string, string> annotations = 4;
}
message BindResponse {
bool success = 1;
string error_message = 2;
string failure_reason = 3;
}Error Handling and Status Codes
| Status Code | Meaning | Example |
|---|---|---|
| 200 | Success | Resource retrieved/updated |
| 201 | Created | Allocation submitted |
| 202 | Accepted | Async operation started |
| 400 | Bad Request | Invalid resource specification |
| 401 | Unauthorized | Invalid/expired token |
| 403 | Forbidden | Quota exceeded, insufficient permissions |
| 404 | Not Found | Allocation/node doesn't exist |
| 409 | Conflict | Resource version conflict (OCC) |
| 422 | Unprocessable | Valid JSON but unsatisfiable constraints |
| 429 | Too Many Requests | Rate limit exceeded |
| 503 | Service Unavailable | Scheduler overloaded |
Rate Limiting Headers
HTTP/1.1 429 Too Many Requests
X-RateLimit-Limit: 1000
X-RateLimit-Remaining: 0
X-RateLimit-Reset: 1705312500
Retry-After: 30
{
"kind": "Status",
"code": 429,
"reason": "TooManyRequests",
"message": "Rate limit exceeded. Tenant 'ml-team' limited to 1000 req/min."
}Client SDK Example (Go)
package main
import (
"context"
"fmt"
"time"
allocator "github.com/example/resource-allocator/sdk/go"
)
func main() {
client, _ := allocator.NewClient(allocator.Config{
Endpoint: "https://scheduler.internal:6443",
Token: os.Getenv("SCHEDULER_TOKEN"),
})
// Submit allocation with constraints
alloc, err := client.Allocations("ml-team").Create(context.Background(), &allocator.Allocation{
Name: "training-job-1",
Spec: allocator.AllocationSpec{
Resources: allocator.ResourceRequirements{
Requests: allocator.ResourceList{"cpu": "8", "memory": "32Gi", "nvidia.com/gpu": "4"},
Limits: allocator.ResourceList{"cpu": "16", "memory": "64Gi", "nvidia.com/gpu": "4"},
},
Priority: 100,
PriorityClass: "high-priority",
NodeSelector: map[string]string{"gpu-type": "nvidia-a100"},
},
})
// Watch for status changes
watcher, _ := client.Allocations("ml-team").Watch(context.Background(),
allocator.WatchOptions{FieldSelector: "metadata.name=training-job-1"})
for event := range watcher.ResultChan() {
alloc := event.Object.(*allocator.Allocation)
if alloc.Status.Phase == "Running" {
fmt.Printf("Allocated on node: %s\n", alloc.Status.NodeName)
break
}
}
// Release when done
client.Allocations("ml-team").Delete(context.Background(), "training-job-1",
allocator.DeleteOptions{GracePeriodSeconds: 30})
}This API design provides a complete interface for resource allocation, from simple single-resource requests to complex gang-scheduled distributed training jobs with topology constraints.