Resource Allocation Service - API Design

Overview

The API layer provides the interface between tenants, administrators, and the resource allocation engine. It follows REST conventions for CRUD operations and uses gRPC for high-performance internal communication (scheduler-to-node). The design is inspired by the Kubernetes API server pattern: declarative desired state, watch-based notifications, and optimistic concurrency via resource versions.

Resource Request API

Submit Allocation Request

POST /api/v1/namespaces/{namespace}/allocations
Authorization: Bearer {token}
Content-Type: application/json

{
  "metadata": {
    "name": "ml-training-job-42",
    "namespace": "ml-team",
    "labels": {
      "app": "model-training",
      "team": "ml-platform",
      "cost-center": "CC-1234"
    },
    "annotations": {
      "scheduler.example.com/gang-size": "4",
      "scheduler.example.com/max-wait": "300s"
    }
  },
  "spec": {
    "resources": {
      "requests": {
        "cpu": "8000m",
        "memory": "32Gi",
        "nvidia.com/gpu": "4"
      },
      "limits": {
        "cpu": "16000m",
        "memory": "64Gi",
        "nvidia.com/gpu": "4"
      }
    },
    "priority": 100,
    "priorityClass": "high-priority",
    "preemptionPolicy": "PreemptLowerPriority",
    "scheduling": {
      "nodeSelector": {
        "gpu-type": "nvidia-a100",
        "zone": "us-east-1a"
      },
      "affinity": {
        "nodeAffinity": {
          "requiredDuringScheduling": {
            "nodeSelectorTerms": [
              {
                "matchExpressions": [
                  {"key": "gpu-memory", "operator": "Gte", "values": ["80"]}
                ]
              }
            ]
          },
          "preferredDuringScheduling": [
            {
              "weight": 50,
              "preference": {
                "matchExpressions": [
                  {"key": "network", "operator": "In", "values": ["rdma"]}
                ]
              }
            }
          ]
        },
        "podAntiAffinity": {
          "requiredDuringScheduling": [
            {
              "topologyKey": "kubernetes.io/hostname",
              "labelSelector": {"matchLabels": {"app": "model-training"}}
            }
          ]
        }
      },
      "tolerations": [
        {"key": "gpu-only", "operator": "Exists", "effect": "NoSchedule"}
      ],
      "topologySpreadConstraints": [
        {
          "maxSkew": 1,
          "topologyKey": "topology.kubernetes.io/zone",
          "whenUnsatisfiable": "DoNotSchedule"
        }
      ]
    },
    "ttl": "7200s",
    "gangScheduling": {
      "groupId": "training-gang-001",
      "minMembers": 4,
      "timeout": "300s"
    }
  }
}

Response 201 Created:
{
  "metadata": {
    "name": "ml-training-job-42",
    "namespace": "ml-team",
    "uid": "alloc-7f3a2b1c-4d5e-6f7a-8b9c-0d1e2f3a4b5c",
    "resourceVersion": "1",
    "creationTimestamp": "2024-01-15T10:30:00Z"
  },
  "spec": { ... },
  "status": {
    "phase": "Pending",
    "conditions": [
      {
        "type": "Scheduled",
        "status": "False",
        "reason": "Unscheduled",
        "message": "Waiting for scheduler"
      }
    ],
    "queuePosition": 12
  }
}

Response 403 Forbidden (quota exceeded):
{
  "kind": "Status",
  "code": 403,
  "reason": "Forbidden",
  "message": "exceeded quota: gpu limit 100, requested 4, used 98"
}

Get Allocation Status

GET /api/v1/namespaces/{namespace}/allocations/{name}
Authorization: Bearer {token}

Response 200:
{
  "metadata": {
    "name": "ml-training-job-42",
    "namespace": "ml-team",
    "uid": "alloc-7f3a2b1c-4d5e-6f7a-8b9c-0d1e2f3a4b5c",
    "resourceVersion": "5"
  },
  "spec": { ... },
  "status": {
    "phase": "Running",
    "nodeId": "node-xyz789",
    "nodeName": "gpu-worker-042.us-east-1a",
    "startTime": "2024-01-15T10:30:05Z",
    "conditions": [
      {"type": "Scheduled", "status": "True", "lastTransitionTime": "2024-01-15T10:30:02Z"},
      {"type": "Ready", "status": "True", "lastTransitionTime": "2024-01-15T10:30:05Z"}
    ],
    "allocatedResources": {
      "cpu": "8000m",
      "memory": "32Gi",
      "nvidia.com/gpu": "4"
    }
  }
}

List Allocations with Filtering

GET /api/v1/namespaces/{namespace}/allocations?labelSelector=app%3Dmodel-training&fieldSelector=status.phase%3DRunning&limit=100&continue={token}
Authorization: Bearer {token}

Response 200:
{
  "kind": "AllocationList",
  "metadata": {
    "resourceVersion": "48291",
    "continue": "eyJvZmZzZXQiOjEwMH0="
  },
  "items": [ ... ]
}

Watch Allocation Changes (Server-Sent Events)

GET /api/v1/namespaces/{namespace}/allocations?watch=true&resourceVersion=48291
Authorization: Bearer {token}
Accept: text/event-stream

Response 200 (streaming):
data: {"type":"MODIFIED","object":{"metadata":{"name":"job-1","resourceVersion":"48292"},"status":{"phase":"Running"}}}

data: {"type":"ADDED","object":{"metadata":{"name":"job-2","resourceVersion":"48293"},"status":{"phase":"Pending"}}}

data: {"type":"DELETED","object":{"metadata":{"name":"job-3","resourceVersion":"48294"}}}

Release API

Release Resources (Graceful)

DELETE /api/v1/namespaces/{namespace}/allocations/{name}
Authorization: Bearer {token}
Content-Type: application/json

{
  "gracePeriodSeconds": 30,
  "propagationPolicy": "Foreground"
}

Response 200:
{
  "metadata": {
    "name": "ml-training-job-42",
    "deletionTimestamp": "2024-01-15T12:30:00Z",
    "deletionGracePeriodSeconds": 30
  },
  "status": {
    "phase": "Terminating",
    "conditions": [
      {
        "type": "Terminating",
        "status": "True",
        "reason": "UserRequested",
        "message": "Graceful termination in progress"
      }
    ]
  }
}

Bulk Release

POST /api/v1/namespaces/{namespace}/allocations/bulk-release
Authorization: Bearer {token}
Content-Type: application/json

{
  "labelSelector": "batch-id=job-20240115",
  "gracePeriodSeconds": 60,
  "dryRun": false
}

Response 200:
{
  "released": 24,
  "failed": 0,
  "totalResourcesFreed": {
    "cpu": "192000m",
    "memory": "768Gi",
    "nvidia.com/gpu": "96"
  }
}

Quota Management API

Set Resource Quota

POST /api/v1/namespaces/{namespace}/resourcequotas
Authorization: Bearer {token}
Content-Type: application/json

{
  "metadata": {
    "name": "ml-team-quota",
    "namespace": "ml-team"
  },
  "spec": {
    "hard": {
      "cpu": "10000",
      "memory": "40Ti",
      "nvidia.com/gpu": "100",
      "pods": "5000",
      "persistentvolumeclaims": "200",
      "services": "100"
    },
    "soft": {
      "cpu": "8000",
      "memory": "32Ti",
      "nvidia.com/gpu": "80"
    },
    "scopes": ["NotTerminating"],
    "scopeSelector": {
      "matchExpressions": [
        {"scopeName": "PriorityClass", "operator": "In", "values": ["high-priority", "normal"]}
      ]
    }
  }
}

Response 201 Created:
{
  "metadata": { ... },
  "spec": { ... },
  "status": {
    "hard": {
      "cpu": "10000",
      "memory": "40Ti",
      "nvidia.com/gpu": "100"
    },
    "used": {
      "cpu": "6500",
      "memory": "26Ti",
      "nvidia.com/gpu": "72"
    }
  }
}

Get Quota Usage

GET /api/v1/namespaces/{namespace}/resourcequotas/{name}/status
Authorization: Bearer {token}

Response 200:
{
  "metadata": {"name": "ml-team-quota"},
  "status": {
    "hard": {"cpu": "10000", "memory": "40Ti", "nvidia.com/gpu": "100"},
    "used": {"cpu": "6500", "memory": "26Ti", "nvidia.com/gpu": "72"},
    "utilization": {"cpu": "65%", "memory": "65%", "nvidia.com/gpu": "72%"},
    "trend": {
      "cpu": "+5% over 24h",
      "nvidia.com/gpu": "+12% over 24h"
    }
  }
}

Update Quota Limits

PATCH /api/v1/namespaces/{namespace}/resourcequotas/{name}
Authorization: Bearer {token}
Content-Type: application/merge-patch+json

{
  "spec": {
    "hard": {
      "nvidia.com/gpu": "150"
    }
  }
}

Response 200:
{
  "metadata": {"name": "ml-team-quota", "resourceVersion": "12"},
  "spec": {
    "hard": {"nvidia.com/gpu": "150"}
  }
}

Scheduling API

Submit Batch Job

POST /api/v1/namespaces/{namespace}/jobs
Authorization: Bearer {token}
Content-Type: application/json

{
  "metadata": {
    "name": "distributed-training-v2",
    "namespace": "ml-team"
  },
  "spec": {
    "parallelism": 8,
    "completions": 8,
    "backoffLimit": 3,
    "activeDeadlineSeconds": 86400,
    "template": {
      "spec": {
        "resources": {
          "requests": {"cpu": "16000m", "memory": "64Gi", "nvidia.com/gpu": "8"},
          "limits": {"cpu": "32000m", "memory": "128Gi", "nvidia.com/gpu": "8"}
        },
        "scheduling": {
          "gangScheduling": {"minMembers": 8, "timeout": "600s"},
          "nodeSelector": {"gpu-type": "nvidia-a100"},
          "topologySpreadConstraints": [
            {"maxSkew": 1, "topologyKey": "rack", "whenUnsatisfiable": "DoNotSchedule"}
          ]
        }
      }
    },
    "priority": 200,
    "priorityClass": "batch-high",
    "queue": "gpu-training"
  }
}

Response 201 Created:
{
  "metadata": {"name": "distributed-training-v2", "uid": "job-abc123"},
  "status": {
    "phase": "Queued",
    "queuePosition": 3,
    "estimatedStartTime": "2024-01-15T11:00:00Z",
    "conditions": [
      {"type": "Queued", "status": "True", "reason": "WaitingForResources"}
    ]
  }
}

Get Job Status

GET /api/v1/namespaces/{namespace}/jobs/{name}/status
Authorization: Bearer {token}

Response 200:
{
  "status": {
    "phase": "Running",
    "active": 8,
    "succeeded": 0,
    "failed": 0,
    "startTime": "2024-01-15T11:02:00Z",
    "allocations": [
      {"name": "training-0", "node": "gpu-001", "phase": "Running"},
      {"name": "training-1", "node": "gpu-002", "phase": "Running"},
      ...
    ]
  }
}

Cancel Job

POST /api/v1/namespaces/{namespace}/jobs/{name}/cancel
Authorization: Bearer {token}
Content-Type: application/json

{
  "reason": "Superseded by newer model version",
  "gracePeriodSeconds": 60
}

Response 200:
{
  "status": {
    "phase": "Cancelling",
    "message": "Graceful cancellation initiated, 60s grace period"
  }
}

Priority Boost

POST /api/v1/namespaces/{namespace}/jobs/{name}/boost
Authorization: Bearer {token}
Content-Type: application/json

{
  "newPriority": 500,
  "reason": "Production incident requires immediate training",
  "duration": "3600s",
  "approvedBy": "admin@example.com"
}

Response 200:
{
  "previousPriority": 200,
  "newPriority": 500,
  "boostExpires": "2024-01-15T12:02:00Z",
  "newQueuePosition": 1
}

Node Management API

Register Node

POST /api/v1/nodes
Authorization: Bearer {node-bootstrap-token}
Content-Type: application/json

{
  "metadata": {
    "name": "gpu-worker-043",
    "labels": {
      "zone": "us-east-1a",
      "rack": "rack-12",
      "gpu-type": "nvidia-a100",
      "instance-type": "p4d.24xlarge"
    }
  },
  "spec": {
    "capacity": {
      "cpu": "96000m",
      "memory": "1152Gi",
      "nvidia.com/gpu": "8",
      "ephemeral-storage": "3800Gi",
      "pods": "110"
    },
    "allocatable": {
      "cpu": "94000m",
      "memory": "1100Gi",
      "nvidia.com/gpu": "8",
      "ephemeral-storage": "3500Gi",
      "pods": "110"
    },
    "taints": [
      {"key": "nvidia.com/gpu", "effect": "NoSchedule"}
    ]
  }
}

Response 201 Created:
{
  "metadata": {"name": "gpu-worker-043", "uid": "node-def456"},
  "spec": { ... },
  "status": {"phase": "Ready", "leaseId": "lease-xyz"}
}

Update Node Capacity (e.g., after hardware change)

PATCH /api/v1/nodes/{name}/status
Authorization: Bearer {node-token}
Content-Type: application/strategic-merge-patch+json

{
  "status": {
    "capacity": {"nvidia.com/gpu": "6"},
    "allocatable": {"nvidia.com/gpu": "6"},
    "conditions": [
      {"type": "Ready", "status": "True", "lastHeartbeatTime": "2024-01-15T10:35:00Z"}
    ]
  }
}

Drain Node (Graceful Evacuation)

POST /api/v1/nodes/{name}/drain
Authorization: Bearer {admin-token}
Content-Type: application/json

{
  "gracePeriodSeconds": 300,
  "deleteEmptyDirData": true,
  "ignoreDaemonSets": true,
  "force": false,
  "reason": "Scheduled maintenance window"
}

Response 200:
{
  "status": "Draining",
  "allocationsToEvict": 23,
  "estimatedCompletionTime": "2024-01-15T10:40:00Z",
  "progress": {
    "evicted": 0,
    "pending": 23,
    "failed": 0
  }
}

Cordon Node (Prevent New Scheduling)

PATCH /api/v1/nodes/{name}
Authorization: Bearer {admin-token}
Content-Type: application/merge-patch+json

{
  "spec": {
    "unschedulable": true
  }
}

Monitoring API

Cluster Utilization

GET /api/v1/metrics/cluster/utilization
Authorization: Bearer {token}

Response 200:
{
  "timestamp": "2024-01-15T10:35:00Z",
  "cluster": {
    "nodes": {"total": 5000, "ready": 4987, "notReady": 8, "cordoned": 5},
    "cpu": {
      "capacity": "320000000m",
      "allocatable": "310000000m",
      "requested": "248000000m",
      "used": "186000000m",
      "utilization": 0.60
    },
    "memory": {
      "capacity": "1280Ti",
      "allocatable": "1240Ti",
      "requested": "992Ti",
      "used": "868Ti",
      "utilization": 0.70
    },
    "gpu": {
      "capacity": 2000,
      "allocatable": 2000,
      "requested": 1680,
      "used": 1520,
      "utilization": 0.76
    }
  },
  "scheduling": {
    "pendingAllocations": 142,
    "schedulingRate": "487/sec",
    "avgSchedulingLatencyMs": 12,
    "p99SchedulingLatencyMs": 85,
    "preemptionsLast1h": 23
  }
}

Per-Tenant Usage

GET /api/v1/metrics/tenants/{tenant_id}/usage?period=24h
Authorization: Bearer {token}

Response 200:
{
  "tenant": "ml-team",
  "period": "24h",
  "resources": {
    "cpu": {
      "quota": "10000",
      "currentUsage": "6500",
      "peakUsage": "8200",
      "avgUsage": "5800",
      "utilizationOfQuota": 0.65
    },
    "nvidia.com/gpu": {
      "quota": "100",
      "currentUsage": "72",
      "peakUsage": "95",
      "avgUsage": "68",
      "utilizationOfQuota": 0.72
    }
  },
  "cost": {
    "period": "24h",
    "total": "$12,450.00",
    "breakdown": {
      "cpu": "$3,200.00",
      "memory": "$2,100.00",
      "gpu": "$6,800.00",
      "storage": "$350.00"
    }
  }
}

Queue Depth and Wait Times

GET /api/v1/metrics/queues
Authorization: Bearer {token}

Response 200:
{
  "queues": [
    {
      "name": "default",
      "depth": 45,
      "oldestItem": "2024-01-15T10:28:00Z",
      "avgWaitTime": "12s",
      "p99WaitTime": "120s"
    },
    {
      "name": "gpu-training",
      "depth": 23,
      "oldestItem": "2024-01-15T09:45:00Z",
      "avgWaitTime": "300s",
      "p99WaitTime": "1800s"
    }
  ]
}

Preemption API

Trigger Preemption (Admin)

POST /api/v1/namespaces/{namespace}/preemptions
Authorization: Bearer {admin-token}
Content-Type: application/json

{
  "targetAllocation": "low-priority-batch-job",
  "reason": "Emergency capacity needed for production incident",
  "gracePeriodSeconds": 30,
  "initiatedBy": "oncall-engineer@example.com"
}

Response 201:
{
  "preemptionId": "preempt-abc123",
  "targetAllocation": "low-priority-batch-job",
  "status": "Initiated",
  "gracePeriodEnds": "2024-01-15T10:35:30Z",
  "resourcesFreed": {
    "cpu": "16000m",
    "memory": "64Gi",
    "nvidia.com/gpu": "4"
  }
}

Get Eviction Notice (Node Agent Polls)

GET /api/v1/nodes/{node_name}/evictions?watch=true
Authorization: Bearer {node-token}

Response 200 (streaming):
{
  "type": "EVICTION",
  "allocation": "low-priority-batch-job",
  "reason": "Preempted by higher priority job",
  "gracePeriodSeconds": 30,
  "deadline": "2024-01-15T10:35:30Z"
}

gRPC Internal API (Scheduler to Node)

syntax = "proto3";
package scheduler.v1;

service NodeAgent {
  // Heartbeat stream (bidirectional)
  rpc Heartbeat(stream HeartbeatRequest) returns (stream HeartbeatResponse);

  // Bind allocation to node
  rpc BindAllocation(BindRequest) returns (BindResponse);

  // Evict allocation from node
  rpc EvictAllocation(EvictRequest) returns (EvictResponse);

  // Get node resource snapshot
  rpc GetResourceSnapshot(SnapshotRequest) returns (ResourceSnapshot);
}

message HeartbeatRequest {
  string node_id = 1;
  int64 timestamp = 2;
  ResourceUsage current_usage = 3;
  repeated PodStatus pod_statuses = 4;
  repeated NodeCondition conditions = 5;
}

message HeartbeatResponse {
  repeated SchedulingDirective directives = 1;
  int64 lease_duration_seconds = 2;
}

message BindRequest {
  string allocation_id = 1;
  string node_id = 2;
  ResourceRequirements resources = 3;
  map<string, string> annotations = 4;
}

message BindResponse {
  bool success = 1;
  string error_message = 2;
  string failure_reason = 3;
}

Error Handling and Status Codes

Status Code	Meaning	Example
200	Success	Resource retrieved/updated
201	Created	Allocation submitted
202	Accepted	Async operation started
400	Bad Request	Invalid resource specification
401	Unauthorized	Invalid/expired token
403	Forbidden	Quota exceeded, insufficient permissions
404	Not Found	Allocation/node doesn't exist
409	Conflict	Resource version conflict (OCC)
422	Unprocessable	Valid JSON but unsatisfiable constraints
429	Too Many Requests	Rate limit exceeded
503	Service Unavailable	Scheduler overloaded

Rate Limiting Headers

HTTP/1.1 429 Too Many Requests
X-RateLimit-Limit: 1000
X-RateLimit-Remaining: 0
X-RateLimit-Reset: 1705312500
Retry-After: 30

{
  "kind": "Status",
  "code": 429,
  "reason": "TooManyRequests",
  "message": "Rate limit exceeded. Tenant 'ml-team' limited to 1000 req/min."
}

Client SDK Example (Go)

package main

import (
    "context"
    "fmt"
    "time"

    allocator "github.com/example/resource-allocator/sdk/go"
)

func main() {
    client, _ := allocator.NewClient(allocator.Config{
        Endpoint: "https://scheduler.internal:6443",
        Token:    os.Getenv("SCHEDULER_TOKEN"),
    })

    // Submit allocation with constraints
    alloc, err := client.Allocations("ml-team").Create(context.Background(), &allocator.Allocation{
        Name: "training-job-1",
        Spec: allocator.AllocationSpec{
            Resources: allocator.ResourceRequirements{
                Requests: allocator.ResourceList{"cpu": "8", "memory": "32Gi", "nvidia.com/gpu": "4"},
                Limits:   allocator.ResourceList{"cpu": "16", "memory": "64Gi", "nvidia.com/gpu": "4"},
            },
            Priority:      100,
            PriorityClass: "high-priority",
            NodeSelector:  map[string]string{"gpu-type": "nvidia-a100"},
        },
    })

    // Watch for status changes
    watcher, _ := client.Allocations("ml-team").Watch(context.Background(),
        allocator.WatchOptions{FieldSelector: "metadata.name=training-job-1"})

    for event := range watcher.ResultChan() {
        alloc := event.Object.(*allocator.Allocation)
        if alloc.Status.Phase == "Running" {
            fmt.Printf("Allocated on node: %s\n", alloc.Status.NodeName)
            break
        }
    }

    // Release when done
    client.Allocations("ml-team").Delete(context.Background(), "training-job-1",
        allocator.DeleteOptions{GracePeriodSeconds: 30})
}

This API design provides a complete interface for resource allocation, from simple single-resource requests to complex gang-scheduled distributed training jobs with topology constraints.