package metrics

kubernetes – k8s.io/kubernetes/pkg/kubelet/metrics Index | Files | Directories

import "k8s.io/kubernetes/pkg/kubelet/metrics"

Index ¶

Constants
Variables
func GetGather() metrics.Gatherer
func GetImageSizeBucket(sizeInBytes uint64) string
func Register(collectors ...metrics.StableCollector)
func SetNodeName(name types.NodeName)
func SinceInSeconds(start time.Time) float64

Constants ¶

const (
	FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
	KubeletSubsystem                   = "kubelet"
	DRASubsystem                       = "dra"
	NodeNameKey                        = "node_name"
	NodeLabelKey                       = "node"
	NodeStartupPreKubeletKey           = "node_startup_pre_kubelet_duration_seconds"
	NodeStartupPreRegistrationKey      = "node_startup_pre_registration_duration_seconds"
	NodeStartupRegistrationKey         = "node_startup_registration_duration_seconds"
	NodeStartupPostRegistrationKey     = "node_startup_post_registration_duration_seconds"
	NodeStartupKey                     = "node_startup_duration_seconds"
	PodWorkerDurationKey               = "pod_worker_duration_seconds"
	PodStartDurationKey                = "pod_start_duration_seconds"
	PodStartSLIDurationKey             = "pod_start_sli_duration_seconds"
	PodStartTotalDurationKey           = "pod_start_total_duration_seconds"
	CgroupManagerOperationsKey         = "cgroup_manager_duration_seconds"
	PodWorkerStartDurationKey          = "pod_worker_start_duration_seconds"
	PodStatusSyncDurationKey           = "pod_status_sync_duration_seconds"
	PLEGRelistDurationKey              = "pleg_relist_duration_seconds"
	PLEGDiscardEventsKey               = "pleg_discard_events"
	PLEGRelistIntervalKey              = "pleg_relist_interval_seconds"
	PLEGLastSeenKey                    = "pleg_last_seen_seconds"
	EventedPLEGConnErrKey              = "evented_pleg_connection_error_count"
	EventedPLEGConnKey                 = "evented_pleg_connection_success_count"
	EventedPLEGConnLatencyKey          = "evented_pleg_connection_latency_seconds"
	EvictionsKey                       = "evictions"
	EvictionStatsAgeKey                = "eviction_stats_age_seconds"
	PreemptionsKey                     = "preemptions"
	VolumeStatsCapacityBytesKey        = "volume_stats_capacity_bytes"
	VolumeStatsAvailableBytesKey       = "volume_stats_available_bytes"
	VolumeStatsUsedBytesKey            = "volume_stats_used_bytes"
	VolumeStatsInodesKey               = "volume_stats_inodes"
	VolumeStatsInodesFreeKey           = "volume_stats_inodes_free"
	VolumeStatsInodesUsedKey           = "volume_stats_inodes_used"
	VolumeStatsHealthStatusAbnormalKey = "volume_stats_health_status_abnormal"
	RunningPodsKey                     = "running_pods"
	RunningContainersKey               = "running_containers"
	DesiredPodCountKey                 = "desired_pods"
	ActivePodCountKey                  = "active_pods"
	MirrorPodCountKey                  = "mirror_pods"
	WorkingPodCountKey                 = "working_pods"
	OrphanedRuntimePodTotalKey         = "orphaned_runtime_pods_total"
	RestartedPodTotalKey               = "restarted_pods_total"
	ImagePullDurationKey               = "image_pull_duration_seconds"
	CgroupVersionKey                   = "cgroup_version"

	// Metrics keys of remote runtime operations
	RuntimeOperationsKey         = "runtime_operations_total"
	RuntimeOperationsDurationKey = "runtime_operations_duration_seconds"
	RuntimeOperationsErrorsKey   = "runtime_operations_errors_total"
	// Metrics keys of device plugin operations
	DevicePluginRegistrationCountKey  = "device_plugin_registration_total"
	DevicePluginAllocationDurationKey = "device_plugin_alloc_duration_seconds"
	// Metrics keys of pod resources operations
	PodResourcesEndpointRequestsTotalKey          = "pod_resources_endpoint_requests_total"
	PodResourcesEndpointRequestsListKey           = "pod_resources_endpoint_requests_list"
	PodResourcesEndpointRequestsGetAllocatableKey = "pod_resources_endpoint_requests_get_allocatable"
	PodResourcesEndpointErrorsListKey             = "pod_resources_endpoint_errors_list"
	PodResourcesEndpointErrorsGetAllocatableKey   = "pod_resources_endpoint_errors_get_allocatable"
	PodResourcesEndpointRequestsGetKey            = "pod_resources_endpoint_requests_get"
	PodResourcesEndpointErrorsGetKey              = "pod_resources_endpoint_errors_get"

	// Metrics keys for RuntimeClass
	RunPodSandboxDurationKey = "run_podsandbox_duration_seconds"
	RunPodSandboxErrorsKey   = "run_podsandbox_errors_total"

	// Metrics to keep track of total number of Pods and Containers started
	StartedPodsTotalKey             = "started_pods_total"
	StartedPodsErrorsTotalKey       = "started_pods_errors_total"
	StartedContainersTotalKey       = "started_containers_total"
	StartedContainersErrorsTotalKey = "started_containers_errors_total"

	// Metrics to track HostProcess container usage by this kubelet
	StartedHostProcessContainersTotalKey       = "started_host_process_containers_total"
	StartedHostProcessContainersErrorsTotalKey = "started_host_process_containers_errors_total"

	// Metrics to track ephemeral container usage by this kubelet
	ManagedEphemeralContainersKey = "managed_ephemeral_containers"

	// Metrics to track the CPU manager behavior
	CPUManagerPinningRequestsTotalKey         = "cpu_manager_pinning_requests_total"
	CPUManagerPinningErrorsTotalKey           = "cpu_manager_pinning_errors_total"
	CPUManagerSharedPoolSizeMilliCoresKey     = "cpu_manager_shared_pool_size_millicores"
	CPUManagerExclusiveCPUsAllocationCountKey = "cpu_manager_exclusive_cpu_allocation_count"
	CPUManagerAllocationPerNUMAKey            = "cpu_manager_allocation_per_numa"

	// Metrics to track the Memory manager behavior
	MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
	MemoryManagerPinningErrorsTotalKey   = "memory_manager_pinning_errors_total"

	// Metrics to track the Topology manager behavior
	TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
	TopologyManagerAdmissionErrorsTotalKey   = "topology_manager_admission_errors_total"
	TopologyManagerAdmissionDurationKey      = "topology_manager_admission_duration_ms"

	// Metric for tracking garbage collected images
	ImageGarbageCollectedTotalKey = "image_garbage_collected_total"

	// Metric for tracking aligment of compute resources
	ContainerAlignedComputeResourcesNameKey          = "container_aligned_compute_resources_count"
	ContainerAlignedComputeResourcesFailureNameKey   = "container_aligned_compute_resources_failure_count"
	ContainerAlignedComputeResourcesScopeLabelKey    = "scope"
	ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"

	// Metric keys for DRA operations
	DRAOperationsDurationKey     = "operations_duration_seconds"
	DRAGRPCOperationsDurationKey = "grpc_operations_duration_seconds"

	// Values used in metric labels
	Container          = "container"
	InitContainer      = "init_container"
	EphemeralContainer = "ephemeral_container"

	AlignScopePod       = "pod"
	AlignScopeContainer = "container"

	AlignedPhysicalCPU = "physical_cpu"
	AlignedNUMANode    = "numa_node"
	AlignedUncoreCache = "uncore_cache"

	// Metrics to track kubelet admission rejections.
	AdmissionRejectionsTotalKey = "admission_rejections_total"

	// Image Volume metrics
	ImageVolumeRequestedTotalKey      = "image_volume_requested_total"
	ImageVolumeMountedSucceedTotalKey = "image_volume_mounted_succeed_total"
	ImageVolumeMountedErrorsTotalKey  = "image_volume_mounted_errors_total"
)

This const block defines the metric names for the kubelet metrics.

Variables ¶

var (
	NodeName = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeNameKey,
			Help:           "The node's name. The count is always 1.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{NodeLabelKey},
	)

	ContainersPerPodCount = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "containers_per_pod_count",
			Help:           "The number of containers per pod.",
			Buckets:        metrics.ExponentialBuckets(1, 2, 5),
			StabilityLevel: metrics.ALPHA,
		},
	)

	PodWorkerDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodWorkerDurationKey,
			Help:           "Duration in seconds to sync a single pod. Broken down by operation type: create, update, or sync",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	PodStartDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStartDurationKey,
			Help:           "Duration in seconds from kubelet seeing a pod for the first time to the pod starting to run",
			Buckets:        podStartupDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PodStartSLIDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStartSLIDurationKey,
			Help:           "" /* 203 byte string literal not displayed */,
			Buckets:        podStartupDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{},
	)

	PodStartTotalDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStartTotalDurationKey,
			Help:           "" /* 218 byte string literal not displayed */,
			Buckets:        podStartupDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{},
	)

	FirstNetworkPodStartSLIDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           FirstNetworkPodStartSLIDurationKey,
			Help:           "" /* 219 byte string literal not displayed */,
			StabilityLevel: metrics.INTERNAL,
		},
	)

	CgroupManagerDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CgroupManagerOperationsKey,
			Help:           "Duration in seconds for cgroup manager operations. Broken down by method.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	PodWorkerStartDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodWorkerStartDurationKey,
			Help:           "Duration in seconds from kubelet seeing a pod to starting a worker.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PodStatusSyncDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStatusSyncDurationKey,
			Help:           "" /* 214 byte string literal not displayed */,
			Buckets:        []float64{0.010, 0.050, 0.100, 0.500, 1, 5, 10, 20, 30, 45, 60},
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGRelistDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGRelistDurationKey,
			Help:           "Duration in seconds for relisting pods in PLEG.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGDiscardEvents = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGDiscardEventsKey,
			Help:           "The number of discard events in PLEG.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGRelistInterval = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGRelistIntervalKey,
			Help:           "Interval in seconds between relisting in PLEG.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGLastSeen = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGLastSeenKey,
			Help:           "Timestamp in seconds when PLEG was last seen active.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	EventedPLEGConnErr = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EventedPLEGConnErrKey,
			Help:           "The number of errors encountered during the establishment of streaming connection with the CRI runtime.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	EventedPLEGConn = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EventedPLEGConnKey,
			Help:           "The number of times a streaming client was obtained to receive CRI Events.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	EventedPLEGConnLatency = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EventedPLEGConnLatencyKey,
			Help:           "The latency of streaming connection with the CRI runtime, measured in seconds.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	RuntimeOperations = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RuntimeOperationsKey,
			Help:           "Cumulative number of runtime operations by operation type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	RuntimeOperationsDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RuntimeOperationsDurationKey,
			Help:           "Duration in seconds of runtime operations. Broken down by operation type.",
			Buckets:        metrics.ExponentialBuckets(.005, 2.5, 14),
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	RuntimeOperationsErrors = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RuntimeOperationsErrorsKey,
			Help:           "Cumulative number of runtime operation errors by operation type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	Evictions = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EvictionsKey,
			Help:           "Cumulative number of pod evictions by eviction signal",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"eviction_signal"},
	)

	EvictionStatsAge = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EvictionStatsAgeKey,
			Help:           "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"eviction_signal"},
	)

	Preemptions = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PreemptionsKey,
			Help:           "Cumulative number of pod preemptions by preemption resource",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"preemption_signal"},
	)

	DevicePluginRegistrationCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           DevicePluginRegistrationCountKey,
			Help:           "Cumulative number of device plugin registrations. Broken down by resource name.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"resource_name"},
	)

	DevicePluginAllocationDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           DevicePluginAllocationDurationKey,
			Help:           "Duration in seconds to serve a device plugin Allocation request. Broken down by resource name.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"resource_name"},
	)

	PodResourcesEndpointRequestsTotalCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsTotalKey,
			Help:           "Cumulative number of requests to the PodResource endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointRequestsListCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsListKey,
			Help:           "Number of requests to the PodResource List endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointRequestsGetAllocatableCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsGetAllocatableKey,
			Help:           "Number of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointErrorsListCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointErrorsListKey,
			Help:           "Number of requests to the PodResource List endpoint which returned error. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointErrorsGetAllocatableCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointErrorsGetAllocatableKey,
			Help:           "" /* 129 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointRequestsGetCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsGetKey,
			Help:           "Number of requests to the PodResource Get endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointErrorsGetCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointErrorsGetKey,
			Help:           "Number of requests to the PodResource Get endpoint which returned error. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	RunPodSandboxDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem: KubeletSubsystem,
			Name:      RunPodSandboxDurationKey,
			Help:      "Duration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.",

			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"runtime_handler"},
	)

	RunPodSandboxErrors = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RunPodSandboxErrorsKey,
			Help:           "Cumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"runtime_handler"},
	)

	RunningPodCount = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RunningPodsKey,
			Help:           "Number of pods that have a running pod sandbox",
			StabilityLevel: metrics.ALPHA,
		},
	)

	RunningContainerCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RunningContainersKey,
			Help:           "Number of containers currently running",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_state"},
	)

	DesiredPodCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           DesiredPodCountKey,
			Help:           "The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"static"},
	)

	ActivePodCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ActivePodCountKey,
			Help:           "" /* 158 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"static"},
	)

	MirrorPodCount = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           MirrorPodCountKey,
			Help:           "The number of mirror pods the kubelet will try to create (one per admitted static pod)",
			StabilityLevel: metrics.ALPHA,
		},
	)

	WorkingPodCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           WorkingPodCountKey,
			Help:           "" /* 324 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"lifecycle", "config", "static"},
	)

	OrphanedRuntimePodTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           OrphanedRuntimePodTotalKey,
			Help:           "" /* 253 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
	)

	RestartedPodTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RestartedPodTotalKey,
			Help:           "" /* 193 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"static"},
	)

	StartedPodsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedPodsTotalKey,
			Help:           "Cumulative number of pods started",
			StabilityLevel: metrics.ALPHA,
		},
	)

	StartedPodsErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedPodsErrorsTotalKey,
			Help:           "Cumulative number of errors when starting pods",
			StabilityLevel: metrics.ALPHA,
		},
	)

	StartedContainersTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedContainersTotalKey,
			Help:           "Cumulative number of containers started",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type"},
	)

	StartedContainersErrorsTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedContainersErrorsTotalKey,
			Help:           "Cumulative number of errors when starting containers",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type", "code"},
	)

	StartedHostProcessContainersTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedHostProcessContainersTotalKey,
			Help:           "Cumulative number of hostprocess containers started. This metric will only be collected on Windows.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type"},
	)

	StartedHostProcessContainersErrorsTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedHostProcessContainersErrorsTotalKey,
			Help:           "Cumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type", "code"},
	)

	ManagedEphemeralContainers = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ManagedEphemeralContainersKey,
			Help:           "Current number of ephemeral containers in pods managed by this kubelet.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	GracefulShutdownStartTime = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "graceful_shutdown_start_time_seconds",
			Help:           "Last graceful shutdown start time since unix epoch in seconds",
			StabilityLevel: metrics.ALPHA,
		},
	)

	GracefulShutdownEndTime = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "graceful_shutdown_end_time_seconds",
			Help:           "Last graceful shutdown end time since unix epoch in seconds",
			StabilityLevel: metrics.ALPHA,
		},
	)

	LifecycleHandlerHTTPFallbacks = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "lifecycle_handler_http_fallbacks_total",
			Help:           "The number of times lifecycle handlers successfully fell back to http from https.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerPinningRequestsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerPinningRequestsTotalKey,
			Help:           "The number of cpu core allocations which required pinning.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerPinningErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerPinningErrorsTotalKey,
			Help:           "The number of cpu core allocations which required pinning failed.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerSharedPoolSizeMilliCores = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerSharedPoolSizeMilliCoresKey,
			Help:           "The size of the shared CPU pool for non-guaranteed QoS pods, in millicores.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerExclusiveCPUsAllocationCount = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerExclusiveCPUsAllocationCountKey,
			Help:           "The total number of CPUs exclusively allocated to containers running on this node",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerAllocationPerNUMA = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerAllocationPerNUMAKey,
			Help:           "Number of CPUs allocated per NUMA node",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{AlignedNUMANode},
	)

	ContainerAlignedComputeResources = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ContainerAlignedComputeResourcesNameKey,
			Help:           "Cumulative number of aligned compute resources allocated to containers by alignment type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
	)

	ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ContainerAlignedComputeResourcesFailureNameKey,
			Help:           "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
	)

	MemoryManagerPinningRequestTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           MemoryManagerPinningRequestsTotalKey,
			Help:           "The number of memory pages allocations which required pinning.",
			StabilityLevel: metrics.ALPHA,
		})

	MemoryManagerPinningErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           MemoryManagerPinningErrorsTotalKey,
			Help:           "The number of memory pages allocations which required pinning that failed.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	TopologyManagerAdmissionRequestsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           TopologyManagerAdmissionRequestsTotalKey,
			Help:           "The number of admission requests where resources have to be aligned.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	TopologyManagerAdmissionErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           TopologyManagerAdmissionErrorsTotalKey,
			Help:           "The number of admission request failures where resources could not be aligned.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	TopologyManagerAdmissionDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           TopologyManagerAdmissionDurationKey,
			Help:           "Duration in milliseconds to serve a pod admission request.",
			Buckets:        metrics.ExponentialBuckets(.05, 2, 15),
			StabilityLevel: metrics.ALPHA,
		},
	)

	OrphanPodCleanedVolumes = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           orphanPodCleanedVolumesKey,
			Help:           "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	OrphanPodCleanedVolumesErrors = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           orphanPodCleanedVolumesErrorsKey,
			Help:           "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupPreKubeletDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupPreKubeletKey,
			Help:           "Duration in seconds of node startup before kubelet starts.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupPreRegistrationDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupPreRegistrationKey,
			Help:           "Duration in seconds of node startup before registration.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupRegistrationDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupRegistrationKey,
			Help:           "Duration in seconds of node startup during registration.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupPostRegistrationDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupPostRegistrationKey,
			Help:           "Duration in seconds of node startup after registration.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupKey,
			Help:           "Duration in seconds of node startup in total.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	ImageGarbageCollectedTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageGarbageCollectedTotalKey,
			Help:           "Total number of images garbage collected by the kubelet, whether through disk usage or image age.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"reason"},
	)

	ImagePullDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImagePullDurationKey,
			Help:           "Duration in seconds to pull an image.",
			Buckets:        imagePullDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"image_size_in_bytes"},
	)

	LifecycleHandlerSleepTerminated = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "sleep_action_terminated_early_total",
			Help:           "The number of times lifecycle sleep handler got terminated before it finishes",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CgroupVersion = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CgroupVersionKey,
			Help:           "cgroup version on the hosts.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	DRAOperationsDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      DRASubsystem,
			Name:           DRAOperationsDurationKey,
			Help:           "" /* 339 byte string literal not displayed */,
			Buckets:        DRADurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_name", "is_error"},
	)

	DRAGRPCOperationsDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      DRASubsystem,
			Name:           DRAGRPCOperationsDurationKey,
			Help:           "Duration in seconds of the DRA gRPC operations",
			Buckets:        DRADurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"driver_name", "method_name", "grpc_status_code"},
	)

	AdmissionRejectionsTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           AdmissionRejectionsTotalKey,
			Help:           "Cumulative number pod admission rejections by the Kubelet.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"reason"},
	)

	ImageVolumeRequestedTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageVolumeRequestedTotalKey,
			Help:           "Number of requested image volumes.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	ImageVolumeMountedSucceedTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageVolumeMountedSucceedTotalKey,
			Help:           "Number of successful image volume mounts.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	ImageVolumeMountedErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageVolumeMountedErrorsTotalKey,
			Help:           "Number of failed image volume mounts.",
			StabilityLevel: metrics.ALPHA,
		},
	)
)

var (

	// DRADurationBuckets is the bucket boundaries for DRA operation duration metrics
	// DRAOperationsDuration and DRAGRPCOperationsDuration defined below in this file.
	// The buckets max value 40 is based on the 45sec max gRPC timeout value defined
	// for the DRA gRPC calls in the pkg/kubelet/cm/dra/plugin/registration.go
	DRADurationBuckets = metrics.ExponentialBucketsRange(.1, 40, 15)
)

Functions ¶

func GetGather ¶

func GetGather() metrics.Gatherer

GetGather returns the gatherer. It used by test case outside current package.

func GetImageSizeBucket ¶

func GetImageSizeBucket(sizeInBytes uint64) string

func Register ¶

func Register(collectors ...metrics.StableCollector)

func SetNodeName ¶

func SetNodeName(name types.NodeName)

SetNodeName sets the NodeName Gauge to 1.

func SinceInSeconds ¶

func SinceInSeconds(start time.Time) float64

SinceInSeconds gets the time since the specified start in seconds.

Source Files ¶

metrics.go

Directories ¶

Path	Synopsis
pkg/kubelet/metrics/collectors

Version: v1.33.3 (latest)
Published: Jul 15, 2025
Platform: linux/amd64
Imports: 7 packages
Last checked: 4 minutes ago –

Tools for package owners.

?	: This menu
/	: Search site
f	: Jump to identifier
g then g	: Go to top of page
g then b	: Go to end of page
G	: Go to end of page
g then i	: Go to index
g then e	: Go to examples