kubernetesk8s.io/kubernetes/pkg/kubelet/metrics Index | Files | Directories

package metrics

import "k8s.io/kubernetes/pkg/kubelet/metrics"

Index

Constants

const (
	FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
	KubeletSubsystem                   = "kubelet"
	DRASubsystem                       = "dra"
	NodeNameKey                        = "node_name"
	NodeLabelKey                       = "node"
	NodeStartupPreKubeletKey           = "node_startup_pre_kubelet_duration_seconds"
	NodeStartupPreRegistrationKey      = "node_startup_pre_registration_duration_seconds"
	NodeStartupRegistrationKey         = "node_startup_registration_duration_seconds"
	NodeStartupPostRegistrationKey     = "node_startup_post_registration_duration_seconds"
	NodeStartupKey                     = "node_startup_duration_seconds"
	PodWorkerDurationKey               = "pod_worker_duration_seconds"
	PodStartDurationKey                = "pod_start_duration_seconds"
	PodStartSLIDurationKey             = "pod_start_sli_duration_seconds"
	PodStartTotalDurationKey           = "pod_start_total_duration_seconds"
	CgroupManagerOperationsKey         = "cgroup_manager_duration_seconds"
	PodWorkerStartDurationKey          = "pod_worker_start_duration_seconds"
	PodStatusSyncDurationKey           = "pod_status_sync_duration_seconds"
	PLEGRelistDurationKey              = "pleg_relist_duration_seconds"
	PLEGDiscardEventsKey               = "pleg_discard_events"
	PLEGRelistIntervalKey              = "pleg_relist_interval_seconds"
	PLEGLastSeenKey                    = "pleg_last_seen_seconds"
	EventedPLEGConnErrKey              = "evented_pleg_connection_error_count"
	EventedPLEGConnKey                 = "evented_pleg_connection_success_count"
	EventedPLEGConnLatencyKey          = "evented_pleg_connection_latency_seconds"
	EvictionsKey                       = "evictions"
	EvictionStatsAgeKey                = "eviction_stats_age_seconds"
	PreemptionsKey                     = "preemptions"
	VolumeStatsCapacityBytesKey        = "volume_stats_capacity_bytes"
	VolumeStatsAvailableBytesKey       = "volume_stats_available_bytes"
	VolumeStatsUsedBytesKey            = "volume_stats_used_bytes"
	VolumeStatsInodesKey               = "volume_stats_inodes"
	VolumeStatsInodesFreeKey           = "volume_stats_inodes_free"
	VolumeStatsInodesUsedKey           = "volume_stats_inodes_used"
	VolumeStatsHealthStatusAbnormalKey = "volume_stats_health_status_abnormal"
	RunningPodsKey                     = "running_pods"
	RunningContainersKey               = "running_containers"
	DesiredPodCountKey                 = "desired_pods"
	ActivePodCountKey                  = "active_pods"
	MirrorPodCountKey                  = "mirror_pods"
	WorkingPodCountKey                 = "working_pods"
	OrphanedRuntimePodTotalKey         = "orphaned_runtime_pods_total"
	RestartedPodTotalKey               = "restarted_pods_total"
	ImagePullDurationKey               = "image_pull_duration_seconds"
	CgroupVersionKey                   = "cgroup_version"

	// Metrics keys of remote runtime operations
	RuntimeOperationsKey         = "runtime_operations_total"
	RuntimeOperationsDurationKey = "runtime_operations_duration_seconds"
	RuntimeOperationsErrorsKey   = "runtime_operations_errors_total"
	// Metrics keys of device plugin operations
	DevicePluginRegistrationCountKey  = "device_plugin_registration_total"
	DevicePluginAllocationDurationKey = "device_plugin_alloc_duration_seconds"
	// Metrics keys of pod resources operations
	PodResourcesEndpointRequestsTotalKey          = "pod_resources_endpoint_requests_total"
	PodResourcesEndpointRequestsListKey           = "pod_resources_endpoint_requests_list"
	PodResourcesEndpointRequestsGetAllocatableKey = "pod_resources_endpoint_requests_get_allocatable"
	PodResourcesEndpointErrorsListKey             = "pod_resources_endpoint_errors_list"
	PodResourcesEndpointErrorsGetAllocatableKey   = "pod_resources_endpoint_errors_get_allocatable"
	PodResourcesEndpointRequestsGetKey            = "pod_resources_endpoint_requests_get"
	PodResourcesEndpointErrorsGetKey              = "pod_resources_endpoint_errors_get"

	// Metrics keys for RuntimeClass
	RunPodSandboxDurationKey = "run_podsandbox_duration_seconds"
	RunPodSandboxErrorsKey   = "run_podsandbox_errors_total"

	// Metrics to keep track of total number of Pods and Containers started
	StartedPodsTotalKey             = "started_pods_total"
	StartedPodsErrorsTotalKey       = "started_pods_errors_total"
	StartedContainersTotalKey       = "started_containers_total"
	StartedContainersErrorsTotalKey = "started_containers_errors_total"

	// Metrics to track HostProcess container usage by this kubelet
	StartedHostProcessContainersTotalKey       = "started_host_process_containers_total"
	StartedHostProcessContainersErrorsTotalKey = "started_host_process_containers_errors_total"

	// Metrics to track ephemeral container usage by this kubelet
	ManagedEphemeralContainersKey = "managed_ephemeral_containers"

	// Metrics to track the CPU manager behavior
	CPUManagerPinningRequestsTotalKey         = "cpu_manager_pinning_requests_total"
	CPUManagerPinningErrorsTotalKey           = "cpu_manager_pinning_errors_total"
	CPUManagerSharedPoolSizeMilliCoresKey     = "cpu_manager_shared_pool_size_millicores"
	CPUManagerExclusiveCPUsAllocationCountKey = "cpu_manager_exclusive_cpu_allocation_count"
	CPUManagerAllocationPerNUMAKey            = "cpu_manager_allocation_per_numa"

	// Metrics to track the Memory manager behavior
	MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
	MemoryManagerPinningErrorsTotalKey   = "memory_manager_pinning_errors_total"

	// Metrics to track the Topology manager behavior
	TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
	TopologyManagerAdmissionErrorsTotalKey   = "topology_manager_admission_errors_total"
	TopologyManagerAdmissionDurationKey      = "topology_manager_admission_duration_ms"

	// Metric for tracking garbage collected images
	ImageGarbageCollectedTotalKey = "image_garbage_collected_total"

	// Metric for tracking aligment of compute resources
	ContainerAlignedComputeResourcesNameKey          = "container_aligned_compute_resources_count"
	ContainerAlignedComputeResourcesFailureNameKey   = "container_aligned_compute_resources_failure_count"
	ContainerAlignedComputeResourcesScopeLabelKey    = "scope"
	ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"

	// Metric keys for DRA operations
	DRAOperationsDurationKey     = "operations_duration_seconds"
	DRAGRPCOperationsDurationKey = "grpc_operations_duration_seconds"

	// Values used in metric labels
	Container          = "container"
	InitContainer      = "init_container"
	EphemeralContainer = "ephemeral_container"

	AlignScopePod       = "pod"
	AlignScopeContainer = "container"

	AlignedPhysicalCPU = "physical_cpu"
	AlignedNUMANode    = "numa_node"
	AlignedUncoreCache = "uncore_cache"

	// Metrics to track kubelet admission rejections.
	AdmissionRejectionsTotalKey = "admission_rejections_total"

	// Image Volume metrics
	ImageVolumeRequestedTotalKey      = "image_volume_requested_total"
	ImageVolumeMountedSucceedTotalKey = "image_volume_mounted_succeed_total"
	ImageVolumeMountedErrorsTotalKey  = "image_volume_mounted_errors_total"
)

This const block defines the metric names for the kubelet metrics.

Variables

var (
	NodeName = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeNameKey,
			Help:           "The node's name. The count is always 1.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{NodeLabelKey},
	)

	ContainersPerPodCount = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "containers_per_pod_count",
			Help:           "The number of containers per pod.",
			Buckets:        metrics.ExponentialBuckets(1, 2, 5),
			StabilityLevel: metrics.ALPHA,
		},
	)

	PodWorkerDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodWorkerDurationKey,
			Help:           "Duration in seconds to sync a single pod. Broken down by operation type: create, update, or sync",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	PodStartDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStartDurationKey,
			Help:           "Duration in seconds from kubelet seeing a pod for the first time to the pod starting to run",
			Buckets:        podStartupDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PodStartSLIDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStartSLIDurationKey,
			Help:           "" /* 203 byte string literal not displayed */,
			Buckets:        podStartupDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{},
	)

	PodStartTotalDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStartTotalDurationKey,
			Help:           "" /* 218 byte string literal not displayed */,
			Buckets:        podStartupDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{},
	)

	FirstNetworkPodStartSLIDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           FirstNetworkPodStartSLIDurationKey,
			Help:           "" /* 219 byte string literal not displayed */,
			StabilityLevel: metrics.INTERNAL,
		},
	)

	CgroupManagerDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CgroupManagerOperationsKey,
			Help:           "Duration in seconds for cgroup manager operations. Broken down by method.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	PodWorkerStartDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodWorkerStartDurationKey,
			Help:           "Duration in seconds from kubelet seeing a pod to starting a worker.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PodStatusSyncDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodStatusSyncDurationKey,
			Help:           "" /* 214 byte string literal not displayed */,
			Buckets:        []float64{0.010, 0.050, 0.100, 0.500, 1, 5, 10, 20, 30, 45, 60},
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGRelistDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGRelistDurationKey,
			Help:           "Duration in seconds for relisting pods in PLEG.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGDiscardEvents = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGDiscardEventsKey,
			Help:           "The number of discard events in PLEG.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGRelistInterval = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGRelistIntervalKey,
			Help:           "Interval in seconds between relisting in PLEG.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	PLEGLastSeen = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PLEGLastSeenKey,
			Help:           "Timestamp in seconds when PLEG was last seen active.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	EventedPLEGConnErr = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EventedPLEGConnErrKey,
			Help:           "The number of errors encountered during the establishment of streaming connection with the CRI runtime.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	EventedPLEGConn = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EventedPLEGConnKey,
			Help:           "The number of times a streaming client was obtained to receive CRI Events.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	EventedPLEGConnLatency = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EventedPLEGConnLatencyKey,
			Help:           "The latency of streaming connection with the CRI runtime, measured in seconds.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
	)

	RuntimeOperations = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RuntimeOperationsKey,
			Help:           "Cumulative number of runtime operations by operation type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	RuntimeOperationsDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RuntimeOperationsDurationKey,
			Help:           "Duration in seconds of runtime operations. Broken down by operation type.",
			Buckets:        metrics.ExponentialBuckets(.005, 2.5, 14),
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	RuntimeOperationsErrors = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RuntimeOperationsErrorsKey,
			Help:           "Cumulative number of runtime operation errors by operation type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_type"},
	)

	Evictions = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EvictionsKey,
			Help:           "Cumulative number of pod evictions by eviction signal",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"eviction_signal"},
	)

	EvictionStatsAge = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           EvictionStatsAgeKey,
			Help:           "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"eviction_signal"},
	)

	Preemptions = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PreemptionsKey,
			Help:           "Cumulative number of pod preemptions by preemption resource",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"preemption_signal"},
	)

	DevicePluginRegistrationCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           DevicePluginRegistrationCountKey,
			Help:           "Cumulative number of device plugin registrations. Broken down by resource name.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"resource_name"},
	)

	DevicePluginAllocationDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           DevicePluginAllocationDurationKey,
			Help:           "Duration in seconds to serve a device plugin Allocation request. Broken down by resource name.",
			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"resource_name"},
	)

	PodResourcesEndpointRequestsTotalCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsTotalKey,
			Help:           "Cumulative number of requests to the PodResource endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointRequestsListCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsListKey,
			Help:           "Number of requests to the PodResource List endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointRequestsGetAllocatableCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsGetAllocatableKey,
			Help:           "Number of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointErrorsListCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointErrorsListKey,
			Help:           "Number of requests to the PodResource List endpoint which returned error. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointErrorsGetAllocatableCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointErrorsGetAllocatableKey,
			Help:           "" /* 129 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointRequestsGetCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointRequestsGetKey,
			Help:           "Number of requests to the PodResource Get endpoint. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	PodResourcesEndpointErrorsGetCount = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           PodResourcesEndpointErrorsGetKey,
			Help:           "Number of requests to the PodResource Get endpoint which returned error. Broken down by server api version.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"server_api_version"},
	)

	RunPodSandboxDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem: KubeletSubsystem,
			Name:      RunPodSandboxDurationKey,
			Help:      "Duration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.",

			Buckets:        metrics.DefBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"runtime_handler"},
	)

	RunPodSandboxErrors = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RunPodSandboxErrorsKey,
			Help:           "Cumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"runtime_handler"},
	)

	RunningPodCount = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RunningPodsKey,
			Help:           "Number of pods that have a running pod sandbox",
			StabilityLevel: metrics.ALPHA,
		},
	)

	RunningContainerCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RunningContainersKey,
			Help:           "Number of containers currently running",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_state"},
	)

	DesiredPodCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           DesiredPodCountKey,
			Help:           "The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"static"},
	)

	ActivePodCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ActivePodCountKey,
			Help:           "" /* 158 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"static"},
	)

	MirrorPodCount = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           MirrorPodCountKey,
			Help:           "The number of mirror pods the kubelet will try to create (one per admitted static pod)",
			StabilityLevel: metrics.ALPHA,
		},
	)

	WorkingPodCount = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           WorkingPodCountKey,
			Help:           "" /* 324 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"lifecycle", "config", "static"},
	)

	OrphanedRuntimePodTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           OrphanedRuntimePodTotalKey,
			Help:           "" /* 253 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
	)

	RestartedPodTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           RestartedPodTotalKey,
			Help:           "" /* 193 byte string literal not displayed */,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"static"},
	)

	StartedPodsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedPodsTotalKey,
			Help:           "Cumulative number of pods started",
			StabilityLevel: metrics.ALPHA,
		},
	)

	StartedPodsErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedPodsErrorsTotalKey,
			Help:           "Cumulative number of errors when starting pods",
			StabilityLevel: metrics.ALPHA,
		},
	)

	StartedContainersTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedContainersTotalKey,
			Help:           "Cumulative number of containers started",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type"},
	)

	StartedContainersErrorsTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedContainersErrorsTotalKey,
			Help:           "Cumulative number of errors when starting containers",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type", "code"},
	)

	StartedHostProcessContainersTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedHostProcessContainersTotalKey,
			Help:           "Cumulative number of hostprocess containers started. This metric will only be collected on Windows.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type"},
	)

	StartedHostProcessContainersErrorsTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           StartedHostProcessContainersErrorsTotalKey,
			Help:           "Cumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"container_type", "code"},
	)

	ManagedEphemeralContainers = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ManagedEphemeralContainersKey,
			Help:           "Current number of ephemeral containers in pods managed by this kubelet.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	GracefulShutdownStartTime = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "graceful_shutdown_start_time_seconds",
			Help:           "Last graceful shutdown start time since unix epoch in seconds",
			StabilityLevel: metrics.ALPHA,
		},
	)

	GracefulShutdownEndTime = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "graceful_shutdown_end_time_seconds",
			Help:           "Last graceful shutdown end time since unix epoch in seconds",
			StabilityLevel: metrics.ALPHA,
		},
	)

	LifecycleHandlerHTTPFallbacks = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "lifecycle_handler_http_fallbacks_total",
			Help:           "The number of times lifecycle handlers successfully fell back to http from https.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerPinningRequestsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerPinningRequestsTotalKey,
			Help:           "The number of cpu core allocations which required pinning.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerPinningErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerPinningErrorsTotalKey,
			Help:           "The number of cpu core allocations which required pinning failed.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerSharedPoolSizeMilliCores = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerSharedPoolSizeMilliCoresKey,
			Help:           "The size of the shared CPU pool for non-guaranteed QoS pods, in millicores.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerExclusiveCPUsAllocationCount = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerExclusiveCPUsAllocationCountKey,
			Help:           "The total number of CPUs exclusively allocated to containers running on this node",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CPUManagerAllocationPerNUMA = metrics.NewGaugeVec(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CPUManagerAllocationPerNUMAKey,
			Help:           "Number of CPUs allocated per NUMA node",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{AlignedNUMANode},
	)

	ContainerAlignedComputeResources = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ContainerAlignedComputeResourcesNameKey,
			Help:           "Cumulative number of aligned compute resources allocated to containers by alignment type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
	)

	ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ContainerAlignedComputeResourcesFailureNameKey,
			Help:           "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
	)

	MemoryManagerPinningRequestTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           MemoryManagerPinningRequestsTotalKey,
			Help:           "The number of memory pages allocations which required pinning.",
			StabilityLevel: metrics.ALPHA,
		})

	MemoryManagerPinningErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           MemoryManagerPinningErrorsTotalKey,
			Help:           "The number of memory pages allocations which required pinning that failed.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	TopologyManagerAdmissionRequestsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           TopologyManagerAdmissionRequestsTotalKey,
			Help:           "The number of admission requests where resources have to be aligned.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	TopologyManagerAdmissionErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           TopologyManagerAdmissionErrorsTotalKey,
			Help:           "The number of admission request failures where resources could not be aligned.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	TopologyManagerAdmissionDuration = metrics.NewHistogram(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           TopologyManagerAdmissionDurationKey,
			Help:           "Duration in milliseconds to serve a pod admission request.",
			Buckets:        metrics.ExponentialBuckets(.05, 2, 15),
			StabilityLevel: metrics.ALPHA,
		},
	)

	OrphanPodCleanedVolumes = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           orphanPodCleanedVolumesKey,
			Help:           "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	OrphanPodCleanedVolumesErrors = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           orphanPodCleanedVolumesErrorsKey,
			Help:           "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupPreKubeletDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupPreKubeletKey,
			Help:           "Duration in seconds of node startup before kubelet starts.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupPreRegistrationDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupPreRegistrationKey,
			Help:           "Duration in seconds of node startup before registration.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupRegistrationDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupRegistrationKey,
			Help:           "Duration in seconds of node startup during registration.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupPostRegistrationDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupPostRegistrationKey,
			Help:           "Duration in seconds of node startup after registration.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	NodeStartupDuration = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           NodeStartupKey,
			Help:           "Duration in seconds of node startup in total.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	ImageGarbageCollectedTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageGarbageCollectedTotalKey,
			Help:           "Total number of images garbage collected by the kubelet, whether through disk usage or image age.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"reason"},
	)

	ImagePullDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImagePullDurationKey,
			Help:           "Duration in seconds to pull an image.",
			Buckets:        imagePullDurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"image_size_in_bytes"},
	)

	LifecycleHandlerSleepTerminated = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           "sleep_action_terminated_early_total",
			Help:           "The number of times lifecycle sleep handler got terminated before it finishes",
			StabilityLevel: metrics.ALPHA,
		},
	)

	CgroupVersion = metrics.NewGauge(
		&metrics.GaugeOpts{
			Subsystem:      KubeletSubsystem,
			Name:           CgroupVersionKey,
			Help:           "cgroup version on the hosts.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	DRAOperationsDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      DRASubsystem,
			Name:           DRAOperationsDurationKey,
			Help:           "" /* 339 byte string literal not displayed */,
			Buckets:        DRADurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"operation_name", "is_error"},
	)

	DRAGRPCOperationsDuration = metrics.NewHistogramVec(
		&metrics.HistogramOpts{
			Subsystem:      DRASubsystem,
			Name:           DRAGRPCOperationsDurationKey,
			Help:           "Duration in seconds of the DRA gRPC operations",
			Buckets:        DRADurationBuckets,
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"driver_name", "method_name", "grpc_status_code"},
	)

	AdmissionRejectionsTotal = metrics.NewCounterVec(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           AdmissionRejectionsTotalKey,
			Help:           "Cumulative number pod admission rejections by the Kubelet.",
			StabilityLevel: metrics.ALPHA,
		},
		[]string{"reason"},
	)

	ImageVolumeRequestedTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageVolumeRequestedTotalKey,
			Help:           "Number of requested image volumes.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	ImageVolumeMountedSucceedTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageVolumeMountedSucceedTotalKey,
			Help:           "Number of successful image volume mounts.",
			StabilityLevel: metrics.ALPHA,
		},
	)

	ImageVolumeMountedErrorsTotal = metrics.NewCounter(
		&metrics.CounterOpts{
			Subsystem:      KubeletSubsystem,
			Name:           ImageVolumeMountedErrorsTotalKey,
			Help:           "Number of failed image volume mounts.",
			StabilityLevel: metrics.ALPHA,
		},
	)
)
var (

	// DRADurationBuckets is the bucket boundaries for DRA operation duration metrics
	// DRAOperationsDuration and DRAGRPCOperationsDuration defined below in this file.
	// The buckets max value 40 is based on the 45sec max gRPC timeout value defined
	// for the DRA gRPC calls in the pkg/kubelet/cm/dra/plugin/registration.go
	DRADurationBuckets = metrics.ExponentialBucketsRange(.1, 40, 15)
)

Functions

func GetGather

func GetGather() metrics.Gatherer

GetGather returns the gatherer. It used by test case outside current package.

func GetImageSizeBucket

func GetImageSizeBucket(sizeInBytes uint64) string

func Register

func Register(collectors ...metrics.StableCollector)

Register registers all metrics.

func SetNodeName

func SetNodeName(name types.NodeName)

SetNodeName sets the NodeName Gauge to 1.

func SinceInSeconds

func SinceInSeconds(start time.Time) float64

SinceInSeconds gets the time since the specified start in seconds.

Source Files

metrics.go

Directories

PathSynopsis
pkg/kubelet/metrics/collectors
Version
v1.33.0 (latest)
Published
Apr 23, 2025
Platform
linux/amd64
Imports
7 packages
Last checked
3 hours ago

Tools for package owners.