Skip to content

Commit a213b4d

Browse files
committed
Enable cluster hibernation
1 parent 7f8583f commit a213b4d

18 files changed

Lines changed: 1941 additions & 1 deletion

cmd/manager/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"github.com/openshift/hive/pkg/controller/controlplanecerts"
4242
"github.com/openshift/hive/pkg/controller/dnsendpoint"
4343
"github.com/openshift/hive/pkg/controller/dnszone"
44+
"github.com/openshift/hive/pkg/controller/hibernation"
4445
"github.com/openshift/hive/pkg/controller/metrics"
4546
"github.com/openshift/hive/pkg/controller/remoteingress"
4647
"github.com/openshift/hive/pkg/controller/remotemachineset"
@@ -83,6 +84,7 @@ var controllerFuncs = map[string]controllerSetupFunc{
8384
unreachable.ControllerName: unreachable.Add,
8485
velerobackup.ControllerName: velerobackup.Add,
8586
clusterpool.ControllerName: clusterpool.Add,
87+
hibernation.ControllerName: hibernation.Add,
8688
}
8789

8890
type controllerManagerOptions struct {

config/crds/hive.openshift.io_clusterdeployments.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,14 @@ spec:
541541
- vCenter
542542
type: object
543543
type: object
544+
powerState:
545+
description: PowerState indicates whether a cluster should be running
546+
or hibernating. When omitted, PowerState defaults to the Running state.
547+
enum:
548+
- ""
549+
- Running
550+
- Hibernating
551+
type: string
544552
preserveOnDelete:
545553
description: PreserveOnDelete allows the user to disconnect a cluster
546554
from Hive without deprovisioning it

pkg/apis/hive/v1/clusterdeployment_types.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,21 @@ const (
4747
HiveClusterRegionLabel = "hive.openshift.io/cluster-region"
4848
)
4949

50+
// ClusterPowerState is used to indicate whether a cluster is running or in a
51+
// hibernating state.
52+
// +kubebuilder:validation:Enum="";Running;Hibernating
53+
type ClusterPowerState string
54+
55+
const (
56+
// RunningClusterPowerState is the default state of a cluster after it has
57+
// been installed. All of its machines should be running.
58+
RunningClusterPowerState ClusterPowerState = "Running"
59+
60+
// HibernatingClusterPowerState is used to stop the machines belonging to a cluster
61+
// and move it to a hibernating state.
62+
HibernatingClusterPowerState ClusterPowerState = "Hibernating"
63+
)
64+
5065
// ClusterDeploymentSpec defines the desired state of ClusterDeployment
5166
type ClusterDeploymentSpec struct {
5267

@@ -103,6 +118,11 @@ type ClusterDeploymentSpec struct {
103118
// ClusterPoolRef is a reference to the ClusterPool that this ClusterDeployment originated from.
104119
// +optional
105120
ClusterPoolRef *ClusterPoolReference `json:"clusterPoolRef,omitempty"`
121+
122+
// PowerState indicates whether a cluster should be running or hibernating. When omitted,
123+
// PowerState defaults to the Running state.
124+
// +optional
125+
PowerState ClusterPowerState `json:"powerState,omitempty"`
106126
}
107127

108128
// Provisioning contains settings used only for initial cluster provisioning.
@@ -288,6 +308,10 @@ const (
288308

289309
// RelocationFailedCondition indicates if a relocation to another Hive instance has failed
290310
RelocationFailedCondition ClusterDeploymentConditionType = "RelocationFailed"
311+
312+
// ClusterHibernatingCondition is set when the ClusterDeployment is either
313+
// transitioning to/from a hibernating state or is in a hibernating state.
314+
ClusterHibernatingCondition ClusterDeploymentConditionType = "Hibernating"
291315
)
292316

293317
// AllClusterDeploymentConditions is a slice containing all condition types. This can be used for dealing with
@@ -304,8 +328,30 @@ var AllClusterDeploymentConditions = []ClusterDeploymentConditionType{
304328
ProvisionFailedCondition,
305329
SyncSetFailedCondition,
306330
RelocationFailedCondition,
331+
ClusterHibernatingCondition,
307332
}
308333

334+
// Cluster hibernating reasons
335+
const (
336+
// ResumingHibernationReason is used as the reason when the cluster is transitioning
337+
// from a Hibernating state to a Running state.
338+
ResumingHibernationReason = "Resuming"
339+
// RunningHibernationReason is used as the reason when the cluster is running and
340+
// the Hibernating condition is false.
341+
RunningHibernationReason = "Running"
342+
// StoppingHibernationReason is used as the reason when the cluster is transitioning
343+
// from a Running state to a Hibernating state.
344+
StoppingHibernationReason = "Stopping"
345+
// HibernatingHibernationReason is used as the reason when the cluster is in a
346+
// Hibernating state.
347+
HibernatingHibernationReason = "Hibernating"
348+
// UnsupportedHibernationReason is used as the reason when the cluster spec
349+
// specifies that the cluster be moved to a Hibernating state, but either the cluster
350+
// version is not compatible with hibernation (< 4.4.8) or the cloud provider of
351+
// the cluster is not supported.
352+
UnsupportedHibernationReason = "Unsupported"
353+
)
354+
309355
// +genclient
310356
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
311357

pkg/apis/hive/v1/validating-webhooks/clusterdeployment_validating_admission_hook.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ const (
3636
)
3737

3838
var (
39-
mutableFields = []string{"CertificateBundles", "ClusterMetadata", "ControlPlaneConfig", "Ingress", "Installed", "PreserveOnDelete", "ClusterPoolRef"}
39+
mutableFields = []string{"CertificateBundles", "ClusterMetadata", "ControlPlaneConfig", "Ingress", "Installed", "PreserveOnDelete", "ClusterPoolRef", "PowerState"}
4040
)
4141

4242
// ClusterDeploymentValidatingAdmissionHook is a struct that is used to reference what code should be run by the generic-admission-server.

pkg/awsclient/client.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ type Client interface {
5959
RunInstances(*ec2.RunInstancesInput) (*ec2.Reservation, error)
6060
DescribeInstances(*ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error)
6161
TerminateInstances(*ec2.TerminateInstancesInput) (*ec2.TerminateInstancesOutput, error)
62+
StopInstances(*ec2.StopInstancesInput) (*ec2.StopInstancesOutput, error)
63+
StartInstances(*ec2.StartInstancesInput) (*ec2.StartInstancesOutput, error)
6264

6365
// ELB
6466
RegisterInstancesWithLoadBalancer(*elb.RegisterInstancesWithLoadBalancerInput) (*elb.RegisterInstancesWithLoadBalancerOutput, error)
@@ -146,6 +148,16 @@ func (c *awsClient) TerminateInstances(input *ec2.TerminateInstancesInput) (*ec2
146148
return c.ec2Client.TerminateInstances(input)
147149
}
148150

151+
func (c *awsClient) StopInstances(input *ec2.StopInstancesInput) (*ec2.StopInstancesOutput, error) {
152+
metricAWSAPICalls.WithLabelValues("StopInstances").Inc()
153+
return c.ec2Client.StopInstances(input)
154+
}
155+
156+
func (c *awsClient) StartInstances(input *ec2.StartInstancesInput) (*ec2.StartInstancesOutput, error) {
157+
metricAWSAPICalls.WithLabelValues("StartInstances").Inc()
158+
return c.ec2Client.StartInstances(input)
159+
}
160+
149161
func (c *awsClient) RegisterInstancesWithLoadBalancer(input *elb.RegisterInstancesWithLoadBalancerInput) (*elb.RegisterInstancesWithLoadBalancerOutput, error) {
150162
metricAWSAPICalls.WithLabelValues("RegisterInstancesWithLoadBalancer").Inc()
151163
return c.elbClient.RegisterInstancesWithLoadBalancer(input)

pkg/awsclient/mock/client_generated.go

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
package hibernation
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/aws/aws-sdk-go/aws"
7+
"github.com/aws/aws-sdk-go/service/ec2"
8+
log "github.com/sirupsen/logrus"
9+
10+
"k8s.io/apimachinery/pkg/util/sets"
11+
"sigs.k8s.io/controller-runtime/pkg/client"
12+
13+
hivev1 "github.com/openshift/hive/pkg/apis/hive/v1"
14+
awsclient "github.com/openshift/hive/pkg/awsclient"
15+
)
16+
17+
var (
18+
runningStates = sets.NewString("running")
19+
stoppedStates = sets.NewString("stopped")
20+
pendingStates = sets.NewString("pending")
21+
stoppingStates = sets.NewString("stopping", "shutting-down")
22+
runningOrPendingStates = runningStates.Union(pendingStates)
23+
stoppedOrStoppingStates = stoppedStates.Union(stoppingStates)
24+
notRunningStates = stoppedOrStoppingStates.Union(pendingStates)
25+
notStoppedStates = runningOrPendingStates.Union(stoppingStates)
26+
)
27+
28+
func init() {
29+
RegisterActuator(&awsActuator{awsClientFn: getAWSClient})
30+
}
31+
32+
type awsActuator struct {
33+
// awsClientFn is the function to build an AWS client, here for testing
34+
awsClientFn func(*hivev1.ClusterDeployment, client.Client, log.FieldLogger) (awsclient.Client, error)
35+
}
36+
37+
// CanHandle returns true if the actuator can handle a particular ClusterDeployment
38+
func (a *awsActuator) CanHandle(cd *hivev1.ClusterDeployment) bool {
39+
return cd.Spec.Platform.AWS != nil
40+
}
41+
42+
// StopMachines will stop machines belonging to the given ClusterDeployment
43+
func (a *awsActuator) StopMachines(cd *hivev1.ClusterDeployment, c client.Client, logger log.FieldLogger) error {
44+
logger = logger.WithField("cloud", "aws")
45+
awsClient, err := a.awsClientFn(cd, c, logger)
46+
if err != nil {
47+
return err
48+
}
49+
instanceIDs, err := getClusterInstanceIDs(cd, awsClient, runningOrPendingStates, logger)
50+
if err != nil {
51+
return err
52+
}
53+
if len(instanceIDs) == 0 {
54+
logger.Warning("No instances were found to stop")
55+
return nil
56+
}
57+
logger.WithField("instanceIDs", instanceIDs).Info("Stopping cluster instances")
58+
_, err = awsClient.StopInstances(&ec2.StopInstancesInput{
59+
InstanceIds: instanceIDs,
60+
})
61+
if err != nil {
62+
logger.WithError(err).Error("failed to stop instances")
63+
}
64+
return err
65+
}
66+
67+
// StartMachines will select machines belonging to the given ClusterDeployment
68+
func (a *awsActuator) StartMachines(cd *hivev1.ClusterDeployment, c client.Client, logger log.FieldLogger) error {
69+
logger = logger.WithField("cloud", "aws")
70+
awsClient, err := a.awsClientFn(cd, c, logger)
71+
if err != nil {
72+
return err
73+
}
74+
instanceIDs, err := getClusterInstanceIDs(cd, awsClient, stoppedOrStoppingStates, logger)
75+
if err != nil {
76+
return err
77+
}
78+
if len(instanceIDs) == 0 {
79+
logger.Warning("No instances were found to start")
80+
return nil
81+
}
82+
logger.WithField("instanceIDs", instanceIDs).Info("Starting cluster instances")
83+
_, err = awsClient.StartInstances(&ec2.StartInstancesInput{
84+
InstanceIds: instanceIDs,
85+
})
86+
if err != nil {
87+
logger.WithError(err).Error("failed to start instances")
88+
}
89+
return err
90+
}
91+
92+
// MachinesRunning will return true if the machines associated with the given
93+
// ClusterDeployment are in a running state.
94+
func (a *awsActuator) MachinesRunning(cd *hivev1.ClusterDeployment, c client.Client, logger log.FieldLogger) (bool, error) {
95+
logger = logger.WithField("cloud", "aws")
96+
logger.Infof("checking whether machines are running")
97+
awsClient, err := a.awsClientFn(cd, c, logger)
98+
if err != nil {
99+
return false, err
100+
}
101+
instanceIDs, err := getClusterInstanceIDs(cd, awsClient, notRunningStates, logger)
102+
if err != nil {
103+
return false, err
104+
}
105+
return len(instanceIDs) == 0, nil
106+
}
107+
108+
// MachinesStopped will return true if the machines associated with the given
109+
// ClusterDeployment are in a stopped state.
110+
func (a *awsActuator) MachinesStopped(cd *hivev1.ClusterDeployment, c client.Client, logger log.FieldLogger) (bool, error) {
111+
logger = logger.WithField("cloud", "aws")
112+
logger.Infof("checking whether machines are stopped")
113+
awsClient, err := a.awsClientFn(cd, c, logger)
114+
if err != nil {
115+
return false, err
116+
}
117+
instanceIDs, err := getClusterInstanceIDs(cd, awsClient, notStoppedStates, logger)
118+
if err != nil {
119+
return false, err
120+
}
121+
return len(instanceIDs) == 0, nil
122+
}
123+
124+
func getAWSClient(cd *hivev1.ClusterDeployment, c client.Client, logger log.FieldLogger) (awsclient.Client, error) {
125+
awsClient, err := awsclient.NewClient(c, cd.Spec.Platform.AWS.CredentialsSecretRef.Name, cd.Namespace, cd.Spec.Platform.AWS.Region)
126+
if err != nil {
127+
logger.WithError(err).Error("failed to get AWS client")
128+
}
129+
return awsClient, err
130+
}
131+
132+
func getClusterInstanceIDs(cd *hivev1.ClusterDeployment, c awsclient.Client, states sets.String, logger log.FieldLogger) ([]*string, error) {
133+
infraID := cd.Spec.ClusterMetadata.InfraID
134+
logger = logger.WithField("infraID", infraID)
135+
logger.Debug("listing cluster instances")
136+
out, err := c.DescribeInstances(&ec2.DescribeInstancesInput{
137+
Filters: []*ec2.Filter{
138+
{
139+
Name: aws.String(fmt.Sprintf("tag:kubernetes.io/cluster/%s", infraID)),
140+
Values: []*string{aws.String("owned")},
141+
},
142+
},
143+
})
144+
if err != nil {
145+
logger.WithError(err).Error("failed to list instances")
146+
return nil, err
147+
}
148+
result := []*string{}
149+
for _, r := range out.Reservations {
150+
for _, i := range r.Instances {
151+
if states.Has(aws.StringValue(i.State.Name)) {
152+
result = append(result, i.InstanceId)
153+
}
154+
}
155+
}
156+
logger.WithField("count", len(result)).WithField("states", states).Debug("result of listing instances")
157+
return result, nil
158+
}

0 commit comments

Comments
 (0)