Skip to content

Commit 4d51b56

Browse files
Merge pull request #6916 from smarterclayton/static_masters
Automatic merge from submit-queue. Make openshift-ansible use static pods to install the control plane, make nodes prefer bootstrapping 1. Nodes continue to be configured for bootstrapping (as today) 2. For bootstrap nodes, we write a generic bootstrap-node-config.yaml that contains static pod references and any bootstrap config, and then use that to start a child kubelet using `--write-flags` instead of launching the node ourselves. If a node-config.yaml is laid down in `/etc/origin/node` it takes precedence. 3. For 3.10 we want dynamic node config from Kubernetes to pull down additional files, but there are functional gaps. For now, the openshift SDN container has a sidecar that syncs node config to disk and updates labels (kubelet doesn't update labels, kubernetes/kubernetes#59314) 4. On the masters, if openshift_master_bootstrap_enabled we generate the master-config.yaml and the etcd config, but we don't start etcd or the masters (no services installed) 5. On the masters, we copy the static files into the correct pod-manifest-path (/etc/origin/node/pods) or similar 6. The kubelet at that point should automatically pick up the new static files and launch the components 7. We wait for them to converge 8. We install openshift-sdn as the first component, which allows nodes to go ready and start installing things. There is a gap here where the masters are up, the nodes can bootstrap, but the nodes are not ready because no network plugin is installed. Challenges at this point: * The master shims (`master-logs` and `master-restart`) need to deal with CRI-O and systemd. Ideally this is a temporary shim until we remove systemd for these components and have cri-ctl installed. * We need to test failure modes of the static pods * Testing Further exploration things: * need to get all the images using image streams or properly replaced into the static pods * need to look at upgrades and updates * disk locations become our API (`/var/lib/origin`, `/var/lib/etcd`) - how many customers have fiddled with this? * may need to make the kubelet halt if it hasn't been able to get server/client certs within a bounded window (5m?) so to ensure that autoheals happen (openshift/origin#18430) * have to figure out whether dynamic kubelet config is a thing we can rely on for 3.10 (@liggitt), and what gaps there are with dynamic reconfig * client-ca.crt is not handled by bootstrapping or dynamic config. This needs a solution unless we keep the openshift-sdn sidecar around * kubelet doesn't send sd notify to systemd (kubernetes/kubernetes#59079) @derekwaynecarr @sdodson @liggitt @deads2k this is the core of self-hosting.
2 parents 570ea7d + c826c43 commit 4d51b56

File tree

84 files changed

+2289
-551
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+2289
-551
lines changed

.papr.inventory

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,24 @@ etcd
77
ansible_ssh_user=root
88
ansible_python_interpreter=/usr/bin/python3
99
openshift_deployment_type=origin
10-
openshift_image_tag="{{ lookup('env', 'OPENSHIFT_IMAGE_TAG') }}"
1110
openshift_master_default_subdomain="{{ lookup('env', 'RHCI_ocp_node1_IP') }}.xip.io"
1211
openshift_check_min_host_disk_gb=1.5
1312
openshift_check_min_host_memory_gb=1.9
1413
osm_cluster_network_cidr=10.128.0.0/14
1514
openshift_portal_net=172.30.0.0/16
1615
osm_host_subnet_length=9
1716

17+
[all:vars]
18+
# bootstrap configs
19+
openshift_node_groups=[{"name":"node-config-master","labels":["node-role.kubernetes.io/master=true","node-role.kubernetes.io/infra=true"]},{"name":"node-config-node","labels":["node-role.kubernetes.io/compute=true"]}]
20+
openshift_master_bootstrap_enabled=true
21+
openshift_master_bootstrap_auto_approve=true
22+
openshift_master_bootstrap_auto_approver_node_selector={"region":"infra"}
23+
osm_controller_args={"experimental-cluster-signing-duration": ["20m"]}
24+
openshift_node_bootstrap=true
25+
openshift_hosted_infra_selector="node-role.kubernetes.io/infra=true"
26+
osm_default_node_selector="node-role.kubernetes.io/compute=true"
27+
1828
[masters]
1929
ocp-master
2030

@@ -23,5 +33,5 @@ ocp-master
2333

2434
[nodes]
2535
ocp-master openshift_schedulable=true
26-
ocp-node1 openshift_node_labels="{'region':'infra'}"
27-
ocp-node2 openshift_node_labels="{'region':'infra'}"
36+
ocp-node1
37+
ocp-node2

.papr.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,16 @@ set -xeuo pipefail
66
# specific version which quickly becomes stale.
77

88
if [ -n "${PAPR_BRANCH:-}" ]; then
9-
target_branch=$PAPR_BRANCH
9+
target_branch=$PAPR_BRANCH
1010
else
11-
target_branch=$PAPR_PULL_TARGET_BRANCH
11+
target_branch=$PAPR_PULL_TARGET_BRANCH
12+
fi
13+
if [[ "${target_branch}" =~ ^release- ]]; then
14+
target_branch="${target_branch/release-/v}"
15+
else
16+
dnf install -y sed
17+
target_branch="$( git describe | sed 's/^openshift-ansible-\([0-9]*\.[0-9]*\)\.[0-9]*-.*/v\1/' )"
1218
fi
13-
14-
# this is a bit wasteful, though there's no easy way to say "only clone up to
15-
# the first tag in the branch" -- ideally, PAPR could help with caching here
16-
git clone --branch $target_branch --single-branch https://github.com/openshift/origin
17-
export OPENSHIFT_IMAGE_TAG=$(git -C origin describe --abbrev=0)
18-
19-
echo "Targeting OpenShift Origin $OPENSHIFT_IMAGE_TAG"
2019

2120
pip install -r requirements.txt
2221

@@ -32,10 +31,11 @@ upload_journals() {
3231

3332
trap upload_journals ERR
3433

34+
# make all nodes ready for bootstrapping
35+
ansible-playbook -vvv -i .papr.inventory playbooks/openshift-node/private/image_prep.yml
36+
3537
# run the actual installer
36-
# FIXME: override openshift_image_tag defined in the inventory until
37-
# https://github.com/openshift/openshift-ansible/issues/4478 is fixed.
38-
ansible-playbook -vvv -i .papr.inventory playbooks/deploy_cluster.yml -e "openshift_image_tag=$OPENSHIFT_IMAGE_TAG"
38+
ansible-playbook -vvv -i .papr.inventory playbooks/deploy_cluster.yml -e "openshift_release=${target_release}"
3939

4040
### DISABLING TESTS FOR NOW, SEE:
4141
### https://github.com/openshift/openshift-ansible/pull/6132

playbooks/common/openshift-cluster/upgrades/docker/tasks/restart.yml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,23 @@
66
retries: 3
77
delay: 30
88

9+
- name: Restart static master services
10+
command: /usr/local/bin/master-restart "{{ item }}"
11+
with_items:
12+
- api
13+
- controllers
14+
- etcd
15+
failed_when: false
16+
when: openshift_is_containerized | bool
17+
918
- name: Restart containerized services
1019
service: name={{ item }} state=started
1120
with_items:
12-
- etcd_container
13-
- openvswitch
14-
- "{{ openshift_service_type }}-master-api"
15-
- "{{ openshift_service_type }}-master-controllers"
16-
- "{{ openshift_service_type }}-node"
21+
- etcd_container
22+
- openvswitch
23+
- "{{ openshift_service_type }}-master-api"
24+
- "{{ openshift_service_type }}-master-controllers"
25+
- "{{ openshift_service_type }}-node"
1726
failed_when: false
1827
when: openshift_is_containerized | bool
1928

playbooks/common/openshift-cluster/upgrades/docker/tasks/upgrade.yml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,22 @@
44
- name: Stop containerized services
55
service: name={{ item }} state=stopped
66
with_items:
7-
- "{{ openshift_service_type }}-master-api"
8-
- "{{ openshift_service_type }}-master-controllers"
9-
- "{{ openshift_service_type }}-node"
10-
- etcd_container
11-
- openvswitch
7+
- "{{ openshift_service_type }}-master-api"
8+
- "{{ openshift_service_type }}-master-controllers"
9+
- "{{ openshift_service_type }}-node"
10+
- etcd_container
11+
- openvswitch
1212
failed_when: false
1313
when: openshift_is_containerized | bool
1414

15+
- name: Restart static master services
16+
command: /usr/local/bin/master-restart "{{ item }}"
17+
with_items:
18+
- api
19+
- controllers
20+
- etcd
21+
failed_when: false
22+
1523
- name: Check Docker image count
1624
shell: "docker images -aq | wc -l"
1725
register: docker_image_count

playbooks/init/base_packages.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
- >
3636
(openshift_use_system_containers | default(False)) | bool
3737
or (openshift_use_etcd_system_container | default(False)) | bool
38-
or (openshift_use_openvswitch_system_container | default(False)) | bool
3938
or (openshift_use_node_system_container | default(False)) | bool
4039
or (openshift_use_master_system_container | default(False)) | bool
4140
register: result

playbooks/openshift-master/private/additional_config.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
etcd_urls: "{{ openshift.master.etcd_urls }}"
1919
omc_cluster_hosts: "{{ groups.oo_masters | join(' ')}}"
2020
roles:
21+
# TODO: this is currently required in order to schedule pods onto the masters, but
22+
# should be moved into components once nodes are using dynamic config
23+
- role: openshift_sdn
2124
- role: openshift_project_request_template
2225
when: openshift_project_request_template_manage
2326
- role: openshift_examples

playbooks/openshift-master/private/config.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,18 @@
176176
openshift_no_proxy_etcd_host_ips: "{{ hostvars | lib_utils_oo_select_keys(groups['oo_etcd_to_config'] | default([]))
177177
| lib_utils_oo_collect('openshift.common.ip') | default([]) | join(',')
178178
}}"
179+
pre_tasks:
180+
# This will be moved into the control plane role once openshift_master is removed
181+
- name: Add static pod and systemd shim commands
182+
import_role:
183+
name: openshift_control_plane
184+
tasks_from: static_shim
185+
- name: Prepare the bootstrap node config on masters for self-hosting
186+
import_role:
187+
name: openshift_node_group
188+
tasks_from: bootstrap
189+
when: openshift_master_bootstrap_enabled | default(false) | bool
190+
179191
roles:
180192
- role: openshift_master_facts
181193
- role: openshift_clock
@@ -184,6 +196,8 @@
184196
- role: openshift_builddefaults
185197
- role: openshift_buildoverrides
186198
- role: nickhammond.logrotate
199+
200+
# DEPRECATED: begin moving away from this
187201
- role: openshift_master
188202
openshift_master_ha: "{{ (groups.oo_masters | length > 1) | bool }}"
189203
openshift_master_hosts: "{{ groups.oo_masters_to_config }}"
@@ -193,6 +207,10 @@
193207
openshift_master_default_registry_value: "{{ hostvars[groups.oo_first_master.0].l_default_registry_value }}"
194208
openshift_master_default_registry_value_api: "{{ hostvars[groups.oo_first_master.0].l_default_registry_value_api }}"
195209
openshift_master_default_registry_value_controllers: "{{ hostvars[groups.oo_first_master.0].l_default_registry_value_controllers }}"
210+
when: not ( openshift_master_bootstrap_enabled | default(false) | bool )
211+
212+
- role: openshift_control_plane
213+
when: openshift_master_bootstrap_enabled | default(false) | bool
196214
- role: tuned
197215
- role: nuage_ca
198216
when: openshift_use_nuage | default(false) | bool

playbooks/openshift-master/private/scaleup.yml

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,14 @@
1515
yaml_key: 'kubernetesMasterConfig.masterCount'
1616
yaml_value: "{{ openshift.master.master_count }}"
1717
notify:
18-
- restart master api
19-
- restart master controllers
18+
- restart master
2019
handlers:
21-
- name: restart master api
22-
service: name={{ openshift_service_type }}-master-controllers state=restarted
20+
- name: restart master
21+
command: /usr/local/bin/master-restart "{{ item }}"
22+
with_items:
23+
- api
24+
- controllers
2325
notify: verify api server
24-
# We retry the controllers because the API may not be 100% initialized yet.
25-
- name: restart master controllers
26-
command: "systemctl restart {{ openshift_service_type }}-master-controllers"
27-
retries: 3
28-
delay: 5
29-
register: result
30-
until: result.rc == 0
3126
- name: verify api server
3227
command: >
3328
curl --silent --tlsv1.2

playbooks/openshift-master/private/tasks/wire_aggregator.yml

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -191,16 +191,11 @@
191191
#restart master serially here
192192
- when: yedit_output.changed or (yedit_asset_config_output is defined and yedit_asset_config_output.changed)
193193
block:
194-
- name: restart master api
195-
systemd: name={{ openshift_service_type }}-master-api state=restarted
196-
197-
# We retry the controllers because the API may not be 100% initialized yet.
198-
- name: restart master controllers
199-
command: "systemctl restart {{ openshift_service_type }}-master-controllers"
200-
retries: 3
201-
delay: 5
202-
register: result
203-
until: result.rc == 0
194+
- name: restart master
195+
command: /usr/local/bin/master-restart "{{ item }}"
196+
with_items:
197+
- api
198+
- controllers
204199

205200
- name: Verify API Server
206201
# Using curl here since the uri module requires python-httplib2 and

playbooks/openshift-node/private/image_prep.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
- import_role:
2626
name: openshift_node
2727
tasks_from: bootstrap.yml
28+
- import_role:
29+
name: openshift_node_group
30+
tasks_from: bootstrap.yml
31+
2832

2933
- name: Re-enable excluders
3034
import_playbook: enable_excluders.yml

roles/container_runtime/tasks/package_docker.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
- >
99
(openshift_use_system_containers | default(False)) | bool
1010
or (openshift_use_etcd_system_container | default(False)) | bool
11-
or (openshift_use_openvswitch_system_container | default(False)) | bool
1211
or (openshift_use_node_system_container | default(False)) | bool
1312
or (openshift_use_master_system_container | default(False)) | bool
1413

roles/etcd/defaults/main.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ etcd_listen_client_urls: "{{ etcd_url_scheme }}://{{ etcd_ip }}:{{ etcd_client_p
8080
#etcd_peer: 127.0.0.1
8181
etcdctlv2: "{{ r_etcd_common_etcdctl_command }} --cert-file {{ etcd_peer_cert_file }} --key-file {{ etcd_peer_key_file }} --ca-file {{ etcd_peer_ca_file }} -C https://{{ etcd_peer }}:{{ etcd_client_port }}"
8282

83-
etcd_service: "{{ 'etcd_container' if r_etcd_common_etcd_runtime == 'docker' else 'etcd' }}"
83+
etcd_service: etcd
8484
# Location of the service file is fixed and not meant to be changed
8585
etcd_service_file: "/etc/systemd/system/{{ etcd_service }}.service"
8686

roles/etcd/files/etcd.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
kind: Pod
2+
apiVersion: v1
3+
metadata:
4+
name: master-etcd
5+
namespace: kube-system
6+
labels:
7+
openshift.io/control-plane: "true"
8+
openshift.io/component: etcd
9+
spec:
10+
restartPolicy: Always
11+
hostNetwork: true
12+
containers:
13+
- name: etcd
14+
image: quay.io/coreos/etcd:v3.3
15+
workingDir: /var/lib/etcd
16+
command: ["/bin/sh", "-c"]
17+
args:
18+
- |
19+
#!/bin/sh
20+
set -o allexport
21+
source /etc/etcd/etcd.conf
22+
exec etcd
23+
securityContext:
24+
privileged: true
25+
volumeMounts:
26+
- mountPath: /etc/etcd/
27+
name: master-config
28+
readOnly: true
29+
- mountPath: /var/lib/etcd/
30+
name: master-data
31+
livenessProbe:
32+
tcpSocket:
33+
port: 2379
34+
volumes:
35+
- name: master-config
36+
hostPath:
37+
path: /etc/etcd/
38+
- name: master-data
39+
hostPath:
40+
path: /var/lib/etcd

roles/etcd/tasks/certificates/fetch_server_certificates_from_ca.yml

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
package:
44
name: "etcd{{ '-' + etcd_version if etcd_version is defined else '' }}"
55
state: present
6-
when: not etcd_is_containerized | bool
6+
when: not etcd_is_atomic | bool
7+
delegate_to: "{{ etcd_ca_host }}"
8+
run_once: true
79
register: result
810
until: result is succeeded
911

@@ -178,8 +180,8 @@
178180
file:
179181
path: "{{ item }}"
180182
mode: 0600
181-
owner: "{{ 'etcd' if not etcd_is_containerized | bool else omit }}"
182-
group: "{{ 'etcd' if not etcd_is_containerized | bool else omit }}"
183+
owner: "etcd"
184+
group: "etcd"
183185
when: etcd_url_scheme == 'https'
184186
with_items:
185187
- "{{ etcd_ca_file }}"
@@ -190,8 +192,8 @@
190192
file:
191193
path: "{{ item }}"
192194
mode: 0600
193-
owner: "{{ 'etcd' if not etcd_is_containerized | bool else omit }}"
194-
group: "{{ 'etcd' if not etcd_is_containerized | bool else omit }}"
195+
owner: "etcd"
196+
group: "etcd"
195197
when: etcd_peer_url_scheme == 'https'
196198
with_items:
197199
- "{{ etcd_peer_ca_file }}"
@@ -202,6 +204,6 @@
202204
file:
203205
path: "{{ etcd_conf_dir }}"
204206
state: directory
205-
owner: "{{ 'etcd' if not etcd_is_containerized | bool else omit }}"
206-
group: "{{ 'etcd' if not etcd_is_containerized | bool else omit }}"
207+
owner: "etcd"
208+
group: "etcd"
207209
mode: 0700

0 commit comments

Comments
 (0)