Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
*.log
.vscode
.venv
.claude
79 changes: 79 additions & 0 deletions collection/stages/roles/day2ops/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,82 @@
# defaults file for day2ops
day2ops_steps: []
day2ops_report_filename: shiftstack-qa-day2ops-results.xml

# Telco MachineSet configuration for SRIOV/DPDK workers
# Used by the create-telco-machinesets procedure
telco_machinesets:
delete_default_workers: true # Whether to delete the default worker machineset
disable_masters_schedulable: true # Set mastersSchedulable: false after adding workers
machinesets: [] # List of machinesets to create (configure in job definition)
# Example machineset configuration:
# machinesets:
# - name: sriov
# role: worker
# type: sriov
# replicas: 1
# flavor: worker_0_numa_0
# networks:
# - name: sriov_net_nic0
# port_name_suffix: sriov_net_nic0_direct_worker_port
# - name: sriov_net_nic1
# port_name_suffix: sriov_net_nic1_direct_worker_port
# - name: dpdk
# role: worker
# type: dpdk
# replicas: 1
# flavor: worker_1_numa_1
# networks:
# - name: dpdk_net_nic0
# port_name_suffix: dpdk_net_nic0_normal_worker_port
# - name: dpdk_net_nic1
# port_name_suffix: dpdk_net_nic1_normal_worker_port

# Telco tuning configuration for SRIOV/DPDK workloads
# Used by the configure-telco-tuning procedure
telco_tuning:
sriov:
performance_profile:
cpu_isolated: "4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20"
cpu_reserved: "0,1,2,3"
hugepages_size: "1G"
hugepages_count: 7
numa_node: 0
numa_topology_policy: "best-effort"
realtime_kernel: false
additional_kernel_args:
- nosmt
- tsc=reliable
networks: [] # List of SRIOV networks for SriovNetworkNodePolicy (configure in job definition)
# Example network configuration:
# networks:
# - name: sriov_net_nic0_9
# resource_name: sriov9
# device_type: vfio-pci
# num_vfs: 1
# - name: sriov_net_nic1_10
# resource_name: sriov10
# device_type: vfio-pci
# num_vfs: 1
dpdk:
performance_profile:
cpu_isolated: "4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20"
cpu_reserved: "0,1,2,3"
hugepages_size: "1G"
hugepages_count: 7
numa_node: 0
numa_topology_policy: "best-effort"
realtime_kernel: false
additional_kernel_args:
- nosmt
- tsc=reliable
networks: [] # List of DPDK networks for SriovNetworkNodePolicy (configure in job definition)
# Example network configuration:
# networks:
# - name: dpdk_net_nic0_9
# resource_name: dpdk9
# device_type: vfio-pci
# num_vfs: 1
# - name: dpdk_net_nic1_10
# resource_name: dpdk10
# device_type: vfio-pci
# num_vfs: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
# MachineConfigPool for SRIOV worker nodes
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigPool
metadata:
name: sriov
labels:
machineconfiguration.openshift.io/role: sriov
spec:
machineConfigSelector:
matchExpressions:
- key: machineconfiguration.openshift.io/role
operator: In
values: [sriov, worker]
paused: false
nodeSelector:
matchLabels:
node-role.kubernetes.io/sriov: ""

---
# MachineConfigPool for DPDK worker nodes
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfigPool
metadata:
name: dpdk
labels:
machineconfiguration.openshift.io/role: dpdk
spec:
machineConfigSelector:
matchExpressions:
- key: machineconfiguration.openshift.io/role
operator: In
values: [dpdk, worker]
paused: false
nodeSelector:
matchLabels:
node-role.kubernetes.io/dpdk: ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
# Namespace for SRIOV Network Operator
apiVersion: v1
kind: Namespace
metadata:
name: openshift-sriov-network-operator
annotations:
workload.openshift.io/allowed: management

---
# OperatorGroup for SRIOV Network Operator
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: sriov-network-operators
namespace: openshift-sriov-network-operator
spec:
targetNamespaces:
- openshift-sriov-network-operator
12 changes: 12 additions & 0 deletions collection/stages/roles/day2ops/files/sriov-operator-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
# SriovOperatorConfig for SRIOV Network Operator
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovOperatorConfig
metadata:
name: default
namespace: openshift-sriov-network-operator
spec:
enableInjector: true
enableOperatorWebhook: true
logLevel: 2
disableDrain: false
2 changes: 1 addition & 1 deletion collection/stages/roles/day2ops/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
mode: u=rw,g=rw,o=r

- name: Run day2ops procedures sequentially
ansible.builtin.include_tasks: run_procedure.yml
ansible.builtin.include_tasks: "{{ day2ops_run_procedure_task | default('run_procedure.yml') }}"
vars:
procedure_task_file: "{{ item }}.yml"
loop: "{{ day2ops_steps }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
---
# Helper task to apply a single telco MachineSet
# This task is included by create-telco-machinesets.yml for each machineset

- name: "Get network details for {{ telco_machineset.networks[0].name }}"
openstack.cloud.networks_info:
cloud: "{{ user_cloud }}"
name: "{{ telco_machineset.networks[0].name }}"
register: network_0

- name: "Get network details for {{ telco_machineset.networks[1].name }}"
openstack.cloud.networks_info:
cloud: "{{ user_cloud }}"
name: "{{ telco_machineset.networks[1].name }}"
register: network_1

- name: Set the telco network and subnet IDs
ansible.builtin.set_fact:
network_id_0: "{{ network_0.networks[0].id }}"
network_subnet_id_0: "{{ network_0.networks[0].subnet_ids[0] }}"
network_id_1: "{{ network_1.networks[0].id }}"
network_subnet_id_1: "{{ network_1.networks[0].subnet_ids[0] }}"

- name: "Generate telco MachineSet manifest for {{ telco_machineset.name }}"
ansible.builtin.template:
src: telco-machineset.yaml.j2
dest: "{{ ocp_installation_dir }}/{{ telco_machineset.name }}-machineset.yaml"
mode: u=rw,g=rw,o=r
vars:
_infrastructure_id: "{{ infrastructure_id }}"
_machine_role: "{{ telco_machineset.role | default('worker') }}"
_machineset_type: "{{ telco_machineset.type }}"
_machineset_replicas: "{{ telco_machineset.replicas }}"
_osp_flavor: "{{ telco_machineset.flavor }}"
_api_vip_port_ip: "{{ api_ip }}"
_ingress_vip_port_ip: "{{ apps_ip }}"
_machines_subnet: "{{ machines_subnet_id }}"
_machines_network: "{{ machines_subnet_net_id }}"
_security_group: "{{ machines_security_group }}"
_subnet_id_0: "{{ network_subnet_id_0 }}"
_name_suffix_0: "{{ telco_machineset.networks[0].port_name_suffix }}"
_network_id_0: "{{ network_id_0 }}"
_subnet_id_1: "{{ network_subnet_id_1 }}"
_name_suffix_1: "{{ telco_machineset.networks[1].port_name_suffix }}"
_network_id_1: "{{ network_id_1 }}"

- name: "Apply telco MachineSet manifest for {{ telco_machineset.name }}"
kubernetes.core.k8s:
kubeconfig: "{{ kubeconfig }}"
state: present
src: "{{ ocp_installation_dir }}/{{ telco_machineset.name }}-machineset.yaml"
wait: true
wait_timeout: "{{ manifests_wait_timeout }}"

- name: "Wait for cluster health after applying {{ telco_machineset.name }} machineset"
block:
- name: Wait for MCP updates
ansible.builtin.include_role:
name: tools_cluster_checks
tasks_from: wait_mcp_updated.yml
vars:
wait_retries: 60
wait_delay: 60

- name: Wait until cluster is healthy
ansible.builtin.include_role:
name: tools_cluster_checks
tasks_from: wait_until_cluster_is_healthy.yml

- name: Wait until nodes are ready
ansible.builtin.include_role:
name: tools_cluster_checks
tasks_from: wait_until_nodes_ready.yml

- name: Wait until ClusterOperators are ready
ansible.builtin.include_role:
name: tools_cluster_checks
tasks_from: wait_until_cluster_operators_ready.yml
Loading