diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..37d1834ca --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023-2025 simplyblock GmbH + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base index 226188c96..735d331b1 100644 --- a/docker/Dockerfile_base +++ b/docker/Dockerfile_base @@ -38,3 +38,4 @@ RUN pip3 install setuptools --upgrade COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt + diff --git a/docs/talos.md b/docs/talos.md index 47ff817d5..f1406ef38 100644 --- a/docs/talos.md +++ b/docs/talos.md @@ -19,26 +19,12 @@ kubectl label namespace simplyblock \ --overwrite ``` - -Patch the host machine so that OpenEBS could work - Create a machine config patch with the contents below and save as patch.yaml ``` cat > patch.yaml <<'EOF' machine: sysctls: vm.nr_hugepages: "1024" - nodeLabels: - openebs.io/engine: mayastor - kubelet: - extraMounts: - - destination: /var/openebs/local - type: bind - source: /var/openebs/local - options: - - rbind - - rshared - - rw EOF talosctl -e -n patch mc -p @patch.yaml diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 103123934..dc429b8f9 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -371,8 +371,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.write_to_db(db_controller.kv_store) - qos_controller.add_class("Default", 100, cluster.get_id()) - cluster_events.cluster_create(cluster) mgmt_node_ops.add_mgmt_node(dev_ip, mode, cluster.uuid) @@ -459,6 +457,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.strict_node_anti_affinity = strict_node_anti_affinity default_cluster = clusters[0] + cluster.mode = default_cluster.mode cluster.db_connection = default_cluster.db_connection cluster.grafana_secret = monitoring_secret if default_cluster.mode == "kubernetes" else default_cluster.grafana_secret cluster.grafana_endpoint = default_cluster.grafana_endpoint @@ -1176,9 +1175,13 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, for service in cluster_docker.services.list(): if image_parts in service.attrs['Spec']['Labels']['com.docker.stack.image'] or \ "simplyblock" in service.attrs['Spec']['Labels']['com.docker.stack.image']: - logger.info(f"Updating service {service.name}") - service.update(image=service_image, force_update=True) - service_names.append(service.attrs['Spec']['Name']) + if service.name == "app_CachingNodeMonitor": + logger.info(f"Removing service {service.name}") + service.remove() + else: + logger.info(f"Updating service {service.name}") + service.update(image=service_image, force_update=True) + service_names.append(service.attrs['Spec']['Name']) if "app_SnapshotMonitor" not in service_names: logger.info("Creating snapshot monitor service") @@ -1191,6 +1194,18 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, networks=["host"], constraints=["node.role == manager"] ) + + if "app_TasksRunnerLVolSyncDelete" not in service_names: + logger.info("Creating lvol sync delete service") + cluster_docker.services.create( + image=service_image, + command="python simplyblock_core/services/tasks_runner_sync_lvol_del.py", + name="app_TasksRunnerLVolSyncDelete", + mounts=["/etc/foundationdb:/etc/foundationdb"], + env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"], + networks=["host"], + constraints=["node.role == manager"] + ) logger.info("Done updating mgmt cluster") elif cluster.mode == "kubernetes": diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index 41824c73a..36ba14a9e 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -133,7 +133,8 @@ def get_config_var(name, default=None): LVOL_NVME_CONNECT_NR_IO_QUEUES=3 LVOL_NVME_KEEP_ALIVE_TO=10 LVOL_NVME_KEEP_ALIVE_TO_TCP=7 -LVOL_NVMF_PORT_START=int(os.getenv('LVOL_NVMF_PORT_START', 9100)) +LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "") +LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100 QPAIR_COUNT=32 CLIENT_QPAIR_COUNT=3 NVME_TIMEOUT_US=8000000 @@ -224,4 +225,4 @@ def get_config_var(name, default=None): qos_class_meta_and_migration_weight_percent = 25 -MIG_PARALLEL_JOBS = 16 \ No newline at end of file +MIG_PARALLEL_JOBS = 64 \ No newline at end of file diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 689027d08..b7c434f63 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -70,6 +70,11 @@ def _add_task(function_name, cluster_id, node_id, device_id, if task_id: logger.info(f"Task found, skip adding new task: {task_id}") return False + elif function_name == JobSchedule.FN_LVOL_SYNC_DEL: + task_id = get_lvol_sync_del_task(cluster_id, node_id, function_params['lvol_bdev_name']) + if task_id: + logger.info(f"Task found, skip adding new task: {task_id}") + return False task_obj = JobSchedule() task_obj.uuid = str(uuid.uuid4()) @@ -386,3 +391,21 @@ def get_jc_comp_task(cluster_id, node_id, jm_vuid=0): if jm_vuid and "jm_vuid" in task.function_params and task.function_params["jm_vuid"] == jm_vuid: return task.uuid return False + + +def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name): + return _add_task(JobSchedule.FN_LVOL_SYNC_DEL, cluster_id, node_id, "", + function_params={"lvol_bdev_name": lvol_bdev_name}, max_retry=10) + +def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None): + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == node_id : + if task.status != JobSchedule.STATUS_DONE and task.canceled is False: + if lvol_bdev_name: + if "lvol_bdev_name" in task.function_params and task.function_params["lvol_bdev_name"] == lvol_bdev_name: + return task.uuid + else: + return task.uuid + return False + diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index f3e377ee4..468ba7a02 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev -SIMPLY_BLOCK_VERSION=19.2.23 +SIMPLY_BLOCK_VERSION=19.2.25 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main-lvol-sync-delete SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index fd4802771..620309f77 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -45,7 +45,7 @@ class Cluster(BaseModel): distr_npcs: int = 0 enable_node_affinity: bool = False grafana_endpoint: str = "" - mode: str = "" + mode: str = "docker" grafana_secret: str = "" contact_point: str = "" ha_type: str = "single" diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py index 3d87a9aca..bbdcd7871 100644 --- a/simplyblock_core/models/job_schedule.py +++ b/simplyblock_core/models/job_schedule.py @@ -22,6 +22,7 @@ class JobSchedule(BaseModel): FN_BALANCING_AFTER_DEV_REMOVE = "balancing_on_dev_rem" FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add" FN_JC_COMP_RESUME = "jc_comp_resume" + FN_LVOL_SYNC_DEL = "lvol_sync_del" canceled: bool = False cluster_id: str = "" diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 8c76d3649..81639c556 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -102,7 +102,6 @@ class StorageNode(BaseNodeObject): hublvol: HubLVol = None # type: ignore[assignment] active_tcp: bool = True active_rdma: bool = False - lvol_sync_del_queue: List[str] = [] def rpc_client(self, **kwargs): """Return rpc client to this node diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 62f37b1e9..ce48e1796 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -379,11 +379,11 @@ def create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0 "clear_method": "unmap", "lvol_priority_class": lvol_priority_class, } - # if ndcs or npcs: - # params.update({ - # 'ndcs' : ndcs, - # 'npcs' : npcs, - # }) + if ndcs or npcs: + params.update({ + 'ndcs' : ndcs, + 'npcs' : npcs, + }) return self._request("bdev_lvol_create", params) def delete_lvol(self, name, del_async=False): @@ -922,7 +922,7 @@ def distr_migration_status(self, name): params = {"name": name} return self._request("distr_migration_status", params) - def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=1024, jobs=4): + def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=64, jobs=64): params = { "name": name, "storage_ID": storage_ID, @@ -935,7 +935,7 @@ def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=Fals params["jobs"] = jobs return self._request("distr_migration_failure_start", params) - def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=1024, jobs=4): + def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=64, jobs=64): params = { "name": name, } diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml index 9d1b62643..380f67bcd 100644 --- a/simplyblock_core/scripts/charts/Chart.yaml +++ b/simplyblock_core/scripts/charts/Chart.yaml @@ -26,11 +26,6 @@ dependencies: version: "25.18.0" repository: "https://prometheus-community.github.io/helm-charts" condition: monitoring.enabled - - name: openebs - version: 3.9.0 - repository: https://openebs.github.io/charts - alias: openebs - condition: openebs.enabled - name: ingress-nginx version: 4.10.1 repository: "https://kubernetes.github.io/ingress-nginx" diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml new file mode 100644 index 000000000..2a9d7d044 --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml @@ -0,0 +1,24 @@ +apiVersion: storage.k8s.io/v1 +kind: CSIDriver +metadata: + name: hostpath.csi.k8s.io + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: hostpath.csi.k8s.io + app.kubernetes.io/component: csi-driver +spec: + # Supports persistent and ephemeral inline volumes. + volumeLifecycleModes: + - Persistent + - Ephemeral + # To determine at runtime which mode a volume uses, pod info and its + # "csi.storage.k8s.io/ephemeral" entry are needed. + podInfoOnMount: true + # No attacher needed. + attachRequired: false + storageCapacity: false + # Kubernetes may use fsGroup to change permissions and ownership + # of the volume to match user requested fsGroup in the pod's SecurityPolicy + fsGroupPolicy: File + \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml new file mode 100644 index 000000000..8e695e593 --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml @@ -0,0 +1,233 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: csi-hostpathplugin +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["persistentvolumeclaims/status"] + verbs: ["get", "update", "patch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csinodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csistoragecapacities"] + verbs: ["get", "list", "watch", "create", "update", "delete"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch", "update", "get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: csi-hostpathplugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: csi-hostpathplugin +subjects: + - kind: ServiceAccount + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} + +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: csi-hostpathplugin + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin +spec: + selector: + matchLabels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + template: + metadata: + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + spec: + serviceAccountName: csi-hostpathplugin-sa + containers: + - name: csi-provisioner + image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0 + args: + - -v=5 + - --csi-address=/csi/csi.sock + - --feature-gates=Topology=true + - --node-deployment=true + - --strict-topology=true + - --immediate-topology=false + - --worker-threads=5 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + - name: csi-resizer + image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 + args: + - -v=5 + - -csi-address=/csi/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + + - name: node-driver-registrar + image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0 + args: + - --v=5 + - --csi-address=/csi/csi.sock + - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + env: + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /registration + name: registration-dir + - mountPath: /csi-data-dir + name: csi-data-dir + + - name: hostpath + image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0 + args: + - --drivername=hostpath.csi.k8s.io + - --v=5 + - --endpoint=$(CSI_ENDPOINT) + - --nodeid=$(KUBE_NODE_NAME) + - --capacity=slow=10Gi + - --capacity=fast=100Gi + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + securityContext: + privileged: true + ports: + - containerPort: 9898 + name: healthz + protocol: TCP + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 10 + timeoutSeconds: 3 + periodSeconds: 2 + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /var/lib/kubelet/pods + mountPropagation: Bidirectional + name: mountpoint-dir + - mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + name: plugins-dir + - mountPath: /csi-data-dir + name: csi-data-dir + - mountPath: /dev + name: dev-dir + - name: liveness-probe + volumeMounts: + - mountPath: /csi + name: socket-dir + image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0 + args: + - --csi-address=/csi/csi.sock + - --health-port=9898 + + volumes: + - hostPath: + path: /var/lib/kubelet/plugins/csi-hostpath + type: DirectoryOrCreate + name: socket-dir + - hostPath: + path: /var/lib/kubelet/pods + type: DirectoryOrCreate + name: mountpoint-dir + - hostPath: + path: /var/lib/kubelet/plugins_registry + type: Directory + name: registration-dir + - hostPath: + path: /var/lib/kubelet/plugins + type: Directory + name: plugins-dir + - hostPath: + # 'path' is where PV data is persisted on host. + # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot + path: /var/lib/csi-hostpath-data/ + type: DirectoryOrCreate + name: csi-data-dir + - hostPath: + path: /dev + type: Directory + name: dev-dir diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml index 1a3134e58..4eb7f1410 100644 --- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml +++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml @@ -2,20 +2,20 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: controller-manager + name: simplyblock-fdb-controller-manager labels: - control-plane: controller-manager - app: controller-manager + control-plane: simplyblock-fdb-controller-manager + app: simplyblock-fdb-controller-manager spec: selector: matchLabels: - app: controller-manager + app: simplyblock-fdb-controller-manager replicas: 1 template: metadata: labels: - control-plane: controller-manager - app: controller-manager + control-plane: simplyblock-fdb-controller-manager + app: simplyblock-fdb-controller-manager spec: securityContext: runAsUser: 4059 @@ -28,7 +28,7 @@ spec: emptyDir: {} - name: fdb-binaries emptyDir: {} - serviceAccountName: controller-manager + serviceAccountName: simplyblock-fdb-controller-manager initContainers: - name: foundationdb-kubernetes-init-7-3 image: foundationdb/fdb-kubernetes-monitor:7.3.63 @@ -51,6 +51,8 @@ spec: containers: - command: - /manager + args: + - "--health-probe-bind-address=:9443" image: foundationdb/fdb-kubernetes-operator:v2.13.0 name: manager env: @@ -86,13 +88,13 @@ spec: apiVersion: v1 kind: ServiceAccount metadata: - name: controller-manager + name: simplyblock-fdb-controller-manager --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: manager-role + name: simplyblock-fdb-manager-role rules: - apiGroups: - "" @@ -164,7 +166,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: creationTimestamp: null - name: manager-clusterrole + name: simplyblock-fdb-manager-clusterrole rules: - apiGroups: - "" @@ -179,27 +181,27 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: creationTimestamp: null - name: manager-rolebinding + name: simplyblock-fdb-manager-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: manager-role + name: simplyblock-fdb-manager-role subjects: - kind: ServiceAccount - name: controller-manager + name: simplyblock-fdb-controller-manager --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: creationTimestamp: null - name: manager-clusterrolebinding + name: simplyblock-fdb-manager-clusterrolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: manager-clusterrole + name: simplyblock-fdb-manager-clusterrole subjects: - kind: ServiceAccount - name: controller-manager + name: simplyblock-fdb-controller-manager namespace: metadata.namespace ##### cluster file ################# @@ -213,7 +215,11 @@ spec: replacements: enabled: true faultDomain: + {{- if .Values.foundationdb.multiAZ }} + key: topology.kubernetes.io/zone + {{- else }} key: foundationdb.org/none + {{- end }} imageType: split labels: filterOnOwnerReference: false @@ -225,10 +231,17 @@ spec: - foundationdb.org/fdb-process-group-id minimumUptimeSecondsForBounce: 60 processCounts: + {{- if .Values.foundationdb.multiAZ }} + cluster_controller: 1 + log: 4 + storage: 4 + stateless: -1 + {{- else }} cluster_controller: 1 log: 3 storage: 3 stateless: -1 + {{- end }} processes: general: customParameters: @@ -270,7 +283,7 @@ spec: runAsUser: 0 volumeClaimTemplate: spec: - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath accessModes: - ReadWriteOnce resources: @@ -285,10 +298,10 @@ spec: resources: limits: cpu: 500m - memory: 2Gi + memory: 4Gi requests: cpu: 100m - memory: 512Mi + memory: 1Gi securityContext: runAsUser: 0 affinity: @@ -308,10 +321,10 @@ spec: resources: limits: cpu: 500m - memory: 2Gi + memory: 4Gi requests: cpu: 100m - memory: 512Mi + memory: 1Gi securityContext: runAsUser: 0 affinity: diff --git a/simplyblock_core/scripts/charts/templates/mongodb.yaml b/simplyblock_core/scripts/charts/templates/mongodb.yaml index 740dd7642..815df6505 100644 --- a/simplyblock_core/scripts/charts/templates/mongodb.yaml +++ b/simplyblock_core/scripts/charts/templates/mongodb.yaml @@ -14,7 +14,7 @@ spec: name: data-volume spec: accessModes: [ "ReadWriteOnce" ] - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath resources: requests: storage: 5Gi @@ -22,7 +22,7 @@ spec: name: logs-volume spec: accessModes: [ "ReadWriteOnce" ] - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath resources: requests: storage: 5Gi diff --git a/simplyblock_core/scripts/charts/templates/storage_class.yaml b/simplyblock_core/scripts/charts/templates/storage_class.yaml index 64e5e6280..b23cb4a07 100644 --- a/simplyblock_core/scripts/charts/templates/storage_class.yaml +++ b/simplyblock_core/scripts/charts/templates/storage_class.yaml @@ -2,9 +2,22 @@ apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: - name: openebs-local-hostpath -provisioner: openebs.io/local + name: local-hostpath + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpath-fast + app.kubernetes.io/component: storageclass +provisioner: hostpath.csi.k8s.io allowVolumeExpansion: true reclaimPolicy: Retain volumeBindingMode: WaitForFirstConsumer - +{{- if .Values.storageclass.allowedTopologyZones }} +allowedTopologies: +- matchLabelExpressions: + - key: topology.kubernetes.io/zone + values: +{{- range .Values.storageclass.allowedTopologyZones }} + - {{ . }} +{{- end }} +{{- end }} diff --git a/simplyblock_core/scripts/charts/values-template.yaml b/simplyblock_core/scripts/charts/values-template.yaml deleted file mode 100644 index 79693e7cd..000000000 --- a/simplyblock_core/scripts/charts/values-template.yaml +++ /dev/null @@ -1,194 +0,0 @@ -graylog: - rootPasswordSha2: "${GRAYLOG_ROOT_PASSWORD_SHA2}" - passwordSecret: "${GRAYLOG_PASSWORD_SECRET}" - -cluster: - secret: "${CLUSTER_SECRET}" - id: "${CLUSTER_ID}" - ip: "${CLUSTER_IP}" - -monitoring: - enabled: ${ENABLE_MONITORING} - -log: - deletionInterval: "${LOG_DELETION_INTERVAL}" - retentionPeriod: "${RETENTION_PERIOD}" - level: "${LOG_LEVEL}" - maxNumberIndex: "${MAX_NUMBER_OF_INDICES}" - -grafana: - endpoint: "${GRAFANA_ENDPOINT}" - contactPoint: "${CONTACT_POINT}" - -image: - simplyblock: - repository: "${SIMPLYBLOCK_REPOSITORY}" - tag: "${SIMPLYBLOCK_TAG}" - pullPolicy: "Always" - -openebs: - enabled: true - -mongodb: - name: "simplyblock-mongodb" - deployment_name: "simplyblock-mongodb" - resources: - requests: - cpu: 100m - memory: 300Mi - limits: - cpu: 250m - memory: 1Gi - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app.kubernetes.io/component - operator: In - values: - - mongodb - topologyKey: "kubernetes.io/hostname" - -opensearch: - fullnameOverride: "simplyblock-opensearch" - singleNode: true - replicas: 1 - - antiAffinity: "hard" - persistence: - enabled: true - storageClass: openebs-local-hostpath - size: 10Gi - - resources: - requests: - cpu: "100m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "3Gi" - - extraEnvs: - - name: OPENSEARCH_JAVA_OPTS - value: "-Xms1g -Xmx1g" - - name: bootstrap.memory_lock - value: "true" - - name: action.auto_create_index - value: "false" - - name: plugins.security.ssl.http.enabled - value: "false" - - name: plugins.security.disabled - value: "true" - - securityConfig: - enabled: false - -prometheus: - server: - fullnameOverride: simplyblock-prometheus - enabled: true - statefulSet: - enabled: true - name: simplyblock-prometheus - replicaCount: 1 - podLabels: - app: simplyblock-prometheus - podAnnotations: {} - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app.kubernetes.io/component - operator: In - values: - - simplyblock-prometheus - topologyKey: "kubernetes.io/hostname" - service: - servicePort: 9090 - type: ClusterIP - gRPC: - enabled: true - servicePort: 10901 - additionalPorts: - - name: http-thanos - port: 10902 - targetPort: 10902 - protocol: TCP - securityContext: - fsGroup: 65534 - persistentVolume: - enabled: true - size: 5Gi - storageClass: openebs-local-hostpath - extraArgs: - storage.tsdb.min-block-duration: 2h - storage.tsdb.max-block-duration: 2h - sidecarContainers: - thanos-sidecar: - image: thanosio/thanos:v0.31.0 - args: - - sidecar - - --tsdb.path=/prometheus - - --prometheus.url=http://localhost:9090 - - --objstore.config-file=/etc/thanos/objstore.yml - ports: - - name: grpc - containerPort: 10901 - - name: http - containerPort: 10902 - volumeMounts: - - name: storage-volume - mountPath: /prometheus - - name: objstore-config - mountPath: /etc/thanos - resources: - requests: - cpu: "100m" - memory: "256Mi" - limits: - cpu: "250m" - memory: "1Gi" - resources: - requests: - cpu: "100m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "1Gi" - configMapOverrideName: simplyblock-prometheus-config - extraVolumes: - - name: objstore-config - configMap: - name: simplyblock-objstore-config - alertmanager: - enabled: false - - prometheus-pushgateway: - enabled: false - - prometheus-node-exporter: - enabled: false - - kube-state-metrics: - enabled: false - -ingress: - enabled: true - ingressClassName: nginx - useDNS: ${USE_DNS} - host: "${DNS_NAME}" - tlsSecret: ${TLS_SECRET} - controller: - hostNetwork: ${USE_HOST} - dnsPolicy: ClusterFirstWithHostNet - service: - type: ${SERVICE_TYPE} - nodePorts: - tcp: - 4501: 32451 - extraArgs: - tcp-services-configmap: "${K8S_NAMESPACE}/simplyblock-tcp-services" - nodeSelector: - simplyblock.io/role: mgmt-plane diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index 467734176..0b70f321e 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -24,9 +24,12 @@ image: ports: lvolNvmfPortStart: - -openebs: - enabled: true + +storageclass: + allowedTopologyZones: [] + +foundationdb: + multiAZ: false mongodb: name: "simplyblock-mongodb" @@ -57,7 +60,7 @@ opensearch: antiAffinity: "hard" persistence: enabled: true - storageClass: openebs-local-hostpath + storageClass: local-hostpath size: 10Gi resources: @@ -123,7 +126,7 @@ prometheus: persistentVolume: enabled: true size: 5Gi - storageClass: openebs-local-hostpath + storageClass: local-hostpath extraArgs: storage.tsdb.min-block-duration: 2h storage.tsdb.max-block-duration: 2h diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml index ba0f8b61d..fd79f43c1 100644 --- a/simplyblock_core/scripts/docker-compose-swarm.yml +++ b/simplyblock_core/scripts/docker-compose-swarm.yml @@ -349,6 +349,20 @@ services: environment: SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + TasksRunnerLVolSyncDelete: + <<: *service-base + image: $SIMPLYBLOCK_DOCKER_IMAGE + command: "python simplyblock_core/services/tasks_runner_sync_lvol_del.py" + deploy: + placement: + constraints: [node.role == manager] + volumes: + - "/etc/foundationdb:/etc/foundationdb" + networks: + - hostnet + environment: + SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + networks: monitoring-net: external: true diff --git a/simplyblock_core/services/lvol_monitor.py b/simplyblock_core/services/lvol_monitor.py index 884b67396..8486f3a32 100644 --- a/simplyblock_core/services/lvol_monitor.py +++ b/simplyblock_core/services/lvol_monitor.py @@ -132,8 +132,7 @@ def process_lvol_delete_finish(lvol): sec_node = db.get_storage_node_by_id(snode.get_id()) if sec_node: - sec_node.lvol_sync_del_queue.append(f"{lvol.lvs_name}/{lvol.lvol_bdev}") - sec_node.write_to_db() + tasks_controller.add_lvol_sync_del_task(sec_node.cluster_id, sec_node.get_id(), f"{lvol.lvs_name}/{lvol.lvol_bdev}") lvol_events.lvol_delete(lvol) lvol.remove(db.kv_store) @@ -349,19 +348,6 @@ def process_lvol_delete_try_again(lvol): present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) set_snapshot_health_check(snap, present) - snode = db.get_storage_node_by_id(snode.get_id()) - if snode.status == StorageNode.STATUS_ONLINE: - not_deleted = [] - for bdev_name in snode.lvol_sync_del_queue: - logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}") - ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True) - if not ret: - if "code" in err and err["code"] == -19: - logger.error(f"Sync delete completed with error: {err}") - else: - logger.error(f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}") - not_deleted.append(bdev_name) - snode.lvol_sync_del_queue = not_deleted - snode.write_to_db() + time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/snapshot_monitor.py b/simplyblock_core/services/snapshot_monitor.py index c82476e7b..a99ed89f3 100644 --- a/simplyblock_core/services/snapshot_monitor.py +++ b/simplyblock_core/services/snapshot_monitor.py @@ -5,7 +5,7 @@ from simplyblock_core import constants, db_controller, utils from simplyblock_core.models.cluster import Cluster -from simplyblock_core.controllers import health_controller, snapshot_events +from simplyblock_core.controllers import health_controller, snapshot_events, tasks_controller from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.rpc_client import RPCClient @@ -76,8 +76,7 @@ def process_snap_delete_finish(snap, leader_node): non_leader = db.get_storage_node_by_id(non_leader_id) if non_leader: - non_leader.lvol_sync_del_queue.append(snap.snap_bdev) - non_leader.write_to_db() + tasks_controller.add_lvol_sync_del_task(non_leader.cluster_id, non_leader.get_id(), snap.snap_bdev) snapshot_events.snapshot_delete(snap) snap.remove(db.kv_store) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 06eeee008..17306e56a 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -6,6 +6,7 @@ import os import socket import sys +import itertools from http.server import HTTPServer from http.server import ThreadingHTTPServer @@ -19,6 +20,9 @@ logger.addHandler(logger_handler) logger.setLevel(logging.INFO) +# Thread-safe request counter +request_counter = itertools.count(1) + def get_env_var(name, default=None, is_required=False): if not name: @@ -30,12 +34,12 @@ def get_env_var(name, default=None, is_required=False): return os.environ.get(name, default) -def rpc_call(req): +def rpc_call(req, request_id): req_data = json.loads(req.decode('ascii')) params = "" if "params" in req_data: params = str(req_data['params']) - logger.info(f"Request function: {str(req_data['method'])}, params: {params}") + logger.info(f"[{request_id}] Request function: {str(req_data['method'])}, params: {params}") sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.settimeout(TIMEOUT) sock.connect(rpc_sock) @@ -65,7 +69,8 @@ def rpc_call(req): if not response and len(buf) > 0: raise ValueError('Invalid response') - logger.debug(f"Response data: {buf}") + logger.debug(f"[{request_id}] Response data: {buf}") + logger.info(f"[{request_id}] Response ready") return buf @@ -96,9 +101,13 @@ def do_INTERNALERROR(self): self.end_headers() def do_POST(self): - if self.headers['Authorization'] != 'Basic ' + self.key: - self.do_AUTHHEAD() - else: + # Unique request ID + request_id = next(request_counter) + try: + if self.headers['Authorization'] != 'Basic ' + self.key: + self.do_AUTHHEAD() + return + if "Content-Length" in self.headers: data_string = self.rfile.read(int(self.headers['Content-Length'])) elif "chunked" in self.headers.get("Transfer-Encoding", ""): @@ -120,16 +129,20 @@ def do_POST(self): break try: - response = rpc_call(data_string) + response = rpc_call(data_string, request_id) if response is not None: self.do_HEAD() self.wfile.write(bytes(response.encode(encoding='ascii'))) else: self.do_HEAD_no_content() - except ValueError: + except ValueError as e: + logger.error(f"[{request_id}] Invalid RPC request from {self.client_address[0]}: {e}") self.do_INTERNALERROR() + except Exception: + logger.error(f"[{request_id}] Error processing request from {self.client_address[0]}", exc_info=True) + def run_server(host, port, user, password, is_threading_enabled=False): # encoding user and password diff --git a/simplyblock_core/services/tasks_runner_failed_migration.py b/simplyblock_core/services/tasks_runner_failed_migration.py index fce4fd8ef..7d0b3e89f 100644 --- a/simplyblock_core/services/tasks_runner_failed_migration.py +++ b/simplyblock_core/services/tasks_runner_failed_migration.py @@ -88,7 +88,7 @@ def task_runner(task): if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True rsp = rpc_client.distr_migration_failure_start( - distr_name, device.cluster_device_order, qos_high_priority, job_size=1024, jobs=constants.MIG_PARALLEL_JOBS) + distr_name, device.cluster_device_order, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS) if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task" diff --git a/simplyblock_core/services/tasks_runner_migration.py b/simplyblock_core/services/tasks_runner_migration.py index fb085e4aa..e325e3d7e 100644 --- a/simplyblock_core/services/tasks_runner_migration.py +++ b/simplyblock_core/services/tasks_runner_migration.py @@ -93,7 +93,7 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024, + rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS) if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") diff --git a/simplyblock_core/services/tasks_runner_new_dev_migration.py b/simplyblock_core/services/tasks_runner_new_dev_migration.py index f62a7f210..9feec7a56 100644 --- a/simplyblock_core/services/tasks_runner_new_dev_migration.py +++ b/simplyblock_core/services/tasks_runner_new_dev_migration.py @@ -98,7 +98,7 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024, + rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS) if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py index a39de42ab..e95dbdf94 100644 --- a/simplyblock_core/services/tasks_runner_port_allow.py +++ b/simplyblock_core/services/tasks_runner_port_allow.py @@ -3,7 +3,7 @@ from simplyblock_core import db_controller, utils, storage_node_ops, distr_controller -from simplyblock_core.controllers import tcp_ports_events, health_controller +from simplyblock_core.controllers import tcp_ports_events, health_controller, tasks_controller from simplyblock_core.fw_api_client import FirewallClient from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster @@ -196,19 +196,13 @@ task.status = JobSchedule.STATUS_RUNNING task.write_to_db(db.kv_store) - not_deleted = [] - for bdev_name in snode.lvol_sync_del_queue: - logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}") - ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True) - if not ret: - if "code" in err and err["code"] == -19: - logger.error(f"Sync delete completed with error: {err}") - else: - logger.error( - f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}") - not_deleted.append(bdev_name) - snode.lvol_sync_del_queue = not_deleted - snode.write_to_db() + # wait for lvol sync delete + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) + while lvol_sync_del_found: + logger.info("Lvol sync delete task found, waiting") + can_continue = False + time.sleep(3) + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: sec_rpc_client = sec_node.rpc_client() diff --git a/simplyblock_core/services/tasks_runner_sync_lvol_del.py b/simplyblock_core/services/tasks_runner_sync_lvol_del.py new file mode 100644 index 000000000..fbf0c1ee4 --- /dev/null +++ b/simplyblock_core/services/tasks_runner_sync_lvol_del.py @@ -0,0 +1,77 @@ +# coding=utf-8 +import time + + +from simplyblock_core import db_controller, utils +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.models.storage_node import StorageNode + +logger = utils.get_logger(__name__) + +# get DB controller +db = db_controller.DBController() + + +logger.info("Starting Tasks runner...") +while True: + + clusters = db.get_clusters() + if not clusters: + logger.error("No clusters found!") + else: + for cl in clusters: + if cl.status == Cluster.STATUS_IN_ACTIVATION: + continue + + tasks = db.get_job_tasks(cl.get_id(), reverse=False) + for task in tasks: + + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL: + if task.status != JobSchedule.STATUS_DONE: + + # get new task object because it could be changed from cancel task + task = db.get_task_by_id(task.uuid) + + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + continue + + node = db.get_storage_node_by_id(task.node_id) + + if not node: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + continue + + if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: + msg = f"Node is {node.status}, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + continue + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + lvol_bdev_name = task.function_params["lvol_bdev_name"] + + logger.info(f"Sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + ret, err = node.rpc_client().delete_lvol(lvol_bdev_name, del_async=True) + if not ret: + if "code" in err and err["code"] == -19: + logger.error(f"Sync delete completed with error: {err}") + else: + logger.error( + f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + + task.function_result = f"bdev {lvol_bdev_name} deleted" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + time.sleep(3) diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 2e8504b08..5e5f66f60 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -81,7 +81,7 @@ def info(self): def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None, cluster_ip=None, fdb_connection=None, namespace=None, server_ip=None, rpc_port=None, rpc_username=None, rpc_password=None, multi_threading_enabled=False, timeout=0, ssd_pcie=None, - total_mem=None, system_mem=None, cluster_mode=None): + total_mem=None, system_mem=None, cluster_mode=None, cluster_id=None): params = { "cluster_ip": cluster_ip, "server_ip": server_ip, @@ -113,6 +113,8 @@ def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None params["system_mem"] = system_mem if cluster_mode: params["cluster_mode"] = cluster_mode + if cluster_id: + params["cluster_id"] = cluster_id return self._request("POST", "spdk_process_start", params) def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id): @@ -124,8 +126,8 @@ def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id): # "db_connection": db_connection} # return self._request("POST", "join_swarm", params) - def spdk_process_kill(self, rpc_port): - return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port}) + def spdk_process_kill(self, rpc_port, cluster_id=None): + return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port, "cluster_id": cluster_id}) def leave_swarm(self): return True diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 3d32dd17a..162f0dd1a 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -998,7 +998,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, namespace, mgmt_ip, rpc_port, rpc_user, rpc_pass, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode) + ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=cluster_id) time.sleep(5) except Exception as e: @@ -1454,7 +1454,7 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False): if health_controller._check_node_api(snode.mgmt_ip): logger.info("Stopping SPDK container") snode_api = SNodeClient(snode.api_endpoint, timeout=20) - snode_api.spdk_process_kill(snode.rpc_port) + snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) snode_api.leave_swarm() pci_address = [] for dev in snode.nvme_devices: @@ -1676,7 +1676,7 @@ def restart_storage_node( snode.l_cores, snode.spdk_mem, snode.spdk_image, spdk_debug, cluster_ip, fdb_connection, snode.namespace, snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode) + ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=snode.cluster_id) except Exception as e: logger.error(e) @@ -2250,7 +2250,7 @@ def shutdown_storage_node(node_id, force=False): logger.info("Stopping SPDK") try: - SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port) + SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port, snode.cluster_id) except SNodeClientException: logger.error('Failed to kill SPDK') return False @@ -3214,7 +3214,7 @@ def recreate_lvstore(snode, force=False): def _kill_app(): storage_events.snode_restart_failed(snode) snode_api = SNodeClient(snode.api_endpoint, timeout=5, retry=5) - snode_api.spdk_process_kill(snode.rpc_port) + snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE) # If LVol Store recovery failed then stop spdk process diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 941414708..0892db54a 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -725,7 +725,13 @@ def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> in raw = size / (base ** exponent) return math.ceil(raw) if round_up else int(raw) - +def first_six_chars(s: str) -> str: + """ + Returns the first six characters of a given string. + If the string is shorter than six characters, returns the entire string. + """ + return s[:6] + def nearest_upper_power_of_2(n): # Check if n is already a power of 2 if (n & (n - 1)) == 0: diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index 8e18fc276..d1ee4f9f0 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -142,6 +142,7 @@ class SPDKParams(BaseModel): spdk_image: Optional[str] = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: Optional[str] = Field(default=None, pattern=utils.IP_PATTERN) cluster_mode: str + cluster_id: str @api.post('/spdk_process_start', responses={ diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index be3193138..56b4ca563 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -268,6 +268,7 @@ class SPDKParams(BaseModel): spdk_image: str = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: str = Field(pattern=utils.IP_PATTERN) cluster_mode: str + cluster_id: str @api.post('/spdk_process_start', responses={ @@ -286,9 +287,10 @@ def spdk_process_start(body: SPDKParams): total_mem_mib = core_utils.convert_size(core_utils.parse_size(body.total_mem), 'MB') if body.total_mem else "" - if _is_pod_up(body.rpc_port) or _is_pod_present(body.rpc_port): + first_six_cluster_id = core_utils.first_six_chars(body.cluster_id) + if _is_pod_up(body.rpc_port, first_six_cluster_id) or _is_pod_present(body.rpc_port, first_six_cluster_id): logger.info("SPDK pod found, removing...") - query = utils.RPCPortParams(rpc_port=body.rpc_port) + query = utils.RPCPortParams(rpc_port=body.rpc_port, cluster_id=body.cluster_id) spdk_process_kill(query) node_prepration_job_name = "snode-spdk-job-" @@ -351,6 +353,7 @@ def spdk_process_start(body: SPDKParams): 'SIMPLYBLOCK_DOCKER_IMAGE': constants.SIMPLY_BLOCK_DOCKER_IMAGE, 'GRAYLOG_SERVER_IP': body.cluster_ip, 'MODE': body.cluster_mode, + 'CLUSTER_ID': first_six_cluster_id, 'SSD_PCIE': ssd_pcie_params, 'PCI_ALLOWED': ssd_pcie_list, 'TOTAL_HP': total_mem_mib @@ -463,7 +466,8 @@ def spdk_process_kill(query: utils.RPCPortParams): k8s_core_v1 = core_utils.get_k8s_core_client() try: namespace = node_utils_k8s.get_namespace() - pod_name = f"snode-spdk-pod-{query.rpc_port}" + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) + pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}" resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace) retries = 10 while retries > 0: @@ -486,9 +490,9 @@ def spdk_process_kill(query: utils.RPCPortParams): return utils.get_response(True) -def _is_pod_up(rpc_port): +def _is_pod_up(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() - pod_name = f"snode-spdk-pod-{rpc_port}" + pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: @@ -502,9 +506,9 @@ def _is_pod_up(rpc_port): return False return False -def _is_pod_present(rpc_port): +def _is_pod_present(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() - pod_name = f"snode-spdk-pod-{rpc_port}" + pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: @@ -525,7 +529,8 @@ def _is_pod_present(rpc_port): })}}}, }) def spdk_process_is_up(query: utils.RPCPortParams): - if _is_pod_up(query.rpc_port): + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) + if _is_pod_up(query.rpc_port, first_six_cluster_id): return utils.get_response(True) else: return utils.get_response(False, "SPDK container is not running") diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index f10478c75..e49aca2e2 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -1,7 +1,7 @@ apiVersion: v1 kind: Pod metadata: - name: snode-spdk-pod-{{ RPC_PORT }} + name: snode-spdk-pod-{{ RPC_PORT }}-{{ CLUSTER_ID }} namespace: {{ NAMESPACE }} labels: app: spdk-app-{{ RPC_PORT }} diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py index b0d1795df..27ff2ce18 100644 --- a/simplyblock_web/utils.py +++ b/simplyblock_web/utils.py @@ -149,6 +149,7 @@ def error_handler(exception: Exception): class RPCPortParams(BaseModel): rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536) + cluster_id: str class DeviceParams(BaseModel):