Skip to content

Commit bee6d40

Browse files
committed
Fix NIC passthrough test issues
- Add missing __init__.py for device_passthrough package - Fix unsafe pool.devices[0] access with validation - Fix class name typo: NetworkPerformace -> NetworkPerformance - Add validation for device_addr to prevent None usage - Fix logic bug: validate only the node parameter, not all node spaces - Remove unused parameters (log, log_path, variables) from functional test - Update descriptions: replace 'sriov' with 'passthrough' for accuracy - Improve error messages with more context - Add timeout documentation comments - Clarify server_ip vs interface_ip usage in common.py - Add pool_type validation before dictionary access - Move device_passthrough tests to correct location for test discovery 1. Identifies interface by PCI address first (before dhclient) 2. Brings interface UP with 'ip link set up' 3. Waits for carrier detection (30s timeout) 4. Then runs dhclient to get IP address This ensures the physical link is established before DHCP configuration.
1 parent 4f32b34 commit bee6d40

File tree

6 files changed

+160
-82
lines changed

6 files changed

+160
-82
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.

microsoft/testsuites/device_passthrough/functional_tests.py renamed to lisa/microsoft/testsuites/device_passthrough/functional_tests.py

Lines changed: 43 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,28 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT license.
3-
from pathlib import Path
4-
from typing import Any, Dict, cast
3+
from typing import TYPE_CHECKING, cast
54

6-
from lisa import (
7-
Environment,
8-
Logger,
9-
Node,
10-
TestCaseMetadata,
11-
TestSuite,
12-
TestSuiteMetadata,
13-
)
5+
from lisa import Environment, Node, TestCaseMetadata, TestSuite, TestSuiteMetadata
6+
from lisa.operating_system import Windows
147
from lisa.sut_orchestrator import CLOUD_HYPERVISOR
15-
from lisa.sut_orchestrator.libvirt.ch_platform import CloudHypervisorPlatform
16-
from lisa.sut_orchestrator.libvirt.schema import BaseLibvirtNodeSchema
178
from lisa.testsuite import TestResult, simple_requirement
189
from lisa.tools import Lspci
1910
from lisa.util import LisaException, SkippedException
2011

12+
if TYPE_CHECKING:
13+
from lisa.sut_orchestrator.libvirt.ch_platform import CloudHypervisorPlatform
14+
2115

2216
@TestSuiteMetadata(
2317
area="device_passthrough",
2418
category="functional",
2519
description="""
2620
This test suite is for testing device passthrough functional tests.
2721
""",
22+
requirement=simple_requirement(
23+
supported_platform_type=[CLOUD_HYPERVISOR],
24+
unsupported_os=[Windows],
25+
),
2826
)
2927
class DevicePassthroughFunctionalTests(TestSuite):
3028
@TestCaseMetadata(
@@ -65,46 +63,52 @@ class DevicePassthroughFunctionalTests(TestSuite):
6563
)
6664
def verify_device_passthrough_on_guest(
6765
self,
68-
log: Logger,
6966
node: Node,
7067
environment: Environment,
71-
log_path: Path,
7268
result: TestResult,
73-
variables: Dict[str, Any],
7469
) -> None:
7570
lspci = node.tools[Lspci]
76-
platform = cast(CloudHypervisorPlatform, environment.platform)
71+
platform = cast("CloudHypervisorPlatform", environment.platform)
7772
pool_vendor_device_map = {}
78-
assert platform.platform_runbook.device_pools, "Device pool cant be empty"
73+
assert platform.platform_runbook.device_pools, "Device pool can't be empty"
7974
for pool in platform.platform_runbook.device_pools:
8075
pool_type = str(pool.type.value)
76+
if not pool.devices:
77+
raise LisaException(f"No devices defined for pool type: {pool_type}")
8178
vendor_device_id = {
8279
"vendor_id": pool.devices[0].vendor_id,
8380
"device_id": pool.devices[0].device_id,
8481
}
8582
pool_vendor_device_map[pool_type] = vendor_device_id
8683

87-
assert environment.runbook.nodes_requirement, "requirement cant be empty"
88-
for node_space in environment.runbook.nodes_requirement:
89-
node_runbook: BaseLibvirtNodeSchema = node_space.get_extended_runbook(
90-
BaseLibvirtNodeSchema, CLOUD_HYPERVISOR
84+
# Get the node's runbook to check its passthrough requirements
85+
# Import at runtime to avoid libvirt dependency on non-libvirt platforms
86+
from lisa.sut_orchestrator.libvirt.schema import BaseLibvirtNodeSchema
87+
88+
node_runbook: "BaseLibvirtNodeSchema" = node.capability.get_extended_runbook(
89+
BaseLibvirtNodeSchema, CLOUD_HYPERVISOR
90+
)
91+
if not node_runbook.device_passthrough:
92+
raise SkippedException("No device-passthrough is set for node")
93+
94+
for req in node_runbook.device_passthrough:
95+
pool_type = str(req.pool_type.value)
96+
if pool_type not in pool_vendor_device_map:
97+
raise LisaException(
98+
f"Pool type '{pool_type}' not found in platform device pools"
99+
)
100+
ven_dev_id_of_pool = pool_vendor_device_map[pool_type]
101+
ven_id = ven_dev_id_of_pool["vendor_id"]
102+
dev_id = ven_dev_id_of_pool["device_id"]
103+
devices = lspci.get_devices_by_vendor_device_id(
104+
vendor_id=ven_id,
105+
device_id=dev_id,
106+
force_run=True,
91107
)
92-
if not node_runbook.device_passthrough:
93-
raise SkippedException("No device-passthrough is set for node")
94-
for req in node_runbook.device_passthrough:
95-
pool_type = str(req.pool_type.value)
96-
ven_dev_id_of_pool = pool_vendor_device_map[pool_type]
97-
ven_id = ven_dev_id_of_pool["vendor_id"]
98-
dev_id = ven_dev_id_of_pool["device_id"]
99-
devices = lspci.get_devices_by_vendor_device_id(
100-
vendor_id=ven_id,
101-
device_id=dev_id,
102-
force_run=True,
108+
if len(devices) < req.count:
109+
raise LisaException(
110+
f"Passthrough device validation failed for "
111+
f"pool_type '{pool_type}': Found {len(devices)} "
112+
f"device(s) but expected {req.count}. "
113+
f"Vendor/Device ID: {ven_id}:{dev_id}"
103114
)
104-
if len(devices) < req.count:
105-
raise LisaException(
106-
f"Device count don't match, got total: {len(devices)} "
107-
f"As per runbook, Required device count: {req.count}, "
108-
f"for pool_type: {pool_type} having Vendor/Device ID as "
109-
f"vendor_id = {ven_id}, device_id={dev_id}"
110-
)

lisa/microsoft/testsuites/performance/common.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,8 @@ def perf_tcp_pps(
301301
for port in ports:
302302
server_netperf.run_as_server(port, interface_ip=server_interface_ip)
303303
for port in ports:
304+
# Use server.internal_address as target since netperf client needs
305+
# the server's IP (which may differ from the interface it binds to)
304306
client_netperf.run_as_client_async(
305307
server_ip=server.internal_address,
306308
core_count=thread_count,

lisa/microsoft/testsuites/performance/networkperf_passthrough.py

Lines changed: 100 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,13 @@
22
# Licensed under the MIT license.
33
import re
44
from functools import partial
5-
from typing import Any, Dict, Tuple, cast
5+
from typing import Any, Callable, Dict, List, Tuple, cast
6+
7+
from microsoft.testsuites.performance.common import (
8+
perf_iperf,
9+
perf_ntttcp,
10+
perf_tcp_pps,
11+
)
612

713
from lisa import (
814
Logger,
@@ -20,27 +26,22 @@
2026
from lisa.sut_orchestrator import CLOUD_HYPERVISOR
2127
from lisa.sut_orchestrator.libvirt.context import get_node_context
2228
from lisa.testsuite import TestResult
23-
from lisa.tools import Dhclient, Lspci, Sysctl
29+
from lisa.tools import Dhclient, Kill, Lspci, Sysctl
2430
from lisa.tools.iperf3 import (
2531
IPERF_TCP_BUFFER_LENGTHS,
2632
IPERF_TCP_CONCURRENCY,
2733
IPERF_UDP_BUFFER_LENGTHS,
2834
IPERF_UDP_CONCURRENCY,
2935
)
3036
from lisa.util import (
37+
LisaException,
3138
SkippedException,
3239
constants,
3340
find_group_in_lines,
3441
find_groups_in_lines,
3542
)
3643
from lisa.util.logger import get_logger
3744
from lisa.util.parallel import run_in_parallel
38-
from microsoft.testsuites.performance.common import (
39-
cleanup_process,
40-
perf_iperf,
41-
perf_ntttcp,
42-
perf_tcp_pps,
43-
)
4445

4546

4647
@TestSuiteMetadata(
@@ -50,11 +51,21 @@
5051
This test suite is to validate linux network performance
5152
for various NIC passthrough scenarios.
5253
""",
54+
requirement=simple_requirement(
55+
supported_platform_type=[CLOUD_HYPERVISOR],
56+
unsupported_os=[Windows],
57+
),
5358
)
54-
class NetworkPerformace(TestSuite):
59+
class NetworkPerformance(TestSuite):
60+
# Timeout values:
61+
# TIMEOUT: 12000s (3.3 hrs) - accounts for test execution + network setup overhead
62+
# PPS_TIMEOUT: 3000s (50 min) - shorter for PPS tests which are less intensive
5563
TIMEOUT = 12000
5664
PPS_TIMEOUT = 3000
5765

66+
# Track baremetal host nodes for cleanup
67+
_baremetal_hosts: list[RemoteNode] = []
68+
5869
# Network device passthrough tests between host and guest
5970
@TestCaseMetadata(
6071
description="""
@@ -239,7 +250,7 @@ def perf_tcp_ntttcp_passthrough_host_guest(
239250

240251
@TestCaseMetadata(
241252
description="""
242-
This test case uses ntttcp to test sriov tcp network throughput.
253+
This test case uses ntttcp to test passthrough udp network throughput.
243254
""",
244255
priority=3,
245256
timeout=TIMEOUT,
@@ -423,7 +434,8 @@ def perf_tcp_max_pps_passthrough_two_guest(self, result: TestResult) -> None:
423434

424435
@TestCaseMetadata(
425436
description="""
426-
This test case uses ntttcp to test sriov tcp network throughput.
437+
This test case uses ntttcp to test passthrough tcp network throughput
438+
between two guest VMs.
427439
""",
428440
priority=3,
429441
timeout=TIMEOUT,
@@ -458,7 +470,8 @@ def perf_tcp_ntttcp_passthrough_two_guest(self, result: TestResult) -> None:
458470

459471
@TestCaseMetadata(
460472
description="""
461-
This test case uses ntttcp to test sriov tcp network throughput.
473+
This test case uses ntttcp to test passthrough udp network throughput
474+
between two guest VMs.
462475
""",
463476
priority=3,
464477
timeout=TIMEOUT,
@@ -500,14 +513,6 @@ def _configure_passthrough_nic_for_node(
500513
if not ctx.passthrough_devices:
501514
raise SkippedException("No passthrough devices found for node")
502515

503-
# Configure the nw interface on guest
504-
node.tools[Dhclient].run(
505-
force_run=True,
506-
sudo=True,
507-
expected_exit_code=0,
508-
expected_exit_code_failure_message="dhclient run failed",
509-
)
510-
511516
lspci = node.tools[Lspci]
512517
pci_devices = lspci.get_devices_by_type(
513518
constants.DEVICE_TYPE_SRIOV, force_run=True
@@ -521,8 +526,14 @@ def _configure_passthrough_nic_for_node(
521526
device_addr = device.slot
522527
break
523528

529+
if device_addr is None:
530+
raise LisaException(
531+
f"No non-virtio passthrough device found. "
532+
f"Available devices: {[d.slot for d in pci_devices]}"
533+
)
534+
524535
# Get the interface name
525-
err_msg: str = "Can't find interface from PCI address"
536+
err_msg: str = f"Can't find interface from PCI address: {device_addr}"
526537
device_path = node.execute(
527538
cmd=(
528539
"find /sys/class/net/*/device/subsystem/devices"
@@ -542,6 +553,38 @@ def _configure_passthrough_nic_for_node(
542553
interface_name = interface_name_raw.get("INTERFACE_NAME", "")
543554
assert interface_name, "Can not find interface name"
544555

556+
# Bring the interface up before configuring it
557+
node.execute(
558+
cmd=f"ip link set {interface_name} up",
559+
sudo=True,
560+
expected_exit_code=0,
561+
expected_exit_code_failure_message=(
562+
f"Failed to bring up interface {interface_name}"
563+
),
564+
)
565+
566+
# Wait for carrier (link up) - some NICs take time to negotiate
567+
# Exit code 124 = timeout (no carrier), which is acceptable if DHCP works anyway
568+
carrier_result = node.execute(
569+
cmd=f"timeout 30 sh -c 'until cat /sys/class/net/{interface_name}/carrier"
570+
f" 2>/dev/null | grep -q 1; do sleep 1; done'",
571+
sudo=True,
572+
shell=True,
573+
)
574+
if carrier_result.exit_code == 124:
575+
node.log.warning(
576+
f"Interface {interface_name} carrier not detected after 30s. "
577+
f"Proceeding with DHCP anyway - may fail if no physical link."
578+
)
579+
elif carrier_result.exit_code != 0:
580+
raise LisaException(
581+
f"Failed to check carrier on {interface_name}: "
582+
f"exit code {carrier_result.exit_code}"
583+
)
584+
585+
# Configure the nw interface on guest with dhclient for the specific interface
586+
node.tools[Dhclient].renew(interface_name)
587+
545588
# Get the interface ip
546589
err_msg = f"Failed to get interface details for: {interface_name}"
547590
interface_details = node.execute(
@@ -557,7 +600,11 @@ def _configure_passthrough_nic_for_node(
557600
single_line=False,
558601
)
559602
passthrough_nic_ip = interface_ip.get("INTERFACE_IP", "")
560-
assert passthrough_nic_ip, "Can not find interface IP"
603+
if not passthrough_nic_ip:
604+
raise LisaException(
605+
f"Failed to get IP for passthrough interface '{interface_name}'. "
606+
f"Interface details: {interface_details[:200]}"
607+
)
561608

562609
test_node = cast(RemoteNode, node)
563610
test_node.internal_address = passthrough_nic_ip
@@ -568,9 +615,14 @@ def _get_host_as_server(self, variables: Dict[str, Any]) -> RemoteNode:
568615
ip = variables.get("baremetal_host_ip", "")
569616
username = variables.get("baremetal_host_username", "")
570617
passwd = variables.get("baremetal_host_password", "")
618+
private_key = variables.get("baremetal_host_private_key_file", "")
571619

572-
if not (ip and username and passwd):
573-
raise SkippedException("Server-Node details are not provided")
620+
if not (ip and username and (passwd or private_key)):
621+
raise SkippedException(
622+
"Server-Node details are not provided. Required: "
623+
"baremetal_host_ip, baremetal_host_username, and either "
624+
"baremetal_host_password or baremetal_host_private_key_file"
625+
)
574626

575627
server = RemoteNode(
576628
runbook=schema.Node(name="baremetal-host"),
@@ -584,7 +636,11 @@ def _get_host_as_server(self, variables: Dict[str, Any]) -> RemoteNode:
584636
public_port=22,
585637
username=username,
586638
password=passwd,
639+
private_key_file=private_key,
587640
)
641+
# Track baremetal host for cleanup
642+
if server not in self._baremetal_hosts:
643+
self._baremetal_hosts.append(server)
588644
return server
589645

590646
def _get_host_nic_name(self, node: RemoteNode) -> str:
@@ -620,26 +676,28 @@ def _get_host_nic_name(self, node: RemoteNode) -> str:
620676
def after_case(self, log: Logger, **kwargs: Any) -> None:
621677
environment: Environment = kwargs.pop("environment")
622678

679+
# Combine environment nodes and baremetal host nodes for cleanup
680+
all_nodes = list(environment.nodes.list())
681+
if self._baremetal_hosts:
682+
all_nodes.extend(self._baremetal_hosts)
683+
623684
# use these cleanup functions
624-
def do_process_cleanup(process: str) -> None:
625-
cleanup_process(environment, process)
685+
def do_process_cleanup(process: str, node: Node) -> None:
686+
# Kill the process on the specific node
687+
kill = node.tools[Kill]
688+
kill.by_name(process, ignore_not_exist=True)
626689

627690
def do_sysctl_cleanup(node: Node) -> None:
628691
node.tools[Sysctl].reset()
629692

630-
# to run parallel cleanup of processes and sysctl settings
631-
run_in_parallel(
632-
[
633-
partial(do_process_cleanup, x)
634-
for x in [
635-
"lagscope",
636-
"netperf",
637-
"netserver",
638-
"ntttcp",
639-
"iperf3",
640-
]
641-
]
642-
)
643-
run_in_parallel(
644-
[partial(do_sysctl_cleanup, x) for x in environment.nodes.list()]
645-
)
693+
# to run parallel cleanup of processes on all nodes
694+
cleanup_tasks: List[Callable[[], None]] = []
695+
for process in ["lagscope", "netperf", "netserver", "ntttcp", "iperf3"]:
696+
for node in all_nodes:
697+
cleanup_tasks.append(partial(do_process_cleanup, process, node))
698+
699+
run_in_parallel(cleanup_tasks)
700+
run_in_parallel([partial(do_sysctl_cleanup, x) for x in all_nodes])
701+
702+
# Clear the baremetal hosts list after cleanup
703+
self._baremetal_hosts.clear()

0 commit comments

Comments
 (0)