-
Notifications
You must be signed in to change notification settings - Fork 5
Fix check for draining #458
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,25 +10,27 @@ | |
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def can_be_migrated(server: Server): | ||
| if server.flavor.name.startswith("g-") or server.flavor.name.startswith("f-"): | ||
| raise ValueError( | ||
| f"Attempted to move GPU or FPGA flavor, {server.flavor.name}, which is not allowed!" | ||
| ) | ||
| def can_be_migrated(server: Server, check_flavor: bool): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put a default value here as
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it was the other way around, before this PR we always did the check as it was always a live migration |
||
| if check_flavor: | ||
| if server.flavor.name.startswith("g-") or server.flavor.name.startswith("f-"): | ||
| raise ValueError( | ||
| f"Attempted to move GPU or FPGA flavor, {server.flavor.name}, which is not allowed!" | ||
| ) | ||
| if server.flavor.vcpus > 60: | ||
| raise ValueError( | ||
| f"Attempted to move flavor with greater than 60 cores, {server.flavor.name}, which is not allowed!" | ||
| ) | ||
| if server.status not in ["ACTIVE", "SHUTOFF"]: | ||
| raise ValueError( | ||
| f"Server status: {server.status}. The server must be ACTIVE or SHUTOFF to be migrated" | ||
| ) | ||
| if server.flavor.vcpus > 60: | ||
| raise ValueError( | ||
| f"Attempted to move flavor with greater than 60 cores, {server.flavor.name}, which is not allowed!" | ||
| ) | ||
|
|
||
|
|
||
| def snapshot_and_migrate_server( | ||
| conn: Connection, | ||
| server_id: str, | ||
| snapshot: bool, | ||
| check_flavor: bool, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do the same here |
||
| dest_host: Optional[str] = None, | ||
| ) -> None: | ||
| """ | ||
|
|
@@ -38,9 +40,10 @@ def snapshot_and_migrate_server( | |
| :param server_status: Status of machine to migrate - must be ACTIVE or SHUTOFF | ||
| :param flavor_name: Server flavor name | ||
| :param dest_host: Optional host to migrate to, otherwise chosen by scheduler | ||
| :param check_flavor: boolean. When value is True, GPUs, FPGAs and Flavors with many CPUs will not be migrated | ||
| """ | ||
| server = conn.compute.get_server(server_id) | ||
| can_be_migrated(server) | ||
| can_be_migrated(server, check_flavor) | ||
| if snapshot: | ||
| snapshot_server(conn=conn, server_id=server_id) | ||
| time.sleep(10) # Ensure server task status has updated after snapshot | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,6 +43,7 @@ def test_active_migration( | |
| server_id=mock_server_id, | ||
| dest_host=dest_host, | ||
| snapshot=True, | ||
| check_flavor=True, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change this to |
||
| ) | ||
| mock_snapshot_server.assert_called_once_with( | ||
| conn=mock_connection, server_id=mock_server_id | ||
|
|
@@ -79,6 +80,7 @@ def test_shutoff_migration( | |
| server_id=mock_server_id, | ||
| dest_host=dest_host, | ||
| snapshot=True, | ||
| check_flavor=True, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dont need this if you put a default value |
||
| ) | ||
| mock_snapshot_server.assert_called_once_with( | ||
| conn=mock_connection, server_id=mock_server_id | ||
|
|
@@ -119,6 +121,7 @@ def test_active_migration_failed_wait_for_status( | |
| server_id=mock_server_id, | ||
| dest_host=None, | ||
| snapshot=True, | ||
| check_flavor=True, | ||
| ) | ||
| mock_snapshot_server.assert_called_once_with( | ||
| conn=mock_connection, server_id=mock_server_id | ||
|
|
@@ -150,6 +153,7 @@ def test_no_snapshot_migration(mock_wait_for_migration_status, mock_snapshot_ser | |
| conn=mock_connection, | ||
| server_id=mock_server_id, | ||
| snapshot=False, | ||
| check_flavor=True, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't need this anymore if you put a default value |
||
| ) | ||
| mock_snapshot_server.assert_not_called() | ||
| mock_connection.compute.live_migrate_server.assert_called_once_with( | ||
|
|
@@ -181,6 +185,7 @@ def test_migration_fail(mock_snapshot_server): | |
| conn=mock_connection, | ||
| server_id=mock_server_id, | ||
| snapshot=True, | ||
| check_flavor=True, | ||
| ) | ||
| mock_snapshot_server.assert_not_called() | ||
| mock_connection.compute.live_migrate_server.assert_not_called() | ||
|
|
@@ -242,6 +247,7 @@ def test_raise_invalid_migration(flavor_name, flavor_vcpu): | |
| server_id=mock_server_id, | ||
| dest_host=None, | ||
| snapshot=True, | ||
| check_flavor=True, | ||
| ) | ||
|
|
||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit, I would call it
live_migrationThen the tick box (default) would do a live migration on CPU instances <60 cores
Unticking it would do a cold migration which can be done on anything but will shutdown while it happens
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That may be a good idea for the long term, as a separate flag.
Note that the purpose here is slightly different. We are NOT doing live migrations during the outage week in July.
This change is just to allow the migration of VMs with flavors for GPUs/FPGAs and/or more that 60 CPUs, which are currently blocked by st2. This PR allows a way to keep that check for those types of flavors in place, to avoid trying risky migrations by mistake, but at the same time having a mechanism to force the migration this time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah we need to do a "cold migration" for these types of flavors
But we'll need to do this post-outage too (e.g. HW failure) and the steps will be the same
The fact it skips the checks is an implementation detail - we just want a way to force drain these types of flavors which will incur an offline