diff --git a/examples/example_data_collection_vx300s.py b/examples/example_data_collection_vx300s.py index 51da710fd..dc3c03a97 100644 --- a/examples/example_data_collection_vx300s.py +++ b/examples/example_data_collection_vx300s.py @@ -28,7 +28,7 @@ def main(args): if record: nc.create_dataset( - name="My Example Dataset", + name="Test ice roxy-2", description="This is an example dataset", ) print("Created Dataset...") diff --git a/neuracore-dictionary.txt b/neuracore-dictionary.txt index d8855827a..453f672f7 100644 --- a/neuracore-dictionary.txt +++ b/neuracore-dictionary.txt @@ -6,6 +6,7 @@ addfinalizer addinivalue agentview agilex +iceoryx aiolimiter aiortc allclose @@ -16,6 +17,7 @@ attns autocast Autoencoders autoregressively +uninit autoset autouse avgpool diff --git a/neuracore/core/streaming/data_stream.py b/neuracore/core/streaming/data_stream.py index e895d98eb..fc9316393 100644 --- a/neuracore/core/streaming/data_stream.py +++ b/neuracore/core/streaming/data_stream.py @@ -150,11 +150,11 @@ def stop_recording( if producer_channel.trace_id: producer_channel.cleanup_producer_channel( stop_cutoff_sequence_number=stop_cutoff_sequence_number, - wait_for_slot_drain=wait_for_producer_drain, + wait_for_transport_drain=wait_for_producer_drain, ) finally: producer_channel.stop_producer_channel( - wait_for_slot_drain=wait_for_producer_drain, + wait_for_transport_drain=wait_for_producer_drain, ) def is_recording(self) -> bool: diff --git a/neuracore/data_daemon/communications_management/consumer/bridge_chunk_spool.py b/neuracore/data_daemon/communications_management/consumer/bridge_chunk_spool.py index 97b12f4d0..8f799b5c4 100644 --- a/neuracore/data_daemon/communications_management/consumer/bridge_chunk_spool.py +++ b/neuracore/data_daemon/communications_management/consumer/bridge_chunk_spool.py @@ -1,4 +1,4 @@ -"""Disk-backed chunk spool for shared-slot completion work.""" +"""Disk-backed chunk spool for video-frame completion work.""" from __future__ import annotations @@ -43,7 +43,7 @@ def __init__( ) def append(self, chunk: bytes | bytearray | memoryview) -> ChunkSpoolRef: - """Copy one chunk from shared memory into the spool and return its ref.""" + """Copy one video chunk into the spool and return its ref.""" chunk_view = chunk if isinstance(chunk, memoryview) else memoryview(chunk) chunk_len = len(chunk_view) diff --git a/neuracore/data_daemon/communications_management/consumer/completion_worker.py b/neuracore/data_daemon/communications_management/consumer/completion_worker.py index 93749ff81..e514bef6c 100644 --- a/neuracore/data_daemon/communications_management/consumer/completion_worker.py +++ b/neuracore/data_daemon/communications_management/consumer/completion_worker.py @@ -226,7 +226,7 @@ def _release_partial_refs(self, partial: SpoolPartialMessage) -> None: class CompletionWorker: - """Non-blocking sharded completion pipeline for shared-slot ingest.""" + """Non-blocking sharded completion pipeline for video-frame ingest.""" def __init__( self, diff --git a/neuracore/data_daemon/communications_management/consumer/data_bridge.py b/neuracore/data_daemon/communications_management/consumer/data_bridge.py index e4fbaa145..84089817f 100644 --- a/neuracore/data_daemon/communications_management/consumer/data_bridge.py +++ b/neuracore/data_daemon/communications_management/consumer/data_bridge.py @@ -24,7 +24,7 @@ ) from ..shared_transport.communications_manager import CommunicationsManager -from ..shared_transport.shared_slot_daemon_handler import SharedSlotDaemonHandler +from ..shared_transport.iox2_daemon_drain import Iox2DaemonDrain from .completion_worker import CompletionWorker from .helpers import str_or_none from .models import ( @@ -32,15 +32,27 @@ ChannelState, ClosedProducerRegistry, RecordingDataDropRequest, - SharedSlotSequenceProgressRequest, TraceMetadataRegistrationRequest, TraceMetadataSnapshot, TraceRecordingLookupRequest, TransportMode, + VideoFrameSequenceProgressRequest, ) from .spool_worker import SpoolWorker from .trace_lifecycle_coordinator import TraceLifecycleCoordinator +# Data types whose frames travel over the iceoryx2 zero-copy video transport +# instead of the ZMQ control channel. +_VIDEO_DATA_TYPES = frozenset( + {DataType.RGB_IMAGES, DataType.DEPTH_IMAGES, DataType.POINT_CLOUDS} +) + + +def _data_type_uses_video_transport(data_type: DataType | None) -> bool: + """Return True when the channel's frames arrive over iceoryx2.""" + return data_type in _VIDEO_DATA_TYPES + + RecordingDiskManager = rdm_module.RecordingDiskManager logger = logging.getLogger(__name__) @@ -77,7 +89,7 @@ def __init__( self.recording_disk_manager = recording_disk_manager self.channels = ChannelRegistry() self._closed_producers = ClosedProducerRegistry() - self._shared_slot_handler = SharedSlotDaemonHandler(self.comm) + self._iox2_drain = Iox2DaemonDrain() self._spool_admission = threading.BoundedSemaphore(DEFAULT_MAX_SPOOLED_CHUNKS) self._completion_worker = CompletionWorker( recording_disk_manager=self.recording_disk_manager, @@ -90,13 +102,12 @@ def __init__( ) self._spool_worker = SpoolWorker( root=get_daemon_recordings_root_path() / ".bridge_chunk_spool", - shared_slot_handler=self._shared_slot_handler, completion_worker=self._completion_worker, acquire_spool_admission=self._spool_admission.acquire, release_spool_admission=self._spool_admission.release, should_drop_recording_data=self._trace_lifecycle.should_drop_recording_data, mark_sequence_completed=( - self._trace_lifecycle.mark_shared_slot_sequence_completed + self._trace_lifecycle.mark_video_frame_sequence_completed ), register_trace=self._trace_lifecycle.register_trace, register_trace_metadata=self._trace_lifecycle.register_trace_metadata, @@ -105,8 +116,6 @@ def __init__( shard_count=4, ) self._command_handlers: dict[CommandType, CommandHandler] = { - CommandType.OPEN_FIXED_SHARED_SLOTS: self._handle_open_fixed_shared_slots, - CommandType.SHARED_SLOT_DESCRIPTOR: self._handle_shared_slot_descriptor, CommandType.DATA_CHUNK: self._handle_write_data_chunk, CommandType.BATCHED_JOINT_DATA: self._handle_batched_joint_data, CommandType.HEARTBEAT: self._handle_heartbeat, @@ -145,14 +154,15 @@ def run(self) -> None: if raw: self.process_raw_message(raw) + self._iox2_drain.drain_all(self._on_iox2_frame) self._cleanup_expired_channels() except KeyboardInterrupt: logger.info("Shutting down daemon...") finally: + self._iox2_drain.close() self._spool_worker.close() self._completion_worker.close() self._spool_worker.cleanup() - self._shared_slot_handler.close() self.comm.cleanup_daemon() def stop( @@ -208,16 +218,12 @@ def handle_message(self, message: MessageEnvelope) -> None: self._trace_lifecycle.handle_recording_stopped(message) return - if ( - self._closed_producers.contains(producer_id) - and cmd != CommandType.OPEN_FIXED_SHARED_SLOTS - ): - return - - if ( - cmd == CommandType.OPEN_FIXED_SHARED_SLOTS - and self._closed_producers.contains(producer_id) - ): + if self._closed_producers.contains(producer_id): + # A heartbeat from a previously closed producer means it has come + # back for a new recording session; revive the channel. Any other + # late command from a closed producer is ignored. + if cmd != CommandType.HEARTBEAT: + return self._closed_producers.discard(producer_id) existing = self.channels.get(producer_id) @@ -256,75 +262,46 @@ def handle_message(self, message: MessageEnvelope) -> None: producer_id, ) - def _handle_open_fixed_shared_slots( - self, channel: ChannelState, message: MessageEnvelope - ) -> None: - """Handle an OPEN_FIXED_SHARED_SLOTS command from a producer.""" - payload = message.payload.get(message.command.value, {}) - previous_trace_id = channel.trace_id - self._shared_slot_handler.handle_open( - channel, - payload, - on_abandoned_sequences=self._handle_abandoned_shared_slot_sequences, - ) - if previous_trace_id is not None: - self.channels.set_trace_id(channel, None) - - def _handle_shared_slot_descriptor( - self, channel: ChannelState, message: MessageEnvelope + def _on_iox2_frame( + self, + channel_id: str, + sequence_id: int, + metadata: dict, + chunk: bytes, ) -> None: - """Queue one shared-slot descriptor for sharded spool processing.""" - descriptor_payload = message.payload.get(message.command.value, {}) - sequence_number = message.sequence_number - if sequence_number is None: - raise ValueError("Shared-slot descriptor missing sequence_number") - - descriptor = self._shared_slot_handler.mark_descriptor_pending( - channel, - descriptor_payload, - ) - self._mark_shared_slot_sequence_pending( - SharedSlotSequenceProgressRequest( - producer_id=channel.producer_id, - sequence_number=sequence_number, + """Handle one video frame drained from an iceoryx2 subscriber. + + Advances the producer sequence (so end-of-recording cutoffs account for + video frames), marks the sequence pending, and hands the decoded chunk to + the spool worker. The spool worker marks the sequence completed once the + chunk has been enqueued to the completion worker, which preserves the + ordering guarantee that a trace is never finalized before its frames are + spooled. + """ + channel = self.channels.get(channel_id) + if channel is None: + logger.debug( + "Iox2 frame for unknown channel channel_id=%s sequence_id=%s", + channel_id, + sequence_id, ) + return + + if sequence_id > channel.last_sequence_number: + channel.last_sequence_number = sequence_id + self._trace_lifecycle.set_max_producer_sequence(channel_id, sequence_id) + + request = VideoFrameSequenceProgressRequest( + producer_id=channel_id, + sequence_number=sequence_id, ) + self._mark_video_frame_sequence_pending(request) try: - self._spool_worker.enqueue(channel, descriptor_payload) + self._spool_worker.enqueue_frame(channel, sequence_id, metadata, chunk) except Exception: - self._shared_slot_handler.mark_descriptor_completed( - channel.producer_id, - descriptor, - ) - self._mark_shared_slot_sequence_completed( - SharedSlotSequenceProgressRequest( - producer_id=channel.producer_id, - sequence_number=sequence_number, - ) - ) + self._mark_video_frame_sequence_completed(request) raise - def _handle_abandoned_shared_slot_sequences( - self, - producer_id: str, - sequence_numbers: list[int], - ) -> None: - """Unblock recording finalization for abandoned shared-slot descriptors.""" - if not sequence_numbers: - return - for sequence_number in sequence_numbers: - self._mark_shared_slot_sequence_completed( - SharedSlotSequenceProgressRequest( - producer_id=producer_id, - sequence_number=sequence_number, - ) - ) - self._trace_lifecycle.set_max_producer_sequence( - producer_id, - max(sequence_numbers), - ) - self._trace_lifecycle.finalize_closing_recordings() - def _on_complete_message( self, channel: ChannelState, @@ -377,27 +354,54 @@ def _on_complete_message( channel.producer_id, ) - def _handle_heartbeat(self, channel: ChannelState, _: MessageEnvelope) -> None: - """Update the heartbeat timestamp for a producer. + def _handle_heartbeat( + self, channel: ChannelState, message: MessageEnvelope + ) -> None: + """Update heartbeat state and register video subscribers on first contact. - This does not perform any logic beyond updating the timestamp, so it is - suitable for use in a high-throughput system. + The heartbeat payload must carry the producer's data type. For video + data types the daemon creates the matching iceoryx2 subscriber + (idempotent), so frames published by the producer are received. + Non-video channels fall back to the ZMQ socket transport. """ channel.touch() - if channel.transport_mode is TransportMode.NONE: + + raw_data_type = message.payload.get("data_type") if message.payload else None + if raw_data_type is None: + logger.warning( + "HEARTBEAT from producer_id=%s missing required data_type", + channel.producer_id, + ) + return + try: + data_type = DataType(raw_data_type) + except ValueError: + logger.warning( + "HEARTBEAT from producer_id=%s carried unknown data_type=%r", + channel.producer_id, + raw_data_type, + ) + return + + channel.data_type = data_type + + if _data_type_uses_video_transport(data_type): + channel.mark_video_transport_open() + self._iox2_drain.register_channel(channel.producer_id) + elif channel.transport_mode is TransportMode.NONE: channel.mark_socket_transport_open() - def _mark_shared_slot_sequence_pending( - self, request: SharedSlotSequenceProgressRequest + def _mark_video_frame_sequence_pending( + self, request: VideoFrameSequenceProgressRequest ) -> None: - """Record that one shared-slot descriptor still needs spool processing.""" - self._trace_lifecycle.mark_shared_slot_sequence_pending(request) + """Record that one video frame still needs spool processing.""" + self._trace_lifecycle.mark_video_frame_sequence_pending(request) - def _mark_shared_slot_sequence_completed( - self, request: SharedSlotSequenceProgressRequest + def _mark_video_frame_sequence_completed( + self, request: VideoFrameSequenceProgressRequest ) -> None: - """Record that one shared-slot descriptor reached completion handoff.""" - self._trace_lifecycle.mark_shared_slot_sequence_completed(request) + """Record that one video frame reached completion handoff.""" + self._trace_lifecycle.mark_video_frame_sequence_completed(request) def _should_drop_recording_data(self, request: RecordingDataDropRequest) -> bool: """Return True when recording state says this data should be dropped.""" @@ -563,24 +567,18 @@ def _handle_end_trace( *, reason: str = "producer_trace_end", ) -> None: - """Handle an END_TRACE command from a producer.""" + """Handle an END_TRACE command from a producer. + + TRACE_END is sent over ZMQ after all video frames have been published to + iceoryx2, so any frame still buffered in the ring was produced before + this trace ended. Drain them now (marking their sequences pending) before + finalizing, so finalization defers until those frames are spooled. The + iceoryx2 subscriber itself is left in place; it persists for the lifetime + of the channel across recording sessions. + """ + self._iox2_drain.drain_all(self._on_iox2_frame) self._trace_lifecycle.handle_trace_end(channel, message) - sequence_number = message.sequence_number - has_pending_shared_slots = ( - sequence_number is not None - and self._trace_lifecycle.has_pending_shared_slot_sequences_at_or_before( - channel.producer_id, - sequence_number, - ) - ) - if ( - channel.uses_shared_memory_transport() - and channel.shared_slot.shm_name is not None - and not has_pending_shared_slots - ): - self._shared_slot_handler.cleanup_channel_resources(channel) - def cleanup_channel_on_trace_written( self, trace_id: str, @@ -600,22 +598,10 @@ def cleanup_channel_on_trace_written( channel = self.channels.get_by_trace_id(trace_id) if channel is not None: - if channel.uses_shared_memory_transport() and ( - channel.shared_slot.shm_name is not None - or channel.socket_pending_messages - ): - logger.debug( - "Cleaning up channel after TRACE_WRITTEN producer_id=%s " - "trace_id=%s shm_name=%s pending_partial_traces=%d", - channel.producer_id, - trace_id, - channel.shared_slot.shm_name, - len(channel.socket_pending_messages), - ) + # Keep the iceoryx2 subscriber registered: the channel may record + # further traces. It is unregistered only when the channel itself is + # removed on heartbeat expiry. self.channels.set_trace_id(channel, None) - if channel.uses_shared_memory_transport(): - self._shared_slot_handler.cleanup_channel_resources(channel) - channel.clear_transport_state() def _cleanup_expired_channels(self) -> None: """Remove channels whose heartbeat has not been seen within the timeout.""" @@ -649,7 +635,7 @@ def _cleanup_expired_channels(self) -> None: continue if cutoff_sequence_number is not None and ( - self._trace_lifecycle.has_pending_shared_slot_sequences_at_or_before( + self._trace_lifecycle.has_pending_video_frame_sequences_at_or_before( producer_id, cutoff_sequence_number, ) @@ -685,8 +671,7 @@ def _cleanup_expired_channels(self) -> None: self._trace_lifecycle.set_max_producer_sequence( channel.producer_id, channel.last_sequence_number ) - if channel.uses_shared_memory_transport(): - self._shared_slot_handler.cleanup_channel_resources(channel) + self._iox2_drain.unregister_channel(producer_id) channel.clear_transport_state() self.channels.remove(producer_id) self._closed_producers.add(producer_id) diff --git a/neuracore/data_daemon/communications_management/consumer/models.py b/neuracore/data_daemon/communications_management/consumer/models.py index 28860b2a7..c114e93ba 100644 --- a/neuracore/data_daemon/communications_management/consumer/models.py +++ b/neuracore/data_daemon/communications_management/consumer/models.py @@ -114,20 +114,7 @@ class TransportMode(str, Enum): NONE = "none" SOCKET = "socket" - SHARED_MEMORY = "shared_memory" - - -@dataclass -class SharedSlotTransportState: - """Mutable daemon-side shared-slot attachment state for one channel.""" - - control_endpoint: str | None = None - shm_name: str | None = None - - def reset(self) -> None: - """Clear any shared-slot attachment details for the channel.""" - self.control_endpoint = None - self.shm_name = None + VIDEO = "video" @dataclass(frozen=True) @@ -220,11 +207,19 @@ class FinalTraceWork: @dataclass(frozen=True) -class SpoolDescriptorWork: - """Spool-worker input for one shared-slot descriptor payload.""" +class DecodedFrameWork: + """Spool-worker input for one already-decoded iceoryx2 video frame. + + The frame's payload has already been copied out of the iceoryx2 ring buffer + by the daemon drain, so no transport-buffer read is needed here. ``metadata`` is + the per-chunk metadata dict and ``sequence_id`` is the channel sequence used + for end-of-recording ordering and drop decisions. + """ channel: ChannelState - descriptor_payload: dict + sequence_id: int + metadata: dict + chunk: bytes @dataclass(frozen=True) @@ -278,8 +273,8 @@ class TraceRecordingLookupRequest: @dataclass(frozen=True) -class SharedSlotSequenceProgressRequest: - """Request to update shared-slot descriptor progress for one producer.""" +class VideoFrameSequenceProgressRequest: + """Request to update video-frame spool progress for one producer.""" producer_id: str sequence_number: int @@ -378,21 +373,21 @@ def set_max(self, producer_id: str, sequence_number: int) -> None: @dataclass -class PendingSharedSlotSequenceRegistry: - """Track shared-slot descriptors still pending completion per producer.""" +class PendingVideoFrameSequenceRegistry: + """Track video-frame sequences still pending completion per producer.""" _pending_by_producer: dict[str, set[int]] = field(default_factory=dict) _lock: threading.Lock = field(default_factory=threading.Lock) def add(self, producer_id: str, sequence_number: int) -> None: - """Record a descriptor sequence as pending completion.""" + """Record a video-frame sequence as pending completion.""" with self._lock: self._pending_by_producer.setdefault(producer_id, set()).add( sequence_number ) def complete(self, producer_id: str, sequence_number: int) -> None: - """Mark a pending descriptor sequence as completed.""" + """Mark a pending video-frame sequence as completed.""" with self._lock: pending = self._pending_by_producer.get(producer_id) if pending is None: @@ -426,10 +421,8 @@ class ChannelState: opened_at: datetime | None = None heartbeat_expired_at: datetime | None = None transport_mode: TransportMode = TransportMode.NONE + data_type: DataType | None = None socket_pending_messages: dict[str, PartialMessage] = field(default_factory=dict) - shared_slot: SharedSlotTransportState = field( - default_factory=SharedSlotTransportState - ) def touch(self) -> None: """Refresh heartbeat tracking for this channel.""" @@ -446,34 +439,20 @@ def mark_socket_transport_open(self) -> None: if self.opened_at is None: self.opened_at = datetime.now(timezone.utc) - def mark_shared_slot_transport_open( - self, - *, - control_endpoint: str, - shm_name: str, - ) -> None: - """Mark the channel as using shared-slot transport.""" - self.transport_mode = TransportMode.SHARED_MEMORY - self.shared_slot.control_endpoint = control_endpoint - self.shared_slot.shm_name = shm_name - self.opened_at = datetime.now(timezone.utc) - - def mark_shared_slot_descriptor_seen(self, *, shm_name: str) -> None: - """Record that shared-slot traffic has been observed for this channel.""" - self.transport_mode = TransportMode.SHARED_MEMORY - self.shared_slot.shm_name = shm_name + def mark_video_transport_open(self) -> None: + """Mark the channel as using the iceoryx2 zero-copy video transport.""" + self.transport_mode = TransportMode.VIDEO if self.opened_at is None: self.opened_at = datetime.now(timezone.utc) - def uses_shared_memory_transport(self) -> bool: - """Return whether the channel is currently using shared-slot transport.""" - return self.transport_mode is TransportMode.SHARED_MEMORY + def uses_video_transport(self) -> bool: + """Return whether the channel is using the iceoryx2 video transport.""" + return self.transport_mode is TransportMode.VIDEO def clear_transport_state(self) -> None: """Reset transport-specific state after a trace finishes or closes.""" self.opened_at = None self.transport_mode = TransportMode.NONE - self.shared_slot.reset() self.socket_pending_messages.clear() def add_socket_data_chunk( diff --git a/neuracore/data_daemon/communications_management/consumer/spool_worker.py b/neuracore/data_daemon/communications_management/consumer/spool_worker.py index 850c3a074..bb88f55e1 100644 --- a/neuracore/data_daemon/communications_management/consumer/spool_worker.py +++ b/neuracore/data_daemon/communications_management/consumer/spool_worker.py @@ -1,4 +1,10 @@ -"""Shared-slot spool worker that copies daemon payloads before ACKing them.""" +"""Spool worker that persists decoded video frames before completion. + +Frames arrive already copied out of the iceoryx2 ring buffer by the daemon +drain, so this worker only spools the chunk to disk, resolves recording +metadata, applies drop policy, and hands the chunk to the completion worker. +Work is sharded by producer so ordering is preserved per channel. +""" from __future__ import annotations @@ -9,44 +15,36 @@ from collections.abc import Callable from pathlib import Path -from neuracore.data_daemon.communications_management.shared_transport.models import ( - SharedSlotTransportResult, -) -from neuracore.data_daemon.models import SharedSlotDescriptor +from neuracore.data_daemon.models import VideoTransportChunkMetadata -from ..shared_transport.shared_slot_daemon_handler import ( - SharedSlotDaemonHandler, - SharedSlotDescriptorAbandoned, -) -from .bridge_chunk_spool import BridgeChunkSpool, ChunkSpoolRef +from .bridge_chunk_spool import BridgeChunkSpool from .completion_worker import CompletionWorker from .models import ( ChannelState, CompletionChunkWork, + DecodedFrameWork, RecordingDataDropRequest, - SharedSlotSequenceProgressRequest, - SpoolDescriptorWork, TraceMetadataRegistrationRequest, TraceMetadataSnapshot, TraceRecordingLookupRequest, + VideoFrameSequenceProgressRequest, ) logger = logging.getLogger(__name__) class _SpoolShard: - """One spool shard that copies shared-slot chunks before ACKing them.""" + """One spool shard that copies decoded frames before completing them.""" def __init__( self, *, chunk_spool: BridgeChunkSpool, - shared_slot_handler: SharedSlotDaemonHandler, completion_worker: CompletionWorker, acquire_spool_admission: Callable[[], object], release_spool_admission: Callable[[], None], should_drop_recording_data: Callable[[RecordingDataDropRequest], bool], - mark_sequence_completed: Callable[[SharedSlotSequenceProgressRequest], None], + mark_sequence_completed: Callable[[VideoFrameSequenceProgressRequest], None], register_trace: Callable[[str, str], None], register_trace_metadata: Callable[[TraceMetadataRegistrationRequest], None], get_trace_recording: Callable[[TraceRecordingLookupRequest], str | None], @@ -54,7 +52,6 @@ def __init__( shard_index: int, ) -> None: self._chunk_spool = chunk_spool - self._shared_slot_handler = shared_slot_handler self._completion_worker = completion_worker self._acquire_spool_admission = acquire_spool_admission self._release_spool_admission = release_spool_admission @@ -64,7 +61,7 @@ def __init__( self._register_trace_metadata = register_trace_metadata self._get_trace_recording = get_trace_recording self._set_channel_trace_id = set_channel_trace_id - self._queue: queue.Queue[SpoolDescriptorWork | None] = queue.Queue(maxsize=32) + self._queue: queue.Queue[DecodedFrameWork | None] = queue.Queue(maxsize=32) self._error: Exception | None = None self._error_lock = threading.Lock() self._thread = threading.Thread( @@ -74,11 +71,10 @@ def __init__( ) self._thread.start() - def enqueue(self, channel: ChannelState, descriptor_payload: dict) -> None: + def enqueue(self, work: DecodedFrameWork) -> None: + """Queue one decoded frame for spool processing.""" self._ensure_running() - self._queue.put( - SpoolDescriptorWork(channel=channel, descriptor_payload=descriptor_payload) - ) + self._queue.put(work) def close(self) -> None: self._queue.put(None) @@ -109,66 +105,24 @@ def _worker_loop(self) -> None: finally: self._queue.task_done() - def _get_transport_result( - self, work: SpoolDescriptorWork - ) -> tuple[SharedSlotTransportResult, bool] | None: - """Gets a transport result from the shared-slot handler.""" - descriptor: SharedSlotDescriptor | None = None - release_admission = True - - try: - transport_result = self._shared_slot_handler.handle_descriptor( - work.channel, - work.descriptor_payload, - self._chunk_spool, - ) - descriptor = transport_result.descriptor - release_admission = False - except SharedSlotDescriptorAbandoned: - descriptor = self._descriptor_from_payload_or_none(work.descriptor_payload) - logger.warning( - "Skipping abandoned shared-slot descriptor " - "producer_id=%s shm_name=%s sequence_id=%s", - work.channel.producer_id, - ( - descriptor.shm_name - if descriptor is not None - else work.descriptor_payload.get("shm_name") - ), - ( - descriptor.sequence_id - if descriptor is not None - else work.descriptor_payload.get("sequence_id") - ), - ) - return None - finally: - if descriptor is None: - descriptor = self._descriptor_from_payload_or_none( - work.descriptor_payload - ) - if descriptor is not None: - self._shared_slot_handler.mark_descriptor_completed( - work.channel.producer_id, - descriptor, - ) - return (transport_result, release_admission) + def _process(self, work: DecodedFrameWork) -> None: + """Spool one decoded frame and hand it to the completion worker. - def _process(self, work: SpoolDescriptorWork) -> None: + Admission and the chunk-spool reference are owned by this method until + the chunk is handed to the completion worker, which then releases both + once the chunk has been materialized. On drop/error paths this method + releases them itself. The channel sequence is always marked completed so + end-of-recording finalization never stalls on this frame. + """ self._acquire_spool_admission() - chunk_spool_ref: ChunkSpoolRef | None = None - release_admission = True - + chunk_spool_ref = None + handed_off = False try: - transport_result_data = self._get_transport_result(work) - if transport_result_data is None: - return + chunk_metadata = VideoTransportChunkMetadata.from_dict(work.metadata) + chunk_spool_ref = self._chunk_spool.append(work.chunk) - transport_result, release_admission = transport_result_data - descriptor = transport_result.descriptor - chunk_metadata = transport_result.chunk_metadata - trace_id = transport_result.trace_id - trace_metadata = transport_result.trace_metadata + trace_id = chunk_metadata.trace_id + trace_metadata = chunk_metadata.trace_metadata recording_id = self._get_trace_recording( TraceRecordingLookupRequest(trace_id=trace_id) @@ -177,20 +131,12 @@ def _process(self, work: SpoolDescriptorWork) -> None: recording_id = trace_metadata.recording_id if recording_id is None: - self._release_chunk_ref(transport_result.chunk_spool_ref) - chunk_spool_ref = None - self._mark_sequence_completed( - SharedSlotSequenceProgressRequest( - producer_id=work.channel.producer_id, - sequence_number=descriptor.sequence_id, - ) - ) logger.debug( - "Shared-slot packet missing recording metadata " - "trace_id=%s producer_id=%s sequence_id=%s", + "Decoded frame missing recording metadata trace_id=%s " + "producer_id=%s sequence_id=%s", trace_id, work.channel.producer_id, - descriptor.sequence_id, + work.sequence_id, ) return @@ -199,17 +145,9 @@ def _process(self, work: SpoolDescriptorWork) -> None: channel=work.channel, recording_id=recording_id, trace_id=trace_id, - sequence_number=descriptor.sequence_id, + sequence_number=work.sequence_id, ) ): - self._release_chunk_ref(transport_result.chunk_spool_ref) - chunk_spool_ref = None - self._mark_sequence_completed( - SharedSlotSequenceProgressRequest( - producer_id=work.channel.producer_id, - sequence_number=descriptor.sequence_id, - ) - ) return self._set_channel_trace_id(work.channel, trace_id) @@ -238,57 +176,44 @@ def _process(self, work: SpoolDescriptorWork) -> None: recording_id=str(recording_id), chunk_index=chunk_metadata.chunk_index, total_chunks=chunk_metadata.total_chunks, - sequence_number=descriptor.sequence_id, + sequence_number=work.sequence_id, chunk_spool=self._chunk_spool, - chunk_spool_ref=transport_result.chunk_spool_ref, + chunk_spool_ref=chunk_spool_ref, trace_metadata=trace_metadata, fallback_data_type=( trace_metadata.data_type if trace_metadata is not None else None ), ) ) + handed_off = True + finally: + if not handed_off: + if chunk_spool_ref is not None: + self._chunk_spool.release(chunk_spool_ref) + self._release_spool_admission() + # Mark the sequence completed only after the chunk has been enqueued + # to the completion worker (when handed off), so finalization never + # enqueues a final-trace marker ahead of this chunk. self._mark_sequence_completed( - SharedSlotSequenceProgressRequest( + VideoFrameSequenceProgressRequest( producer_id=work.channel.producer_id, - sequence_number=descriptor.sequence_id, + sequence_number=work.sequence_id, ) ) - chunk_spool_ref = None - finally: - if chunk_spool_ref is not None: - self._release_chunk_ref(chunk_spool_ref) - elif release_admission: - self._release_spool_admission() - - @staticmethod - def _descriptor_from_payload_or_none( - descriptor_payload: dict, - ) -> SharedSlotDescriptor | None: - try: - return SharedSlotDescriptor.from_dict(descriptor_payload) - except Exception: - return None - - def _release_chunk_ref(self, ref: ChunkSpoolRef) -> None: - try: - self._chunk_spool.release(ref) - finally: - self._release_spool_admission() class SpoolWorker: - """Route shared-slot descriptors onto per-producer spool shards.""" + """Route decoded video frames onto per-producer spool shards.""" def __init__( self, *, root: Path, - shared_slot_handler: SharedSlotDaemonHandler, completion_worker: CompletionWorker, acquire_spool_admission: Callable[[], object], release_spool_admission: Callable[[], None], should_drop_recording_data: Callable[[RecordingDataDropRequest], bool], - mark_sequence_completed: Callable[[SharedSlotSequenceProgressRequest], None], + mark_sequence_completed: Callable[[VideoFrameSequenceProgressRequest], None], register_trace: Callable[[str, str], None], register_trace_metadata: Callable[[TraceMetadataRegistrationRequest], None], get_trace_recording: Callable[[TraceRecordingLookupRequest], str | None], @@ -299,7 +224,6 @@ def __init__( self._shards = [ _SpoolShard( chunk_spool=BridgeChunkSpool(root / f"shard-{index:02d}"), - shared_slot_handler=shared_slot_handler, completion_worker=completion_worker, acquire_spool_admission=acquire_spool_admission, release_spool_admission=release_spool_admission, @@ -314,11 +238,23 @@ def __init__( for index in range(shard_count) ] - def enqueue(self, channel: ChannelState, descriptor_payload: dict) -> None: - """Queue one shared-slot descriptor onto its owning shard.""" - key = channel.producer_id.encode("utf-8", errors="replace") - shard = self._shards[zlib.crc32(key) % len(self._shards)] - shard.enqueue(channel, descriptor_payload) + def enqueue_frame( + self, + channel: ChannelState, + sequence_id: int, + metadata: dict, + chunk: bytes, + ) -> None: + """Queue one decoded iceoryx2 frame onto its owning shard.""" + shard = self._shard_for(channel.producer_id) + shard.enqueue( + DecodedFrameWork( + channel=channel, + sequence_id=sequence_id, + metadata=metadata, + chunk=chunk, + ) + ) def close(self) -> None: """Stop all spool shards.""" @@ -329,3 +265,7 @@ def cleanup(self) -> None: """Remove spool files created by all owned shards.""" for shard in self._shards: shard.cleanup() + + def _shard_for(self, producer_id: str) -> _SpoolShard: + key = producer_id.encode("utf-8", errors="replace") + return self._shards[zlib.crc32(key) % len(self._shards)] diff --git a/neuracore/data_daemon/communications_management/consumer/trace_lifecycle_coordinator.py b/neuracore/data_daemon/communications_management/consumer/trace_lifecycle_coordinator.py index 48373a31d..06a586308 100644 --- a/neuracore/data_daemon/communications_management/consumer/trace_lifecycle_coordinator.py +++ b/neuracore/data_daemon/communications_management/consumer/trace_lifecycle_coordinator.py @@ -17,20 +17,20 @@ CompletedChannelMessage, FinalChunkRegistry, FinalTraceWork, - PendingSharedSlotSequenceRegistry, PendingTraceEnd, PendingTraceEndRegistry, + PendingVideoFrameSequenceRegistry, ProducerCutoffWatchRegistry, ProducerSequenceRegistry, RecordingCloseRegistry, RecordingClosingState, RecordingDataDropRequest, - SharedSlotSequenceProgressRequest, TraceMetadataRegistrationRequest, TraceMetadataRegistry, TraceMetadataSnapshot, TraceRecordingLookupRequest, TraceRecordingRegistry, + VideoFrameSequenceProgressRequest, ) logger = logging.getLogger(__name__) @@ -57,7 +57,7 @@ def __init__( self._closing_recordings = RecordingCloseRegistry() self._producer_cutoff_watches = ProducerCutoffWatchRegistry() self._producer_last_sequence_numbers = ProducerSequenceRegistry() - self._pending_shared_slot_sequences = PendingSharedSlotSequenceRegistry() + self._pending_video_frame_sequences = PendingVideoFrameSequenceRegistry() def ensure_result_trace_registered( self, @@ -137,13 +137,13 @@ def channel_stop_cutoff_sequence_number( return None return int(cutoff) - def has_pending_shared_slot_sequences_at_or_before( + def has_pending_video_frame_sequences_at_or_before( self, producer_id: str, sequence_number: int, ) -> bool: - """Return whether shared-slot spool work is still pending up to a cutoff.""" - return self._pending_shared_slot_sequences.has_pending_at_or_before( + """Return whether video-frame spool work is still pending up to a cutoff.""" + return self._pending_video_frame_sequences.has_pending_at_or_before( producer_id, sequence_number, ) @@ -175,22 +175,22 @@ def register_trace_metadata( incoming_value, ) - def mark_shared_slot_sequence_pending( + def mark_video_frame_sequence_pending( self, - request: SharedSlotSequenceProgressRequest, + request: VideoFrameSequenceProgressRequest, ) -> None: - """Record that one shared-slot descriptor still needs spool processing.""" - self._pending_shared_slot_sequences.add( + """Record that one video frame still needs spool processing.""" + self._pending_video_frame_sequences.add( request.producer_id, request.sequence_number, ) - def mark_shared_slot_sequence_completed( + def mark_video_frame_sequence_completed( self, - request: SharedSlotSequenceProgressRequest, + request: VideoFrameSequenceProgressRequest, ) -> None: - """Record that one shared-slot descriptor reached completion handoff.""" - self._pending_shared_slot_sequences.complete( + """Record that one video frame reached completion handoff.""" + self._pending_video_frame_sequences.complete( request.producer_id, request.sequence_number, ) @@ -461,7 +461,7 @@ def _has_reached_sequence_cutoffs( or last_sequence_number < cutoff_sequence_number ): return False - if self._pending_shared_slot_sequences.has_pending_at_or_before( + if self._pending_video_frame_sequences.has_pending_at_or_before( producer_id, cutoff_sequence_number, ): diff --git a/neuracore/data_daemon/communications_management/producer/producer_channel.py b/neuracore/data_daemon/communications_management/producer/producer_channel.py index b6cb78be2..eade9e010 100644 --- a/neuracore/data_daemon/communications_management/producer/producer_channel.py +++ b/neuracore/data_daemon/communications_management/producer/producer_channel.py @@ -6,7 +6,6 @@ import math import queue import threading -import time import uuid from collections.abc import Iterator, Sequence @@ -20,7 +19,7 @@ ) from neuracore.data_daemon.const import ( DEFAULT_CHUNK_SIZE, - DEFAULT_SHARED_MEMORY_SIZE, + DEFAULT_TRANSPORT_BUFFER_SIZE, DEFAULT_VIDEO_CHUNK_SIZE, DEFAULT_VIDEO_SEND_QUEUE_MAXSIZE, DEFAULT_VIDEO_SLOT_SIZE, @@ -30,17 +29,12 @@ CommandType, DataChunkPayload, DataType, - SharedMemoryChunkMetadata, TraceTransportMetadata, + VideoTransportChunkMetadata, ) from ..shared_transport.communications_manager import CommunicationsManager -from ..shared_transport.registry import ( - SharedSlotOpenFailedError, - SharedSlotTimeout, - SharedSlotUnhealthyError, -) -from ..shared_transport.shared_slot_transport import SharedSlotVideoTransport +from ..shared_transport.iox2_video_transport import Iox2VideoTransport from .producer_channel_message_sender import ProducerChannelMessageSender from .producer_heartbeat_service import ProducerHeartbeatService @@ -50,13 +44,13 @@ __all__ = [ "ProducerChannel", - "data_type_uses_shared_slot_transport", + "data_type_uses_video_transport", "producer_transport_args_for_data_type", ] -def data_type_uses_shared_slot_transport(data_type: DataType) -> bool: - """Return True when the data type should use shared-slot transport.""" +def data_type_uses_video_transport(data_type: DataType) -> bool: + """Return True when the data type should use the iceoryx2 video transport.""" return data_type in ( DataType.RGB_IMAGES, DataType.DEPTH_IMAGES, @@ -81,7 +75,7 @@ def producer_transport_args_for_data_type( return ( DEFAULT_CHUNK_SIZE, - DEFAULT_SHARED_MEMORY_SIZE, + DEFAULT_TRANSPORT_BUFFER_SIZE, 512, ) @@ -97,7 +91,7 @@ def __init__( chunk_size: int | None = None, send_queue_maxsize: int | None = None, recording_id: str | None = None, - shared_memory_size: int | None = None, + max_frame_bytes: int | None = None, ) -> None: """Initialize the producer channel.""" if data_type is None: @@ -105,7 +99,7 @@ def __init__( ( default_chunk_size, - default_shared_memory_size, + default_max_frame_bytes, default_send_queue_maxsize, ) = producer_transport_args_for_data_type(data_type) @@ -125,20 +119,19 @@ def __init__( self.recording_id: str | None = recording_id self._heartbeat_interval = 1.0 self._data_type = data_type - self._use_shared_slot_transport = data_type_uses_shared_slot_transport( - data_type - ) + self._use_video_transport = data_type_uses_video_transport(data_type) self._sequence_allocator = ChannelSequenceAllocator() - self._shared_slot_transport: SharedSlotVideoTransport | None = ( - SharedSlotVideoTransport( + self._iox2_transport: Iox2VideoTransport | None = ( + Iox2VideoTransport( + channel_id=self.channel_id, sequence_allocator=self._sequence_allocator, - slot_size=int( - default_shared_memory_size - if shared_memory_size is None - else shared_memory_size + max_frame_bytes=int( + default_max_frame_bytes + if max_frame_bytes is None + else max_frame_bytes ), ) - if self._use_shared_slot_transport + if self._use_video_transport else None ) self._message_sender = ProducerChannelMessageSender( @@ -170,23 +163,31 @@ def start_producer_channel(self) -> None: self._heartbeat_service.start() def heartbeat(self) -> None: - """Send a heartbeat message to the daemon.""" - self._send(CommandType.HEARTBEAT, {}) + """Send a heartbeat message to the daemon. + + The heartbeat carries the channel data type so the daemon can create the + matching iceoryx2 subscriber for video channels on first contact. It also + refreshes the publisher connections so a daemon subscriber that just + registered receives buffered history frames even while idle. + """ + if self._iox2_transport is not None: + self._iox2_transport.update_connections() + self._send(CommandType.HEARTBEAT, {"data_type": self._data_type.value}) def set_recording_id(self, recording_id: str | None) -> None: """Set the recording ID for the producer.""" self.recording_id = recording_id def get_last_accepted_sequence_number(self) -> int: - """Return the latest sequence accepted by either sender or shared-slot queue.""" + """Return the latest sequence accepted by the sender or video transport.""" last_enqueued = self.get_last_enqueued_sequence_number() - if self._shared_slot_transport is None: + if self._iox2_transport is None: return last_enqueued return max( last_enqueued, - self._shared_slot_transport.get_last_reserved_sequence_number(), + self._iox2_transport.get_last_reserved_sequence_number(), ) def mark_recording_stop_requested(self) -> int: @@ -204,7 +205,7 @@ def _recording_data_stopped(self) -> bool: def start_recording_session( self, recording_id: str | None = None, - shared_memory_size: int | None = None, + max_frame_bytes: int | None = None, ) -> None: """Start a fresh recording session for this producer channel.""" with self._recording_send_lock: @@ -224,8 +225,21 @@ def start_recording_session( self.start_producer_channel() self.start_new_trace() - if self._use_shared_slot_transport: - self.open_fixed_shared_slots(slot_size=shared_memory_size) + if self._use_video_transport: + self._announce_video_channel() + + def _announce_video_channel(self) -> None: + """Prompt the daemon to register its iceoryx2 subscriber for this channel. + + Sends a heartbeat (which carries the data type) and waits for it to be + flushed so the daemon learns about the channel before video frames flow. + Combined with the iceoryx2 service history, this avoids losing the first + frames of a recording to the subscriber-registration race. + """ + sequence_number = self._send( + CommandType.HEARTBEAT, {"data_type": self._data_type.value} + ) + self.wait_until_sequence_sent(sequence_number) def start_new_trace(self) -> None: """Start a new trace for the given recording.""" @@ -256,18 +270,16 @@ def end_trace(self) -> None: def stop_producer_channel( self, - wait_for_slot_drain: bool = True, + wait_for_transport_drain: bool = True, ) -> None: """Stop the producer channel and release local resources.""" self._stop_heartbeat_service() final_flush_sequence = self.get_last_enqueued_sequence_number() stop_failure: RuntimeError | None = None - sender_failed = False if not self.wait_until_sequence_sent(final_flush_sequence): sender_error = self._get_message_sender_error() if sender_error is not None: - sender_failed = True logger.warning( "Producer channel stopping after sender failure without " "flushing final sequence_number=%s error=%r", @@ -285,36 +297,24 @@ def stop_producer_channel( "before stopping producer channel" ) - try: - if ( - stop_failure is None - and not sender_failed - and self._shared_slot_transport is not None - ): - self._shared_slot_transport.wait_until_drained(timeout_s=30.0) - finally: - self._close_shared_slot_transport() - self._stop_message_sender() - self._comm.cleanup_producer() + # Video frames are published synchronously into the iceoryx2 ring as they + # are produced, so there is no producer-side queue to drain before + # shutdown. The daemon's sequence tracking guarantees all frames up to the + # stop cutoff are spooled before finalization. + self._close_iox2_transport() + self._stop_message_sender() + self._comm.cleanup_producer() if stop_failure is not None: raise stop_failure def _send(self, command: CommandType, payload: dict | None = None) -> int: """Send a message to the daemon.""" - shared_slot_transport = ( - self._shared_slot_transport if self._use_shared_slot_transport else None - ) sequence_number = self._sequence_allocator.reserve() return self._message_sender.send( command, payload, sequence_number=sequence_number, - on_failed_send=( - shared_slot_transport.notify_sender_failure - if shared_slot_transport is not None - else None - ), ) def get_last_sent_sequence_number(self) -> int: @@ -336,33 +336,6 @@ def wait_until_sequence_sent( timeout_s=timeout_s, ) - def open_fixed_shared_slots(self, slot_size: int | None = None) -> None: - """Announce the fixed shared-slot transport for this producer.""" - if not self._use_shared_slot_transport or self._shared_slot_transport is None: - return - if ( - slot_size is not None - and not self._shared_slot_transport.is_announced() - and int(slot_size) != self._shared_slot_transport.slot_size - ): - self._shared_slot_transport.close() - self._shared_slot_transport = SharedSlotVideoTransport( - sequence_allocator=self._sequence_allocator, slot_size=int(slot_size) - ) - if self._shared_slot_transport.is_announced(): - return - payload = self._shared_slot_transport.open_payload() - sequence_number = self._send( - CommandType.OPEN_FIXED_SHARED_SLOTS, - { - "open_fixed_shared_slots": payload.model_dump(exclude_none=True), - }, - ) - if not self.wait_until_sequence_sent(sequence_number): - raise RuntimeError( - "Failed to send OPEN_FIXED_SHARED_SLOTS before video transport use" - ) - def _send_socket_data_chunk(self, payload: DataChunkPayload) -> None: """Send one DATA_CHUNK payload directly over the producer socket.""" with self._recording_send_lock: @@ -454,75 +427,32 @@ def _iter_chunk_views( if chunk_parts: yield chunk_parts[0] if len(chunk_parts) == 1 else b"".join(chunk_parts) - def _ping_daemon_for_shared_slot_recovery( - self, - timeout_s: float = 2.0, - ) -> bool: - """Return True when a heartbeat can be sent during slot recovery.""" - started_at = time.monotonic() - try: - sequence_number = self._send(CommandType.HEARTBEAT, {}) - alive = self.wait_until_sequence_sent( - sequence_number, - timeout_s=timeout_s, - ) - except Exception: - elapsed_s = time.monotonic() - started_at - logger.warning( - "Shared-slot recovery daemon ping failed elapsed=%.3fs", - elapsed_s, - exc_info=True, - ) - return False - - elapsed_s = time.monotonic() - started_at - logger.info( - "Shared-slot recovery daemon ping alive=%s elapsed=%.3fs", - alive, - elapsed_s, - ) - return alive - - def _reset_shared_slot_transport_for_recovery(self) -> None: - """Replace the shared-slot transport after a recoverable slot failure.""" - old_transport = self._shared_slot_transport - slot_size = old_transport.slot_size if old_transport is not None else None - self._close_shared_slot_transport() - self._shared_slot_transport = SharedSlotVideoTransport( - sequence_allocator=self._sequence_allocator, - slot_size=slot_size or DEFAULT_VIDEO_SLOT_SIZE, - allocate_timeout_s=3.0, - ) - self.open_fixed_shared_slots(slot_size=slot_size) - - def _stop_shared_slot_logging_after_failure(self) -> None: - """Stop accepting more recording data after unrecoverable slot failure.""" + def _stop_video_logging_after_failure(self) -> None: + """Stop accepting more recording data after an unrecoverable video error.""" with self._recording_send_lock: if self._stop_cutoff_sequence_number is None: self._stop_cutoff_sequence_number = ( self.get_last_accepted_sequence_number() ) - self._close_shared_slot_transport() + self._close_iox2_transport() - def _send_data_parts_shared_slots( + def _send_data_parts_iox2( self, normalised_parts: Sequence[memoryview], total_chunks: int, trace_metadata: TraceTransportMetadata, ) -> None: - """Send one logical payload over the shared-slot transport.""" - self.open_fixed_shared_slots() - shared_slot_transport = self._shared_slot_transport - if shared_slot_transport is None: - raise SharedSlotUnhealthyError("Shared-slot transport is not available") + """Publish one logical payload over the iceoryx2 video transport.""" + transport = self._iox2_transport + if transport is None: + raise RuntimeError("iceoryx2 video transport is not available") - produced_chunks = 0 trace_id = self.trace_id if trace_id is None: - raise RuntimeError("Trace ID required for shared-slot transport") + raise RuntimeError("Trace ID required for video transport") for idx, chunk in enumerate(self._iter_chunk_views(normalised_parts)): - metadata = SharedMemoryChunkMetadata( + metadata = VideoTransportChunkMetadata( trace_id=trace_id, chunk_index=idx, total_chunks=total_chunks, @@ -533,24 +463,21 @@ def _send_data_parts_shared_slots( if self._recording_data_stopped(): return - sequence_number = shared_slot_transport.enqueue_packet( - producer_id=self.channel_id, - sender=self._message_sender, + sequence_number = transport.send_frame( metadata=metadata, chunk=chunk, stop_cutoff_sequence_number=self._stop_cutoff_sequence_number, ) if sequence_number is None: + # send_frame returns None either because the frame was rejected + # by the stop cutoff (transport still healthy) or because the + # publisher errored (transport unhealthy). + if not transport.is_healthy(): + self._stop_video_logging_after_failure() + raise RuntimeError("iceoryx2 video transport became unhealthy") return - produced_chunks += 1 - - if produced_chunks != total_chunks: - raise RuntimeError( - "Chunk count mismatch while serializing payload for transport" - ) - def send_data_parts( self, parts: Sequence[BytePart], @@ -598,7 +525,7 @@ def send_data_parts( robot_instance=robot_instance, ) - if not self._use_shared_slot_transport: + if not self._use_video_transport: if not normalised_parts: return payload_bytes = ( @@ -625,89 +552,48 @@ def send_data_parts( self._send_socket_data_chunk(payload) return - initial_exc: Exception | None = None - for attempt in range(2): - try: - self._send_data_parts_shared_slots( - normalised_parts, - total_chunks, - trace_metadata, - ) - return - except (SharedSlotTimeout, SharedSlotUnhealthyError) as exc: - if attempt > 0 or isinstance(exc, SharedSlotOpenFailedError): - self._stop_shared_slot_logging_after_failure() - raise RuntimeError( - "Shared-slot transport remained unhealthy after recovery" - ) from (initial_exc or exc) - - initial_exc = exc - if not self._ping_daemon_for_shared_slot_recovery(timeout_s=2.0): - self._stop_shared_slot_logging_after_failure() - raise RuntimeError( - "Shared-slot transport failed and daemon did not respond " - "to recovery ping" - ) from exc - - logger.warning( - "Shared-slot transport unhealthy; resetting transport and " - "retrying payload once. Initial error: %s", - exc, - ) - self._reset_shared_slot_transport_for_recovery() + self._send_data_parts_iox2( + normalised_parts, + total_chunks, + trace_metadata, + ) def initialize_new_producer_channel( self, - shared_memory_size: int | None = None, + max_frame_bytes: int | None = None, ) -> None: """Initialize a new producer channel for recording.""" - self.start_recording_session(shared_memory_size=shared_memory_size) + self.start_recording_session(max_frame_bytes=max_frame_bytes) def cleanup_producer_channel( self, stop_cutoff_sequence_number: int, - wait_for_slot_drain: bool = True, + wait_for_transport_drain: bool = True, ) -> None: - """Finish one trace after queued recording data up to stop cutoff is sent. + """Finish one trace after recording data up to the stop cutoff is sent. - Shared-slot transports must always drain before a recording session is - reset. The wait flag controls higher-level/backend waiting, not whether - daemon-owned shm slots may be abandoned with credits still in flight. + Video frames are published synchronously into the iceoryx2 ring as they + are produced, so there is nothing to drain here. The TRACE_END sent below + carries the channel's last sequence number, and the daemon defers + finalization until every frame up to the stop cutoff has been spooled. """ if stop_cutoff_sequence_number < 0: raise ValueError("stop_cutoff_sequence_number must be non-negative") - if self._shared_slot_transport is not None: - payload_cutoff_sequence_number = ( - self._shared_slot_transport.get_last_payload_sequence_number() - ) - - if payload_cutoff_sequence_number > 0: - self._shared_slot_transport.wait_until_payload_handed_off( - timeout_s=30.0, - max_sequence_number=payload_cutoff_sequence_number, - ) - - if not self.wait_until_sequence_sent(stop_cutoff_sequence_number): - raise RuntimeError( - "Failed to send queued recording data up to stop cutoff before cleanup" - ) - - if self._shared_slot_transport is not None: - payload_cutoff_sequence_number = ( - self._shared_slot_transport.get_last_payload_sequence_number() - ) - - if payload_cutoff_sequence_number > 0: - self._shared_slot_transport.wait_until_drained( - timeout_s=30.0, - max_sequence_number=payload_cutoff_sequence_number, - ) + # The stop cutoff spans the channel sequence space, so for video channels it + # is typically an iceoryx2 frame sequence that never travels over ZMQ — the + # ZMQ sender would never report it as "sent". Video frames are published + # synchronously into the iceoryx2 ring as they are produced, so only the + # ZMQ control/data messages need flushing here. Wait on the sender's own + # last-enqueued sequence instead of the global cutoff. + flush_sequence_number = self.get_last_enqueued_sequence_number() + if not self.wait_until_sequence_sent(flush_sequence_number): + raise RuntimeError("Failed to send queued recording data before cleanup") self.end_trace() - if self._shared_slot_transport is not None: - self._shared_slot_transport.finish_recording_session() + if self._iox2_transport is not None: + self._iox2_transport.finish_recording_session() def _stop_heartbeat_service(self) -> None: self._heartbeat_service.stop(join_timeout_s=1.0) @@ -715,10 +601,10 @@ def _stop_heartbeat_service(self) -> None: def _stop_message_sender(self) -> None: self._message_sender.close(join_timeout_s=2.0) - def _close_shared_slot_transport(self) -> None: - if self._shared_slot_transport is not None: - self._shared_slot_transport.close() - self._shared_slot_transport = None + def _close_iox2_transport(self) -> None: + if self._iox2_transport is not None: + self._iox2_transport.close() + self._iox2_transport = None def _get_message_sender_error(self) -> Exception | None: sender = getattr(self, "_message_sender", None) diff --git a/neuracore/data_daemon/communications_management/producer/producer_channel_message_sender.py b/neuracore/data_daemon/communications_management/producer/producer_channel_message_sender.py index a37f72d21..ac6c32677 100644 --- a/neuracore/data_daemon/communications_management/producer/producer_channel_message_sender.py +++ b/neuracore/data_daemon/communications_management/producer/producer_channel_message_sender.py @@ -108,9 +108,9 @@ def enqueue_envelope( ) -> None: """Enqueue a prebuilt envelope for ordered socket delivery. - Prebuilt envelopes are used by shared-slot transport. They must still - update sender sequence progress so stop cutoffs and flush waits observe - them correctly. + Prebuilt envelopes may carry pre-reserved sequence numbers. They must + still update sender sequence progress so stop cutoffs and flush waits + observe them correctly. """ with self._enqueue_lock: if envelope.sequence_number is None: diff --git a/neuracore/data_daemon/communications_management/shared_transport/__init__.py b/neuracore/data_daemon/communications_management/shared_transport/__init__.py index 0faca7c95..58c0a3683 100644 --- a/neuracore/data_daemon/communications_management/shared_transport/__init__.py +++ b/neuracore/data_daemon/communications_management/shared_transport/__init__.py @@ -1,22 +1,9 @@ -"""Internal shared-slot transport runtime components.""" +"""Zero-copy video transport runtime components (iceoryx2-backed).""" from __future__ import annotations from .communications_manager import CommunicationsManager -from .registry import SharedSlotRegistry, SharedSlotTimeout __all__ = [ - "SharedSlotRegistry", - "SharedSlotTimeout", "CommunicationsManager", - "SharedSlotVideoTransport", ] - - -def __getattr__(name: str) -> object: - """Lazily expose heavier transport types to avoid package import cycles.""" - if name == "SharedSlotVideoTransport": - from .shared_slot_transport import SharedSlotVideoTransport - - return SharedSlotVideoTransport - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/neuracore/data_daemon/communications_management/shared_transport/framing.py b/neuracore/data_daemon/communications_management/shared_transport/framing.py new file mode 100644 index 000000000..b1a40b6aa --- /dev/null +++ b/neuracore/data_daemon/communications_management/shared_transport/framing.py @@ -0,0 +1,85 @@ +"""Binary framing for self-describing video transport packets. + +One packet is ``magic (4B) + struct header (8B) + JSON metadata + raw chunk``. +This framing is shared by the producer (which writes packets into iceoryx2 +slots) and the daemon (which parses packets copied out of those slots). It is +deliberately transport-agnostic so the same bytes could travel over any +zero-copy channel. +""" + +from __future__ import annotations + +import json +import struct +from collections.abc import Mapping + +from neuracore.data_daemon.const import ( + VIDEO_TRANSPORT_PACKET_HEADER_FORMAT, + VIDEO_TRANSPORT_PACKET_HEADER_SIZE, + VIDEO_TRANSPORT_PACKET_MAGIC, +) + + +class PacketTooLarge(ValueError): + """Raised when an encoded frame cannot fit in a single transport slot.""" + + +def build_video_transport_packet( + metadata: Mapping[str, object], + chunk: bytes | bytearray | memoryview, +) -> bytes: + """Build one self-describing transport packet.""" + metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") + payload = bytes(chunk) + return ( + struct.pack( + VIDEO_TRANSPORT_PACKET_HEADER_FORMAT, + VIDEO_TRANSPORT_PACKET_MAGIC, + len(metadata_bytes), + len(payload), + ) + + metadata_bytes + + payload + ) + + +def build_video_transport_packet_metadata( + metadata: Mapping[str, object], + chunk: bytes | bytearray | memoryview, +) -> tuple[bytes, int]: + """Return serialized metadata plus total packet length without copying chunk.""" + metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") + chunk_len = len(chunk) + packet_length = VIDEO_TRANSPORT_PACKET_HEADER_SIZE + len(metadata_bytes) + chunk_len + return metadata_bytes, packet_length + + +def parse_video_transport_packet(packet: bytes) -> tuple[dict[str, object], bytes]: + """Parse one self-describing packet, returning its metadata and chunk.""" + metadata, chunk_start, chunk_end = parse_video_transport_packet_view( + memoryview(packet) + ) + return metadata, packet[chunk_start:chunk_end] + + +def parse_video_transport_packet_view( + packet: memoryview, +) -> tuple[dict[str, object], int, int]: + """Parse one packet view without copying the payload chunk.""" + if len(packet) < VIDEO_TRANSPORT_PACKET_HEADER_SIZE: + raise ValueError("Transport packet shorter than record header") + magic, metadata_len, chunk_len = struct.unpack( + VIDEO_TRANSPORT_PACKET_HEADER_FORMAT, + packet[:VIDEO_TRANSPORT_PACKET_HEADER_SIZE], + ) + if magic != VIDEO_TRANSPORT_PACKET_MAGIC: + raise ValueError("Transport packet missing video transport magic") + expected = VIDEO_TRANSPORT_PACKET_HEADER_SIZE + metadata_len + chunk_len + if len(packet) < expected: + raise ValueError("Transport packet shorter than declared lengths") + if len(packet) > expected: + raise ValueError("Transport packet contains trailing bytes") + metadata_start = VIDEO_TRANSPORT_PACKET_HEADER_SIZE + chunk_start = metadata_start + metadata_len + metadata = json.loads(packet[metadata_start:chunk_start].tobytes().decode("utf-8")) + return metadata, chunk_start, expected diff --git a/neuracore/data_daemon/communications_management/shared_transport/iox2_daemon_drain.py b/neuracore/data_daemon/communications_management/shared_transport/iox2_daemon_drain.py new file mode 100644 index 000000000..fbe68964a --- /dev/null +++ b/neuracore/data_daemon/communications_management/shared_transport/iox2_daemon_drain.py @@ -0,0 +1,289 @@ +"""Daemon-side iceoryx2 subscriber pool that drains all video channels. + +Replaces the daemon half of the old fixed-slot video transport (allocation, +descriptor handling, and credit returns). One subscriber is created +per video channel on first contact; :meth:`Iox2DaemonDrain.drain_all` copies +each received frame's payload out of the zero-copy ring and hands the decoded +``(sequence_id, frame_index, metadata, chunk)`` to a callback before releasing +the sample back to the ring buffer. + +Frames dropped under overload (DiscardData) leave gaps in the per-channel +``frame_index`` sequence; those gaps are counted and logged so silent loss is +observable. +""" + +from __future__ import annotations + +import ctypes +import logging +import threading +from collections.abc import Callable +from typing import Protocol, cast + +import iceoryx2 +from iceoryx2 import Slice + +from neuracore.data_daemon.communications_management.shared_transport.framing import ( + parse_video_transport_packet, +) +from neuracore.data_daemon.communications_management.shared_transport.iox2_video_transport import ( # noqa: E501 + FRAME_INDEX_KEY, + FRAME_META_KEY, + FRAME_SEQUENCE_KEY, +) +from neuracore.data_daemon.const import ( + IOX2_HISTORY_SIZE, + IOX2_SERVICE_PREFIX, + IOX2_SUBSCRIBER_BUFFER_SIZE, +) + +logger = logging.getLogger(__name__) + +# Callback signature: (channel_id, sequence_id, metadata_dict, chunk_bytes). +FrameCallback = Callable[[str, int, dict[str, object], bytes], None] + +# Log a per-channel drop summary at most this often (number of drained frames). +_DROP_LOG_INTERVAL = 64 + + +class _SamplePayload(Protocol): + number_of_elements: int + data_ptr: int + + +class _ReceivedSample(Protocol): + def payload(self) -> _SamplePayload: ... + + def delete(self) -> None: ... + + +class _Subscriber(Protocol): + def receive(self) -> _ReceivedSample | None: ... + + def delete(self) -> None: ... + + +class _ChannelSubscriber: + """One channel's subscriber plus its drop-detection bookkeeping.""" + + def __init__(self, subscriber: _Subscriber) -> None: + self.subscriber = subscriber + self.last_frame_index: int | None = None + self.dropped_frames = 0 + self._frames_since_log = 0 + + def note_frame_index(self, frame_index: int) -> int: + """Update drop tracking for one received frame; return newly dropped count.""" + dropped = 0 + if ( + self.last_frame_index is not None + and frame_index > self.last_frame_index + 1 + ): + dropped = frame_index - self.last_frame_index - 1 + self.dropped_frames += dropped + if self.last_frame_index is None or frame_index > self.last_frame_index: + self.last_frame_index = frame_index + self._frames_since_log += 1 + return dropped + + def should_log_drops(self) -> bool: + """Return True roughly once per drop-log interval to bound log spam.""" + if self._frames_since_log >= _DROP_LOG_INTERVAL: + self._frames_since_log = 0 + return True + return False + + +class Iox2DaemonDrain: + """Own all daemon-side iceoryx2 subscribers for video channels.""" + + def __init__(self) -> None: + """Create the daemon iceoryx2 node and empty subscriber pool.""" + self._lock = threading.Lock() + self._node = iceoryx2.NodeBuilder.new().create(iceoryx2.ServiceType.Ipc) + self._subscribers: dict[str, _ChannelSubscriber] = {} + # Reclaim resources left behind by producer/daemon processes that exited + # uncleanly in a previous run (best effort). + try: + self._node.try_cleanup_dead_nodes( + iceoryx2.ServiceType.Ipc, self._node.config + ) + except Exception: + logger.debug("Iox2DaemonDrain: dead-node cleanup skipped", exc_info=True) + + def register_channel(self, channel_id: str) -> None: + """Create a subscriber for ``channel_id`` if one does not already exist.""" + with self._lock: + if channel_id in self._subscribers: + return + + service_name = f"{IOX2_SERVICE_PREFIX}{channel_id}" + try: + service = ( + self._node.service_builder(iceoryx2.ServiceName.new(service_name)) + .publish_subscribe(Slice[ctypes.c_uint8]) + .subscriber_max_buffer_size(IOX2_SUBSCRIBER_BUFFER_SIZE) + .enable_safe_overflow(True) + .history_size(IOX2_HISTORY_SIZE) + .open_or_create() + ) + subscriber = ( + service.subscriber_builder() + .buffer_size(IOX2_SUBSCRIBER_BUFFER_SIZE) + .create() + ) + typed_subscriber = cast(_Subscriber, subscriber) + except Exception: + logger.exception( + "Iox2DaemonDrain: failed to register channel channel_id=%s", + channel_id, + ) + return + + with self._lock: + # Re-check under the lock in case of a concurrent registration. + if channel_id in self._subscribers: + _delete_subscriber(typed_subscriber) + return + self._subscribers[channel_id] = _ChannelSubscriber(typed_subscriber) + logger.info("Iox2DaemonDrain: registered subscriber service=%s", service_name) + + def is_registered(self, channel_id: str) -> bool: + """Return whether a subscriber exists for the channel.""" + with self._lock: + return channel_id in self._subscribers + + def drain_all(self, on_frame: FrameCallback) -> int: + """Poll every subscriber and invoke ``on_frame`` for each received sample. + + Each sample's payload is copied out and the sample released immediately + (returning the slot to the ring buffer). ``on_frame`` runs synchronously + on the caller's thread; it should stay fast and offload heavy work. + Returns the total number of frames drained. + """ + with self._lock: + items = list(self._subscribers.items()) + + drained = 0 + for channel_id, channel_sub in items: + drained += self._drain_channel(channel_id, channel_sub, on_frame) + return drained + + def _drain_channel( + self, + channel_id: str, + channel_sub: _ChannelSubscriber, + on_frame: FrameCallback, + ) -> int: + drained = 0 + subscriber = channel_sub.subscriber + while True: + try: + sample = subscriber.receive() + except Exception: + logger.exception( + "Iox2DaemonDrain: receive error channel_id=%s", channel_id + ) + break + if sample is None: + break + try: + payload = sample.payload() + raw = bytes( + (ctypes.c_uint8 * payload.number_of_elements).from_address( + payload.data_ptr + ) + ) + finally: + sample.delete() + + try: + self._dispatch_frame(channel_id, channel_sub, raw, on_frame) + drained += 1 + except Exception: + logger.exception( + "Iox2DaemonDrain: failed to process frame channel_id=%s", + channel_id, + ) + return drained + + def _dispatch_frame( + self, + channel_id: str, + channel_sub: _ChannelSubscriber, + raw: bytes, + on_frame: FrameCallback, + ) -> None: + envelope, chunk = parse_video_transport_packet(raw) + sequence_id = _parse_int_field(envelope, FRAME_SEQUENCE_KEY) + frame_index = _parse_int_field(envelope, FRAME_INDEX_KEY) + metadata = _parse_metadata_field(envelope, FRAME_META_KEY) + + dropped = channel_sub.note_frame_index(frame_index) + if dropped and channel_sub.should_log_drops(): + logger.warning( + "Iox2DaemonDrain: dropped video frames channel_id=%s " + "recent=%d total=%d (daemon overload / DiscardData)", + channel_id, + dropped, + channel_sub.dropped_frames, + ) + on_frame(channel_id, sequence_id, metadata, chunk) + + def dropped_frame_count(self, channel_id: str) -> int: + """Return the total frames dropped for one channel (for observability).""" + with self._lock: + channel_sub = self._subscribers.get(channel_id) + return channel_sub.dropped_frames if channel_sub is not None else 0 + + def unregister_channel(self, channel_id: str) -> None: + """Remove and close the subscriber for one channel.""" + with self._lock: + channel_sub = self._subscribers.pop(channel_id, None) + if channel_sub is None: + return + if channel_sub.dropped_frames: + logger.warning( + "Iox2DaemonDrain: channel_id=%s dropped %d video frames total", + channel_id, + channel_sub.dropped_frames, + ) + _delete_subscriber(channel_sub.subscriber) + logger.info("Iox2DaemonDrain: unregistered channel channel_id=%s", channel_id) + + def close(self) -> None: + """Close all subscribers and the node.""" + with self._lock: + channel_subs = list(self._subscribers.values()) + self._subscribers.clear() + for channel_sub in channel_subs: + _delete_subscriber(channel_sub.subscriber) + try: + del self._node + except Exception: + logger.warning("Iox2DaemonDrain: error closing node", exc_info=True) + logger.info("Iox2DaemonDrain closed") + + +def _parse_int_field(envelope: dict[str, object], key: str) -> int: + """Extract one integer field from a decoded frame envelope.""" + value = envelope[key] + if not isinstance(value, int): + raise TypeError(f"Frame envelope field {key!r} must be an int") + return value + + +def _parse_metadata_field(envelope: dict[str, object], key: str) -> dict[str, object]: + """Extract the per-chunk metadata mapping from a decoded frame envelope.""" + value = envelope[key] + if not isinstance(value, dict): + raise TypeError(f"Frame envelope field {key!r} must be a dict") + return cast(dict[str, object], value) + + +def _delete_subscriber(subscriber: _Subscriber) -> None: + """Best-effort release of one iceoryx2 subscriber.""" + try: + subscriber.delete() + except Exception: + logger.warning("Iox2DaemonDrain: error closing subscriber", exc_info=True) diff --git a/neuracore/data_daemon/communications_management/shared_transport/iox2_video_transport.py b/neuracore/data_daemon/communications_management/shared_transport/iox2_video_transport.py new file mode 100644 index 000000000..9b051f22a --- /dev/null +++ b/neuracore/data_daemon/communications_management/shared_transport/iox2_video_transport.py @@ -0,0 +1,215 @@ +"""iceoryx2-backed video frame transport for one producer channel. + +Replaces the hand-rolled fixed-slot video transport. The producer owns one +iceoryx2 publisher per channel and publishes self-describing frame packets into +a lock-free, zero-copy ring buffer. Loan -> write -> send all happen on the +caller's thread; there is no background worker, queue, credit channel, or +watchdog. Backpressure is handled by iceoryx2: under daemon overload the oldest +buffered frames are overwritten (DiscardData semantics). + +The channel ``sequence_id`` (reserved from the shared +:class:`ChannelSequenceAllocator`) travels inside each frame so the daemon can +preserve the same end-of-recording ordering and drop semantics as the ZMQ +control path. A separate per-publisher ``frame_index`` lets the daemon detect +and count frames dropped under overload. +""" + +from __future__ import annotations + +import ctypes +import logging +import threading +from collections.abc import Mapping + +import iceoryx2 +from iceoryx2 import Slice + +from neuracore.data_daemon.communications_management.sequence_allocator import ( + ChannelSequenceAllocator, +) +from neuracore.data_daemon.communications_management.shared_transport.framing import ( + PacketTooLarge, + build_video_transport_packet, +) +from neuracore.data_daemon.const import ( + IOX2_HISTORY_SIZE, + IOX2_MAX_FRAME_BYTES, + IOX2_SERVICE_PREFIX, + IOX2_SUBSCRIBER_BUFFER_SIZE, +) + +logger = logging.getLogger(__name__) + +# Metadata-envelope keys carried alongside the per-chunk metadata so the daemon +# can recover lifecycle ordering (``seq``) and detect dropped frames (``idx``). +FRAME_SEQUENCE_KEY = "seq" +FRAME_INDEX_KEY = "idx" +FRAME_META_KEY = "meta" + + +def build_frame_envelope( + sequence_id: int, + frame_index: int, + metadata: Mapping[str, object], +) -> dict[str, object]: + """Wrap per-chunk metadata with the lifecycle sequence and frame index.""" + return { + FRAME_SEQUENCE_KEY: sequence_id, + FRAME_INDEX_KEY: frame_index, + FRAME_META_KEY: metadata, + } + + +class Iox2VideoTransport: + """Producer-side iceoryx2 publisher for one video channel.""" + + def __init__( + self, + channel_id: str, + sequence_allocator: ChannelSequenceAllocator | None = None, + max_frame_bytes: int = IOX2_MAX_FRAME_BYTES, + ) -> None: + """Create the iceoryx2 node, service, and publisher for one channel.""" + self._channel_id = channel_id + self._service_name = f"{IOX2_SERVICE_PREFIX}{channel_id}" + self._max_frame_bytes = int(max_frame_bytes) + self._sequence_allocator = sequence_allocator or ChannelSequenceAllocator() + self._state_lock = threading.Lock() + self._healthy = True + self._error: Exception | None = None + self._frame_index = 0 + self._last_payload_sequence_number = 0 + + # open_or_create lets the producer start before or after the daemon + # without a coordination handshake. + self._node = iceoryx2.NodeBuilder.new().create(iceoryx2.ServiceType.Ipc) + service = ( + self._node.service_builder(iceoryx2.ServiceName.new(self._service_name)) + .publish_subscribe(Slice[ctypes.c_uint8]) + .subscriber_max_buffer_size(IOX2_SUBSCRIBER_BUFFER_SIZE) + .enable_safe_overflow(True) + .history_size(IOX2_HISTORY_SIZE) + .open_or_create() + ) + self._publisher = ( + service.publisher_builder() + .initial_max_slice_len(self._max_frame_bytes) + .max_loaned_samples(1) + .create() + ) + logger.info("Iox2VideoTransport created service=%s", self._service_name) + + @property + def service_name(self) -> str: + """Return the iceoryx2 service name for this channel.""" + return self._service_name + + def send_frame( + self, + metadata: Mapping[str, object], + chunk: bytes | bytearray | memoryview, + stop_cutoff_sequence_number: int | None = None, + ) -> int | None: + """Encode and publish one frame on the caller's thread. + + Reserves a channel sequence number, builds the self-describing packet + (magic + header + JSON metadata envelope + raw chunk), loans an + iceoryx2 slot, writes into it with a single memmove, and sends. + + Returns the reserved sequence number, or ``None`` if the frame is + rejected by the stop cutoff. Raises :class:`PacketTooLarge` if the + encoded frame exceeds the configured slot size. Returns ``None`` and + marks the transport unhealthy if the iceoryx2 send fails. + """ + sequence_number = self._sequence_allocator.reserve() + if ( + stop_cutoff_sequence_number is not None + and sequence_number > stop_cutoff_sequence_number + ): + return None + + if not self._healthy: + return None + + with self._state_lock: + frame_index = self._frame_index + self._frame_index += 1 + + envelope = build_frame_envelope(sequence_number, frame_index, metadata) + packet = build_video_transport_packet(envelope, chunk) + packet_length = len(packet) + if packet_length > self._max_frame_bytes: + raise PacketTooLarge( + f"Frame {packet_length} bytes exceeds max {self._max_frame_bytes}" + ) + + try: + sample = self._publisher.loan_slice_uninit(packet_length) + ctypes.memmove(sample.payload().data_ptr, packet, packet_length) + deliveries = sample.assume_init().send() + if deliveries == 0: + logger.debug( + "Iox2VideoTransport: no daemon subscriber yet service=%s", + self._service_name, + ) + except Exception as exc: + logger.exception( + "Iox2VideoTransport send failed service=%s", self._service_name + ) + with self._state_lock: + self._healthy = False + self._error = exc + return None + + with self._state_lock: + self._last_payload_sequence_number = max( + self._last_payload_sequence_number, sequence_number + ) + return sequence_number + + def update_connections(self) -> None: + """Refresh publisher connections so late subscribers receive history. + + Called from the producer heartbeat so a daemon subscriber that registers + after the producer has gone idle still receives buffered history frames. + """ + if not self._healthy: + return + try: + self._publisher.update_connections() + except Exception: + logger.debug( + "Iox2VideoTransport: update_connections failed service=%s", + self._service_name, + exc_info=True, + ) + + def is_healthy(self) -> bool: + """Return True while the transport can accept new video writes.""" + return self._healthy + + def get_last_reserved_sequence_number(self) -> int: + """Return the most recently reserved channel sequence number.""" + return self._sequence_allocator.get_last_reserved_sequence_number() + + def get_last_payload_sequence_number(self) -> int: + """Return the latest sequence number reserved for a published frame.""" + with self._state_lock: + return self._last_payload_sequence_number + + def finish_recording_session(self) -> None: + """Reset per-recording local state. The service/publisher persist.""" + with self._state_lock: + self._last_payload_sequence_number = 0 + + def close(self) -> None: + """Drop the publisher and node.""" + try: + self._publisher.delete() + except Exception: + logger.warning("Iox2VideoTransport: error closing publisher", exc_info=True) + try: + del self._node + except Exception: + logger.warning("Iox2VideoTransport: error closing node", exc_info=True) + logger.info("Iox2VideoTransport closed service=%s", self._service_name) diff --git a/neuracore/data_daemon/communications_management/shared_transport/models.py b/neuracore/data_daemon/communications_management/shared_transport/models.py deleted file mode 100644 index c90078109..000000000 --- a/neuracore/data_daemon/communications_management/shared_transport/models.py +++ /dev/null @@ -1,165 +0,0 @@ -"""Shared transport data structures used by producer and daemon components.""" - -from __future__ import annotations - -import threading -from collections import deque -from collections.abc import Callable -from dataclasses import dataclass, field -from enum import Enum -from multiprocessing.shared_memory import SharedMemory -from pathlib import Path -from typing import TYPE_CHECKING - -import zmq - -from neuracore.data_daemon.models import ( - SharedMemoryChunkMetadata, - SharedSlotDescriptor, - TraceTransportMetadata, -) - -from ..consumer.bridge_chunk_spool import ChunkSpoolRef - -if TYPE_CHECKING: - from ..producer.producer_channel_message_sender import ProducerChannelMessageSender - - -class SharedSlotUnhealthyReason(str, Enum): - """Enumeration of reasons a shared-slot transport can become unhealthy.""" - - OPEN_TIMEOUT = "open_timeout" - OPEN_FAILED = "open_failed" - ATTACH_FAILED = "attach_failed" - SENDER_FAILURE = "sender_failure" - CREDIT_STALL = "credit_stall" - - -@dataclass(frozen=True) -class SharedSlotReservation: - """Shared-memory reservation details returned by the budget manager.""" - - slot_count: int - allocated_bytes: int - - -@dataclass -class QueuedSharedSlotPacket: - """Transport packet queued for background worker.""" - - producer_id: str - sender: ProducerChannelMessageSender - metadata_bytes: bytes - chunk: bytes - packet_length: int - sequence_number: int - - -@dataclass(frozen=True) -class SharedSlotTransportResult: - """Daemon-side result of reading one shared-slot descriptor.""" - - descriptor: SharedSlotDescriptor - chunk_metadata: SharedMemoryChunkMetadata - chunk_spool_ref: ChunkSpoolRef - trace_id: str - trace_metadata: TraceTransportMetadata | None - - -@dataclass(frozen=True) -class SharedSlotRegistryConfig: - """Configuration values for one producer shared-slot registry.""" - - slot_size: int - slot_count: int - ack_timeout_s: float - allocate_timeout_s: float - - -@dataclass(frozen=True) -class InFlightSlot: - """Metadata for one shared-memory slot awaiting credit return.""" - - shm_name: str - slot_id: int - sequence_id: int - reserved_at: float - socket_sent_at: float | None = None - - -@dataclass -class SharedSlotRegistryState: - """Mutable producer-side shared-slot transport state.""" - - shm_name: str | None = None - shm: SharedMemory | None = None - free_slots: deque[int] = field(default_factory=deque) - sequence_id: int = 1 - healthy: bool = True - ready: bool = False - in_flight: dict[int, InFlightSlot] = field(default_factory=dict) - max_in_flight_count: int = 0 - acked_sequence_count: int = 0 - ack_timeout_count: int = 0 - last_acked_sequence_id: int | None = None - last_ack_latency_s: float | None = None - max_ack_latency_s: float = 0.0 - last_credit_return_at: float | None = None - closed: bool = False - unhealthy_reason: SharedSlotUnhealthyReason | None = None - unhealthy_reason_detail: str | None = None - failure_message: str | None = None - - -@dataclass -class SharedSlotControlRuntime: - """Threading and socket resources for shared-slot control messages.""" - - control_socket_path: Path - control_endpoint: str - context: zmq.Context - control_socket: zmq.Socket - stop_event: threading.Event - control_thread: threading.Thread - watchdog_thread: threading.Thread - - @classmethod - def build( - cls, - *, - socket_path: Path, - control_listener_target: Callable[[], None], - watchdog_target: Callable[[], None], - ) -> SharedSlotControlRuntime: - """Build a control runtime with bound IPC socket and worker threads.""" - control_endpoint = f"ipc://{socket_path}" - context = zmq.Context() - control_socket = context.socket(zmq.PULL) - control_socket.setsockopt(zmq.LINGER, 0) - control_socket.bind(control_endpoint) - - stop_event = threading.Event() - control_thread = threading.Thread( - target=control_listener_target, - name="shared-slot-control-listener", - daemon=True, - ) - watchdog_thread = threading.Thread( - target=watchdog_target, - name="shared-slot-watchdog", - daemon=True, - ) - return cls( - control_socket_path=socket_path, - control_endpoint=control_endpoint, - context=context, - control_socket=control_socket, - stop_event=stop_event, - control_thread=control_thread, - watchdog_thread=watchdog_thread, - ) - - def start(self) -> None: - """Start the control listener and watchdog threads.""" - self.control_thread.start() - self.watchdog_thread.start() diff --git a/neuracore/data_daemon/communications_management/shared_transport/registry.py b/neuracore/data_daemon/communications_management/shared_transport/registry.py deleted file mode 100644 index 48e56386d..000000000 --- a/neuracore/data_daemon/communications_management/shared_transport/registry.py +++ /dev/null @@ -1,709 +0,0 @@ -"""Producer-side registry for daemon-owned shared-slot transport.""" - -from __future__ import annotations - -import logging -import os -import threading -import time -import uuid -from multiprocessing import resource_tracker -from multiprocessing.shared_memory import SharedMemory -from pathlib import Path - -import zmq - -from neuracore.data_daemon.const import ACK_BASE_DIR -from neuracore.data_daemon.models import ( - CommandType, - MessageEnvelope, - OpenFixedSharedSlotsModel, - SharedSlotCreditReturn, - SharedSlotOpenFailedModel, - SharedSlotReadyModel, -) - -from .models import ( - InFlightSlot, - SharedSlotControlRuntime, - SharedSlotRegistryConfig, - SharedSlotRegistryState, - SharedSlotUnhealthyReason, -) - -logger = logging.getLogger(__name__) - - -def create_control_socket_path(base_dir: Path = ACK_BASE_DIR) -> Path: - """Create a unique filesystem path for the shared-slot control socket.""" - base_dir.mkdir(parents=True, exist_ok=True) - socket_path = base_dir / f"slot_control_{os.getpid()}_{uuid.uuid4().hex}.ipc" - try: - socket_path.unlink() - except FileNotFoundError: - pass - return socket_path - - -class SharedSlotUnhealthyError(RuntimeError): - """Raised when the shared-slot transport can no longer accept work.""" - - -class SharedSlotOpenFailedError(SharedSlotUnhealthyError): - """Raised when the daemon explicitly rejected a shared-slot open request.""" - - -class SharedSlotTimeout(TimeoutError): - """Raised when shared-slot setup, allocation, or credit return times out.""" - - -class SharedSlotRegistry: - """Producer-side session state for one daemon-owned shared-slot transport.""" - - def __init__( - self, - slot_size: int, - slot_count: int, - ack_timeout_s: float, - allocate_timeout_s: float, - ) -> None: - """Initialize a producer-side shared-slot registry. - - Args: - slot_size: Size of each shared-memory slot in bytes. - slot_count: Number of fixed slots available in the shared-memory - transport. - ack_timeout_s: Maximum time to wait for daemon acknowledgements and - credit-return progress before marking the transport unhealthy. - allocate_timeout_s: Maximum time to wait for shared-slot setup or - slot allocation before timing out. - """ - self._config = SharedSlotRegistryConfig( - slot_size=int(slot_size), - slot_count=int(slot_count), - ack_timeout_s=float(ack_timeout_s), - allocate_timeout_s=float(allocate_timeout_s), - ) - self._state = SharedSlotRegistryState() - self._condition = threading.Condition() - self._runtime = SharedSlotControlRuntime.build( - socket_path=create_control_socket_path(), - control_listener_target=self._control_listener_loop, - watchdog_target=self._watchdog_loop, - ) - self._runtime.start() - - @property - def slot_size(self) -> int: - """Return the configured slot size.""" - return self._config.slot_size - - @property - def slot_count(self) -> int: - """Return the configured slot count.""" - return self._config.slot_count - - @property - def ack_timeout_s(self) -> float: - """Return the configured ACK timeout.""" - return self._config.ack_timeout_s - - @property - def allocate_timeout_s(self) -> float: - """Return the configured allocation timeout.""" - return self._config.allocate_timeout_s - - @property - def shm_name(self) -> str | None: - """Return the current attached shared-memory name, if any.""" - return self._state.shm_name - - @property - def control_endpoint(self) -> str: - """Return the producer-side control endpoint.""" - return self._runtime.control_endpoint - - def request_payload(self) -> OpenFixedSharedSlotsModel: - """Return the setup request payload for daemon-owned fixed shared slots.""" - return OpenFixedSharedSlotsModel( - control_endpoint=self.control_endpoint, - slot_size=self.slot_size, - slot_count=self.slot_count, - ) - - def is_ready(self) -> bool: - """Return True when the daemon has opened the shared memory session.""" - with self._condition: - return self._state.ready - - def wait_until_ready(self) -> bool: - """Block until the daemon has opened the shared-slot session.""" - deadline = time.monotonic() + self.allocate_timeout_s - with self._condition: - while not self._state.ready: - if not self._is_healthy_locked(): - return False - remaining = deadline - time.monotonic() - if remaining <= 0: - self._mark_unhealthy_locked(SharedSlotUnhealthyReason.OPEN_TIMEOUT) - return False - self._condition.wait(timeout=min(0.1, remaining)) - return True - - def is_healthy(self) -> bool: - """Return True while the shared-slot transport is still healthy.""" - with self._condition: - return self._is_healthy_locked() - - def ensure_healthy(self) -> None: - """Raise when the shared-slot transport is unhealthy.""" - with self._condition: - if self._is_healthy_locked(): - return - raise self._build_unhealthy_error_locked() - - def allocate_slot(self) -> tuple[int, int]: - """Reserve one free slot or fail when backpressure persists.""" - deadline = time.monotonic() + self.allocate_timeout_s - with self._condition: - while True: - self._check_for_timeouts_locked() - if not self._is_healthy_locked(): - raise self._build_unhealthy_error_locked() - if self._state.ready and self._state.free_slots: - slot_id = self._state.free_slots.popleft() - return int(slot_id), int(slot_id) * self.slot_size - remaining = deadline - time.monotonic() - if remaining <= 0: - if not self._state.ready: - self._mark_unhealthy_locked( - SharedSlotUnhealthyReason.OPEN_TIMEOUT - ) - raise SharedSlotTimeout( - "Timed out waiting for daemon-owned shared slots to open" - ) - raise SharedSlotTimeout("Timed out waiting for a free shared slot") - self._condition.wait(timeout=min(0.1, remaining)) - - def mark_in_flight(self, slot_id: int, sequence_id: int) -> int: - """Record that the slot now backs one sent descriptor.""" - with self._condition: - self._check_for_timeouts_locked() - - if not self._is_healthy_locked(): - raise self._build_unhealthy_error_locked() - self._reserve_slot_for_descriptor_locked( - slot_id=slot_id, - sequence_id=sequence_id, - ) - return sequence_id - - def mark_sent(self, sequence_id: int) -> None: - """Start the credit-return timeout clock after socket send.""" - with self._condition: - self._mark_descriptor_sent_locked(sequence_id) - - def next_sequence_number(self) -> int: - """Reserve the next shared-slot sequence number.""" - with self._condition: - sequence_id = self._state.sequence_id - self._state.sequence_id += 1 - self._condition.notify_all() - return sequence_id - - def get_last_reserved_sequence_number(self) -> int: - """Return the most recently reserved shared-slot sequence number.""" - with self._condition: - return self._state.sequence_id - 1 - - def has_in_flight_at_or_before(self, sequence_number: int) -> bool: - """Return True if any in-flight descriptor at or before sequence remains.""" - with self._condition: - return any(seq <= sequence_number for seq in self._state.in_flight) - - def release_slot(self, shm_name: str, slot_id: int, sequence_id: int) -> bool: - """Release one in-flight slot after a matching credit return arrives.""" - with self._condition: - self._check_for_timeouts_locked() - return self._apply_slot_credit_locked( - shm_name=shm_name, - slot_id=slot_id, - sequence_id=sequence_id, - ) - - def rollback_enqueued_slot(self, sequence_id: int) -> None: - """Return a slot immediately when descriptor enqueue fails.""" - with self._condition: - self._release_sequence_locked(sequence_id) - - def notify_sender_failure(self) -> None: - """Fail fast when a descriptor could not be written to ZMQ.""" - with self._condition: - if not self._is_healthy_locked(): - return - self._mark_unhealthy_locked(SharedSlotUnhealthyReason.SENDER_FAILURE) - - def get_in_flight_count(self) -> int: - """Return the number of descriptors still awaiting slot credit.""" - with self._condition: - return len(self._state.in_flight) - - def shared_memory_view(self, offset: int, length: int) -> memoryview: - """Return a writable view into one slot-sized shared-memory span.""" - shm = self._state.shm - if shm is None: - raise RuntimeError("Shared-slot transport is not ready") - return shm.buf[offset : offset + length] - - def reset_session(self) -> None: - """Reset per-recording shared-slot session state for reuse.""" - with self._condition: - if self._state.closed: - raise RuntimeError("Cannot reset a closed shared-slot registry") - self._reset_session_locked() - - def close(self) -> None: - """Stop threads and close local handles.""" - with self._condition: - if self._state.closed: - return - in_flight_count = len(self._state.in_flight) - if in_flight_count > 0: - logger.info( - "Closing shared-slot registry with in-flight slots " - "shm_name=%s in_flight=%d free_slots=%d unhealthy_reason=%s", - self._state.shm_name, - in_flight_count, - len(self._state.free_slots), - ( - self._state.unhealthy_reason.value - if self._state.unhealthy_reason is not None - else None - ), - ) - self._mark_closed_locked() - - self._runtime.stop_event.set() - self._runtime.control_thread.join(timeout=1.0) - self._runtime.watchdog_thread.join(timeout=1.0) - self._close_control_resources() - self._close_shared_memory() - self._remove_control_socket_path() - - def _control_listener_loop(self) -> None: - logger.info( - "Shared-slot control receiver started endpoint=%s", - self.control_endpoint, - ) - poller = zmq.Poller() - poller.register(self._runtime.control_socket, zmq.POLLIN) - - try: - while not self._runtime.stop_event.is_set(): - try: - events = dict(poller.poll(100)) - except zmq.ZMQError: - logger.exception( - "Shared-slot control receiver poll failed endpoint=%s", - self.control_endpoint, - ) - break - - if self._runtime.control_socket not in events: - continue - - try: - message = MessageEnvelope.from_bytes( - self._runtime.control_socket.recv() - ) - self._process_control_message(message) - except Exception: - logger.exception("Failed to process shared-slot control message") - finally: - with self._condition: - closed = self._state.closed - healthy = self._state.healthy - log_fn = ( - logger.info - if self._runtime.stop_event.is_set() and closed - else logger.warning - ) - log_fn( - "Shared-slot control receiver exiting endpoint=%s " - "stop_event=%s closed=%s healthy=%s", - self.control_endpoint, - self._runtime.stop_event.is_set(), - closed, - healthy, - ) - - def _process_control_message(self, message: MessageEnvelope) -> None: - if message.command == CommandType.SHARED_SLOT_READY: - ready = SharedSlotReadyModel(**message.payload[message.command.value]) - self._apply_ready_message(ready) - return - if message.command == CommandType.SHARED_SLOT_OPEN_FAILED: - failure = SharedSlotOpenFailedModel( - **message.payload[message.command.value] - ) - self._apply_open_failed_message(failure) - return - if message.command == CommandType.SHARED_SLOT_CREDIT_RETURN: - credit = SharedSlotCreditReturn.from_dict( - message.payload[message.command.value] - ) - self._process_slot_credit_return(credit) - return - logger.warning( - "Ignoring unexpected shared-slot control command %s", message.command - ) - - def _apply_ready_message(self, ready: SharedSlotReadyModel) -> None: - try: - shm = SharedMemory(name=ready.shm_name, create=False) - try: - resource_tracker.unregister( - getattr(shm, "_name", shm.name), "shared_memory" - ) - except Exception: - logger.debug( - "Failed to unregister daemon-owned shared-memory handle %s", - ready.shm_name, - exc_info=True, - ) - except Exception: - logger.exception( - "Failed to attach daemon-owned shared memory %s", ready.shm_name - ) - with self._condition: - self._mark_unhealthy_locked(SharedSlotUnhealthyReason.ATTACH_FAILED) - return - - with self._condition: - if self._state.closed: - shm.close() - return - self._set_ready_shared_memory_locked(shm=shm, ready=ready) - - def _apply_open_failed_message(self, failure: SharedSlotOpenFailedModel) -> None: - """Record a daemon-reported shared-slot open failure.""" - with self._condition: - if self._state.closed: - return - self._mark_unhealthy_locked( - SharedSlotUnhealthyReason.OPEN_FAILED, - error_message=failure.error_message, - ) - - def _watchdog_loop(self) -> None: - while not self._runtime.stop_event.wait(0.1): - with self._condition: - self._check_for_timeouts_locked() - - def _set_ready_shared_memory_locked( - self, - *, - shm: SharedMemory, - ready: SharedSlotReadyModel, - ) -> None: - """Swap in a newly attached shared-memory region and mark registry ready.""" - if self._state.shm is not None: - self._state.shm.close() - self._config = SharedSlotRegistryConfig( - slot_size=int(ready.slot_size), - slot_count=int(ready.slot_count), - ack_timeout_s=self._config.ack_timeout_s, - allocate_timeout_s=self._config.allocate_timeout_s, - ) - self._state.shm = shm - self._state.shm_name = ready.shm_name - self._state.free_slots = type(self._state.free_slots)(range(self.slot_count)) - self._state.last_credit_return_at = None - self._state.ready = True - self._condition.notify_all() - - def _reset_session_locked(self) -> None: - """Clear daemon-session state while preserving channel sequence IDs.""" - self._close_shared_memory_locked() - self._state.shm_name = None - self._state.free_slots.clear() - self._state.ready = False - self._state.healthy = True - self._state.in_flight.clear() - self._state.max_in_flight_count = 0 - self._state.acked_sequence_count = 0 - self._state.ack_timeout_count = 0 - self._state.last_acked_sequence_id = None - self._state.last_ack_latency_s = None - self._state.max_ack_latency_s = 0.0 - self._state.last_credit_return_at = None - self._state.unhealthy_reason = None - self._state.unhealthy_reason_detail = None - self._state.failure_message = None - self._condition.notify_all() - - def _check_for_timeouts_locked(self) -> None: - if not self._is_healthy_locked(): - return - sent_entries = [ - entry - for entry in self._state.in_flight.values() - if entry.socket_sent_at is not None - ] - if not sent_entries: - return - - now = time.monotonic() - oldest_entry = min(sent_entries, key=lambda entry: entry.socket_sent_at or now) - last_progress_at = self._state.last_credit_return_at - stalled_since = ( - oldest_entry.socket_sent_at - if last_progress_at is None - else last_progress_at - ) - if stalled_since is None or now - stalled_since < self.ack_timeout_s: - return - - in_flight_sequences = sorted(self._state.in_flight) - logger.warning( - "Shared-slot credit stalled shm_name=%s slot_id=%s sequence_id=%s " - "in_flight=%d free_slots=%d stalled_for=%.3fs ack_timeout=%.3fs " - "last_acked_sequence_id=%s acked_sequence_count=%d " - "max_ack_latency=%.3fs in_flight_sequences=%s", - oldest_entry.shm_name, - oldest_entry.slot_id, - oldest_entry.sequence_id, - len(self._state.in_flight), - len(self._state.free_slots), - now - stalled_since, - self.ack_timeout_s, - self._state.last_acked_sequence_id, - self._state.acked_sequence_count, - self._state.max_ack_latency_s, - in_flight_sequences[:16], - ) - self._state.ack_timeout_count += 1 - self._mark_unhealthy_locked( - SharedSlotUnhealthyReason.CREDIT_STALL, - reason_detail=( - f"sequence_id={oldest_entry.sequence_id}," - f"slot_id={oldest_entry.slot_id}" - ), - ) - - def _release_sequence_locked(self, sequence_id: int) -> None: - entry = self._state.in_flight.pop(sequence_id, None) - if entry is None: - return - self._state.free_slots.append(entry.slot_id) - self._condition.notify_all() - - def _reserve_slot_for_descriptor_locked( - self, - slot_id: int, - sequence_id: int, - ) -> None: - """Create in-flight tracking for a reserved slot and sequence.""" - shm_name = self._state.shm_name - if shm_name is None: - raise RuntimeError("Shared-slot transport is not ready") - - if sequence_id < 0: - raise ValueError("sequence_id must be non-negative") - - if sequence_id in self._state.in_flight: - raise RuntimeError(f"Shared-slot sequence already in flight: {sequence_id}") - - self._state.in_flight[sequence_id] = InFlightSlot( - shm_name=shm_name, - slot_id=int(slot_id), - sequence_id=sequence_id, - reserved_at=time.monotonic(), - ) - self._state.max_in_flight_count = max( - self._state.max_in_flight_count, - len(self._state.in_flight), - ) - self._condition.notify_all() - - def _mark_descriptor_sent_locked(self, sequence_id: int) -> None: - """Start the credit timeout clock for one in-flight descriptor.""" - entry = self._state.in_flight.get(sequence_id) - if entry is None or entry.socket_sent_at is not None: - return - - self._state.in_flight[sequence_id] = InFlightSlot( - shm_name=entry.shm_name, - slot_id=entry.slot_id, - sequence_id=entry.sequence_id, - reserved_at=entry.reserved_at, - socket_sent_at=time.monotonic(), - ) - self._condition.notify_all() - - def _apply_slot_credit_locked( - self, - shm_name: str, - slot_id: int, - sequence_id: int, - ) -> bool: - """Apply one returned slot credit to the in-flight state.""" - entry = self._state.in_flight.get(sequence_id) - if entry is None or entry.shm_name != shm_name or entry.slot_id != slot_id: - logger.debug( - "Ignoring stale or unknown slot credit " - "shm_name=%s slot_id=%s sequence_id=%s", - shm_name, - slot_id, - sequence_id, - ) - return False - - now = time.monotonic() - if entry.socket_sent_at is not None: - ack_latency_s = now - entry.socket_sent_at - self._state.last_ack_latency_s = ack_latency_s - self._state.max_ack_latency_s = max( - self._state.max_ack_latency_s, - ack_latency_s, - ) - - self._state.last_acked_sequence_id = sequence_id - self._state.acked_sequence_count += 1 - self._release_sequence_locked(sequence_id) - self._state.last_credit_return_at = now - self._condition.notify_all() - return True - - def _process_slot_credit_return(self, credit: SharedSlotCreditReturn) -> None: - """Apply one returned slot credit.""" - self.release_slot( - credit.shm_name, - credit.slot_id, - credit.sequence_id, - ) - - def _is_healthy_locked(self) -> bool: - """Return True when the registry can still accept work.""" - return self._state.healthy and not self._state.closed - - def _mark_closed_locked(self) -> None: - """Mark the registry closed and wake any waiters.""" - self._state.closed = True - self._state.healthy = False - self._condition.notify_all() - - def _mark_unhealthy_locked( - self, - reason: SharedSlotUnhealthyReason, - *, - error_message: str | None = None, - reason_detail: str | None = None, - sequence_ids: list[int] | None = None, - ) -> None: - """Transition to unhealthy state and release affected slots.""" - was_healthy = self._state.healthy - diagnostics = self._format_unhealthy_diagnostics_locked() - self._state.healthy = False - self._state.unhealthy_reason = reason - self._state.unhealthy_reason_detail = reason_detail - if error_message is not None: - self._state.failure_message = error_message - if was_healthy: - if error_message: - logger.error( - "Shared-slot transport marked unhealthy reason=%s " - "daemon_error=%r %s", - reason, - error_message, - diagnostics, - ) - else: - logger.error( - "Shared-slot transport marked unhealthy reason=%s %s", - reason, - diagnostics, - ) - ids_to_release = ( - list(self._state.in_flight) if sequence_ids is None else sequence_ids - ) - for sequence_id in ids_to_release: - self._release_sequence_locked(sequence_id) - self._condition.notify_all() - - def _build_unhealthy_error_locked(self) -> RuntimeError: - """Build the most specific unhealthy transport error available.""" - base_message = ( - self._state.failure_message or "Shared-slot transport is unhealthy" - ) - reason = self._state.unhealthy_reason - if reason: - reason_label = reason.value - if self._state.unhealthy_reason_detail: - reason_label = f"{reason_label}({self._state.unhealthy_reason_detail})" - message = ( - f"{base_message}: reason={reason_label} " - f"{self._format_unhealthy_diagnostics_locked()}" - ) - else: - message = f"{base_message}: {self._format_unhealthy_diagnostics_locked()}" - if reason is SharedSlotUnhealthyReason.OPEN_FAILED: - return SharedSlotOpenFailedError(message) - return SharedSlotUnhealthyError(message) - - def _format_unhealthy_diagnostics_locked(self) -> str: - """Return compact state details for unhealthy transport logs/errors.""" - in_flight_sequences = sorted(self._state.in_flight) - return ( - f"shm_name={self._state.shm_name} ready={self._state.ready} " - f"closed={self._state.closed} in_flight={len(self._state.in_flight)} " - f"free_slots={len(self._state.free_slots)} slot_count={self.slot_count} " - f"last_acked_sequence_id={self._state.last_acked_sequence_id} " - f"acked_sequence_count={self._state.acked_sequence_count} " - f"ack_timeout_count={self._state.ack_timeout_count} " - f"last_ack_latency_s={self._state.last_ack_latency_s} " - f"max_ack_latency_s={self._state.max_ack_latency_s:.3f} " - f"in_flight_sequences={in_flight_sequences[:16]}" - ) - - def _close_control_resources(self) -> None: - """Close the producer-side control socket and its ZMQ context.""" - try: - self._runtime.control_socket.close(0) - except Exception: - logger.warning("Failed to close shared-slot control socket", exc_info=True) - try: - self._runtime.context.term() - except Exception: - logger.warning( - "Failed to terminate shared-slot control context", exc_info=True - ) - - def _close_shared_memory(self) -> None: - """Close the local attachment to the daemon-owned shared-memory region.""" - with self._condition: - self._close_shared_memory_locked() - - def _close_shared_memory_locked(self) -> None: - """Close the local attachment to the daemon-owned shared-memory region.""" - shm = self._state.shm - if shm is None: - return - try: - shm.close() - except Exception: - logger.warning("Failed to close shared-memory handle", exc_info=True) - self._state.shm = None - - def _remove_control_socket_path(self) -> None: - """Remove the filesystem entry backing the control IPC endpoint.""" - try: - self._runtime.control_socket_path.unlink() - except FileNotFoundError: - pass - except OSError: - logger.warning( - "Failed to remove shared-slot control socket file %s", - self._runtime.control_socket_path, - exc_info=True, - ) diff --git a/neuracore/data_daemon/communications_management/shared_transport/shared_memory_budget.py b/neuracore/data_daemon/communications_management/shared_transport/shared_memory_budget.py deleted file mode 100644 index dc08153fa..000000000 --- a/neuracore/data_daemon/communications_management/shared_transport/shared_memory_budget.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Budget shared-memory allocations for daemon-owned slot transport.""" - -import logging -import shutil -import threading -from dataclasses import dataclass, field - -from neuracore.data_daemon.communications_management.shared_transport.models import ( - SharedSlotReservation, -) -from neuracore.data_daemon.lifecycle.runtime_recovery import _default_shm_path - -logger = logging.getLogger(__name__) - -BYTES_PER_MIB = 1024**2 - - -@dataclass(frozen=True) -class SHMBytesAllocation: - """One tracked shared-memory allocation.""" - - shm_name: str - allocated_bytes: int - - -@dataclass -class SHMBytesAllocationRegistry: - """Bookkeeping for outstanding shared-memory allocations.""" - - _allocations: dict[str, SHMBytesAllocation] = field(default_factory=dict) - - def add(self, allocation: SHMBytesAllocation) -> None: - """Track a new allocation by shared-memory name.""" - self._allocations[allocation.shm_name] = allocation - - def pop(self, shm_name: str) -> SHMBytesAllocation | None: - """Remove and return one tracked allocation, if present.""" - return self._allocations.pop(shm_name, None) - - -class SharedMemoryBudget: - """Track a conservative `/dev/shm` budget for shared-slot segments.""" - - def __init__( - self, - shm_path: str | None = None, - budget_fraction: float = 0.75, - ) -> None: - """Initialize budget state for future shared-memory reservations.""" - self._shm_path = shm_path or _default_shm_path() - self._budget_fraction = budget_fraction - self._lock = threading.Lock() - self._reserved_bytes = 0 - self._allocations = SHMBytesAllocationRegistry() - - def reserve( - self, - *, - shm_name: str, - slot_size: int, - requested_slot_count: int, - ) -> SharedSlotReservation: - """Reserve shared-memory capacity for a fixed-slot segment.""" - usage = shutil.disk_usage(self._shm_path) - total_budget = int(usage.total * self._budget_fraction) - - with self._lock: - remaining_budget = total_budget - self._reserved_bytes - - if remaining_budget < slot_size: - raise RuntimeError( - "Not enough shared-memory for data throughput requirements. " - "Next steps: 1) increase shared memory size, and/or " - "2) reduce volume of data logged. " - f"slot_size={slot_size / BYTES_PER_MIB:.2f}MiB, " - f"remaining={remaining_budget / BYTES_PER_MIB:.2f}MiB, " - f"reserved={self._reserved_bytes / BYTES_PER_MIB:.2f}MiB, " - f"budget={total_budget / BYTES_PER_MIB:.2f}MiB, " - f"shm_total={usage.total / BYTES_PER_MIB:.2f}MiB" - ) - - slot_count = min( - requested_slot_count, - remaining_budget // slot_size, - ) - - allocated_bytes = slot_size * slot_count - - allocation = SHMBytesAllocation( - shm_name=shm_name, - allocated_bytes=allocated_bytes, - ) - - self._reserved_bytes += allocated_bytes - self._allocations.add(allocation) - - reserved_bytes = self._reserved_bytes - - logger.debug( - "Reserved shared-memory budget shm_name=%s slot_size=%.2fMiB " - "requested_slot_count=%d actual_slot_count=%d allocated=%.2fMiB " - "reserved_total=%.2fMiB budget=%.2fMiB shm_total=%.2fMiB", - shm_name, - slot_size / BYTES_PER_MIB, - requested_slot_count, - slot_count, - allocated_bytes / BYTES_PER_MIB, - reserved_bytes / BYTES_PER_MIB, - total_budget / BYTES_PER_MIB, - usage.total / BYTES_PER_MIB, - ) - - return SharedSlotReservation( - slot_count=int(slot_count), - allocated_bytes=int(allocated_bytes), - ) - - def release(self, shm_name: str) -> None: - """Release any tracked reservation for the given shared-memory name.""" - with self._lock: - allocation = self._allocations.pop(shm_name) - - if allocation is None: - return - - self._reserved_bytes = max( - 0, - self._reserved_bytes - allocation.allocated_bytes, - ) - reserved_bytes = self._reserved_bytes - - logger.debug( - "Released shared-memory budget shm_name=%s released=%.2fMiB " - "reserved_total=%.2fMiB", - allocation.shm_name, - allocation.allocated_bytes / BYTES_PER_MIB, - reserved_bytes / BYTES_PER_MIB, - ) - - def rollback(self, shm_name: str) -> None: - """Alias `release` for callers handling failed allocation setup.""" - self.release(shm_name) diff --git a/neuracore/data_daemon/communications_management/shared_transport/shared_slot_daemon_handler.py b/neuracore/data_daemon/communications_management/shared_transport/shared_slot_daemon_handler.py deleted file mode 100644 index d77065d96..000000000 --- a/neuracore/data_daemon/communications_management/shared_transport/shared_slot_daemon_handler.py +++ /dev/null @@ -1,534 +0,0 @@ -"""Daemon-side shared-slot transport helpers.""" - -from __future__ import annotations - -import logging -import threading -import time -import uuid -from collections.abc import Callable -from multiprocessing.shared_memory import SharedMemory -from typing import Protocol - -import zmq - -from neuracore.data_daemon.const import SHARED_SLOT_SHM_PREFIX -from neuracore.data_daemon.helpers import env_float -from neuracore.data_daemon.models import ( - CommandType, - MessageEnvelope, - OpenFixedSharedSlotsModel, - SharedMemoryChunkMetadata, - SharedSlotCreditReturn, - SharedSlotDescriptor, - SharedSlotOpenFailedModel, - SharedSlotReadyModel, -) - -from ..consumer.bridge_chunk_spool import BridgeChunkSpool, ChunkSpoolRef -from ..consumer.models import ChannelState -from .communications_manager import CommunicationsManager -from .models import SharedSlotTransportResult -from .shared_memory_budget import SharedMemoryBudget -from .shared_slot_transport import parse_shared_frame_packet_view - -logger = logging.getLogger(__name__) - -SHARED_SLOT_REOPEN_DRAIN_TIMEOUT_S = 1 - - -class SharedSlotDescriptorAbandoned(RuntimeError): - """Raised when a queued descriptor belongs to an abandoned slot session.""" - - -class _AckSenderSocket(Protocol): - def close(self, linger: int = 0) -> None: ... - - def connect(self, addr: str) -> None: ... - - def send(self, data: bytes) -> None: ... - - def setsockopt(self, option: int, value: int) -> None: ... - - -class SharedSlotDaemonHandler: - """Own daemon-side shared-slot transport mechanics.""" - - def __init__( - self, - comm: CommunicationsManager, - reopen_drain_timeout_s: float = SHARED_SLOT_REOPEN_DRAIN_TIMEOUT_S, - ) -> None: - """Initialize daemon-side caches for shared memory and ACK sockets.""" - self._comm = comm - self._shared_memory_cache: dict[str, SharedMemory] = {} - self._ack_sender_sockets: dict[str, _AckSenderSocket] = {} - self._shared_memory_budget = SharedMemoryBudget() - self._reopen_drain_timeout_s = env_float( - "NCD_SHARED_SLOT_REOPEN_DRAIN_TIMEOUT_S", - float(reopen_drain_timeout_s), - ) - self._descriptor_delay_once_s = env_float( - "NCD_TEST_SHARED_SLOT_DESCRIPTOR_DELAY_ONCE_S", - 0.0, - ) - self._descriptor_delay_lock = threading.Lock() - self._pending_by_shm: dict[str, set[tuple[str, int]]] = {} - self._abandoned_descriptors: set[tuple[str, str, int]] = set() - self._pending_condition = threading.Condition() - - def _cleanup_previous_shared_slots( - self, channel: ChannelState, control_endpoint: str | None = None - ) -> None: - """Clean up a producer's previous shared-slot resources, if any.""" - previous_shm_name = channel.shared_slot.shm_name - previous_endpoint = channel.shared_slot.control_endpoint - - if previous_shm_name: - old = self._shared_memory_cache.pop(previous_shm_name, None) - if old is not None: - try: - old.close() - finally: - try: - old.unlink() - except FileNotFoundError: - pass - - self._shared_memory_budget.release(previous_shm_name) - - if previous_endpoint and previous_endpoint != control_endpoint: - old_socket = self._ack_sender_sockets.pop(previous_endpoint, None) - if old_socket is not None: - old_socket.close(0) - - channel.shared_slot.reset() - - def handle_open( - self, - channel: ChannelState, - payload: dict, - on_abandoned_sequences: Callable[[str, list[int]], None] | None = None, - ) -> None: - """Open daemon-owned fixed shared slots for one channel.""" - request = OpenFixedSharedSlotsModel(**payload) - - if channel.shared_slot.shm_name is not None: - abandoned_sequences = self._wait_or_abandon_previous_session(channel) - if abandoned_sequences and on_abandoned_sequences is not None: - on_abandoned_sequences(channel.producer_id, abandoned_sequences) - self._cleanup_previous_shared_slots(channel, request.control_endpoint) - - shm_name = f"{SHARED_SLOT_SHM_PREFIX}{uuid.uuid4().hex[:16]}" - - reservation = None - shm: SharedMemory | None = None - - try: - reservation = self._shared_memory_budget.reserve( - shm_name=shm_name, - slot_size=request.slot_size, - requested_slot_count=request.slot_count, - ) - - shm = SharedMemory( - name=shm_name, - create=True, - size=reservation.allocated_bytes, - ) - - self._shared_memory_cache[shm_name] = shm - - channel.mark_shared_slot_transport_open( - control_endpoint=request.control_endpoint, - shm_name=shm_name, - ) - - self._send_ready_message( - endpoint=request.control_endpoint, - ready=SharedSlotReadyModel( - shm_name=shm_name, - slot_size=request.slot_size, - slot_count=reservation.slot_count, - ), - ) - - except Exception as exc: - error_message = str(exc) or exc.__class__.__name__ - self._shared_memory_cache.pop(shm_name, None) - - if shm is not None: - try: - shm.close() - shm.unlink() - except FileNotFoundError: - pass - except Exception: - logger.warning( - "Failed to clean up shared memory after open failure %s", - shm_name, - exc_info=True, - ) - - self._shared_memory_budget.rollback(shm_name) - channel.shared_slot.reset() - if request.control_endpoint: - try: - self._send_open_failed_message( - endpoint=request.control_endpoint, - failure=SharedSlotOpenFailedModel(error_message=error_message), - ) - except Exception: - logger.warning( - "Failed to send shared-slot open failure " - "producer_id=%s endpoint=%s", - channel.producer_id, - request.control_endpoint, - exc_info=True, - ) - raise - - def handle_descriptor( - self, - channel: ChannelState, - payload: dict, - chunk_spool: BridgeChunkSpool, - ) -> SharedSlotTransportResult: - """Spool, credit, and parse one shared-slot descriptor.""" - self._delay_pending_descriptor_processing(channel, payload) - descriptor = SharedSlotDescriptor.from_dict(payload) - self._raise_if_descriptor_abandoned(channel, descriptor) - spool_failed = False - try: - metadata_dict, chunk_spool_ref = self._spool_shared_slot_packet( - descriptor, chunk_spool - ) - except Exception: - spool_failed = True - logger.exception( - "Shared-slot copy failed " "producer_id=%s sequence_id=%s slot_id=%s", - channel.producer_id, - descriptor.sequence_id, - descriptor.slot_id, - ) - raise - finally: - try: - self._send_slot_credit_return(channel, descriptor) - except Exception: - if spool_failed: - logger.exception( - "Failed to return shared-slot credit after copy failure " - "producer_id=%s sequence_id=%s slot_id=%s", - channel.producer_id, - descriptor.sequence_id, - descriptor.slot_id, - ) - else: - raise - - channel.mark_shared_slot_descriptor_seen( - shm_name=descriptor.shm_name, - ) - - chunk_metadata = SharedMemoryChunkMetadata.from_dict(metadata_dict) - return SharedSlotTransportResult( - descriptor=descriptor, - chunk_metadata=chunk_metadata, - chunk_spool_ref=chunk_spool_ref, - trace_id=chunk_metadata.trace_id, - trace_metadata=chunk_metadata.trace_metadata, - ) - - def _delay_pending_descriptor_processing( - self, - channel: ChannelState, - payload: dict, - ) -> None: - """Optional one-shot descriptor delay used by integration tests.""" - with self._descriptor_delay_lock: - delay_s = self._descriptor_delay_once_s - self._descriptor_delay_once_s = 0.0 - if delay_s <= 0.0: - return - - logger.warning( - "Delaying shared-slot descriptor processing for test " - "producer_id=%s shm_name=%s sequence_id=%s delay=%.3fs", - channel.producer_id, - payload.get("shm_name"), - payload.get("sequence_id"), - delay_s, - ) - time.sleep(delay_s) - - def mark_descriptor_pending( - self, - channel: ChannelState, - payload: dict, - ) -> SharedSlotDescriptor: - """Track a shared-slot descriptor until the spool worker has handled it.""" - descriptor = SharedSlotDescriptor.from_dict(payload) - with self._pending_condition: - descriptor_key = self._descriptor_key(channel, descriptor) - if descriptor_key in self._abandoned_descriptors: - raise SharedSlotDescriptorAbandoned( - "Shared-slot descriptor belongs to an abandoned session " - f"producer_id={channel.producer_id} " - f"shm_name={descriptor.shm_name} " - f"sequence_id={descriptor.sequence_id}" - ) - self._pending_by_shm.setdefault(descriptor.shm_name, set()).add( - (channel.producer_id, descriptor.sequence_id) - ) - self._pending_condition.notify_all() - return descriptor - - def mark_descriptor_completed( - self, - producer_id: str, - descriptor: SharedSlotDescriptor, - ) -> None: - """Clear daemon-side pending tracking for one shared-slot descriptor.""" - with self._pending_condition: - pending = self._pending_by_shm.get(descriptor.shm_name) - if pending is not None: - pending.discard((producer_id, descriptor.sequence_id)) - if not pending: - self._pending_by_shm.pop(descriptor.shm_name, None) - self._abandoned_descriptors.discard( - (producer_id, descriptor.shm_name, descriptor.sequence_id) - ) - self._pending_condition.notify_all() - - def _descriptor_key( - self, - channel: ChannelState, - descriptor: SharedSlotDescriptor, - ) -> tuple[str, str, int]: - return (channel.producer_id, descriptor.shm_name, descriptor.sequence_id) - - def _raise_if_descriptor_abandoned( - self, - channel: ChannelState, - descriptor: SharedSlotDescriptor, - ) -> None: - with self._pending_condition: - descriptor_key = self._descriptor_key(channel, descriptor) - if descriptor_key not in self._abandoned_descriptors: - return - raise SharedSlotDescriptorAbandoned( - "Skipping abandoned shared-slot descriptor " - f"producer_id={channel.producer_id} " - f"shm_name={descriptor.shm_name} " - f"sequence_id={descriptor.sequence_id}" - ) - - def _wait_or_abandon_previous_session(self, channel: ChannelState) -> list[int]: - """Wait for old descriptors before reusing a producer's shared-slot state.""" - shm_name = channel.shared_slot.shm_name - if shm_name is None: - return [] - - deadline = time.monotonic() + self._reopen_drain_timeout_s - with self._pending_condition: - while True: - pending = self._pending_by_shm.get(shm_name, set()) - producer_pending = { - sequence_id - for producer_id, sequence_id in pending - if producer_id == channel.producer_id - } - if not producer_pending: - return [] - - remaining = deadline - time.monotonic() - if remaining <= 0: - abandoned = sorted(producer_pending) - for sequence_id in abandoned: - pending.discard((channel.producer_id, sequence_id)) - self._abandoned_descriptors.add( - (channel.producer_id, shm_name, sequence_id) - ) - if not pending: - self._pending_by_shm.pop(shm_name, None) - self._pending_condition.notify_all() - break - self._pending_condition.wait(timeout=min(0.1, remaining)) - - logger.warning( - "Abandoning stalled shared-slot session before reopen " - "producer_id=%s shm_name=%s pending_sequences=%s " - "timeout=%.3fs", - channel.producer_id, - shm_name, - abandoned, - self._reopen_drain_timeout_s, - ) - return abandoned - - def cleanup_channel_resources(self, channel: ChannelState) -> None: - """Close daemon-side shared-slot resources associated with one channel.""" - shm_name = channel.shared_slot.shm_name - if shm_name: - shm = self._shared_memory_cache.pop(shm_name, None) - - if shm is not None: - try: - shm.close() - shm.unlink() - except FileNotFoundError: - pass - except Exception: - logger.warning( - "Failed to close cached shared memory %s", - shm_name, - exc_info=True, - ) - finally: - self._shared_memory_budget.release(shm_name) - else: - self._shared_memory_budget.release(shm_name) - - endpoint = channel.shared_slot.control_endpoint - if endpoint: - socket_obj = self._ack_sender_sockets.pop(endpoint, None) - if socket_obj is not None: - try: - socket_obj.close(0) - except Exception: - logger.warning( - "Failed to close shared-slot ACK sender %s", - endpoint, - exc_info=True, - ) - - channel.shared_slot.reset() - - def close(self) -> None: - """Close all daemon-side shared-slot handles during shutdown.""" - for socket_obj in self._ack_sender_sockets.values(): - try: - socket_obj.close(0) - except Exception: - logger.warning("Failed to close shared-slot ACK sender", exc_info=True) - self._ack_sender_sockets.clear() - - for shm_name, shm in list(self._shared_memory_cache.items()): - try: - shm.close() - shm.unlink() - except FileNotFoundError: - pass - except Exception: - logger.warning("Failed to close cached shared memory", exc_info=True) - finally: - self._shared_memory_budget.release(shm_name) - - self._shared_memory_cache.clear() - - def _spool_shared_slot_packet( - self, - descriptor: SharedSlotDescriptor, - chunk_spool: BridgeChunkSpool, - ) -> tuple[dict[str, object], ChunkSpoolRef]: - """Copy one payload chunk from shared memory into the disk-backed spool.""" - packet_view = self._shared_slot_packet_view(descriptor) - try: - metadata, chunk_start, chunk_end = parse_shared_frame_packet_view( - packet_view - ) - chunk_view = packet_view[chunk_start:chunk_end] - try: - chunk_spool_ref = chunk_spool.append(chunk_view) - finally: - chunk_view.release() - return metadata, chunk_spool_ref - finally: - packet_view.release() - - def _shared_slot_packet_view(self, descriptor: SharedSlotDescriptor) -> memoryview: - """Return one packet view out of cached shared memory.""" - shm = self._shared_memory_cache.get(descriptor.shm_name) - if shm is None: - raise RuntimeError( - "Shared-slot shared memory handle missing from daemon cache. " - "Expected handle to be cached during handle_open() " - f"for shm_name={descriptor.shm_name}" - ) - - return shm.buf[descriptor.offset : descriptor.offset + descriptor.length] - - def _send_ready_message( - self, - endpoint: str, - ready: SharedSlotReadyModel, - ) -> None: - """Send one daemon-owned shared-slot ready message.""" - socket_obj = self._get_or_create_ack_sender_socket(endpoint) - socket_obj.send( - MessageEnvelope( - producer_id=None, - command=CommandType.SHARED_SLOT_READY, - payload={CommandType.SHARED_SLOT_READY.value: ready.model_dump()}, - ).to_bytes() - ) - - def _send_open_failed_message( - self, - endpoint: str, - failure: SharedSlotOpenFailedModel, - ) -> None: - """Send one daemon-owned shared-slot open failure message.""" - socket_obj = self._get_or_create_ack_sender_socket(endpoint) - socket_obj.send( - MessageEnvelope( - producer_id=None, - command=CommandType.SHARED_SLOT_OPEN_FAILED, - payload={ - CommandType.SHARED_SLOT_OPEN_FAILED.value: failure.model_dump() - }, - ).to_bytes() - ) - - def _get_or_create_ack_sender_socket(self, endpoint: str) -> _AckSenderSocket: - """Return a cached PUSH socket for one producer control endpoint.""" - socket_obj = self._ack_sender_sockets.get(endpoint) - if socket_obj is None: - socket_obj = self._comm._context.socket(zmq.PUSH) - socket_obj.setsockopt(zmq.LINGER, 0) - socket_obj.connect(endpoint) - self._ack_sender_sockets[endpoint] = socket_obj - return socket_obj - - def _send_slot_credit_return( - self, - channel: ChannelState, - descriptor: SharedSlotDescriptor, - ) -> None: - """Return one writable slot credit immediately after shared-memory copy-out.""" - endpoint = channel.shared_slot.control_endpoint - if not endpoint: - raise RuntimeError("Shared-slot control endpoint is not available") - socket_obj = self._get_or_create_ack_sender_socket(endpoint) - credit = SharedSlotCreditReturn( - shm_name=descriptor.shm_name, - slot_id=descriptor.slot_id, - sequence_id=descriptor.sequence_id, - ) - try: - socket_obj.send( - MessageEnvelope( - producer_id=None, - command=CommandType.SHARED_SLOT_CREDIT_RETURN, - payload={ - CommandType.SHARED_SLOT_CREDIT_RETURN.value: credit.to_dict() - }, - ).to_bytes() - ) - except Exception: - logger.exception( - "Failed to return shared-slot credit producer_id=%s sequence_id=%s", - channel.producer_id, - descriptor.sequence_id, - ) diff --git a/neuracore/data_daemon/communications_management/shared_transport/shared_slot_transport.py b/neuracore/data_daemon/communications_management/shared_transport/shared_slot_transport.py deleted file mode 100644 index cd2fd05af..000000000 --- a/neuracore/data_daemon/communications_management/shared_transport/shared_slot_transport.py +++ /dev/null @@ -1,552 +0,0 @@ -"""Fixed shared-slot transport for producer-side video packets.""" - -from __future__ import annotations - -import json -import logging -import queue -import struct -import threading -import time - -from neuracore.data_daemon.communications_management.sequence_allocator import ( - ChannelSequenceAllocator, -) -from neuracore.data_daemon.const import ( - DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - DEFAULT_VIDEO_SLOT_COUNT, - DEFAULT_VIDEO_SLOT_SIZE, - SHARED_MEMORY_RECORD_HEADER_FORMAT, - SHARED_MEMORY_RECORD_HEADER_SIZE, - SHARED_MEMORY_RECORD_MAGIC, -) -from neuracore.data_daemon.helpers import env_float -from neuracore.data_daemon.models import ( - CommandType, - MessageEnvelope, - OpenFixedSharedSlotsModel, - SharedSlotDescriptor, -) - -from ..producer.producer_channel_message_sender import ProducerChannelMessageSender -from .models import QueuedSharedSlotPacket -from .registry import SharedSlotRegistry - -logger = logging.getLogger(__name__) - - -class PacketTooLarge(ValueError): - """Raised when a packet cannot fit in a single shared slot.""" - - -def build_shared_frame_packet( - metadata: dict[str, str | int | None], - chunk: bytes | bytearray | memoryview, -) -> bytes: - """Build the self-describing packet stored in one shared slot.""" - metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") - payload = bytes(chunk) - return ( - struct.pack( - SHARED_MEMORY_RECORD_HEADER_FORMAT, - SHARED_MEMORY_RECORD_MAGIC, - len(metadata_bytes), - len(payload), - ) - + metadata_bytes - + payload - ) - - -def build_shared_frame_packet_metadata( - metadata: dict[str, str | int | None], - chunk: bytes | bytearray | memoryview, -) -> tuple[bytes, int]: - """Return serialized metadata plus total packet length without copying the chunk.""" - metadata_bytes = json.dumps(metadata, separators=(",", ":")).encode("utf-8") - chunk_len = len(chunk) - packet_length = SHARED_MEMORY_RECORD_HEADER_SIZE + len(metadata_bytes) + chunk_len - return metadata_bytes, packet_length - - -def parse_shared_frame_packet(packet: bytes) -> tuple[dict[str, object], bytes]: - """Parse one self-describing packet copied out of a shared slot.""" - metadata, chunk_start, chunk_end = parse_shared_frame_packet_view( - memoryview(packet) - ) - return metadata, packet[chunk_start:chunk_end] - - -def parse_shared_frame_packet_view( - packet: memoryview, -) -> tuple[dict[str, object], int, int]: - """Parse one shared-slot packet view without copying the payload chunk.""" - if len(packet) < SHARED_MEMORY_RECORD_HEADER_SIZE: - raise ValueError("Shared-slot packet shorter than record header") - magic, metadata_len, chunk_len = struct.unpack( - SHARED_MEMORY_RECORD_HEADER_FORMAT, - packet[:SHARED_MEMORY_RECORD_HEADER_SIZE], - ) - if magic != SHARED_MEMORY_RECORD_MAGIC: - raise ValueError("Shared-slot packet missing shared record magic") - expected = SHARED_MEMORY_RECORD_HEADER_SIZE + metadata_len + chunk_len - if len(packet) < expected: - raise ValueError("Shared-slot packet shorter than declared lengths") - if len(packet) > expected: - raise ValueError("Shared-slot packet contains trailing bytes") - metadata_start = SHARED_MEMORY_RECORD_HEADER_SIZE - chunk_start = metadata_start + metadata_len - metadata = json.loads(packet[metadata_start:chunk_start].tobytes().decode("utf-8")) - return metadata, chunk_start, expected - - -class SharedSlotVideoWorker: - """Background worker that writes packets into daemon-owned shared-memory slots.""" - - _instance: SharedSlotVideoWorker | None = None - _refcount = 0 - _instance_lock = threading.Lock() - - def __init__(self, registry: SharedSlotRegistry) -> None: - """Start the background worker for one shared-slot registry.""" - self._registry = registry - self._queue: queue.Queue[QueuedSharedSlotPacket | None] = queue.Queue( - maxsize=max(1, registry.slot_count) - ) - self._active_items = 0 - self._active_items_lock = threading.Lock() - self._error: Exception | None = None - self._error_lock = threading.Lock() - self._last_handed_off_sequence_number = 0 - self._handoff_cv = threading.Condition() - self._thread = threading.Thread( - target=self._worker_loop, - name="shared-slot-video-worker", - daemon=True, - ) - self._thread.start() - - @classmethod - def acquire(cls, registry: SharedSlotRegistry) -> SharedSlotVideoWorker: - """Acquire a singleton worker instance for isolated unit tests.""" - with cls._instance_lock: - if cls._instance is None: - cls._instance = cls(registry) - cls._refcount += 1 - return cls._instance - - @classmethod - def release_shared_instance(cls) -> None: - """Release one singleton test worker reference.""" - with cls._instance_lock: - if cls._instance is None: - return - cls._refcount = max(0, cls._refcount - 1) - if cls._refcount > 0: - return - instance = cls._instance - cls._instance = None - instance.close() - - @classmethod - def reset_shared_instance_for_tests(cls) -> None: - """Tear down the singleton test worker, if any.""" - with cls._instance_lock: - instance = cls._instance - cls._instance = None - cls._refcount = 0 - if instance is not None: - instance.close() - - def enqueue_packet( - self, - *, - packet: QueuedSharedSlotPacket, - ) -> None: - """Queue one complete packet for shared-memory copy and descriptor send.""" - self._ensure_running() - if packet.packet_length > self._registry.slot_size: - raise PacketTooLarge( - "Packet length " - f"{packet.packet_length} exceeds slot size {self._registry.slot_size}" - ) - while True: - self._ensure_running() - try: - self._queue.put(packet, timeout=0.1) - return - except queue.Full: - continue - - def close(self) -> None: - """Stop the worker thread.""" - try: - self._queue.put(None, timeout=0.1) - except queue.Full: - pass - self._thread.join(timeout=1.0) - - def is_idle(self) -> bool: - """Return True when the worker has no queued packets left.""" - with self._active_items_lock: - return self._queue.qsize() == 0 and self._active_items == 0 - - def last_handed_off_sequence_number(self) -> int: - """Return the latest descriptor sequence handed off to the sender.""" - with self._handoff_cv: - return self._last_handed_off_sequence_number - - def wait_until_handed_off_through( - self, - sequence_number: int, - timeout_s: float, - ) -> None: - """Wait until all packets through sequence_number are descriptor-enqueued.""" - if sequence_number <= 0: - return - - deadline = time.monotonic() + timeout_s - - with self._handoff_cv: - while self._last_handed_off_sequence_number < sequence_number: - worker_error = self.get_error() - if worker_error is not None: - raise RuntimeError( - "Shared-slot video worker failed before payload handoff " - f"completed: {worker_error}" - ) from worker_error - - remaining = deadline - time.monotonic() - if remaining <= 0: - raise RuntimeError( - "Timed out waiting for shared-slot payload handoff before stop" - ) - - self._handoff_cv.wait(timeout=min(0.05, remaining)) - - def get_error(self) -> Exception | None: - """Return the worker error, if the background thread failed.""" - with self._error_lock: - return self._error - - def _ensure_running(self) -> None: - self._registry.ensure_healthy() - with self._error_lock: - if self._error is not None: - raise RuntimeError( - f"Shared-slot video worker failed: {self._error}" - ) from self._error - if not self._thread.is_alive(): - raise RuntimeError("Shared-slot video worker is not running") - - def _worker_loop(self) -> None: - while True: - item = self._queue.get() - try: - if item is None: - break - with self._active_items_lock: - self._active_items += 1 - try: - self._process_item(item) - except Exception as exc: - with self._error_lock: - self._error = exc - with self._handoff_cv: - self._handoff_cv.notify_all() - logger.exception("Shared-slot video worker failed") - break - finally: - with self._active_items_lock: - self._active_items = max(0, self._active_items - 1) - finally: - self._queue.task_done() - - with self._handoff_cv: - self._handoff_cv.notify_all() - - def _process_item(self, item: QueuedSharedSlotPacket) -> None: - """Copy a queued packet into a shm and hand off descriptor.""" - slot_id, offset = self._registry.allocate_slot() - try: - shm_view = self._registry.shared_memory_view(offset, item.packet_length) - try: - header = struct.pack( - SHARED_MEMORY_RECORD_HEADER_FORMAT, - SHARED_MEMORY_RECORD_MAGIC, - len(item.metadata_bytes), - len(item.chunk), - ) - header_end = SHARED_MEMORY_RECORD_HEADER_SIZE - metadata_end = header_end + len(item.metadata_bytes) - shm_view[:header_end] = header - shm_view[header_end:metadata_end] = item.metadata_bytes - shm_view[metadata_end : item.packet_length] = item.chunk - finally: - shm_view.release() - - sequence_id = item.sequence_number - - self._registry.mark_in_flight( - slot_id=slot_id, - sequence_id=sequence_id, - ) - - if self._registry.shm_name is None: - raise RuntimeError("Shared-slot transport is not ready") - descriptor = SharedSlotDescriptor( - shm_name=self._registry.shm_name, - slot_id=slot_id, - offset=offset, - length=item.packet_length, - sequence_id=sequence_id, - slot_size=self._registry.slot_size, - ) - envelope = MessageEnvelope( - producer_id=item.producer_id, - command=CommandType.SHARED_SLOT_DESCRIPTOR, - payload={ - CommandType.SHARED_SLOT_DESCRIPTOR.value: descriptor.to_dict(), - }, - sequence_number=sequence_id, - ) - try: - item.sender.enqueue_envelope( - envelope, - on_sent=lambda: self._registry.mark_sent(sequence_id), - on_failed_send=self._registry.notify_sender_failure, - ) - except Exception: - self._registry.rollback_enqueued_slot(sequence_id) - raise - - with self._handoff_cv: - self._last_handed_off_sequence_number = max( - self._last_handed_off_sequence_number, - sequence_id, - ) - self._handoff_cv.notify_all() - - finally: - del item - - -class SharedSlotVideoTransport: - """Producer-facing adapter over one daemon-owned shared-slot session.""" - - def __init__( - self, - sequence_allocator: ChannelSequenceAllocator | None = None, - slot_size: int = DEFAULT_VIDEO_SLOT_SIZE, - slot_count: int = DEFAULT_VIDEO_SLOT_COUNT, - ack_timeout_s: float = DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - allocate_timeout_s: float = DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - ) -> None: - """Initialize a producer-side shared-slot transport. - - Args: - sequence_allocator: Channel sequence allocator to use. - slot_size: Size of each shared-memory slot in bytes. - slot_count: Number of fixed slots available in the shared-memory - transport. - ack_timeout_s: Maximum time to wait for daemon acknowledgements and - credit-return progress before marking the transport unhealthy. - allocate_timeout_s: Maximum time to wait for shared-slot setup or - slot allocation before timing out. - """ - self._registry = SharedSlotRegistry( - slot_size=slot_size, - slot_count=slot_count, - ack_timeout_s=env_float( - "NCD_VIDEO_ACK_TIMEOUT_SECONDS", - ack_timeout_s, - ), - allocate_timeout_s=env_float( - "NCD_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS", - allocate_timeout_s, - ), - ) - self._sequence_allocator = ( - sequence_allocator - if sequence_allocator is not None - else ChannelSequenceAllocator() - ) - self._worker = SharedSlotVideoWorker(self._registry) - self._announced = False - self._payload_sequence_lock = threading.Lock() - self._last_payload_sequence_number = 0 - - @property - def slot_size(self) -> int: - """Return the configured fixed slot size.""" - return self._registry.slot_size - - def open_payload(self) -> OpenFixedSharedSlotsModel: - """Return the setup request payload and mark the transport announced.""" - self._announced = True - return self._registry.request_payload() - - def is_announced(self) -> bool: - """Return True when setup has been announced to the daemon.""" - return self._announced - - def is_ready(self) -> bool: - """Return True when the daemon has opened the shared-memory session.""" - return self._registry.is_ready() - - def wait_until_ready(self) -> bool: - """Block until the daemon has opened the shared-memory session.""" - return self._registry.wait_until_ready() - - def enqueue_packet( - self, - producer_id: str, - sender: ProducerChannelMessageSender, - metadata: dict[str, str | int | None], - chunk: bytes | bytearray | memoryview, - stop_cutoff_sequence_number: int | None = None, - ) -> int | None: - """Serialize one transport packet and hand it to the background worker. - - Returns the reserved sequence number, or None if rejected by stop cutoff. - """ - sequence_number = self._sequence_allocator.reserve() - - if ( - stop_cutoff_sequence_number is not None - and sequence_number > stop_cutoff_sequence_number - ): - return None - - metadata_bytes, packet_length = build_shared_frame_packet_metadata( - metadata, - chunk, - ) - chunk_bytes = chunk if isinstance(chunk, bytes) else bytes(chunk) - self._worker.enqueue_packet( - packet=QueuedSharedSlotPacket( - producer_id=producer_id, - sender=sender, - metadata_bytes=metadata_bytes, - chunk=chunk_bytes, - packet_length=packet_length, - sequence_number=sequence_number, - ) - ) - - with self._payload_sequence_lock: - self._last_payload_sequence_number = max( - self._last_payload_sequence_number, - sequence_number, - ) - - return sequence_number - - def next_sequence_number(self) -> int: - """Reserve a channel-scoped sequence number for control messages.""" - return self._sequence_allocator.reserve() - - def get_last_reserved_sequence_number(self) -> int: - """Return the most recently reserved shared-slot sequence number.""" - return self._sequence_allocator.get_last_reserved_sequence_number() - - def get_last_payload_sequence_number(self) -> int: - """Return the latest sequence number reserved for a payload packet.""" - with self._payload_sequence_lock: - return self._last_payload_sequence_number - - def is_healthy(self) -> bool: - """Return True while the transport can accept new video writes.""" - return self._registry.is_healthy() - - def notify_sender_failure(self) -> None: - """Mark the shared-slot transport unhealthy after sender failure.""" - self._registry.notify_sender_failure() - - def finish_recording_session(self) -> None: - """Reset transport state so the next recording opens a fresh session.""" - self._registry.reset_session() - self._announced = False - - with self._payload_sequence_lock: - self._last_payload_sequence_number = 0 - - def wait_until_drained( - self, - timeout_s: float = 30.0, - max_sequence_number: int | None = None, - ) -> None: - """Wait until queued packets and in-flight credits are settled.""" - deadline = time.monotonic() + timeout_s - - while time.monotonic() < deadline: - worker_error = self._worker.get_error() - if worker_error is not None: - raise RuntimeError( - "Shared-slot transport worker failed before drain completed: " - f"{worker_error}" - ) from worker_error - - if max_sequence_number is None: - drained = self._is_drained() - elif max_sequence_number <= 0: - drained = True - else: - drained = ( - self._worker.last_handed_off_sequence_number() - >= max_sequence_number - and not self._registry.has_in_flight_at_or_before( - max_sequence_number - ) - ) - - if drained: - return - - time.sleep(0.05) - - raise RuntimeError( - "Timed out waiting for shared-slot transport to drain before close" - ) - - def wait_until_payload_handed_off( - self, - timeout_s: float = 30.0, - max_sequence_number: int | None = None, - ) -> None: - """Wait until queued payloads have been copied and descriptor-enqueued.""" - if max_sequence_number is not None: - self._worker.wait_until_handed_off_through( - sequence_number=max_sequence_number, - timeout_s=timeout_s, - ) - return - - deadline = time.monotonic() + timeout_s - - while time.monotonic() < deadline: - worker_error = self._worker.get_error() - if worker_error is not None: - raise RuntimeError( - "Shared-slot transport worker failed before payload " - f"handoff completed: {worker_error}" - ) from worker_error - - if self._worker.is_idle(): - return - - time.sleep(0.01) - - raise RuntimeError( - "Timed out waiting for shared-slot payload handoff before stop" - ) - - def close(self) -> None: - """Release this channel's shared-slot runtime.""" - self._worker.close() - self._registry.close() - - def _is_drained(self) -> bool: - """Return True when shutdown can proceed without queued local work.""" - return self._worker.is_idle() and self._registry.get_in_flight_count() == 0 diff --git a/neuracore/data_daemon/const.py b/neuracore/data_daemon/const.py index 9b5bf7dfb..f3c6d6e42 100644 --- a/neuracore/data_daemon/const.py +++ b/neuracore/data_daemon/const.py @@ -18,33 +18,41 @@ # uint32 total_chunks, uint32 chunk_len CHUNK_HEADER_SIZE = struct.calcsize(CHUNK_HEADER_FORMAT) -SHARED_MEMORY_RECORD_MAGIC = b"NCR1" -SHARED_MEMORY_RECORD_HEADER_FORMAT = "!4sII" -SHARED_MEMORY_RECORD_HEADER_SIZE = struct.calcsize(SHARED_MEMORY_RECORD_HEADER_FORMAT) -SHARED_SLOT_SHM_PREFIX = "ncs-" +VIDEO_TRANSPORT_PACKET_MAGIC = b"NCR1" +VIDEO_TRANSPORT_PACKET_HEADER_FORMAT = "!4sII" +VIDEO_TRANSPORT_PACKET_HEADER_SIZE = struct.calcsize( + VIDEO_TRANSPORT_PACKET_HEADER_FORMAT +) -# Shared transport sizing. +# Transport sizing. # Keep these aligned with frontend/PFE expectations. DEFAULT_CHUNK_SIZE = 64 * 1024 # 64 KiB -DEFAULT_SHARED_MEMORY_SIZE = 8 * 1024 * 1024 # 8 MiB +DEFAULT_TRANSPORT_BUFFER_SIZE = 8 * 1024 * 1024 # 8 MiB # 4K RGB frame: 3840 * 2160 * 3 = 24,883,200 bytes ~= 23.73 MiB. -# A video record must fit in one shared-memory slot, including header + metadata. +# A video chunk must fit in one loaned transport sample, including header + metadata. DEFAULT_VIDEO_CHUNK_SIZE = 4 * 1024 * 1024 # 4 MiB DEFAULT_VIDEO_SEND_QUEUE_MAXSIZE = 0 DEFAULT_VIDEO_SLOT_SIZE = DEFAULT_VIDEO_CHUNK_SIZE + ( 64 * 1024 ) # metadata + header headroom -DEFAULT_VIDEO_SLOT_COUNT = max( - 1, (32 * 1024 * 1024) // DEFAULT_VIDEO_SLOT_SIZE # 32 MiB total budget -) -DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS = 5.0 -DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS = 5.0 + +# iceoryx2 video transport settings. +# One zero-copy publish/subscribe service per producer channel, named +# f"{IOX2_SERVICE_PREFIX}{channel_id}". +IOX2_SERVICE_PREFIX = "neuracore/video/" +# Slots in the daemon subscriber ring buffer per channel. Under overload the +# oldest frames are overwritten (DiscardData semantics). +IOX2_SUBSCRIBER_BUFFER_SIZE = 16 +# Historical samples retained so a daemon subscriber that registers slightly +# after the producer starts publishing can still catch up on recent frames. +IOX2_HISTORY_SIZE = 16 +# Maximum encoded frame packet (header + metadata + chunk) per loaned slot. +IOX2_MAX_FRAME_BYTES = DEFAULT_VIDEO_SLOT_SIZE BASE_DIR = Path("/tmp/ndd") SOCKET_PATH = BASE_DIR / "management.sock" -ACK_BASE_DIR = BASE_DIR / "slot_acks" # Uploads Configuration paths and files CONFIG_DIR = Path.home() / ".neuracore" @@ -85,3 +93,6 @@ # default profile name DEFAULT_PROFILE_NAME = "default_profile" + +DEFAULT_UPLOAD_WAIT_TIMEOUT_SECONDS = 180 +DURATION_VARIATION_TOLERANCE_SECONDS = 4 diff --git a/neuracore/data_daemon/lifecycle/runtime_recovery.py b/neuracore/data_daemon/lifecycle/runtime_recovery.py index 7dc6d8015..5bccad073 100644 --- a/neuracore/data_daemon/lifecycle/runtime_recovery.py +++ b/neuracore/data_daemon/lifecycle/runtime_recovery.py @@ -10,7 +10,6 @@ from collections.abc import Iterable, Iterator from pathlib import Path -from neuracore.data_daemon.const import ACK_BASE_DIR, SHARED_SLOT_SHM_PREFIX from neuracore.data_daemon.lifecycle.daemon_os_control import ( DaemonLifecycleError, remove_pid_file, @@ -87,59 +86,6 @@ def shared_memory_required_bytes( return int(total_payload_bytes) + int(metadata_size) -def cleanup_stale_shared_slot_segments( - shm_dir: Path = _SHARED_MEMORY_DIR, -) -> int: - """Remove stale daemon-owned shared-slot segments from /dev/shm.""" - if not shm_dir.exists(): - return 0 - - cleaned = 0 - - for shm_path in shm_dir.iterdir(): - if not shm_path.name.startswith(SHARED_SLOT_SHM_PREFIX): - continue - - try: - shm_path.unlink() - cleaned += 1 - except FileNotFoundError: - continue - except OSError as exc: - logger.warning( - "Failed to remove shared-slot segment %s: %s", - shm_path, - exc, - ) - - return cleaned - - -def cleanup_stale_shared_slot_control_sockets( - ack_dir: Path = ACK_BASE_DIR, -) -> int: - """Remove stale shared-slot ACK control socket files.""" - if not ack_dir.exists(): - return 0 - - cleaned = 0 - - for socket_path in ack_dir.glob("slot_control_*.ipc"): - try: - socket_path.unlink() - cleaned += 1 - except FileNotFoundError: - continue - except OSError as exc: - logger.warning( - "Failed to remove shared-slot ACK socket %s: %s", - socket_path, - exc, - ) - - return cleaned - - def cleanup_socket_files(paths: Iterable[Path]) -> None: """Remove socket files that exist on disk.""" for socket_path in paths: @@ -158,8 +104,6 @@ def cleanup_stale_runtime_state( socket_paths: Iterable[Path], ) -> None: """Clean stale daemon-owned runtime files from previous processes.""" - cleanup_stale_shared_slot_segments() - cleanup_stale_shared_slot_control_sockets() cleanup_socket_files(socket_paths) @@ -285,8 +229,6 @@ def shutdown( __all__ = [ "checkpoint_sqlite", "cleanup_stale_runtime_state", - "cleanup_stale_shared_slot_control_sockets", - "cleanup_stale_shared_slot_segments", "cleanup_socket_files", "ensure_shared_memory_capacity", "reconcile_state_with_filesystem", diff --git a/neuracore/data_daemon/models.py b/neuracore/data_daemon/models.py index f767c0c66..4c31a8ae8 100644 --- a/neuracore/data_daemon/models.py +++ b/neuracore/data_daemon/models.py @@ -27,11 +27,6 @@ def get_content_type(data_type: DataType) -> str: class CommandType(Enum): """Commands sent from the producer to the daemon.""" - OPEN_FIXED_SHARED_SLOTS = "open_fixed_shared_slots" - SHARED_SLOT_DESCRIPTOR = "shared_slot_descriptor" - SHARED_SLOT_READY = "shared_slot_ready" - SHARED_SLOT_OPEN_FAILED = "shared_slot_open_failed" - SHARED_SLOT_CREDIT_RETURN = "shared_slot_credit_return" HEARTBEAT = "heartbeat" DATA_CHUNK = "data_chunk" BATCHED_JOINT_DATA = "batched_joint_data" @@ -255,35 +250,11 @@ def from_row(cls, row: dict[str, Any]) -> "TraceRecord": ) -class OpenFixedSharedSlotsModel(BaseModel): - """Producer request to open daemon-owned fixed shared slots.""" - - transport_mode: str = "FIXED_SHARED_SLOTS_DAEMON_OWNED" - control_endpoint: str - slot_size: int - slot_count: int - - -class SharedSlotReadyModel(BaseModel): - """Daemon response describing one opened shared-slot transport.""" - - shm_name: str - slot_size: int - slot_count: int - - -class SharedSlotOpenFailedModel(BaseModel): - """Daemon response describing why a shared-slot open request failed.""" - - error_message: str - - class ManagementModel(BaseModel): """Model for management commands from the producer to the daemon.""" producer_id: str command: CommandType - open_fixed_shared_slots: OpenFixedSharedSlotsModel | None = None @dataclass(frozen=True) @@ -385,8 +356,8 @@ def merged_with( @dataclass(frozen=True) -class SharedMemoryChunkMetadata: - """Per-chunk metadata written into shared memory.""" +class VideoTransportChunkMetadata: + """Per-chunk metadata carried by the video transport.""" trace_id: str chunk_index: int @@ -394,8 +365,8 @@ class SharedMemoryChunkMetadata: trace_metadata: TraceTransportMetadata | None = None @classmethod - def from_dict(cls, data: dict[str, Any]) -> "SharedMemoryChunkMetadata": - """Parse a shared-memory chunk metadata record from JSON.""" + def from_dict(cls, data: dict[str, Any]) -> "VideoTransportChunkMetadata": + """Parse a video-transport chunk metadata record from JSON.""" return cls( trace_id=str(data["trace_id"]), chunk_index=int(data["chunk_index"]), @@ -404,7 +375,7 @@ def from_dict(cls, data: dict[str, Any]) -> "SharedMemoryChunkMetadata": ) def to_dict(self) -> dict[str, str | int | None]: - """Serialize the shared-memory chunk metadata to a JSON-friendly dict.""" + """Serialize the video-transport chunk metadata to a JSON-friendly dict.""" payload: dict[str, str | int | None] = { "trace_id": self.trace_id, "chunk_index": self.chunk_index, @@ -415,72 +386,6 @@ def to_dict(self) -> dict[str, str | int | None]: return payload -@dataclass(frozen=True) -class SharedSlotDescriptor: - """Descriptor for one packet stored in shared memory.""" - - shm_name: str - slot_id: int - offset: int - length: int - sequence_id: int - slot_size: int - ack_endpoint: str | None = None - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "SharedSlotDescriptor": - """Parse a shared-slot descriptor from a dict payload.""" - return cls( - shm_name=str(data["shm_name"]), - slot_id=int(data["slot_id"]), - offset=int(data["offset"]), - length=int(data["length"]), - sequence_id=int(data["sequence_id"]), - slot_size=int(data["slot_size"]), - ack_endpoint=( - None if data.get("ack_endpoint") is None else str(data["ack_endpoint"]) - ), - ) - - def to_dict(self) -> dict[str, str | int | None]: - """Serialize the descriptor to a JSON-friendly dict.""" - return { - "shm_name": self.shm_name, - "slot_id": self.slot_id, - "offset": self.offset, - "length": self.length, - "sequence_id": self.sequence_id, - "slot_size": self.slot_size, - "ack_endpoint": self.ack_endpoint, - } - - -@dataclass(frozen=True) -class SharedSlotCreditReturn: - """Credit return for one daemon-owned shared-memory slot.""" - - shm_name: str - slot_id: int - sequence_id: int - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "SharedSlotCreditReturn": - """Parse a slot credit return from a dict payload.""" - return cls( - shm_name=str(data["shm_name"]), - slot_id=int(data["slot_id"]), - sequence_id=int(data["sequence_id"]), - ) - - def to_dict(self) -> dict[str, str | int]: - """Serialize the credit return to a JSON-friendly dict.""" - return { - "shm_name": self.shm_name, - "slot_id": self.slot_id, - "sequence_id": self.sequence_id, - } - - @dataclass class DataChunkPayload: """Payload for the DATA_CHUNK command.""" diff --git a/neuracore/data_daemon/runtime.py b/neuracore/data_daemon/runtime.py index 96ae08251..16cc7f6e0 100644 --- a/neuracore/data_daemon/runtime.py +++ b/neuracore/data_daemon/runtime.py @@ -22,9 +22,10 @@ ) from neuracore.data_daemon.const import ( DEFAULT_PROFILE_NAME, - DEFAULT_SHARED_MEMORY_SIZE, - DEFAULT_VIDEO_SLOT_COUNT, - DEFAULT_VIDEO_SLOT_SIZE, + DEFAULT_TRANSPORT_BUFFER_SIZE, + IOX2_HISTORY_SIZE, + IOX2_MAX_FRAME_BYTES, + IOX2_SUBSCRIBER_BUFFER_SIZE, ) from neuracore.data_daemon.event_emitter import Emitter from neuracore.data_daemon.event_loop_manager import EventLoopManager @@ -94,11 +95,12 @@ def _prepare_runtime_state(self, config: DaemonConfig) -> Path: try: free_shared_bytes = shared_memory_free_bytes() min_required_bytes = shared_memory_required_bytes( - DEFAULT_SHARED_MEMORY_SIZE, + DEFAULT_TRANSPORT_BUFFER_SIZE, metadata_size=4096, ) video_required_bytes = shared_memory_required_bytes( - DEFAULT_VIDEO_SLOT_SIZE * DEFAULT_VIDEO_SLOT_COUNT, + IOX2_MAX_FRAME_BYTES + * (IOX2_SUBSCRIBER_BUFFER_SIZE + IOX2_HISTORY_SIZE), metadata_size=4096, ) if free_shared_bytes < min_required_bytes: diff --git a/neuracore/data_daemon/upload_management/upload_manager.py b/neuracore/data_daemon/upload_management/upload_manager.py index e8133e16c..621d146e0 100644 --- a/neuracore/data_daemon/upload_management/upload_manager.py +++ b/neuracore/data_daemon/upload_management/upload_manager.py @@ -353,7 +353,10 @@ async def upload_files() -> bool: return False cumulative_bytes += file.stat().st_size - + logger.info( + f"Marking trace complete on backend: trace-id: {trace_id}, " + f"recording_id: {recording_id}" + ) updated_trace = await self._trace_status_updater.update_trace_completed( recording_id=recording_id, trace_id=trace_id, @@ -374,6 +377,10 @@ async def upload_files() -> bool: return False self._emitter.emit(Emitter.UPLOAD_COMPLETE, trace_id) + logger.info( + f"Trace marked complete on backend: trace-id: {trace_id}, " + f"recording_id: {recording_id}" + ) return True except FileNotFoundError as e: diff --git a/setup.py b/setup.py index 6be125607..e7d9c469e 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ "neuracore_types>=7.1.0", "ordered_set", "pyzmq==27.1.0", + "iceoryx2>=0.9.0", "sqlalchemy>=2.0.0", "aiosqlite>=0.19.0", "aiohttp>=3.9.0", diff --git a/tests/integration/platform/data_daemon/behavioural_correctness/test_shared_slot_reopen.py b/tests/integration/platform/data_daemon/behavioural_correctness/test_shared_slot_reopen.py deleted file mode 100644 index 83c850024..000000000 --- a/tests/integration/platform/data_daemon/behavioural_correctness/test_shared_slot_reopen.py +++ /dev/null @@ -1,183 +0,0 @@ -"""Behavioural tests for shared-slot reopen recovery through real daemon flows.""" - -from __future__ import annotations - -import logging -import time -from collections.abc import Callable - -import pytest - -import neuracore as nc -from tests.integration.platform.data_daemon.shared.assertions import ( - assert_exactly_one_daemon_pid, - assert_post_test_storage_state, - verify_cloud_results, -) -from tests.integration.platform.data_daemon.shared.process_control import Timer -from tests.integration.platform.data_daemon.shared.runners import online_daemon_running -from tests.integration.platform.data_daemon.shared.test_case.build_test_case import ( - DataDaemonTestCase, - camera_names, - has_configured_org, - joint_names_for_count, -) -from tests.integration.platform.data_daemon.shared.test_case.build_test_case_context import ( # noqa: E501 - ContextResult, - build_context_specs, - create_testing_dataset_name, - encode_frame_number, - log_frames, -) -from tests.integration.platform.data_daemon.shared.test_case.constants import ( - MAX_TIME_TO_START_S, - STOP_RECORDING_OVERHEAD_PER_SEC, -) -from tests.integration.platform.data_daemon.shared.test_infrastructure import ( - scoped_storage_state, - set_case_analysis_report, -) - -logger = logging.getLogger(__name__) - - -_CASE = DataDaemonTestCase( - duration_sec=5, - joint_count=4, - video_count=1, - image_width=64, - image_height=64, -) - - -def test_shared_slot_reopen_after_stalled_descriptor_uploads_next_recording( - monkeypatch: pytest.MonkeyPatch, - clear_daemon_timer_stats, - request: pytest.FixtureRequest, - test_wall_timer: Callable[[], float], -) -> None: - """Force the shared-slot reopen race, then verify uploaded cloud data. - - The daemon subprocess delays the first shared-slot descriptor long enough for - the same producer to cancel and immediately reopen shared slots. The reopen - path must abandon the old pending descriptor, avoid getting stuck on the old - sequence number, and still upload the following valid recording through the - normal cloud verification path. - """ - if not has_configured_org(): - pytest.skip( - "Shared-slot reopen behavioural test requires NEURACORE_ORG_ID" - " or a saved current organization." - ) - - monkeypatch.setenv("NCD_SHARED_SLOT_REOPEN_DRAIN_TIMEOUT_S", "0.05") - monkeypatch.setenv("NCD_TEST_SHARED_SLOT_DESCRIPTOR_DELAY_ONCE_S", "2.0") - - case = _CASE - dataset_name = create_testing_dataset_name(case) - specs = build_context_specs(case, dataset_name=dataset_name) - spec = specs[0] - robot_name = spec.robot_name - camera_name = camera_names(spec.case.video_count)[0] - results: list[ContextResult] = [] - - try: - with scoped_storage_state(case, dataset_name=dataset_name): - with online_daemon_running(): - assert_exactly_one_daemon_pid() - - with Timer( - MAX_TIME_TO_START_S, label="nc.create_dataset", always_log=True - ): - nc.create_dataset( - dataset_name, - description="Shared-slot reopen recovery upload test", - ) - with Timer( - MAX_TIME_TO_START_S, label="nc.connect_robot", always_log=True - ): - robot = nc.connect_robot(robot_name, overwrite=False) - - with Timer( - MAX_TIME_TO_START_S, label="nc.start_recording", always_log=True - ): - nc.start_recording(robot_name=robot_name) - cancelled_recording_id = robot.get_current_recording_id() - assert cancelled_recording_id is not None - - nc.log_rgb( - camera_name, - encode_frame_number( - 0, - spec.case.image_width, - spec.case.image_height, - ), - robot_name=robot_name, - timestamp=0.0, - ) - time.sleep(0.2) - - with Timer( - case.duration_sec * STOP_RECORDING_OVERHEAD_PER_SEC, - label="nc.cancel_recording", - always_log=True, - assert_deadline=False, - ): - nc.cancel_recording(robot_name=robot_name) - - logger.info( - "Starting replacement recording immediately after forced " - "shared-slot descriptor stall" - ) - wall_started_at = time.time() - with Timer( - MAX_TIME_TO_START_S, label="nc.start_recording", always_log=True - ): - nc.start_recording(robot_name=robot_name) - resumed_recording_id = robot.get_current_recording_id() - assert resumed_recording_id is not None - - log_frames(spec, recording_index=0, marker_name="marker_reopen") - - with Timer( - case.duration_sec * STOP_RECORDING_OVERHEAD_PER_SEC, - label="nc.stop_recording", - always_log=True, - assert_deadline=False, - ): - nc.stop_recording(robot_name=robot_name, wait=True) - wall_stopped_at = time.time() - - results = [ - ContextResult( - dataset_name=dataset_name, - recording_ids=[resumed_recording_id], - robot_name=robot_name, - joint_names=joint_names_for_count(spec.case.joint_count), - camera_names=camera_names(spec.case.video_count), - joint_frame_count=spec.expected_joint_frames, - video_frame_count=spec.expected_video_frames, - joint_fps=spec.case.joint_fps, - video_fps=spec.case.video_fps, - duration_sec=case.duration_sec + 1, - timestamp_start_s=spec.timestamp_start_s, - timestamp_end_s=spec.timestamp_start_s + case.duration_sec, - marker_names=["marker_reopen"], - has_video=bool(spec.case.video_count), - context_index=0, - wall_started_at=wall_started_at, - wall_stopped_at=wall_stopped_at, - timestamp_mode=case.timestamp_mode, - ) - ] - verify_cloud_results(results=results, case=case) - finally: - set_case_analysis_report( - request=request, - case=case, - results=results, - label_prefix="shared_slot_reopen", - test_wall_s=test_wall_timer(), - ) - - assert_post_test_storage_state(case.storage_state_action) diff --git a/tests/integration/platform/data_daemon/daemon_test_cases.py b/tests/integration/platform/data_daemon/daemon_test_cases.py index f3cf34a31..594e745ee 100644 --- a/tests/integration/platform/data_daemon/daemon_test_cases.py +++ b/tests/integration/platform/data_daemon/daemon_test_cases.py @@ -146,38 +146,41 @@ mode=MODE_STAGGERED, timestamp_mode=TIMESTAMP_MODE_REAL, ), - # DataDaemonTestCase( - # duration_sec=10, - # recording_count=1, - # video_count=1, - # image_height=120, - # image_width=120, - # video_fps=120, - # joint_fps=1000, - # producer_channels=PRODUCER_PER_THREAD, - # timestamp_mode=TIMESTAMP_MODE_REAL, - # wait=False, - # ), - # DataDaemonTestCase( - # duration_sec=10, - # recording_count=4, - # video_count=1, - # image_height=120, - # image_width=120, - # video_fps=120, - # joint_fps=500, - # producer_channels=PRODUCER_PER_THREAD, - # timestamp_mode=TIMESTAMP_MODE_STOCHASTIC, - # wait=False, - # ), - # DataDaemonTestCase( - # duration_sec=5, - # joint_count=7, - # parallel_contexts=1, - # recording_count=1, - # joint_fps=600, - # wait=True, - # ), + DataDaemonTestCase( + duration_sec=10, + recording_count=1, + video_count=1, + image_height=120, + image_width=120, + video_fps=120, + joint_fps=1000, + producer_channels=PRODUCER_PER_THREAD, + timestamp_mode=TIMESTAMP_MODE_REAL, + wait=True, + skip=True, + ), + DataDaemonTestCase( + duration_sec=10, + recording_count=4, + video_count=1, + image_height=120, + image_width=120, + video_fps=120, + joint_fps=500, + producer_channels=PRODUCER_PER_THREAD, + timestamp_mode=TIMESTAMP_MODE_STOCHASTIC, + wait=False, + skip=True, + ), + DataDaemonTestCase( + duration_sec=5, + joint_count=7, + parallel_contexts=1, + recording_count=1, + joint_fps=600, + wait=True, + skip=True, + ), DataDaemonTestCase( duration_sec=10, joint_count=7, @@ -325,6 +328,7 @@ context_duration_mode=DURATION_MODE_FIXED, joint_fps=250, wait=True, + # skip=True, ), # High number of medium-throughput robots with synchronized # recordings. Tests: multi-robot contention, mixed data types, diff --git a/tests/integration/platform/data_daemon/data_integrity/test_network.py b/tests/integration/platform/data_daemon/data_integrity/test_network.py index 0f8937e7d..1e0b10a67 100644 --- a/tests/integration/platform/data_daemon/data_integrity/test_network.py +++ b/tests/integration/platform/data_daemon/data_integrity/test_network.py @@ -4,6 +4,7 @@ import pytest +from neuracore.data_daemon.const import DEFAULT_UPLOAD_WAIT_TIMEOUT_SECONDS from tests.integration.platform.data_daemon.daemon_test_cases import ( NETWORK_INTEGRITY_CASES, ) @@ -46,7 +47,7 @@ def _assert_online_verification_invariants( results: list[ContextResult], *, - timeout_seconds: float = 30.0, + timeout_seconds: float = DEFAULT_UPLOAD_WAIT_TIMEOUT_SECONDS, ) -> None: """Block until every recording in *results* has reached ``upload_complete`` in the platform DB. Must be called before cloud frame verification so diff --git a/tests/integration/platform/data_daemon/shared/assertions.py b/tests/integration/platform/data_daemon/shared/assertions.py index c1211ff78..2f108251f 100644 --- a/tests/integration/platform/data_daemon/shared/assertions.py +++ b/tests/integration/platform/data_daemon/shared/assertions.py @@ -46,7 +46,10 @@ import neuracore as nc from neuracore.core.data.recording import Recording -from neuracore.data_daemon.const import SOCKET_PATH +from neuracore.data_daemon.const import ( + DURATION_VARIATION_TOLERANCE_SECONDS, + SOCKET_PATH, +) from neuracore.data_daemon.helpers import get_daemon_pid_path from tests.integration.platform.data_daemon.shared.db_helpers import ( wait_for_dataset_ready, @@ -646,7 +649,7 @@ def _verify_recording_structure( # Duration bounds: variable mode allows 0.75–1.25× base; fixed mode uses exact # base duration. base_duration_s = float(result.duration_sec) - clock_tolerance_s = 2.5 + clock_tolerance_s = DURATION_VARIATION_TOLERANCE_SECONDS if case.context_duration_mode == DURATION_MODE_VARIABLE: min_duration_s = ( base_duration_s * DURATION_VARIABLE_MIN_FACTOR - clock_tolerance_s diff --git a/tests/integration/platform/data_daemon/shared/db_helpers.py b/tests/integration/platform/data_daemon/shared/db_helpers.py index 58131e06c..22a96fb95 100644 --- a/tests/integration/platform/data_daemon/shared/db_helpers.py +++ b/tests/integration/platform/data_daemon/shared/db_helpers.py @@ -33,6 +33,7 @@ import pytest import neuracore as nc +from neuracore.data_daemon.const import DEFAULT_UPLOAD_WAIT_TIMEOUT_SECONDS from neuracore.data_daemon.helpers import get_daemon_db_path from tests.integration.platform.data_daemon.shared.db_constants import ( COLUMN_EXPECTED_TRACE_COUNT, @@ -914,7 +915,7 @@ def assert_recording_db_statuses( def wait_for_upload_complete_in_db( recording_id: str, - timeout_s: float = 90.0, + timeout_s: float = DEFAULT_UPLOAD_WAIT_TIMEOUT_SECONDS, ) -> None: """Block until all known traces for a recording are uploaded per the daemon DB. diff --git a/tests/unit/core/test_data_stream.py b/tests/unit/core/test_data_stream.py index 87c332c1b..0f5e18bd2 100644 --- a/tests/unit/core/test_data_stream.py +++ b/tests/unit/core/test_data_stream.py @@ -28,10 +28,10 @@ def __init__( recording_id: str | None = None, chunk_size: int | None = None, send_queue_maxsize: int | None = None, - shared_memory_size: int | None = None, + max_frame_bytes: int | None = None, **_: object, ) -> None: - default_chunk_size, default_shared_memory_size, default_send_queue_maxsize = ( + default_chunk_size, default_max_frame_bytes, default_send_queue_maxsize = ( producer_transport_args_for_data_type(data_type) ) self.id = id @@ -43,16 +43,14 @@ def __init__( if send_queue_maxsize is None else send_queue_maxsize ) - self.init_shared_memory_size: int | None = None - self.default_shared_memory_size = ( - default_shared_memory_size - if shared_memory_size is None - else shared_memory_size + self.init_max_frame_bytes: int | None = None + self.default_max_frame_bytes = ( + default_max_frame_bytes if max_frame_bytes is None else max_frame_bytes ) - self.opened_shared_memory_sizes: list[int] = [] + self.opened_max_frame_bytes: list[int] = [] self.send_data_parts_calls: list[dict[str, object]] = [] - self.cleanup_wait_for_slot_drain_calls: list[bool] = [] - self.stop_wait_for_slot_drain_calls: list[bool] = [] + self.cleanup_wait_for_transport_drain_calls: list[bool] = [] + self.stop_wait_for_transport_drain_calls: list[bool] = [] self.trace_id = None _FakeProducerChannel.instances.append(self) @@ -60,24 +58,20 @@ def start_recording_session( self, *, recording_id: str | None = None, - shared_memory_size: int | None = None, + max_frame_bytes: int | None = None, ) -> None: if recording_id is not None: self.recording_id = recording_id self.trace_id = "trace-1" - self.opened_shared_memory_sizes.append( - self.default_shared_memory_size - if shared_memory_size is None - else shared_memory_size + self.opened_max_frame_bytes.append( + self.default_max_frame_bytes if max_frame_bytes is None else max_frame_bytes ) def initialize_new_producer_channel( - self, shared_memory_size: int | None = None + self, max_frame_bytes: int | None = None ) -> None: - self.init_shared_memory_size = ( - self.default_shared_memory_size - if shared_memory_size is None - else shared_memory_size + self.init_max_frame_bytes = ( + self.default_max_frame_bytes if max_frame_bytes is None else max_frame_bytes ) def set_recording_id(self, recording_id: str | None) -> None: @@ -89,16 +83,11 @@ def start_producer_channel(self) -> None: def stop_producer_channel( self, *, - wait_for_slot_drain: bool = True, + wait_for_transport_drain: bool = True, ) -> None: - self.stop_wait_for_slot_drain_calls.append(wait_for_slot_drain) + self.stop_wait_for_transport_drain_calls.append(wait_for_transport_drain) return - def open_fixed_shared_slots(self, slot_size: int | None = None) -> None: - self.opened_shared_memory_sizes.append( - self.default_shared_memory_size if slot_size is None else slot_size - ) - def start_new_trace(self) -> None: self.trace_id = "trace-1" @@ -106,10 +95,10 @@ def cleanup_producer_channel( self, *, stop_cutoff_sequence_number: int | None = None, - wait_for_slot_drain: bool = True, + wait_for_transport_drain: bool = True, ) -> None: del stop_cutoff_sequence_number - self.cleanup_wait_for_slot_drain_calls.append(wait_for_slot_drain) + self.cleanup_wait_for_transport_drain_calls.append(wait_for_transport_drain) return def send_data_parts(self, **kwargs: object) -> None: @@ -157,7 +146,7 @@ def test_rgb_stream_uses_video_specific_producer_settings(monkeypatch) -> None: assert producer.data_type == DataType.RGB_IMAGES assert producer.chunk_size == DEFAULT_VIDEO_CHUNK_SIZE assert producer.send_queue_maxsize == DEFAULT_VIDEO_SEND_QUEUE_MAXSIZE - assert producer.opened_shared_memory_sizes == [DEFAULT_VIDEO_SLOT_SIZE] + assert producer.opened_max_frame_bytes == [DEFAULT_VIDEO_SLOT_SIZE] def test_rgb_stream_sends_frame_as_multipart_payload(monkeypatch) -> None: @@ -201,7 +190,7 @@ def test_rgb_stream_sends_frame_as_multipart_payload(monkeypatch) -> None: assert total_bytes == len(header) + len(metadata_json) + frame.nbytes -def test_stream_stop_recording_wait_false_skips_slot_drain(monkeypatch) -> None: +def test_stream_stop_recording_wait_false_skips_transport_drain(monkeypatch) -> None: _FakeProducerChannel.instances.clear() monkeypatch.setattr( "neuracore.core.streaming.data_stream.ProducerChannel", @@ -217,8 +206,8 @@ def test_stream_stop_recording_wait_false_skips_slot_drain(monkeypatch) -> None: wait_for_producer_drain=False, ) - assert producer.cleanup_wait_for_slot_drain_calls == [False] - assert producer.stop_wait_for_slot_drain_calls == [False] + assert producer.cleanup_wait_for_transport_drain_calls == [False] + assert producer.stop_wait_for_transport_drain_calls == [False] assert stream.get_recording_context() is None assert stream.get_producer_channel() is None assert stream.is_recording() is False diff --git a/tests/unit/data_daemon/communications_management/test_iox2_daemon_drain.py b/tests/unit/data_daemon/communications_management/test_iox2_daemon_drain.py new file mode 100644 index 000000000..d66953f93 --- /dev/null +++ b/tests/unit/data_daemon/communications_management/test_iox2_daemon_drain.py @@ -0,0 +1,107 @@ +"""Unit tests for the iceoryx2 daemon-side subscriber drain pool.""" + +from __future__ import annotations + +import uuid + +from neuracore.data_daemon.communications_management.shared_transport.iox2_daemon_drain import ( # noqa: E501 + Iox2DaemonDrain, +) +from neuracore.data_daemon.communications_management.shared_transport.iox2_video_transport import ( # noqa: E501 + Iox2VideoTransport, +) + + +def _channel_id() -> str: + return f"test-{uuid.uuid4().hex[:12]}" + + +def test_register_and_drain() -> None: + """A registered channel drains frames sent by its publisher.""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id) + drain = Iox2DaemonDrain() + try: + drain.register_channel(channel_id) + transport.update_connections() + transport.send_frame({"trace_id": "t", "chunk_index": 0}, b"frame-0") + + seen: list[tuple[str, int, dict, bytes]] = [] + count = drain.drain_all(lambda *args: seen.append(args)) + assert count == 1 + assert seen[0][0] == channel_id + assert seen[0][3] == b"frame-0" + finally: + drain.close() + transport.close() + + +def test_drain_multiple_channels() -> None: + """Two channels are drained independently.""" + channel_a = _channel_id() + channel_b = _channel_id() + transport_a = Iox2VideoTransport(channel_a) + transport_b = Iox2VideoTransport(channel_b) + drain = Iox2DaemonDrain() + try: + drain.register_channel(channel_a) + drain.register_channel(channel_b) + transport_a.update_connections() + transport_b.update_connections() + transport_a.send_frame({"trace_id": "a"}, b"aaa") + transport_b.send_frame({"trace_id": "b"}, b"bbb") + + by_channel: dict[str, bytes] = {} + drain.drain_all(lambda ch, seq, meta, chunk: by_channel.__setitem__(ch, chunk)) + assert by_channel == {channel_a: b"aaa", channel_b: b"bbb"} + finally: + drain.close() + transport_a.close() + transport_b.close() + + +def test_unregister_channel_stops_draining() -> None: + """After unregister, frames on that channel are not drained.""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id) + drain = Iox2DaemonDrain() + try: + drain.register_channel(channel_id) + assert drain.is_registered(channel_id) + drain.unregister_channel(channel_id) + assert not drain.is_registered(channel_id) + + transport.send_frame({"trace_id": "t"}, b"x") + count = drain.drain_all(lambda *args: None) + assert count == 0 + finally: + drain.close() + transport.close() + + +def test_drain_empty_returns_zero() -> None: + """Draining with no frames returns 0 and does not error.""" + channel_id = _channel_id() + drain = Iox2DaemonDrain() + try: + drain.register_channel(channel_id) + assert drain.drain_all(lambda *args: None) == 0 + finally: + drain.close() + + +def test_drop_counter_detects_frame_index_gaps() -> None: + """Gaps in the per-channel frame index are counted as drops.""" + channel_id = _channel_id() + drain = Iox2DaemonDrain() + try: + drain.register_channel(channel_id) + channel_sub = drain._subscribers[channel_id] + channel_sub.note_frame_index(0) + channel_sub.note_frame_index(1) + # Frames 2 and 3 were dropped under overload. + newly_dropped = channel_sub.note_frame_index(4) + assert newly_dropped == 2 + assert drain.dropped_frame_count(channel_id) == 2 + finally: + drain.close() diff --git a/tests/unit/data_daemon/communications_management/test_iox2_video_transport.py b/tests/unit/data_daemon/communications_management/test_iox2_video_transport.py new file mode 100644 index 000000000..1e25a719f --- /dev/null +++ b/tests/unit/data_daemon/communications_management/test_iox2_video_transport.py @@ -0,0 +1,147 @@ +"""Unit tests for the iceoryx2 producer-side video transport.""" + +from __future__ import annotations + +import uuid + +import pytest + +from neuracore.data_daemon.communications_management.shared_transport.framing import ( + PacketTooLarge, + parse_video_transport_packet, +) +from neuracore.data_daemon.communications_management.shared_transport.iox2_daemon_drain import ( # noqa: E501 + Iox2DaemonDrain, +) +from neuracore.data_daemon.communications_management.shared_transport.iox2_video_transport import ( # noqa: E501 + FRAME_INDEX_KEY, + FRAME_META_KEY, + FRAME_SEQUENCE_KEY, + Iox2VideoTransport, +) + + +def _channel_id() -> str: + return f"test-{uuid.uuid4().hex[:12]}" + + +def test_send_frame_round_trip() -> None: + """A published frame is received and parses back to the original payload.""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id) + drain = Iox2DaemonDrain() + try: + drain.register_channel(channel_id) + transport.update_connections() + + metadata = {"trace_id": "t1", "chunk_index": 0, "total_chunks": 1} + chunk = b"hello-iceoryx2-payload" + seq = transport.send_frame(metadata, chunk) + assert seq is not None + + received: list[tuple[str, int, dict, bytes]] = [] + drain.drain_all(lambda *args: received.append(args)) + + assert len(received) == 1 + got_channel, got_seq, got_meta, got_chunk = received[0] + assert got_channel == channel_id + assert got_seq == seq + assert got_meta == metadata + assert got_chunk == chunk + finally: + drain.close() + transport.close() + + +def test_send_frame_too_large() -> None: + """An oversized frame raises rather than being silently discarded.""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id, max_frame_bytes=128) + try: + with pytest.raises(PacketTooLarge): + transport.send_frame({"trace_id": "t"}, b"x" * 256) + finally: + transport.close() + + +def test_send_frame_no_subscriber_is_not_an_error() -> None: + """Publishing with no daemon subscriber succeeds (frame just goes nowhere).""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id) + try: + seq = transport.send_frame({"trace_id": "t", "chunk_index": 0}, b"data") + assert seq is not None + assert transport.is_healthy() + finally: + transport.close() + + +def test_send_frame_respects_stop_cutoff() -> None: + """Frames past the stop cutoff are rejected and return None.""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id) + try: + first = transport.send_frame({"trace_id": "t"}, b"a") + assert first is not None + # The next reserved sequence will be first + 1, which is past the cutoff. + rejected = transport.send_frame( + {"trace_id": "t"}, b"b", stop_cutoff_sequence_number=first + ) + assert rejected is None + finally: + transport.close() + + +def test_finish_recording_session_is_noop_for_service() -> None: + """finish_recording_session keeps the publisher/service usable.""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id) + try: + transport.send_frame({"trace_id": "t"}, b"a") + transport.finish_recording_session() + assert transport.is_healthy() + assert transport.send_frame({"trace_id": "t2"}, b"b") is not None + finally: + transport.close() + + +def test_envelope_carries_sequence_and_index() -> None: + """Each frame embeds its sequence id and a monotonic frame index.""" + channel_id = _channel_id() + transport = Iox2VideoTransport(channel_id) + drain = Iox2DaemonDrain() + try: + drain.register_channel(channel_id) + transport.update_connections() + transport.send_frame({"trace_id": "t", "chunk_index": 0}, b"a") + transport.send_frame({"trace_id": "t", "chunk_index": 1}, b"b") + + indices: list[int] = [] + + def collect(channel: str, seq: int, meta: dict, chunk: bytes) -> None: + indices.append(seq) + + drain.drain_all(collect) + assert indices == sorted(indices) + assert len(indices) == 2 + finally: + drain.close() + transport.close() + + +def test_raw_packet_envelope_structure() -> None: + """The on-wire packet wraps metadata with seq/idx/meta keys.""" + from neuracore.data_daemon.communications_management.shared_transport.framing import ( # noqa: E501 + build_video_transport_packet, + ) + from neuracore.data_daemon.communications_management.shared_transport.iox2_video_transport import ( # noqa: E501 + build_frame_envelope, + ) + + envelope = build_frame_envelope(7, 3, {"trace_id": "t", "chunk_index": 0}) + packet = build_video_transport_packet(envelope, b"payload") + parsed, chunk = parse_video_transport_packet(packet) + assert parsed[FRAME_SEQUENCE_KEY] == 7 + assert parsed[FRAME_INDEX_KEY] == 3 + assert parsed[FRAME_META_KEY] == {"trace_id": "t", "chunk_index": 0} + assert chunk == b"payload" diff --git a/tests/unit/data_daemon/communications_management/test_producer_channel.py b/tests/unit/data_daemon/communications_management/test_producer_channel.py index dc42159ef..ab2a8d96b 100644 --- a/tests/unit/data_daemon/communications_management/test_producer_channel.py +++ b/tests/unit/data_daemon/communications_management/test_producer_channel.py @@ -1,184 +1,128 @@ +import threading + import pytest from neuracore.data_daemon.communications_management.producer.producer_channel import ( ProducerChannel, ) -from neuracore.data_daemon.communications_management.shared_transport.registry import ( - SharedSlotUnhealthyError, -) -from neuracore.data_daemon.models import DataType +from neuracore.data_daemon.models import DataType, TraceTransportMetadata -class _FakeSharedSlotTransport: - def __init__(self) -> None: - self.wait_until_payload_handed_off_calls: list[float] = [] - self.wait_until_drained_calls: list[float] = [] - self.finish_recording_session_calls = 0 - self.last_payload_sequence_number = 0 - - def get_last_payload_sequence_number(self) -> int: - return self.last_payload_sequence_number - - def wait_until_payload_handed_off( - self, - timeout_s: float = 30.0, - max_sequence_number: int | None = None, +class _FakeIox2Transport: + def __init__( + self, *, healthy: bool = True, seqs: list[int | None] | None = None ) -> None: - self.wait_until_payload_handed_off_calls.append(timeout_s) - self.last_payload_sequence_number = ( - 0 if max_sequence_number is None else max_sequence_number - ) + self.healthy = healthy + self.sent: list[tuple[dict, bytes]] = [] + self._seqs = seqs + self.finish_recording_session_calls = 0 + self.close_calls = 0 - def wait_until_drained( + def send_frame( self, - timeout_s: float = 30.0, - max_sequence_number: int | None = None, - ) -> None: - self.wait_until_drained_calls.append(timeout_s) - self.last_payload_sequence_number = ( - 0 if max_sequence_number is None else max_sequence_number - ) + metadata: dict, + chunk: bytes, + stop_cutoff_sequence_number: int | None = None, + ) -> int | None: + self.sent.append((metadata, bytes(chunk))) + if self._seqs is not None: + return self._seqs.pop(0) + return len(self.sent) + + def is_healthy(self) -> bool: + return self.healthy def finish_recording_session(self) -> None: self.finish_recording_session_calls += 1 + def close(self) -> None: + self.close_calls += 1 + -def test_cleanup_producer_channel_wait_false_still_drains_shared_slots() -> None: +def test_cleanup_producer_channel_finishes_recording_session() -> None: channel = object.__new__(ProducerChannel) - transport = _FakeSharedSlotTransport() + transport = _FakeIox2Transport() wait_calls: list[int] = [] end_trace_calls: list[str] = [] - channel._shared_slot_transport = transport - channel.get_last_enqueued_sequence_number = lambda: 41 + channel._iox2_transport = transport + # cleanup must flush the ZMQ sender's own last-enqueued sequence, NOT the + # global stop cutoff (which is an iceoryx2 frame sequence for video channels + # and would never be reported as sent over ZMQ). + channel.get_last_enqueued_sequence_number = lambda: 12 channel.wait_until_sequence_sent = ( lambda sequence_number: wait_calls.append(sequence_number) or True ) channel.end_trace = lambda: end_trace_calls.append("end") - transport.last_payload_sequence_number = 41 ProducerChannel.cleanup_producer_channel( channel, stop_cutoff_sequence_number=41, - wait_for_slot_drain=False, + wait_for_transport_drain=True, ) - assert transport.wait_until_payload_handed_off_calls == [30.0] - assert transport.wait_until_drained_calls == [30.0] - assert transport.finish_recording_session_calls == 1 + assert wait_calls == [12] assert end_trace_calls == ["end"] - assert wait_calls == [41] - - -def test_cleanup_producer_channel_wait_true_drains_shared_slots() -> None: - channel = object.__new__(ProducerChannel) - transport = _FakeSharedSlotTransport() - wait_calls: list[int] = [] - channel._shared_slot_transport = transport - channel.get_last_enqueued_sequence_number = lambda: 99 - channel.wait_until_sequence_sent = ( - lambda sequence_number: wait_calls.append(sequence_number) or True - ) - channel.end_trace = lambda: None - transport.last_payload_sequence_number = 99 - - ProducerChannel.cleanup_producer_channel( - channel, - stop_cutoff_sequence_number=99, - wait_for_slot_drain=True, - ) - - assert transport.wait_until_payload_handed_off_calls == [30.0] - assert transport.wait_until_drained_calls == [30.0] assert transport.finish_recording_session_calls == 1 - assert wait_calls == [99] -def test_cleanup_producer_channel_raises_when_descriptor_cutoff_not_sent() -> None: +def test_cleanup_producer_channel_raises_when_flush_not_sent() -> None: channel = object.__new__(ProducerChannel) - transport = _FakeSharedSlotTransport() + transport = _FakeIox2Transport() end_trace_calls: list[str] = [] - channel._shared_slot_transport = transport + channel._iox2_transport = transport channel.get_last_enqueued_sequence_number = lambda: 44 channel.wait_until_sequence_sent = lambda sequence_number: False channel.end_trace = lambda: end_trace_calls.append("end") - transport.last_payload_sequence_number = 44 with pytest.raises( RuntimeError, - match="Failed to send queued recording data up to stop cutoff before cleanup", + match="Failed to send queued recording data before cleanup", ): ProducerChannel.cleanup_producer_channel( channel, stop_cutoff_sequence_number=44, - wait_for_slot_drain=True, + wait_for_transport_drain=True, ) - assert transport.wait_until_payload_handed_off_calls == [30.0] - assert transport.wait_until_drained_calls == [] - assert transport.finish_recording_session_calls == 0 assert end_trace_calls == [] + assert transport.finish_recording_session_calls == 0 -def test_stop_producer_channel_wait_false_still_drains_shared_slots() -> None: +def test_stop_producer_channel_closes_resources_in_order() -> None: channel = object.__new__(ProducerChannel) - transport = _FakeSharedSlotTransport() wait_calls: list[int] = [] close_calls: list[str] = [] - channel._shared_slot_transport = transport channel._stop_heartbeat_service = lambda: close_calls.append("heartbeat") channel.get_last_enqueued_sequence_number = lambda: 12 channel.wait_until_sequence_sent = ( lambda sequence_number: wait_calls.append(sequence_number) or True ) - channel._close_shared_slot_transport = lambda: close_calls.append("transport") + channel._close_iox2_transport = lambda: close_calls.append("transport") channel._stop_message_sender = lambda: close_calls.append("sender") channel._comm = type( "_Comm", (), {"cleanup_producer": lambda self: close_calls.append("comm")} )() - ProducerChannel.stop_producer_channel(channel, wait_for_slot_drain=False) + ProducerChannel.stop_producer_channel(channel) assert wait_calls == [12] - assert transport.wait_until_drained_calls == [30.0] assert close_calls == ["heartbeat", "transport", "sender", "comm"] -def test_stop_producer_channel_wait_true_drains_shared_slots() -> None: - channel = object.__new__(ProducerChannel) - transport = _FakeSharedSlotTransport() - wait_calls: list[int] = [] - - channel._shared_slot_transport = transport - channel._stop_heartbeat_service = lambda: None - channel.get_last_enqueued_sequence_number = lambda: 13 - channel.wait_until_sequence_sent = ( - lambda sequence_number: wait_calls.append(sequence_number) or True - ) - channel._close_shared_slot_transport = lambda: None - channel._stop_message_sender = lambda: None - channel._comm = type("_Comm", (), {"cleanup_producer": lambda self: None})() - - ProducerChannel.stop_producer_channel(channel, wait_for_slot_drain=True) - - assert wait_calls == [13] - assert transport.wait_until_drained_calls == [30.0] - - def test_stop_producer_channel_raises_when_cutoff_not_sent_and_still_cleans_up() -> ( None ): channel = object.__new__(ProducerChannel) - transport = _FakeSharedSlotTransport() close_calls: list[str] = [] - channel._shared_slot_transport = transport channel._stop_heartbeat_service = lambda: close_calls.append("heartbeat") channel.get_last_enqueued_sequence_number = lambda: 21 channel.wait_until_sequence_sent = lambda sequence_number: False - channel._close_shared_slot_transport = lambda: close_calls.append("transport") + channel._get_message_sender_error = lambda: None + channel._close_iox2_transport = lambda: close_calls.append("transport") channel._stop_message_sender = lambda: close_calls.append("sender") channel._comm = type( "_Comm", (), {"cleanup_producer": lambda self: close_calls.append("comm")} @@ -188,35 +132,27 @@ def test_stop_producer_channel_raises_when_cutoff_not_sent_and_still_cleans_up() RuntimeError, match="Failed to send all enqueued messages before stopping producer channel", ): - ProducerChannel.stop_producer_channel(channel, wait_for_slot_drain=True) + ProducerChannel.stop_producer_channel(channel) - assert transport.wait_until_drained_calls == [] assert close_calls == ["heartbeat", "transport", "sender", "comm"] def test_stop_producer_channel_swallows_cutoff_failure_after_sender_error() -> None: channel = object.__new__(ProducerChannel) - transport = _FakeSharedSlotTransport() close_calls: list[str] = [] - channel._shared_slot_transport = transport channel._stop_heartbeat_service = lambda: close_calls.append("heartbeat") channel.get_last_enqueued_sequence_number = lambda: 22 channel.wait_until_sequence_sent = lambda sequence_number: False - channel._close_shared_slot_transport = lambda: close_calls.append("transport") + channel._get_message_sender_error = lambda: RuntimeError("boom") + channel._close_iox2_transport = lambda: close_calls.append("transport") channel._stop_message_sender = lambda: close_calls.append("sender") channel._comm = type( "_Comm", (), {"cleanup_producer": lambda self: close_calls.append("comm")} )() - channel._message_sender = type( - "_Sender", - (), - {"get_error": lambda self: RuntimeError("boom")}, - )() - ProducerChannel.stop_producer_channel(channel, wait_for_slot_drain=True) + ProducerChannel.stop_producer_channel(channel) - assert transport.wait_until_drained_calls == [] assert close_calls == ["heartbeat", "transport", "sender", "comm"] @@ -260,99 +196,78 @@ def test_end_trace_keeps_trace_state_when_trace_end_not_sent() -> None: assert channel.recording_id == "recording-1" -def _shared_slot_channel_for_send_tests() -> ProducerChannel: +def _video_channel_for_send_tests( + transport: _FakeIox2Transport, +) -> ProducerChannel: channel = object.__new__(ProducerChannel) channel._stop_cutoff_sequence_number = None channel.trace_id = "trace-1" channel.recording_id = "recording-1" channel.chunk_size = 1024 - channel._use_shared_slot_transport = True + channel._use_video_transport = True + channel._iox2_transport = transport + channel._recording_send_lock = threading.RLock() return channel -def test_send_data_parts_resets_shared_slot_transport_and_retries_once() -> None: - channel = _shared_slot_channel_for_send_tests() - send_calls: list[int] = [] - reset_calls: list[str] = [] - - def fake_send(*_args, **_kwargs) -> None: - send_calls.append(len(send_calls)) - if len(send_calls) == 1: - raise SharedSlotUnhealthyError("stale shared slot") - - channel._send_data_parts_shared_slots = fake_send - channel._ping_daemon_for_shared_slot_recovery = lambda timeout_s=2.0: True - channel._reset_shared_slot_transport_for_recovery = lambda: reset_calls.append( - "reset" - ) - - ProducerChannel.send_data_parts( - channel, - (b"frame",), +def _trace_metadata() -> TraceTransportMetadata: + return TraceTransportMetadata( + recording_id="recording-1", data_type=DataType.RGB_IMAGES, - robot_instance=0, data_type_name="camera", robot_name="robot", dataset_name="dataset", + robot_instance=0, ) - assert send_calls == [0, 1] - assert reset_calls == ["reset"] +def test_send_data_parts_iox2_publishes_each_chunk() -> None: + transport = _FakeIox2Transport() + channel = _video_channel_for_send_tests(transport) -def _raise_stale_shared_slot(*_args, **_kwargs) -> None: - raise SharedSlotUnhealthyError("stale shared slot") + ProducerChannel._send_data_parts_iox2( + channel, + [memoryview(b"frame-bytes")], + total_chunks=1, + trace_metadata=_trace_metadata(), + ) + assert len(transport.sent) == 1 + metadata, chunk = transport.sent[0] + assert chunk == b"frame-bytes" + assert metadata["trace_id"] == "trace-1" -def test_send_data_parts_stops_shared_slot_logging_when_ping_fails() -> None: - channel = _shared_slot_channel_for_send_tests() - stop_calls: list[str] = [] - reset_calls: list[str] = [] - channel._send_data_parts_shared_slots = _raise_stale_shared_slot - channel._ping_daemon_for_shared_slot_recovery = lambda timeout_s=2.0: False - channel._reset_shared_slot_transport_for_recovery = lambda: reset_calls.append( - "reset" - ) - channel._stop_shared_slot_logging_after_failure = lambda: stop_calls.append("stop") +def test_send_data_parts_iox2_stops_logging_when_transport_unhealthy() -> None: + transport = _FakeIox2Transport(healthy=False, seqs=[None]) + channel = _video_channel_for_send_tests(transport) + stop_calls: list[str] = [] + channel._stop_video_logging_after_failure = lambda: stop_calls.append("stop") - with pytest.raises(RuntimeError, match="daemon did not respond"): - ProducerChannel.send_data_parts( + with pytest.raises(RuntimeError, match="became unhealthy"): + ProducerChannel._send_data_parts_iox2( channel, - (b"frame",), - data_type=DataType.RGB_IMAGES, - robot_instance=0, - data_type_name="camera", - robot_name="robot", - dataset_name="dataset", + [memoryview(b"frame-bytes")], + total_chunks=1, + trace_metadata=_trace_metadata(), ) assert stop_calls == ["stop"] - assert reset_calls == [] -def test_send_data_parts_stops_shared_slot_logging_when_retry_fails() -> None: - channel = _shared_slot_channel_for_send_tests() +def test_send_data_parts_iox2_stops_on_cutoff_without_failure() -> None: + # send_frame returns None but the transport is healthy => stop cutoff hit; + # this must not be treated as a failure. + transport = _FakeIox2Transport(healthy=True, seqs=[None]) + channel = _video_channel_for_send_tests(transport) stop_calls: list[str] = [] - reset_calls: list[str] = [] + channel._stop_video_logging_after_failure = lambda: stop_calls.append("stop") - channel._send_data_parts_shared_slots = _raise_stale_shared_slot - channel._ping_daemon_for_shared_slot_recovery = lambda timeout_s=2.0: True - channel._reset_shared_slot_transport_for_recovery = lambda: reset_calls.append( - "reset" + ProducerChannel._send_data_parts_iox2( + channel, + [memoryview(b"frame-bytes")], + total_chunks=1, + trace_metadata=_trace_metadata(), ) - channel._stop_shared_slot_logging_after_failure = lambda: stop_calls.append("stop") - with pytest.raises(RuntimeError, match="remained unhealthy"): - ProducerChannel.send_data_parts( - channel, - (b"frame",), - data_type=DataType.RGB_IMAGES, - robot_instance=0, - data_type_name="camera", - robot_name="robot", - dataset_name="dataset", - ) - - assert reset_calls == ["reset"] - assert stop_calls == ["stop"] + assert stop_calls == [] diff --git a/tests/unit/data_daemon/communications_management/test_shared_slot_daemon_handler.py b/tests/unit/data_daemon/communications_management/test_shared_slot_daemon_handler.py deleted file mode 100644 index 367ef6b7c..000000000 --- a/tests/unit/data_daemon/communications_management/test_shared_slot_daemon_handler.py +++ /dev/null @@ -1,207 +0,0 @@ -import threading -import time -from collections import namedtuple - -import pytest - -from neuracore.data_daemon.communications_management.consumer import ( - models as consumer_models, -) -from neuracore.data_daemon.communications_management.shared_transport import ( - shared_memory_budget as shared_memory_budget_module, -) -from neuracore.data_daemon.communications_management.shared_transport import ( - shared_slot_daemon_handler as shared_slot_daemon_handler_module, -) - -SharedMemoryBudget = shared_memory_budget_module.SharedMemoryBudget -SharedSlotDaemonHandler = shared_slot_daemon_handler_module.SharedSlotDaemonHandler - - -def test_shared_memory_budget_caps_slot_count_to_remaining_budget( - monkeypatch, -) -> None: - budget = SharedMemoryBudget() - usage = namedtuple("usage", ["total", "used", "free"])( - 128 * 1024**2, - 88 * 1024**2, - 40 * 1024**2, - ) - slot_size = 31 * 1024**2 - - monkeypatch.setattr( - "neuracore.data_daemon.communications_management.shared_transport" - ".shared_memory_budget.shutil.disk_usage", - lambda _path: usage, - ) - - reservation = budget.reserve( - shm_name="test-shm", - slot_size=slot_size, - requested_slot_count=4, - ) - - assert reservation.slot_count == 3 - assert reservation.allocated_bytes == slot_size * 3 - - -def test_handle_descriptor_returns_slot_credit_when_spool_fails(monkeypatch) -> None: - handler = SharedSlotDaemonHandler(comm=object()) # type: ignore[arg-type] - channel = consumer_models.ChannelState(producer_id="producer-1") - channel.mark_shared_slot_transport_open( - control_endpoint="ipc://credits", - shm_name="existing-shm", - ) - descriptor_payload = { - "shm_name": "descriptor-shm", - "slot_id": 7, - "offset": 0, - "length": 128, - "sequence_id": 42, - "slot_size": 256, - } - returned_credits: list[tuple[str, int, int]] = [] - - def fake_spool(*_args, **_kwargs): - raise RuntimeError("spool failed") - - def fake_return_credit(channel_arg, descriptor_arg) -> None: - returned_credits.append(( - channel_arg.producer_id, - descriptor_arg.slot_id, - descriptor_arg.sequence_id, - )) - - monkeypatch.setattr(handler, "_spool_shared_slot_packet", fake_spool) - monkeypatch.setattr(handler, "_send_slot_credit_return", fake_return_credit) - - with pytest.raises(RuntimeError, match="spool failed"): - handler.handle_descriptor(channel, descriptor_payload, chunk_spool=object()) - - assert returned_credits == [("producer-1", 7, 42)] - assert channel.shared_slot.shm_name == "existing-shm" - - -def test_handle_descriptor_preserves_spool_error_when_credit_return_fails( - monkeypatch, -) -> None: - handler = SharedSlotDaemonHandler(comm=object()) # type: ignore[arg-type] - channel = consumer_models.ChannelState(producer_id="producer-1") - channel.mark_shared_slot_transport_open( - control_endpoint="ipc://credits", - shm_name="existing-shm", - ) - descriptor_payload = { - "shm_name": "descriptor-shm", - "slot_id": 7, - "offset": 0, - "length": 128, - "sequence_id": 42, - "slot_size": 256, - } - - def fake_spool(*_args, **_kwargs): - raise RuntimeError("spool failed") - - def fake_return_credit(*_args, **_kwargs) -> None: - raise RuntimeError("credit failed") - - monkeypatch.setattr(handler, "_spool_shared_slot_packet", fake_spool) - monkeypatch.setattr(handler, "_send_slot_credit_return", fake_return_credit) - - with pytest.raises(RuntimeError, match="spool failed"): - handler.handle_descriptor(channel, descriptor_payload, chunk_spool=object()) - - -def _descriptor_payload(shm_name: str, sequence_id: int = 42) -> dict[str, int | str]: - return { - "shm_name": shm_name, - "slot_id": 1, - "offset": 0, - "length": 16, - "sequence_id": sequence_id, - "slot_size": 256, - } - - -def _open_payload(endpoint: str = "ipc://new-credits") -> dict[str, int | str]: - return { - "control_endpoint": endpoint, - "slot_size": 256, - "slot_count": 1, - } - - -def test_handle_open_waits_for_previous_shared_slot_descriptors(monkeypatch) -> None: - handler = SharedSlotDaemonHandler( - comm=object(), # type: ignore[arg-type] - reopen_drain_timeout_s=0.5, - ) - channel = consumer_models.ChannelState(producer_id="producer-1") - channel.mark_shared_slot_transport_open( - control_endpoint="ipc://old-credits", - shm_name="old-shm", - ) - descriptor = handler.mark_descriptor_pending( - channel, _descriptor_payload("old-shm") - ) - abandoned: list[int] = [] - - monkeypatch.setattr(handler, "_send_ready_message", lambda **_kwargs: None) - - def complete_old_descriptor() -> None: - time.sleep(0.05) - handler.mark_descriptor_completed(channel.producer_id, descriptor) - - thread = threading.Thread(target=complete_old_descriptor) - thread.start() - try: - handler.handle_open( - channel, - _open_payload(), - on_abandoned_sequences=lambda _producer_id, seqs: abandoned.extend(seqs), - ) - finally: - thread.join(timeout=1.0) - handler.close() - - assert abandoned == [] - assert channel.shared_slot.shm_name != "old-shm" - - -def test_handle_open_abandons_stalled_previous_shared_slot_descriptors( - monkeypatch, -) -> None: - handler = SharedSlotDaemonHandler( - comm=object(), # type: ignore[arg-type] - reopen_drain_timeout_s=0.01, - ) - channel = consumer_models.ChannelState(producer_id="producer-1") - channel.mark_shared_slot_transport_open( - control_endpoint="ipc://old-credits", - shm_name="old-shm", - ) - handler.mark_descriptor_pending(channel, _descriptor_payload("old-shm")) - abandoned: list[int] = [] - - monkeypatch.setattr(handler, "_send_ready_message", lambda **_kwargs: None) - - try: - handler.handle_open( - channel, - _open_payload(), - on_abandoned_sequences=lambda _producer_id, seqs: abandoned.extend(seqs), - ) - - assert abandoned == [42] - assert channel.shared_slot.shm_name != "old-shm" - with pytest.raises( - shared_slot_daemon_handler_module.SharedSlotDescriptorAbandoned - ): - handler.handle_descriptor( - channel, - _descriptor_payload("old-shm"), - chunk_spool=object(), - ) - finally: - handler.close() diff --git a/tests/unit/data_daemon/communications_management/test_spool_worker.py b/tests/unit/data_daemon/communications_management/test_spool_worker.py index eb08f8ff8..64aa778b6 100644 --- a/tests/unit/data_daemon/communications_management/test_spool_worker.py +++ b/tests/unit/data_daemon/communications_management/test_spool_worker.py @@ -1,28 +1,29 @@ import time -from collections.abc import Callable from unittest.mock import Mock import pytest -from neuracore.data_daemon.communications_management.consumer.models import ChannelState +from neuracore.data_daemon.communications_management.consumer.models import ( + ChannelState, + DecodedFrameWork, +) from neuracore.data_daemon.communications_management.consumer.spool_worker import ( _SpoolShard, ) def _build_shard( - *, handle_descriptor: Callable[..., object] | Mock | None = None + *, + chunk_spool: Mock | None = None, + should_drop_recording_data=lambda _: False, ) -> _SpoolShard: - shared_slot_handler = Mock() - shared_slot_handler.handle_descriptor = handle_descriptor or Mock() completion_worker = Mock() return _SpoolShard( - chunk_spool=Mock(), - shared_slot_handler=shared_slot_handler, + chunk_spool=chunk_spool or Mock(), completion_worker=completion_worker, acquire_spool_admission=lambda: None, release_spool_admission=lambda: None, - should_drop_recording_data=lambda _: False, + should_drop_recording_data=should_drop_recording_data, mark_sequence_completed=lambda _: None, register_trace=lambda *_: None, register_trace_metadata=lambda *_: None, @@ -32,6 +33,15 @@ def _build_shard( ) +def _frame_work(channel: ChannelState, metadata: dict) -> DecodedFrameWork: + return DecodedFrameWork( + channel=channel, + sequence_id=1, + metadata=metadata, + chunk=b"chunk", + ) + + def test_enqueue_raises_when_shard_thread_is_not_running() -> None: shard = _build_shard() channel = ChannelState(producer_id="producer-1") @@ -39,16 +49,22 @@ def test_enqueue_raises_when_shard_thread_is_not_running() -> None: shard.close() with pytest.raises(RuntimeError, match="Daemon spool shard is not running"): - shard.enqueue(channel, {}) + shard.enqueue(_frame_work(channel, {"trace_id": "t"})) def test_enqueue_raises_wrapped_error_after_worker_failure() -> None: - shard = _build_shard( - handle_descriptor=Mock(side_effect=RuntimeError("boom")), - ) + # chunk_spool.append raises so the worker loop records the error and stops. + chunk_spool = Mock() + chunk_spool.append = Mock(side_effect=RuntimeError("boom")) + shard = _build_shard(chunk_spool=chunk_spool) channel = ChannelState(producer_id="producer-1") - shard.enqueue(channel, {}) + shard.enqueue( + _frame_work( + channel, + {"trace_id": "t", "chunk_index": 0, "total_chunks": 1}, + ) + ) deadline = time.monotonic() + 1.0 while shard._thread.is_alive() and time.monotonic() < deadline: @@ -57,7 +73,7 @@ def test_enqueue_raises_wrapped_error_after_worker_failure() -> None: assert not shard._thread.is_alive() with pytest.raises(RuntimeError, match="Daemon spool shard failed") as excinfo: - shard.enqueue(channel, {}) + shard.enqueue(_frame_work(channel, {"trace_id": "t"})) assert isinstance(excinfo.value.__cause__, RuntimeError) assert str(excinfo.value.__cause__) == "boom" diff --git a/tests/unit/data_daemon/lifecycle/test_runtime_recovery.py b/tests/unit/data_daemon/lifecycle/test_runtime_recovery.py index 4650029af..89860ecdf 100644 --- a/tests/unit/data_daemon/lifecycle/test_runtime_recovery.py +++ b/tests/unit/data_daemon/lifecycle/test_runtime_recovery.py @@ -8,10 +8,7 @@ import pytest -from neuracore.data_daemon.const import ( - DEFAULT_SHARED_MEMORY_SIZE, - SHARED_SLOT_SHM_PREFIX, -) +from neuracore.data_daemon.const import DEFAULT_TRANSPORT_BUFFER_SIZE from neuracore.data_daemon.lifecycle import runtime_recovery from neuracore.data_daemon.lifecycle.daemon_os_control import ( acquire_pid_file, @@ -21,8 +18,6 @@ from neuracore.data_daemon.lifecycle.runtime_recovery import ( SharedMemoryCapacityError, cleanup_socket_files, - cleanup_stale_shared_slot_control_sockets, - cleanup_stale_shared_slot_segments, ensure_shared_memory_capacity, reconcile_state_with_filesystem, shared_memory_required_bytes, @@ -93,60 +88,6 @@ def test_validate_or_recover_sqlite_rotates_corrupt_db(tmp_path: Path) -> None: assert any(path.name.startswith("state.db.corrupt-") for path in tmp_path.iterdir()) -def test_cleanup_stale_shared_slot_segments_removes_stale_shared_slot_segments( - tmp_path: Path, -) -> None: - shm_dir = tmp_path / "dev-shm" - shm_dir.mkdir() - - stale_names = ( - f"{SHARED_SLOT_SHM_PREFIX}stale-1", - f"{SHARED_SLOT_SHM_PREFIX}stale-2", - ) - for buffer_name in stale_names: - (shm_dir / buffer_name).write_bytes(b"shm") - - live_name = "neuracore-keep-live" - (shm_dir / live_name).write_bytes(b"shm") - - cleaned = cleanup_stale_shared_slot_segments(shm_dir=shm_dir) - - assert cleaned == len(stale_names) - for stale_name in stale_names: - assert not (shm_dir / stale_name).exists() - assert (shm_dir / live_name).exists() - - -def test_cleanup_stale_shared_slot_control_sockets_removes_ack_sockets( - tmp_path: Path, -) -> None: - ack_dir = tmp_path / "slot_acks" - ack_dir.mkdir() - - stale_names = ( - "slot_control_123_first.ipc", - "slot_control_456_second.ipc", - ) - for socket_name in stale_names: - (ack_dir / socket_name).write_text("stale", encoding="utf-8") - - keep_names = ( - "management.sock", - "slot_control_789_missing_suffix", - "other_control_123.ipc", - ) - for socket_name in keep_names: - (ack_dir / socket_name).write_text("keep", encoding="utf-8") - - cleaned = cleanup_stale_shared_slot_control_sockets(ack_dir=ack_dir) - - assert cleaned == len(stale_names) - for stale_name in stale_names: - assert not (ack_dir / stale_name).exists() - for keep_name in keep_names: - assert (ack_dir / keep_name).exists() - - def test_ensure_shared_memory_capacity_raises_when_tmpfs_is_full( tmp_path: Path, monkeypatch, @@ -165,8 +106,8 @@ def test_ensure_shared_memory_capacity_raises_when_tmpfs_is_full( def test_shared_memory_required_bytes_matches_default_allocation() -> None: assert ( - shared_memory_required_bytes(DEFAULT_SHARED_MEMORY_SIZE, metadata_size=4096) - == DEFAULT_SHARED_MEMORY_SIZE + 4096 + shared_memory_required_bytes(DEFAULT_TRANSPORT_BUFFER_SIZE, metadata_size=4096) + == DEFAULT_TRANSPORT_BUFFER_SIZE + 4096 ) diff --git a/tests/unit/data_daemon/test_ipc_communications.py b/tests/unit/data_daemon/test_ipc_communications.py index 12a1aaead..023b0c79c 100644 --- a/tests/unit/data_daemon/test_ipc_communications.py +++ b/tests/unit/data_daemon/test_ipc_communications.py @@ -134,15 +134,9 @@ def test_create_producer_socket_returns_continues_without_daemon( def test_message_envelope_round_trip_bytes() -> None: envelope = MessageEnvelope( producer_id="producer-abc", - command=CommandType.OPEN_FIXED_SHARED_SLOTS, - payload={ - "open_fixed_shared_slots": { - "transport_mode": "FIXED_SHARED_SLOTS_DAEMON_OWNED", - "control_endpoint": "ipc://test-envelope-round-trip", - "slot_size": 4096, - "slot_count": 16, - } - }, + command=CommandType.HEARTBEAT, + payload={"data_type": "rgb_image"}, + sequence_number=7, ) parsed = MessageEnvelope.from_bytes(envelope.to_bytes()) @@ -384,7 +378,7 @@ def send_open(comm: CommunicationsManager, producer_id: str) -> None: MessageEnvelope( producer_id=producer_id, command=CommandType.HEARTBEAT, - payload={CommandType.HEARTBEAT.value: {}}, + payload={"data_type": DataType.CUSTOM_1D.value}, ), ) @@ -508,7 +502,7 @@ class FakeCommand(Enum): MessageEnvelope( producer_id="producer-1", command=CommandType.HEARTBEAT, - payload={CommandType.HEARTBEAT.value: {}}, + payload={"data_type": DataType.CUSTOM_1D.value}, ) ) assert daemon.channels.get("producer-1").is_open() @@ -560,7 +554,7 @@ def _capture_handle_message(message: MessageEnvelope) -> None: MessageEnvelope( producer_id="prod", command=CommandType.HEARTBEAT, - payload={CommandType.HEARTBEAT.value: {}}, + payload={"data_type": DataType.CUSTOM_1D.value}, ).to_bytes() ) raw = daemon_comm._consumer_socket.recv() diff --git a/tests/unit/data_daemon/test_messaging.py b/tests/unit/data_daemon/test_messaging.py index ff5670636..4d4b42f8f 100644 --- a/tests/unit/data_daemon/test_messaging.py +++ b/tests/unit/data_daemon/test_messaging.py @@ -1,13 +1,10 @@ -import json -import struct +import logging import threading import time from datetime import datetime, timedelta, timezone -from multiprocessing.shared_memory import SharedMemory from uuid import uuid4 import pytest -import zmq from neuracore_types import DataType from neuracore.data_daemon.communications_management.consumer import ( @@ -20,30 +17,21 @@ from neuracore.data_daemon.communications_management.consumer.models import ( ChannelState, CompletionChunkWork, - SharedSlotSequenceProgressRequest, TraceMetadataRegistrationRequest, TraceMetadataSnapshot, + TransportMode, + VideoFrameSequenceProgressRequest, ) from neuracore.data_daemon.communications_management.producer.producer_channel import ( ProducerChannel, ) -from neuracore.data_daemon.communications_management.shared_transport import ( - shared_slot_transport as shared_slot_transport_module, +from neuracore.data_daemon.communications_management.shared_transport.framing import ( + PacketTooLarge, ) -from neuracore.data_daemon.communications_management.shared_transport.models import ( - QueuedSharedSlotPacket, -) -from neuracore.data_daemon.communications_management.shared_transport.registry import ( - SharedSlotRegistry, -) -from neuracore.data_daemon.const import ( - DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - DEFAULT_VIDEO_SLOT_COUNT, - HEARTBEAT_TIMEOUT_SECS, - SHARED_MEMORY_RECORD_HEADER_FORMAT, - SHARED_SLOT_SHM_PREFIX, +from neuracore.data_daemon.communications_management.shared_transport.iox2_daemon_drain import ( # noqa: E501 + Iox2DaemonDrain, ) +from neuracore.data_daemon.const import HEARTBEAT_TIMEOUT_SECS from neuracore.data_daemon.models import ( BatchedJointDataItemPayload, BatchedJointDataPayload, @@ -51,60 +39,27 @@ CompleteMessage, DataChunkPayload, MessageEnvelope, - SharedSlotCreditReturn, - SharedSlotDescriptor, - SharedSlotOpenFailedModel, - SharedSlotReadyModel, TraceTransportMetadata, ) - -def _decode_shared_memory_write(packet: bytes) -> tuple[dict, bytes]: - _magic, metadata_len, chunk_len = struct.unpack( - SHARED_MEMORY_RECORD_HEADER_FORMAT, - packet[: struct.calcsize(SHARED_MEMORY_RECORD_HEADER_FORMAT)], - ) - metadata_start = struct.calcsize(SHARED_MEMORY_RECORD_HEADER_FORMAT) - metadata_end = metadata_start + metadata_len - metadata = json.loads(packet[metadata_start:metadata_end].decode("utf-8")) - chunk = packet[metadata_end : metadata_end + chunk_len] - return metadata, chunk - - -def _read_shared_slot_packet(envelope: MessageEnvelope) -> tuple[dict, bytes]: - descriptor = SharedSlotDescriptor.from_dict( - envelope.payload[CommandType.SHARED_SLOT_DESCRIPTOR.value] - ) - shm = SharedMemory(name=descriptor.shm_name) - try: - packet = bytes( - shm.buf[descriptor.offset : descriptor.offset + descriptor.length] - ) - finally: - shm.close() - return parse_shared_frame_packet(packet) +BridgeChunkSpool = bridge_chunk_spool_module.BridgeChunkSpool def test_message_envelope_round_trip() -> None: - payload = { - "open_fixed_shared_slots": { - "transport_mode": "FIXED_SHARED_SLOTS_DAEMON_OWNED", - "control_endpoint": "ipc://test-message-round-trip", - "slot_size": 2048, - "slot_count": 16, - } - } + payload = {"data_type": "rgb_image"} envelope = MessageEnvelope( producer_id="producer-123", - command=CommandType.OPEN_FIXED_SHARED_SLOTS, + command=CommandType.HEARTBEAT, payload=payload, + sequence_number=3, ) parsed = MessageEnvelope.from_bytes(envelope.to_bytes()) assert parsed.producer_id == "producer-123" - assert parsed.command == CommandType.OPEN_FIXED_SHARED_SLOTS + assert parsed.command == CommandType.HEARTBEAT assert parsed.payload == payload + assert parsed.sequence_number == 3 def test_data_chunk_payload_round_trip() -> None: @@ -183,10 +138,6 @@ def test_complete_message_batch_record_round_trip() -> None: assert parsed.producer_id == "prod" assert parsed.trace_id == "trace" assert parsed.recording_id == "rec-1" - assert parsed.dataset_id is None - assert parsed.dataset_name is None - assert parsed.robot_name is None - assert parsed.robot_id is None assert parsed.data_type == DataType.CUSTOM_1D assert parsed.data_type_name == "custom_data" assert parsed.robot_instance == 0 @@ -215,20 +166,10 @@ def cleanup_producer(self): self.cleaned = True -def _wait_for_messages(comm: DummyComm, expected: int, timeout: float = 1.0) -> None: - """Wait for ProducerChannel's sender thread to flush messages to DummyComm.""" - deadline = time.monotonic() + timeout - while time.monotonic() < deadline: - if len(comm.messages) >= expected: - return - time.sleep(0.02) - return - - def _wait_for_envelopes( messages: list[MessageEnvelope], expected: int, timeout: float = 1.0 ) -> None: - """Wait for a stubbed producer transport to capture messages.""" + """Wait for the producer sender thread to flush messages to the stub.""" deadline = time.monotonic() + timeout while time.monotonic() < deadline: if len(messages) >= expected: @@ -238,101 +179,18 @@ def _wait_for_envelopes( def _stub_producer_transport(monkeypatch) -> list[MessageEnvelope]: + """Patch the producer ZMQ control channel to capture sent envelopes.""" messages: list[MessageEnvelope] = [] - control_context = zmq.Context() - control_sockets: dict[str, zmq.Socket] = {} - control_endpoints: dict[str, str] = {} - shared_memories: dict[str, SharedMemory] = {} - - monkeypatch.setattr( - "neuracore.data_daemon.communications_management.shared_transport.communications_manager.CommunicationsManager.create_producer_socket", - lambda self: None, + base = ( + "neuracore.data_daemon.communications_management.shared_transport." + "communications_manager.CommunicationsManager" ) - def _send_message(_self, message): - messages.append(message) - - if message.command == CommandType.OPEN_FIXED_SHARED_SLOTS: - payload = message.payload["open_fixed_shared_slots"] - control_endpoint = payload["control_endpoint"] - shm_name = f"neuracore-slots-test-{len(shared_memories)}" - shm = SharedMemory( - name=shm_name, - create=True, - size=int(payload["slot_size"]) * int(payload["slot_count"]), - ) - shared_memories[shm_name] = shm - control_endpoints[str(message.producer_id)] = control_endpoint - socket_obj = control_sockets.get(control_endpoint) - if socket_obj is None: - socket_obj = control_context.socket(zmq.PUSH) - socket_obj.setsockopt(zmq.LINGER, 0) - socket_obj.connect(control_endpoint) - control_sockets[control_endpoint] = socket_obj - socket_obj.send( - MessageEnvelope( - producer_id=None, - command=CommandType.SHARED_SLOT_READY, - payload={ - CommandType.SHARED_SLOT_READY.value: SharedSlotReadyModel( - shm_name=shm_name, - slot_size=int(payload["slot_size"]), - slot_count=int(payload["slot_count"]), - ).model_dump() - }, - ).to_bytes() - ) - return - - if message.command == CommandType.SHARED_SLOT_DESCRIPTOR: - descriptor = SharedSlotDescriptor.from_dict( - message.payload[CommandType.SHARED_SLOT_DESCRIPTOR.value] - ) - control_endpoint = control_endpoints.get(str(message.producer_id)) - if control_endpoint is None: - raise RuntimeError( - "Missing control endpoint for shared-slot descriptor" - ) - socket_obj = control_sockets[control_endpoint] - socket_obj.send( - MessageEnvelope( - producer_id=None, - command=CommandType.SHARED_SLOT_CREDIT_RETURN, - payload={ - CommandType.SHARED_SLOT_CREDIT_RETURN.value: ( - SharedSlotCreditReturn( - shm_name=descriptor.shm_name, - slot_id=descriptor.slot_id, - sequence_id=descriptor.sequence_id, - ).to_dict() - ) - }, - ).to_bytes() - ) - - monkeypatch.setattr( - "neuracore.data_daemon.communications_management.shared_transport.communications_manager.CommunicationsManager.send_message", - _send_message, - ) - - def _cleanup_producer(_self) -> None: - for socket_obj in control_sockets.values(): - socket_obj.close(0) - control_sockets.clear() - for shm in shared_memories.values(): - shm.close() - try: - shm.unlink() - except FileNotFoundError: - pass - shared_memories.clear() - control_context.term() - + monkeypatch.setattr(f"{base}.create_producer_socket", lambda self: None) monkeypatch.setattr( - "neuracore.data_daemon.communications_management.shared_transport.communications_manager.CommunicationsManager.cleanup_producer", - _cleanup_producer, + f"{base}.send_message", lambda self, message: messages.append(message) ) - + monkeypatch.setattr(f"{base}.cleanup_producer", lambda self: None) return messages @@ -374,42 +232,42 @@ def test_send_batched_joint_data_enqueues_expected_command(monkeypatch) -> None: assert envelope.payload[CommandType.BATCHED_JOINT_DATA.value] == payload.to_dict() -def test_producer_open_fixed_shared_slots_sends_payload(monkeypatch) -> None: - messages = _stub_producer_transport(monkeypatch) - producer = ProducerChannel(data_type=DataType.RGB_IMAGES) - - try: - producer.open_fixed_shared_slots(slot_size=2048) - _wait_for_envelopes(messages, 1) - finally: - producer.stop_producer_channel() - - assert len(messages) == 1 - envelope = messages[0] - assert envelope.command == CommandType.OPEN_FIXED_SHARED_SLOTS - payload = envelope.payload["open_fixed_shared_slots"] - assert payload["slot_size"] == 2048 - assert payload["slot_count"] == DEFAULT_VIDEO_SLOT_COUNT - assert payload["control_endpoint"].startswith("ipc://") +def _drain_until( + drain: Iox2DaemonDrain, + received: list[tuple[dict, bytes]], + expected: int, + timeout: float = 1.0, +) -> None: + deadline = time.monotonic() + timeout + while len(received) < expected and time.monotonic() < deadline: + drain.drain_all( + lambda channel, seq, meta, chunk: received.append((meta, chunk)) + ) + time.sleep(0.02) -def test_producer_send_data_parts_lazily_opens_shared_memory( - monkeypatch, -) -> None: +@pytest.mark.parametrize( + "data_type", + [DataType.RGB_IMAGES, DataType.DEPTH_IMAGES, DataType.POINT_CLOUDS], +) +def test_producer_routes_video_frames_over_iox2(monkeypatch, data_type) -> None: messages = _stub_producer_transport(monkeypatch) - + channel_id = f"vid-{uuid4().hex[:8]}" producer = ProducerChannel( + id=channel_id, chunk_size=2, recording_id="rec-1", - shared_memory_size=2048, - data_type=DataType.RGB_IMAGES, + data_type=data_type, ) + drain = Iox2DaemonDrain() + received: list[tuple[dict, bytes]] = [] try: + drain.register_channel(channel_id) producer.start_new_trace() producer.send_data( b"abcd", - data_type=DataType.RGB_IMAGES, + data_type=data_type, data_type_name="custom", robot_instance=2, robot_id="robot-1", @@ -417,41 +275,39 @@ def test_producer_send_data_parts_lazily_opens_shared_memory( dataset_id="dataset-1", dataset_name="dataset", ) - _wait_for_envelopes(messages, 3) - first_metadata, first_chunk = _read_shared_slot_packet(messages[1]) - second_metadata, second_chunk = _read_shared_slot_packet(messages[2]) + _drain_until(drain, received, expected=2) finally: + drain.close() producer.stop_producer_channel() - assert len(messages) == 3 - assert messages[0].command == CommandType.OPEN_FIXED_SHARED_SLOTS - assert messages[1].command == CommandType.SHARED_SLOT_DESCRIPTOR - assert messages[2].command == CommandType.SHARED_SLOT_DESCRIPTOR - assert first_chunk == b"ab" - assert second_chunk == b"cd" + assert [chunk for _, chunk in received] == [b"ab", b"cd"] + # First frame carries trace metadata, subsequent frames do not. + assert received[0][0]["data_type"] == data_type.value + assert "data_type" not in received[1][0] + # Video frames must not travel over the ZMQ control channel. + assert all(message.command != CommandType.DATA_CHUNK for message in messages) -@pytest.mark.parametrize( - "data_type", - [DataType.DEPTH_IMAGES, DataType.POINT_CLOUDS], -) -def test_producer_send_data_parts_uses_shared_slots_for_depth_and_point_clouds( - monkeypatch, data_type: DataType -) -> None: - messages = _stub_producer_transport(monkeypatch) - +def test_producer_video_chunks_across_multiple_frames(monkeypatch) -> None: + _stub_producer_transport(monkeypatch) + channel_id = f"vid-{uuid4().hex[:8]}" producer = ProducerChannel( - chunk_size=2, + id=channel_id, + chunk_size=3, recording_id="rec-1", - shared_memory_size=2048, - data_type=data_type, + data_type=DataType.RGB_IMAGES, ) + drain = Iox2DaemonDrain() + received: list[tuple[dict, bytes]] = [] try: + drain.register_channel(channel_id) producer.start_new_trace() - producer.send_data( - b"abcd", - data_type=data_type, + # cspell:ignore cdef + producer.send_data_parts( + (b"ab", memoryview(b"cdef"), b"gh"), + total_bytes=8, + data_type=DataType.RGB_IMAGES, data_type_name="custom", robot_instance=2, robot_id="robot-1", @@ -459,20 +315,15 @@ def test_producer_send_data_parts_uses_shared_slots_for_depth_and_point_clouds( dataset_id="dataset-1", dataset_name="dataset", ) - _wait_for_envelopes(messages, 3) - first_metadata, first_chunk = _read_shared_slot_packet(messages[1]) - second_metadata, second_chunk = _read_shared_slot_packet(messages[2]) + _drain_until(drain, received, expected=3) finally: + drain.close() producer.stop_producer_channel() - assert len(messages) == 3 - assert messages[0].command == CommandType.OPEN_FIXED_SHARED_SLOTS - assert messages[1].command == CommandType.SHARED_SLOT_DESCRIPTOR - assert messages[2].command == CommandType.SHARED_SLOT_DESCRIPTOR - assert first_metadata["data_type"] == data_type.value - assert "data_type" not in second_metadata - assert first_chunk == b"ab" - assert second_chunk == b"cd" + assert [chunk for _, chunk in received] == [b"abc", b"def", b"gh"] + assert received[0][0]["recording_id"] == "rec-1" + assert "recording_id" not in received[1][0] + assert "recording_id" not in received[2][0] def test_producer_send_data_parts_uses_socket_for_non_video(monkeypatch) -> None: @@ -511,104 +362,18 @@ def test_producer_send_data_parts_uses_socket_for_non_video(monkeypatch) -> None assert payload.data == b"abcd" -def test_producer_ensure_shared_memory_does_not_reannounce( - monkeypatch, -) -> None: - messages = _stub_producer_transport(monkeypatch) - - producer = ProducerChannel( - recording_id="rec-1", - shared_memory_size=2048, - data_type=DataType.RGB_IMAGES, - ) - - try: - producer.start_new_trace() - producer.open_fixed_shared_slots(slot_size=2048) - _wait_for_envelopes(messages, 1) - control_endpoint = messages[0].payload["open_fixed_shared_slots"][ - "control_endpoint" - ] - trace_id = producer.trace_id - - producer.send_data( - b"ab", - data_type=DataType.RGB_IMAGES, - data_type_name="custom", - robot_instance=2, - robot_id="robot-1", - robot_name="robot", - dataset_id="dataset-1", - dataset_name="dataset", - ) - _wait_for_envelopes(messages, 2) - metadata, _chunk = _read_shared_slot_packet(messages[1]) - finally: - producer.stop_producer_channel() - - assert len(messages) == 2 - assert ( - messages[0].payload["open_fixed_shared_slots"]["control_endpoint"] - == control_endpoint - ) - assert metadata["trace_id"] == trace_id - - -def test_producer_send_data_parts_chunks_across_multiple_buffers(monkeypatch) -> None: - messages = _stub_producer_transport(monkeypatch) - - producer = ProducerChannel( - chunk_size=3, - recording_id="rec-1", - shared_memory_size=2048, - data_type=DataType.RGB_IMAGES, - ) - - try: - producer.start_new_trace() - # cspell:ignore cdef - producer.send_data_parts( - (b"ab", memoryview(b"cdef"), b"gh"), - total_bytes=8, - data_type=DataType.RGB_IMAGES, - data_type_name="custom", - robot_instance=2, - robot_id="robot-1", - robot_name="robot", - dataset_id="dataset-1", - dataset_name="dataset", - ) - _wait_for_envelopes(messages, 4) - packets = [_read_shared_slot_packet(packet) for packet in messages[1:]] - finally: - producer.stop_producer_channel() - - assert len(messages) == 4 - envelope = messages[0] - assert envelope.command == CommandType.OPEN_FIXED_SHARED_SLOTS - payload = envelope.payload["open_fixed_shared_slots"] - assert payload["slot_size"] == 2048 - assert payload["control_endpoint"].startswith("ipc://") - assert [chunk for _, chunk in packets] == [b"abc", b"def", b"gh"] - assert packets[0][0]["recording_id"] == "rec-1" - assert "recording_id" not in packets[1][0] - assert "recording_id" not in packets[2][0] - - -def test_producer_shared_memory_rejects_oversized_packet(monkeypatch) -> None: - monkeypatch.setenv("NDD_DEBUG", "true") +def test_producer_video_rejects_oversized_frame(monkeypatch) -> None: _stub_producer_transport(monkeypatch) - producer = ProducerChannel( chunk_size=16, recording_id="rec-1", - shared_memory_size=8, + max_frame_bytes=8, data_type=DataType.RGB_IMAGES, ) try: producer.start_new_trace() - try: + with pytest.raises(PacketTooLarge): producer.send_data( b"abcdefgh", data_type=DataType.RGB_IMAGES, @@ -617,378 +382,9 @@ def test_producer_shared_memory_rejects_oversized_packet(monkeypatch) -> None: robot_id="robot-1", dataset_id="dataset-1", ) - except PacketTooLarge: - oversized = True - else: - oversized = False finally: producer.stop_producer_channel() - assert oversized is True - - -def test_producer_sender_failure_stops_waiters(monkeypatch) -> None: - monkeypatch.setenv("NDD_DEBUG", "true") - sent = {"count": 0} - control_context = zmq.Context() - control_sockets: dict[str, zmq.Socket] = {} - shared_memories: dict[str, SharedMemory] = {} - - def flaky_send(_self, message): - sent["count"] += 1 - if message.command == CommandType.OPEN_FIXED_SHARED_SLOTS: - payload = message.payload["open_fixed_shared_slots"] - control_endpoint = payload["control_endpoint"] - shm_name = "neuracore-slots-test-failure" - shm = SharedMemory( - name=shm_name, - create=True, - size=int(payload["slot_size"]) * int(payload["slot_count"]), - ) - shared_memories[shm_name] = shm - socket_obj = control_sockets.get(control_endpoint) - if socket_obj is None: - socket_obj = control_context.socket(zmq.PUSH) - socket_obj.setsockopt(zmq.LINGER, 0) - socket_obj.connect(control_endpoint) - control_sockets[control_endpoint] = socket_obj - socket_obj.send( - MessageEnvelope( - producer_id=None, - command=CommandType.SHARED_SLOT_READY, - payload={ - CommandType.SHARED_SLOT_READY.value: SharedSlotReadyModel( - shm_name=shm_name, - slot_size=int(payload["slot_size"]), - slot_count=int(payload["slot_count"]), - ).model_dump() - }, - ).to_bytes() - ) - return - if message.command == CommandType.SHARED_SLOT_DESCRIPTOR: - raise RuntimeError("boom") - - monkeypatch.setattr( - "neuracore.data_daemon.communications_management.shared_transport.communications_manager.CommunicationsManager.create_producer_socket", - lambda self: None, - ) - monkeypatch.setattr( - "neuracore.data_daemon.communications_management.shared_transport.communications_manager.CommunicationsManager.send_message", - flaky_send, - ) - - def _cleanup_producer(_self) -> None: - for socket_obj in control_sockets.values(): - socket_obj.close(0) - control_sockets.clear() - for shm in shared_memories.values(): - shm.close() - try: - shm.unlink() - except FileNotFoundError: - pass - shared_memories.clear() - control_context.term() - - monkeypatch.setattr( - "neuracore.data_daemon.communications_management.shared_transport.communications_manager.CommunicationsManager.cleanup_producer", - _cleanup_producer, - ) - - producer = ProducerChannel( - chunk_size=2, - recording_id="rec-1", - shared_memory_size=2048, - data_type=DataType.RGB_IMAGES, - ) - - try: - producer.start_new_trace() - producer.send_data( - b"abcd", - data_type=DataType.RGB_IMAGES, - data_type_name="custom", - robot_instance=2, - robot_id="robot-1", - dataset_id="dataset-1", - ) - deadline = time.monotonic() + 1.0 - wait_result = True - try: - while time.monotonic() < deadline: - try: - producer._send(CommandType.HEARTBEAT, {}) - except RuntimeError: - wait_result = False - break - time.sleep(0.02) - finally: - if wait_result and time.monotonic() >= deadline: - wait_result = True - finally: - producer.stop_producer_channel() - - assert sent["count"] >= 1 - assert wait_result is False - - -def test_shared_slot_video_worker_surfaces_background_failure() -> None: - registry = SharedSlotRegistry( - slot_size=2048, - slot_count=2, - ack_timeout_s=DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - allocate_timeout_s=DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - ) - worker = SharedSlotVideoWorker.acquire(registry) - - def raise_worker_error(_item) -> None: - raise RuntimeError("boom") - - worker._process_item = raise_worker_error # type: ignore[method-assign] - - packet = QueuedSharedSlotPacket( - producer_id="producer-1", - sender=None, # type: ignore[arg-type] - metadata_bytes=b"{}", - chunk=b"x", - packet_length=1, - sequence_number=1, - ) - - try: - worker.enqueue_packet(packet=packet) - deadline = time.monotonic() + 1.0 - while worker._thread.is_alive() and time.monotonic() < deadline: - time.sleep(0.02) - - with pytest.raises(RuntimeError, match="Shared-slot video worker failed"): - worker.enqueue_packet(packet=packet) - finally: - SharedSlotVideoWorker.reset_shared_instance_for_tests() - registry.close() - - -def test_shared_slot_transport_snapshots_chunk_before_background_handoff() -> None: - transport = shared_slot_transport_module.SharedSlotVideoTransport() - captured_packets: list[QueuedSharedSlotPacket] = [] - - try: - transport._worker.enqueue_packet = ( # type: ignore[method-assign] - lambda *, packet: captured_packets.append(packet) - ) - - source = bytearray(b"frame-000") - transport.enqueue_packet( - producer_id="producer-1", - sender=None, # type: ignore[arg-type] - metadata={"trace_id": "trace-1", "chunk_index": 0, "total_chunks": 1}, - chunk=memoryview(source), - ) - - source[:] = b"frame-999" - - assert len(captured_packets) == 1 - assert isinstance(captured_packets[0].chunk, bytes) - assert captured_packets[0].chunk == b"frame-000" - finally: - transport.close() - - -def test_shared_slot_timeout_clock_starts_after_socket_send() -> None: - registry = SharedSlotRegistry( - slot_size=2048, - slot_count=2, - ack_timeout_s=0.01, - allocate_timeout_s=0.01, - ) - shm_name = f"{SHARED_SLOT_SHM_PREFIX}{uuid4().hex[:16]}" - shm = SharedMemory(name=shm_name, create=True, size=2048 * 2) - - try: - registry._apply_ready_message( - SharedSlotReadyModel( - shm_name=shm_name, - slot_size=2048, - slot_count=2, - ) - ) - slot_id, _offset = registry.allocate_slot() - sequence_id = registry.mark_in_flight(slot_id=slot_id, sequence_id=1) - - time.sleep(0.03) - with registry._condition: - registry._check_for_timeouts_locked() - assert registry.is_healthy() is True - assert sequence_id in registry._state.in_flight - - registry.mark_sent(sequence_id) - time.sleep(0.03) - with registry._condition: - registry._check_for_timeouts_locked() - assert registry.is_healthy() is False - finally: - registry.close() - shm.close() - shm.unlink() - - -def test_shared_slot_unhealthy_error_includes_reason_and_state() -> None: - registry = SharedSlotRegistry( - slot_size=2048, - slot_count=2, - ack_timeout_s=0.01, - allocate_timeout_s=0.01, - ) - shm_name = f"test-{uuid4().hex[:6]}" - shm = SharedMemory(name=shm_name, create=True, size=2048 * 2) - - try: - registry._apply_ready_message( - SharedSlotReadyModel( - shm_name=shm_name, - slot_size=2048, - slot_count=2, - ) - ) - slot_id, _offset = registry.allocate_slot() - sequence_id = registry.mark_in_flight(slot_id=slot_id, sequence_id=1) - registry.mark_sent(sequence_id) - time.sleep(0.03) - - with registry._condition: - registry._check_for_timeouts_locked() - - with pytest.raises(RuntimeError) as exc_info: - registry.ensure_healthy() - - message = str(exc_info.value) - assert "reason=credit_stall(sequence_id=1,slot_id=0)" in message - assert f"shm_name={shm_name}" in message - assert "ack_timeout_count=1" in message - finally: - registry.close() - shm.close() - shm.unlink() - - -def test_shared_slot_registry_runtime_starts_and_stops_cleanly() -> None: - registry = SharedSlotRegistry( - slot_size=2048, - slot_count=2, - ack_timeout_s=DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - allocate_timeout_s=DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - ) - - try: - assert registry.control_endpoint.startswith("ipc://") - assert registry._runtime.control_thread.is_alive() - assert registry._runtime.watchdog_thread.is_alive() - finally: - registry.close() - - assert not registry._runtime.control_thread.is_alive() - assert not registry._runtime.watchdog_thread.is_alive() - - -def test_shared_slot_ready_message_populates_free_slots() -> None: - registry = SharedSlotRegistry( - slot_size=2048, - slot_count=3, - ack_timeout_s=DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - allocate_timeout_s=DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - ) - shm_name = f"{SHARED_SLOT_SHM_PREFIX}{uuid4().hex[:16]}" - shm = SharedMemory(name=shm_name, create=True, size=2048 * 3) - - try: - registry._apply_ready_message( - SharedSlotReadyModel( - shm_name=shm_name, - slot_size=2048, - slot_count=3, - ) - ) - - assert registry.is_ready() is True - assert len(registry._state.free_slots) == 3 - assert registry.slot_size == 2048 - assert registry.slot_count == 3 - finally: - registry.close() - shm.close() - shm.unlink() - - -def test_shared_slot_ready_message_adopts_daemon_slot_dimensions() -> None: - registry = SharedSlotRegistry( - slot_size=1024, - slot_count=1, - ack_timeout_s=DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - allocate_timeout_s=DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - ) - shm_name = f"{SHARED_SLOT_SHM_PREFIX}{uuid4().hex[:16]}" - shm = SharedMemory(name=shm_name, create=True, size=4096 * 4) - - try: - registry._apply_ready_message( - SharedSlotReadyModel( - shm_name=shm_name, - slot_size=4096, - slot_count=4, - ) - ) - - assert registry.slot_size == 4096 - assert registry.slot_count == 4 - assert registry.slot_size * registry.slot_count == 4096 * 4 - assert len(registry._state.free_slots) == 4 - finally: - registry.close() - shm.close() - shm.unlink() - - -def test_shared_slot_open_failure_message_surfaces_daemon_error() -> None: - registry = SharedSlotRegistry( - slot_size=2048, - slot_count=3, - ack_timeout_s=DEFAULT_VIDEO_ACK_TIMEOUT_SECONDS, - allocate_timeout_s=DEFAULT_VIDEO_SLOT_ALLOCATE_TIMEOUT_SECONDS, - ) - error_message = ( - "Not enough shared-memory for data throughput requirements. " - "remaining=3.31MiB" - ) - - try: - registry._process_control_message( - MessageEnvelope( - producer_id=None, - command=CommandType.SHARED_SLOT_OPEN_FAILED, - payload={ - CommandType.SHARED_SLOT_OPEN_FAILED.value: ( - SharedSlotOpenFailedModel( - error_message=error_message - ).model_dump() - ) - }, - ) - ) - - assert registry.is_healthy() is False - assert registry.is_ready() is False - assert registry._state.unhealthy_reason == "open_failed" - assert registry._state.failure_message == error_message - - with pytest.raises(RuntimeError, match="Not enough shared-memory"): - registry.ensure_healthy() - - with pytest.raises(RuntimeError, match="Not enough shared-memory"): - registry.allocate_slot() - finally: - registry.close() - def test_producer_sequences_follow_enqueue_order_under_concurrent_senders( monkeypatch, @@ -1191,15 +587,65 @@ def test_cleanup_keeps_recent_channel(emitter) -> None: assert daemon.channels.get("active") is channel -def test_cleanup_keeps_stale_shared_memory_channel_with_pending_descriptor( - emitter, monkeypatch +def test_heartbeat_missing_data_type_does_not_open_transport( + caplog: pytest.LogCaptureFixture, + emitter, ) -> None: daemon = Daemon( comm_manager=DummyComm(), recording_disk_manager=DummyRecordingDiskManager(), emitter=emitter, ) - producer_id = "stale-shm-producer" + + with caplog.at_level(logging.WARNING): + daemon.handle_message( + MessageEnvelope( + producer_id="missing-data-type", + command=CommandType.HEARTBEAT, + payload={}, + ) + ) + + channel = daemon.channels.get("missing-data-type") + assert channel is not None + assert channel.transport_mode is TransportMode.NONE + assert channel.data_type is None + assert "missing required data_type" in caplog.text + + +def test_heartbeat_unknown_data_type_does_not_open_transport( + caplog: pytest.LogCaptureFixture, + emitter, +) -> None: + daemon = Daemon( + comm_manager=DummyComm(), + recording_disk_manager=DummyRecordingDiskManager(), + emitter=emitter, + ) + + with caplog.at_level(logging.WARNING): + daemon.handle_message( + MessageEnvelope( + producer_id="unknown-data-type", + command=CommandType.HEARTBEAT, + payload={"data_type": "not-a-real-type"}, + ) + ) + + channel = daemon.channels.get("unknown-data-type") + assert channel is not None + assert channel.transport_mode is TransportMode.NONE + assert channel.data_type is None + assert "carried unknown data_type" in caplog.text + + +def test_cleanup_keeps_stale_video_channel_with_pending_sequence(emitter) -> None: + daemon = Daemon( + comm_manager=DummyComm(), + recording_disk_manager=DummyRecordingDiskManager(), + emitter=emitter, + ) + producer_id = "stale-video-producer" recording_id = "rec-1" trace_id = "trace-1" cutoff_sequence_number = 5 @@ -1211,10 +657,8 @@ def test_cleanup_keeps_stale_shared_memory_channel_with_pending_descriptor( trace_id=trace_id, last_sequence_number=cutoff_sequence_number, ) - channel.mark_shared_slot_transport_open( - control_endpoint="ipc://test-shared-slot-control", - shm_name="neuracore-slots-test", - ) + channel.data_type = DataType.RGB_IMAGES + channel.mark_video_transport_open() daemon.channels.add(channel) daemon._trace_lifecycle.register_trace(recording_id, trace_id) @@ -1241,61 +685,21 @@ def test_cleanup_keeps_stale_shared_memory_channel_with_pending_descriptor( }, ) ) - daemon._trace_lifecycle.mark_shared_slot_sequence_pending( - SharedSlotSequenceProgressRequest( + # A video frame at the cutoff is still pending spool processing. + daemon._trace_lifecycle.mark_video_frame_sequence_pending( + VideoFrameSequenceProgressRequest( producer_id=producer_id, sequence_number=cutoff_sequence_number, ) ) - cleanup_calls: list[str] = [] - monkeypatch.setattr( - daemon._shared_slot_handler, - "cleanup_channel_resources", - lambda cleanup_channel: cleanup_calls.append(cleanup_channel.producer_id), - ) - daemon._cleanup_expired_channels() assert daemon.channels.get(producer_id) is channel - assert cleanup_calls == [] assert daemon._closed_producers.contains(producer_id) is False -def test_abandoned_shared_slot_sequences_unblock_pending_recording_state( - emitter, -) -> None: - daemon = Daemon( - comm_manager=DummyComm(), - recording_disk_manager=DummyRecordingDiskManager(), - emitter=emitter, - ) - producer_id = "producer-1" - sequence_number = 5 - - daemon._trace_lifecycle.mark_shared_slot_sequence_pending( - SharedSlotSequenceProgressRequest( - producer_id=producer_id, - sequence_number=sequence_number, - ) - ) - - assert daemon._trace_lifecycle.has_pending_shared_slot_sequences_at_or_before( - producer_id, - sequence_number, - ) - - daemon._handle_abandoned_shared_slot_sequences(producer_id, [sequence_number]) - - assert not daemon._trace_lifecycle.has_pending_shared_slot_sequences_at_or_before( - producer_id, - sequence_number, - ) - - -def test_closed_producer_drops_stale_messages_until_reopened( - emitter, monkeypatch -) -> None: +def test_closed_producer_revived_on_heartbeat(emitter) -> None: daemon = Daemon( comm_manager=DummyComm(), recording_disk_manager=DummyRecordingDiskManager(), @@ -1304,22 +708,17 @@ def test_closed_producer_drops_stale_messages_until_reopened( producer_id = "reopened-producer" daemon._closed_producers.add(producer_id) - open_calls: list[str] = [] heartbeat_calls: list[str] = [] - monkeypatch.setattr( - daemon._shared_slot_handler, - "handle_open", - lambda channel, _payload, **_kwargs: open_calls.append(channel.producer_id), - ) daemon._command_handlers[CommandType.HEARTBEAT] = ( lambda channel, _message: heartbeat_calls.append(channel.producer_id) ) + # A non-heartbeat command from a closed producer is dropped. daemon.handle_message( MessageEnvelope( producer_id=producer_id, - command=CommandType.HEARTBEAT, - payload={}, + command=CommandType.TRACE_END, + payload={"trace_end": {"trace_id": "t", "recording_id": "rec-1"}}, ) ) @@ -1327,30 +726,15 @@ def test_closed_producer_drops_stale_messages_until_reopened( assert heartbeat_calls == [] assert daemon._closed_producers.contains(producer_id) is True + # A heartbeat revives the channel. daemon.handle_message( MessageEnvelope( producer_id=producer_id, - command=CommandType.OPEN_FIXED_SHARED_SLOTS, - payload={"open_fixed_shared_slots": {"control_endpoint": "ipc://test"}}, + command=CommandType.HEARTBEAT, + payload={"data_type": DataType.RGB_IMAGES.value}, ) ) assert daemon.channels.get(producer_id) is not None - assert open_calls == [producer_id] assert daemon._closed_producers.contains(producer_id) is False - - daemon.handle_message( - MessageEnvelope( - producer_id=producer_id, - command=CommandType.HEARTBEAT, - payload={}, - ) - ) - assert heartbeat_calls == [producer_id] - - -BridgeChunkSpool = bridge_chunk_spool_module.BridgeChunkSpool -PacketTooLarge = shared_slot_transport_module.PacketTooLarge -SharedSlotVideoWorker = shared_slot_transport_module.SharedSlotVideoWorker -parse_shared_frame_packet = shared_slot_transport_module.parse_shared_frame_packet