From 075d09cd9747389464792c11f152da8a94e6328c Mon Sep 17 00:00:00 2001 From: Ankith-Confluent Date: Tue, 14 Apr 2026 10:33:51 +0530 Subject: [PATCH 01/24] Add support for ShareConsumer in soak client and update usage instructionsx --- tests/soak/run.sh | 8 +- tests/soak/soakclient.py | 237 +++++++++++++++++++++++++++++++++++---- 2 files changed, 221 insertions(+), 24 deletions(-) diff --git a/tests/soak/run.sh b/tests/soak/run.sh index 0da531b27..b625c3589 100755 --- a/tests/soak/run.sh +++ b/tests/soak/run.sh @@ -17,12 +17,18 @@ topic="pysoak-$TESTID-$librdkafka_version" logfile="${TESTID}.log.bz2" limit=$((50 * 1024 * 1024)) # 50MB export HOSTNAME=$(hostname) + +share_flag="" +if [[ "${SHARE:-}" == "true" ]]; then + share_flag="--share" +fi + echo "Starting soak client using topic $topic. Logging to $logfile." set +x while [ "$run" = true ]; do # Ignore SIGINT in children (inherited) trap "" SIGINT - time opentelemetry-instrument $testdir/soakclient.py -i $TESTID -t $topic -r 80 -f $1 |& tee /dev/tty | bzip2 > $logfile & + time opentelemetry-instrument $testdir/soakclient.py -i $TESTID -t $topic -r 80 -f $1 $share_flag |& tee /dev/tty | bzip2 > $logfile & PID=$! terminate_last() { # List children of $PID only diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 3ae7e22b6..c82fdc5a0 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -20,7 +20,7 @@ # long term validation testing. # # Usage: -# tests/soak/soakclient.py -i -t -r -f +# tests/soak/soakclient.py -i -t -r -f [--share] # # A unique topic should be used for each soakclient instance. # @@ -43,6 +43,12 @@ from confluent_kafka import Consumer, KafkaError, KafkaException, Producer, version from confluent_kafka.admin import AdminClient, NewTopic +try: + from confluent_kafka import ShareConsumer + HAS_SHARE_CONSUMER = True +except ImportError: + HAS_SHARE_CONSUMER = False + class SoakRecord(object): """A private record type, with JSON serializer and deserializer""" @@ -68,7 +74,7 @@ def deserialize(cls, binstr): class SoakClient(object): """The SoakClient consists of a Producer sending messages at - the given rate, and a Consumer consuming the messages. + the given rate, and a Consumer or ShareConsumer consuming the messages. Both clients print their message and error counters every 10 seconds. The producer and consumer run in separate background threads. """ @@ -329,6 +335,147 @@ def producer_error_cb(self, err): self.producer_error_cb_cnt += 1 self.incr_counter("producer.errorcb", 1) + def share_error_cb(self, err): + """Share consumer error callback""" + self.logger.error("share: error_cb: {}".format(err)) + self.share_error_cb_cnt += 1 + self.incr_counter("share.errorcb", 1) + + def share_status(self): + """Print share consumer status""" + self.logger.info( + "share: {} messages consumed, {} duplicates, " + "{} missed, {} message errors, {} consumer errors, {} error_cbs".format( + self.share_msg_cnt, + self.share_msg_dup_cnt, + self.share_msg_miss_cnt, + self.share_msg_err_cnt, + self.share_err_cnt, + self.share_error_cb_cnt, + ) + ) + + def share_run(self): + """Share consumer main loop""" + self.share_consumer.subscribe([self.topic]) + + self.share_msg_cnt = 0 + self.share_msg_dup_cnt = 0 + self.share_msg_miss_cnt = 0 + self.share_msg_err_cnt = 0 + self.share_err_cnt = 0 + self.share_error_cb_cnt = 0 + + # Track highest offset seen per partition for duplicate/gap detection. + # With implicit ack and a single share consumer, offsets should + # progress sequentially per partition. + hwmarks = defaultdict(int) + + next_status = time.time() + self.disprate + + while self.run: + now = time.time() + if now > next_status: + self.share_status() + next_status = now + self.disprate + + try: + messages = self.share_consumer.consume_batch(timeout=1.0) + except Exception as ex: + self.logger.error("share: consume_batch exception: {}".format(ex)) + self.share_err_cnt += 1 + self.incr_counter("share.error", 1) + continue + + if not messages: + continue + + for msg in messages: + if msg.error() is not None: + self.logger.error("share: error: {}".format(msg.error())) + self.share_err_cnt += 1 + self.incr_counter("share.error", 1) + continue + + try: + record = SoakRecord.deserialize(msg.value()) # noqa unused variable + except ValueError as ex: + self.logger.info( + "share: Failed to deserialize message in " + "{} [{}] at offset {} (headers {}): {}".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), ex + ) + ) + self.share_msg_err_cnt += 1 + self.incr_counter("share.msgerr", 1) + + self.share_msg_cnt += 1 + self.incr_counter("share.msg", 1) + + # end-to-end latency + headers = dict(msg.headers()) + txtime = headers.get('time', None) + if txtime is not None: + latency = time.time() - float(txtime) + self.set_gauge( + "share.e2e_latency", latency, + tags={"partition": "{}".format(msg.partition())} + ) + + if (self.share_msg_cnt % self.disprate) == 0: + self.logger.info( + "share: {} messages consumed: Message {} " + "[{}] at offset {} (headers {})".format( + self.share_msg_cnt, msg.topic(), + msg.partition(), msg.offset(), msg.headers() + ) + ) + + # Track per-partition high-water mark for duplicate/gap detection + hwkey = "{}-{}".format(msg.topic(), msg.partition()) + hw = hwmarks[hwkey] + + if hw > 0: + if msg.offset() <= hw: + self.logger.warning( + "share: Old or duplicate message {} " + "[{}] at offset {} (headers {}): wanted offset > {}".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) + ) + self.share_msg_dup_cnt += (hw + 1) - msg.offset() + self.incr_counter("share.msgdup", 1) + elif msg.offset() > hw + 1: + self.logger.warning( + "share: Lost messages, now at {} " + "[{}] at offset {} (headers {}): " + "expected offset {}+1".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) + ) + self.share_msg_miss_cnt += msg.offset() - (hw + 1) + self.incr_counter("share.missedmsg", 1) + + hwmarks[hwkey] = msg.offset() + + self.share_consumer.close() + self.share_status() + + def share_thread_main(self): + """Share consumer thread main function""" + try: + self.share_run() + except KeyboardInterrupt: + self.logger.info("share: aborted by user") + self.run = False + except Exception as ex: + self.logger.fatal("share: fatal exception: {}\n{}".format( + ex, traceback.print_exc())) + self.run = False + def rtt_stats(self, d): """Extract broker rtt statistics from the stats dict in @param d""" @@ -394,7 +541,7 @@ def create_topic(self, topic, conf): else: raise - def __init__(self, testid, topic, rate, conf): + def __init__(self, testid, topic, rate, conf, enable_share=False): """SoakClient constructor. conf is the client configuration""" self.topic = topic self.rate = rate @@ -446,50 +593,90 @@ def filter_config(conf, filter_out, strip_prefix): return out # Create topic (might already exist) - aconf = filter_config(conf, ["consumer.", "producer."], "admin.") + aconf = filter_config(conf, ["consumer.", "producer.", "share."], "admin.") aconf['client.id'] = self.testid self.create_topic(self.topic, aconf) # - # Create Producer and Consumer, each running in its own thread. + # Create Producer and Consumer/ShareConsumer, each in its own thread. # conf['stats_cb'] = self.stats_cb conf['statistics.interval.ms'] = 120000 # Producer - pconf = filter_config(conf, ["consumer.", "admin."], "producer.") + pconf = filter_config(conf, ["consumer.", "admin.", "share."], "producer.") pconf['error_cb'] = self.producer_error_cb pconf['client.id'] = self.testid self.producer = Producer(pconf) - # Consumer - cconf = filter_config(conf, ["producer.", "admin."], "consumer.") - cconf['error_cb'] = self.consumer_error_cb - cconf['on_commit'] = self.consumer_commit_cb - self.logger.info("consumer: using group.id {}".format(cconf['group.id'])) - cconf['client.id'] = self.testid - self.consumer = Consumer(cconf) - - # Initialize some counters to zero to make them appear in the metrics - self.incr_counter("consumer.error", 0) - self.incr_counter("consumer.msgdup", 0) self.incr_counter("producer.errorcb", 0) # Create and start producer thread self.producer_thread = threading.Thread(target=self.producer_thread_main) self.producer_thread.start() - # Create and start consumer thread - self.consumer_thread = threading.Thread(target=self.consumer_thread_main) - self.consumer_thread.start() + self.consumer = None + self.consumer_thread = None + self.share_consumer = None + self.share_thread = None + + if enable_share: + if not HAS_SHARE_CONSUMER: + raise RuntimeError( + "ShareConsumer requested but not available in this " + "confluent_kafka build." + ) + + sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") + sconf['error_cb'] = self.share_error_cb + sconf['stats_cb'] = self.stats_cb + sconf['statistics.interval.ms'] = 120000 + sconf['client.id'] = self.testid + + # Always set a share-specific group.id. + sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( + self.hostname, version(), sys.version.split(' ')[0] + ) + + self.logger.info("share: using group.id {}".format(sconf['group.id'])) + self.share_consumer = ShareConsumer(sconf) + + # Initialize counters to zero + self.incr_counter("share.error", 0) + self.incr_counter("share.msgdup", 0) + self.incr_counter("share.msgerr", 0) + self.incr_counter("share.errorcb", 0) + + # Create and start share consumer thread + self.share_thread = threading.Thread(target=self.share_thread_main) + self.share_thread.start() + else: + # Consumer + cconf = filter_config(conf, ["producer.", "admin.", "share."], "consumer.") + cconf['error_cb'] = self.consumer_error_cb + cconf['on_commit'] = self.consumer_commit_cb + self.logger.info("consumer: using group.id {}".format(cconf['group.id'])) + cconf['client.id'] = self.testid + self.consumer = Consumer(cconf) + + # Initialize some counters to zero to make them appear in the metrics + self.incr_counter("consumer.error", 0) + self.incr_counter("consumer.msgdup", 0) + + # Create and start consumer thread + self.consumer_thread = threading.Thread(target=self.consumer_thread_main) + self.consumer_thread.start() def terminate(self): - """Terminate Producer and Consumer""" + """Terminate Producer and Consumer/Share Consumer""" soak.logger.info("Terminating (ran for {}s)".format(time.time() - self.start_time)) self.run = False # Wait for background threads to finish. self.producer_thread.join() - self.consumer_thread.join() + if self.share_thread is not None: + self.share_thread.join() + else: + self.consumer_thread.join() # Final resource usage soak.get_rusage() @@ -576,6 +763,10 @@ def get_rusage(self): parser.add_argument( '-f', dest='conffile', type=argparse.FileType('r'), help='Configuration file (configprop=value format)' ) + parser.add_argument( + '--share', dest='share', action='store_true', default=False, + help='Enable share consumer thread' + ) args = parser.parse_args() @@ -606,7 +797,7 @@ def get_rusage(self): conf['enable.partition.eof'] = False # Create SoakClient - soak = SoakClient(args.testid, args.topic, args.rate, conf) + soak = SoakClient(args.testid, args.topic, args.rate, conf, enable_share=args.share) # Get initial resource usage soak.get_rusage() From 18ca7efbc266b1212c307c468dce0510501767c8 Mon Sep 17 00:00:00 2001 From: Ankith-Confluent Date: Tue, 14 Apr 2026 21:34:52 +0530 Subject: [PATCH 02/24] Refactor share consumer metrics to use generic consumer counters in soak client --- tests/soak/README.md | 10 +++++++++- tests/soak/soakclient.py | 24 ++++++++++++------------ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/soak/README.md b/tests/soak/README.md index e8d89aef9..457083f6c 100644 --- a/tests/soak/README.md +++ b/tests/soak/README.md @@ -30,4 +30,12 @@ OpenTelemetry reporting supported through OTLP. 5. Run some tests ```bash TESTID= ./run.sh ccloud.config -``` \ No newline at end of file +``` + +## Share Consumer + +To run with a share consumer instead of the regular consumer: +```bash +SHARE=true TESTID= ./run.sh ccloud.config +``` +Requires KIP-932 compatible librdkafka and broker. \ No newline at end of file diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index c82fdc5a0..bf61ff0bb 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -339,7 +339,7 @@ def share_error_cb(self, err): """Share consumer error callback""" self.logger.error("share: error_cb: {}".format(err)) self.share_error_cb_cnt += 1 - self.incr_counter("share.errorcb", 1) + self.incr_counter("consumer.errorcb", 1) def share_status(self): """Print share consumer status""" @@ -384,7 +384,7 @@ def share_run(self): except Exception as ex: self.logger.error("share: consume_batch exception: {}".format(ex)) self.share_err_cnt += 1 - self.incr_counter("share.error", 1) + self.incr_counter("consumer.error", 1) continue if not messages: @@ -394,7 +394,7 @@ def share_run(self): if msg.error() is not None: self.logger.error("share: error: {}".format(msg.error())) self.share_err_cnt += 1 - self.incr_counter("share.error", 1) + self.incr_counter("consumer.error", 1) continue try: @@ -408,10 +408,10 @@ def share_run(self): ) ) self.share_msg_err_cnt += 1 - self.incr_counter("share.msgerr", 1) + self.incr_counter("consumer.msgerr", 1) self.share_msg_cnt += 1 - self.incr_counter("share.msg", 1) + self.incr_counter("consumer.msg", 1) # end-to-end latency headers = dict(msg.headers()) @@ -419,7 +419,7 @@ def share_run(self): if txtime is not None: latency = time.time() - float(txtime) self.set_gauge( - "share.e2e_latency", latency, + "consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())} ) @@ -446,7 +446,7 @@ def share_run(self): ) ) self.share_msg_dup_cnt += (hw + 1) - msg.offset() - self.incr_counter("share.msgdup", 1) + self.incr_counter("consumer.msgdup", 1) elif msg.offset() > hw + 1: self.logger.warning( "share: Lost messages, now at {} " @@ -457,7 +457,7 @@ def share_run(self): ) ) self.share_msg_miss_cnt += msg.offset() - (hw + 1) - self.incr_counter("share.missedmsg", 1) + self.incr_counter("consumer.missedmsg", 1) hwmarks[hwkey] = msg.offset() @@ -642,10 +642,10 @@ def filter_config(conf, filter_out, strip_prefix): self.share_consumer = ShareConsumer(sconf) # Initialize counters to zero - self.incr_counter("share.error", 0) - self.incr_counter("share.msgdup", 0) - self.incr_counter("share.msgerr", 0) - self.incr_counter("share.errorcb", 0) + self.incr_counter("consumer.error", 0) + self.incr_counter("consumer.msgdup", 0) + self.incr_counter("consumer.msgerr", 0) + self.incr_counter("consumer.errorcb", 0) # Create and start share consumer thread self.share_thread = threading.Thread(target=self.share_thread_main) From b816d847454bb3d769f03b80233515cd1bdda46c Mon Sep 17 00:00:00 2001 From: Ankith L Date: Wed, 20 May 2026 18:42:25 +0530 Subject: [PATCH 03/24] Update share consumer method from consume_batch to poll in SoakClient --- tests/soak/soakclient.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index bf61ff0bb..329af74e4 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -380,9 +380,9 @@ def share_run(self): next_status = now + self.disprate try: - messages = self.share_consumer.consume_batch(timeout=1.0) + messages = self.share_consumer.poll(timeout=1.0) except Exception as ex: - self.logger.error("share: consume_batch exception: {}".format(ex)) + self.logger.error("share: poll exception: {}".format(ex)) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) continue From 333fdf17098abb50021bdef5f3b69fc2513645f0 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Tue, 2 Jun 2026 17:16:17 +0530 Subject: [PATCH 04/24] Remove statistics configuration for ShareConsumer to prevent crashes --- tests/soak/soakclient.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 329af74e4..cb38b92f3 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -629,8 +629,12 @@ def filter_config(conf, filter_out, strip_prefix): sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb - sconf['stats_cb'] = self.stats_cb - sconf['statistics.interval.ms'] = 120000 + # Share consumer rejects `statistics.interval.ms` (librdkafka + # `dev_kip-932_queues-for-kafka` PR #5469). Strip the stats + # wiring inherited from the parent conf so ShareConsumer() + # doesn't crash with `_INVALID_ARG`. + sconf.pop('statistics.interval.ms', None) + sconf.pop('stats_cb', None) sconf['client.id'] = self.testid # Always set a share-specific group.id. From d59a21a5f385e48f7e1df8e83305e2744e5cef98 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Thu, 4 Jun 2026 22:10:11 +0530 Subject: [PATCH 05/24] Add explicit mode support for ShareConsumer with alternating commit strategy --- tests/soak/run.sh | 7 +++- tests/soak/soakclient.py | 72 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/tests/soak/run.sh b/tests/soak/run.sh index b625c3589..719d89fd4 100755 --- a/tests/soak/run.sh +++ b/tests/soak/run.sh @@ -23,12 +23,17 @@ if [[ "${SHARE:-}" == "true" ]]; then share_flag="--share" fi +explicit_flag="" +if [[ "${EXPLICIT:-}" == "true" ]]; then + explicit_flag="--explicit" +fi + echo "Starting soak client using topic $topic. Logging to $logfile." set +x while [ "$run" = true ]; do # Ignore SIGINT in children (inherited) trap "" SIGINT - time opentelemetry-instrument $testdir/soakclient.py -i $TESTID -t $topic -r 80 -f $1 $share_flag |& tee /dev/tty | bzip2 > $logfile & + time opentelemetry-instrument $testdir/soakclient.py -i $TESTID -t $topic -r 80 -f $1 $share_flag $explicit_flag |& tee /dev/tty | bzip2 > $logfile & PID=$! terminate_last() { # List children of $PID only diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index cb38b92f3..2f71b9e9a 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -368,9 +368,17 @@ def share_run(self): # Track highest offset seen per partition for duplicate/gap detection. # With implicit ack and a single share consumer, offsets should - # progress sequentially per partition. + # progress sequentially per partition. Skipped in Explict mode. hwmarks = defaultdict(int) + # In explicit mode, commit every N messages and alternate between + # commit_async and commit_sync. + COMMIT_EVERY_MSGS = 1000 + msgs_since_commit = 0 + commit_use_sync = False + + self.logger.info("share: running in mode={}".format(self.share_mode)) + next_status = time.time() + self.disprate while self.run: @@ -461,6 +469,46 @@ def share_run(self): hwmarks[hwkey] = msg.offset() + # Explicit mode: ack each message with ACCEPT and flush acks + # every N messages via a standalone ShareAcknowledge RPC, + # alternating commit_async and commit_sync. + if self.share_mode == 'explicit': + try: + self.share_consumer.acknowledge(msg) + msgs_since_commit += 1 + except KafkaException as ex: + self.logger.error( + "share: acknowledge failed: {}".format(ex)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + + if msgs_since_commit >= COMMIT_EVERY_MSGS: + try: + if commit_use_sync: + result = self.share_consumer.commit_sync( + timeout=10.0) + partition_errs = sum( + 1 for err in result.values() + if err is not None + ) + if partition_errs > 0: + self.logger.warning( + "share: commit_sync had {} partition " + "error(s)".format(partition_errs) + ) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + else: + self.share_consumer.commit_async() + except KafkaException as ex: + self.logger.error( + "share: commit_{} exception: {}".format( + "sync" if commit_use_sync else "async", ex)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + commit_use_sync = not commit_use_sync + msgs_since_commit = 0 + self.share_consumer.close() self.share_status() @@ -541,7 +589,8 @@ def create_topic(self, topic, conf): else: raise - def __init__(self, testid, topic, rate, conf, enable_share=False): + def __init__(self, testid, topic, rate, conf, enable_share=False, + share_mode='implicit'): """SoakClient constructor. conf is the client configuration""" self.topic = topic self.rate = rate @@ -549,6 +598,7 @@ def __init__(self, testid, topic, rate, conf, enable_share=False): self.run = True self.stats_cnt = {'producer': 0, 'consumer': 0} self.start_time = time.time() + self.share_mode = share_mode # OTEL instruments self.counters = {} @@ -637,6 +687,12 @@ def filter_config(conf, filter_out, strip_prefix): sconf.pop('stats_cb', None) sconf['client.id'] = self.testid + # In explicit mode, switch the consumer's ack policy. Default is + # implicit (next poll auto-acks); without this flip, calls to + # acknowledge() return _STATE because the message is already acked. + if self.share_mode == 'explicit': + sconf['share.acknowledgement.mode'] = 'explicit' + # Always set a share-specific group.id. sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( self.hostname, version(), sys.version.split(' ')[0] @@ -771,9 +827,18 @@ def get_rusage(self): '--share', dest='share', action='store_true', default=False, help='Enable share consumer thread' ) + parser.add_argument( + '--explicit', dest='explicit', action='store_true', default=False, + help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)' + ) args = parser.parse_args() + share_mode = 'explicit' if args.explicit else 'implicit' + + if share_mode == 'explicit' and not args.share: + parser.error('--explicit requires --share') + conf = dict() if args.conffile is not None: # Parse client configuration file. @@ -801,7 +866,8 @@ def get_rusage(self): conf['enable.partition.eof'] = False # Create SoakClient - soak = SoakClient(args.testid, args.topic, args.rate, conf, enable_share=args.share) + soak = SoakClient(args.testid, args.topic, args.rate, conf, + enable_share=args.share, share_mode=share_mode) # Get initial resource usage soak.get_rusage() From f2e37c8e1805dd360132a3bfbd4ee180bbbb6ab5 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Sat, 6 Jun 2026 13:54:35 +0530 Subject: [PATCH 06/24] Refactor explicit mode handling in ShareConsumer to alternate between sync and async commits --- tests/soak/soakclient.py | 58 +++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 2f71b9e9a..570e25e2a 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -29,6 +29,7 @@ import json import logging import os +import random import resource import sys import threading @@ -371,12 +372,6 @@ def share_run(self): # progress sequentially per partition. Skipped in Explict mode. hwmarks = defaultdict(int) - # In explicit mode, commit every N messages and alternate between - # commit_async and commit_sync. - COMMIT_EVERY_MSGS = 1000 - msgs_since_commit = 0 - commit_use_sync = False - self.logger.info("share: running in mode={}".format(self.share_mode)) next_status = time.time() + self.disprate @@ -469,45 +464,40 @@ def share_run(self): hwmarks[hwkey] = msg.offset() - # Explicit mode: ack each message with ACCEPT and flush acks - # every N messages via a standalone ShareAcknowledge RPC, - # alternating commit_async and commit_sync. + # Explicit mode: ack each message with ACCEPT. Commit fires + # once per batch after the for-msg loop below. if self.share_mode == 'explicit': try: self.share_consumer.acknowledge(msg) - msgs_since_commit += 1 except KafkaException as ex: self.logger.error( "share: acknowledge failed: {}".format(ex)) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) - if msgs_since_commit >= COMMIT_EVERY_MSGS: - try: - if commit_use_sync: - result = self.share_consumer.commit_sync( - timeout=10.0) - partition_errs = sum( - 1 for err in result.values() - if err is not None - ) - if partition_errs > 0: - self.logger.warning( - "share: commit_sync had {} partition " - "error(s)".format(partition_errs) - ) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) - else: - self.share_consumer.commit_async() - except KafkaException as ex: - self.logger.error( - "share: commit_{} exception: {}".format( - "sync" if commit_use_sync else "async", ex)) + if self.share_mode == 'explicit': + use_sync = random.random() < 0.5 + try: + if use_sync: + result = self.share_consumer.commit_sync(timeout=10.0) + partition_errs = sum( + 1 for err in result.values() if err is not None + ) + if partition_errs > 0: + self.logger.warning( + "share: commit_sync had {} partition error(s)" + .format(partition_errs) + ) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) - commit_use_sync = not commit_use_sync - msgs_since_commit = 0 + else: + self.share_consumer.commit_async() + except KafkaException as ex: + self.logger.error( + "share: commit_{} exception: {}".format( + "sync" if use_sync else "async", ex)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) self.share_consumer.close() self.share_status() From f99d4694ba256d5a431699f345af547aac00b692 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Thu, 11 Jun 2026 21:43:35 +0530 Subject: [PATCH 07/24] Improve error logging for ShareConsumer commit_sync by detailing partition errors --- tests/soak/soakclient.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 570e25e2a..715d4aa36 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -480,13 +480,14 @@ def share_run(self): try: if use_sync: result = self.share_consumer.commit_sync(timeout=10.0) - partition_errs = sum( - 1 for err in result.values() if err is not None - ) - if partition_errs > 0: + err_details = [ + "{}/{}={}".format(tp.topic, tp.partition, err) + for tp, err in result.items() if err is not None + ] + if err_details: self.logger.warning( - "share: commit_sync had {} partition error(s)" - .format(partition_errs) + "share: commit_sync had {} partition error(s): {}" + .format(len(err_details), "; ".join(err_details)) ) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) From 32fcfa8065a6defae27063850e912030e9d6847d Mon Sep 17 00:00:00 2001 From: Ojasva Jain Date: Fri, 12 Jun 2026 15:27:41 +0530 Subject: [PATCH 08/24] Add script for perf testing with some metrics removed --- tests/soak/soakclient_perf.py | 886 ++++++++++++++++++++++++++++++++++ 1 file changed, 886 insertions(+) create mode 100644 tests/soak/soakclient_perf.py diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py new file mode 100644 index 000000000..3547760b2 --- /dev/null +++ b/tests/soak/soakclient_perf.py @@ -0,0 +1,886 @@ +#!/usr/bin/env python +# +# Copyright 2018 Confluent Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Soak test producer-consumer end-to-end client for +# long term validation testing. +# +# Usage: +# tests/soak/soakclient.py -i -t -r -f [--share] +# +# A unique topic should be used for each soakclient instance. +# + +import argparse +import json +import logging +import os +import random +import resource +import sys +import threading +import time +import traceback +from builtins import int +from collections import defaultdict + +import psutil +from opentelemetry import metrics + +from confluent_kafka import Consumer, KafkaError, KafkaException, Producer, version +from confluent_kafka.admin import AdminClient, NewTopic + +try: + from confluent_kafka import ShareConsumer + HAS_SHARE_CONSUMER = True +except ImportError: + HAS_SHARE_CONSUMER = False + + +class SoakRecord(object): + """A private record type, with JSON serializer and deserializer""" + + def __init__(self, msgid, name=None): + self.msgid = msgid + if name is None: + self.name = "SoakRecord nr #{}".format(self.msgid) + else: + self.name = name + + def serialize(self): + return json.dumps(self, default=lambda o: o.__dict__) + + def __str__(self): + return "SoakRecord({})".format(self.name) + + @classmethod + def deserialize(cls, binstr): + d = json.loads(binstr) + return SoakRecord(d['msgid'], d['name']) + + +class SoakClient(object): + """The SoakClient consists of a Producer sending messages at + the given rate, and a Consumer or ShareConsumer consuming the messages. + Both clients print their message and error counters every 10 seconds. + The producer and consumer run in separate background threads. + """ + + # metric name prefix + METRIC_PFX = "kafka.client.soak.python." + + def dr_cb(self, err, msg): + """Producer delivery report callback""" + if err is not None: + self.logger.warning("producer: delivery failed: {} [{}]: {}".format(msg.topic(), msg.partition(), err)) + self.dr_err_cnt += 1 + self.incr_counter("producer.drerr", 1) + self.incr_counter( + "producer.delivery.failure", + 1, + {"topic": msg.topic(), "partition": str(msg.partition()), "err": str(err)}, + ) + + else: + self.dr_cnt += 1 + self.incr_counter("producer.drok", 1) + # perf: per-message gauge disabled to reach high throughput. + # self.set_gauge("producer.latency", msg.latency(), tags={"partition": "{}".format(msg.partition())}) + if (self.dr_cnt % self.disprate) == 0: + self.logger.debug( + "producer: delivered message to {} [{}] at offset {} in {}s".format( + msg.topic(), msg.partition(), msg.offset(), msg.latency() + ) + ) + + def produce_record(self): + """Asynchronously produce a single record, but block and + and retry if buffer is full.""" + record = SoakRecord(self.producer_msgid) + + txcnt = 0 + while True: + txcnt += 1 + + try: + self.producer.produce( + self.topic, + value=record.serialize(), + # perf: "time" header dropped — it only fed the per-message + # e2e latency gauge, which is now disabled. str(time.time()) + # was a syscall + float->str on every produced message. + headers={"msgid": str(record.msgid), "txcnt": str(txcnt)}, + on_delivery=self.dr_cb, + ) + break + + except BufferError: + self.producer.poll(1) + continue + + self.producer_msgid += 1 + self.incr_counter("producer.send", 1) + + def producer_status(self): + """Print producer status""" + self.logger.info( + "producer: {} messages produced, {} delivered, {} failed, {} error_cbs".format( + self.producer_msgid, self.dr_cnt, self.dr_err_cnt, self.producer_error_cb_cnt + ) + ) + + def producer_run(self): + """Producer main loop""" + sleep_intvl = 1.0 / self.rate + + self.producer_msgid = 0 + self.dr_cnt = 0 + self.dr_err_cnt = 0 + self.producer_error_cb_cnt = 0 + + next_status = time.time() + self.disprate + + while self.run: + + # Produce a single record + self.produce_record() + + # Enforce message rate by polling until interval is exceeded. + now = time.time() + t_end = now + sleep_intvl + while True: + if now > next_status: + # Print status + self.producer_status() + next_status = now + self.disprate + + remaining_time = t_end - now + if remaining_time < 0: + remaining_time = 0 + self.producer.poll(remaining_time) + if remaining_time <= 0: + break + now = time.time() + + # Wait for outstanding messages to be delivered. + remaining = self.producer.flush(30) + self.logger.warning("producer: {} message(s) remaining in queue after flush()".format(remaining)) + self.producer_status() + + def producer_thread_main(self): + """Producer thread main function""" + try: + self.producer_run() + except KeyboardInterrupt: + self.logger.info("producer: aborted by user") + self.run = False + except Exception as ex: + self.logger.fatal("producer: fatal exception: {}:\n{}".format(ex, traceback.print_exc())) + self.run = False + + def consumer_status(self): + """Print consumer status""" + self.logger.info( + "consumer: {} messages consumed, {} duplicates, " + "{} missed, {} message errors, {} consumer errors, {} error_cbs".format( + self.msg_cnt, + self.msg_dup_cnt, + self.msg_miss_cnt, + self.msg_err_cnt, + self.consumer_err_cnt, + self.consumer_error_cb_cnt, + ) + ) + + def consumer_run(self): + """Consumer main loop""" + self.consumer_msgid_next = 0 + + self.consumer.subscribe([self.topic]) + + self.msg_cnt = 0 + self.msg_dup_cnt = 0 + self.msg_miss_cnt = 0 + self.msg_err_cnt = 0 + self.consumer_err_cnt = 0 + self.consumer_error_cb_cnt = 0 + self.last_commited = None + + # Keep track of high-watermarks to make sure we don't go backwards + hwmarks = defaultdict(int) + + next_status = time.time() + self.disprate + + while self.run: + + now = time.time() + if now > next_status: + # Print status + self.consumer_status() + next_status = now + self.disprate + + msg = self.consumer.poll(1) + if msg is None: + continue + + if msg.error() is not None: + self.logger.error("consumer: error: {}".format(msg.error())) + self.consumer_err_cnt += 1 + self.incr_counter("consumer.error", 1) + continue + + try: + # Deserialize message + record = SoakRecord.deserialize(msg.value()) # noqa unused variable + except ValueError as ex: + self.logger.info( + "consumer: Failed to deserialize message in " + "{} [{}] at offset {} (headers {}): {}".format( + msg.topic(), msg.partition(), msg.offset(), msg.headers(), ex + ) + ) + self.msg_err_cnt += 1 + self.incr_counter("consumer.msgerr", 1) + + self.msg_cnt += 1 + self.incr_counter("consumer.msg", 1) + + # perf: per-message end-to-end latency gauge disabled to reach + # high throughput (rebuilt a dict + gauge per message). + # headers = dict(msg.headers()) + # txtime = headers.get('time', None) + # if txtime is not None: + # latency = time.time() - float(txtime) + # self.set_gauge("consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())}) + # else: + # latency = None + latency = None + + if (self.msg_cnt % self.disprate) == 0: + # Show a sample message every #disprate messages + self.logger.info( + "consumer: {} messages consumed: Message {} " + "[{}] at offset {} (headers {}, latency {})".format( + self.msg_cnt, msg.topic(), msg.partition(), msg.offset(), msg.headers(), latency + ) + ) + + # Keep track of consumer's highwater mark for each partition, + # to identify duplicates and lost messages. + hwkey = "{}-{}".format(msg.topic(), msg.partition()) + hw = hwmarks[hwkey] + + if hw > 0: + if msg.offset() <= hw: + self.logger.warning( + "consumer: Old or duplicate message {} " + "[{}] at offset {} (headers {}): wanted offset > {} (last commited {})".format( + msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw, self.last_committed + ) + ) + self.msg_dup_cnt += (hw + 1) - msg.offset() + self.incr_counter("consumer.msgdup", 1) + elif msg.offset() > hw + 1: + self.logger.warning( + "consumer: Lost messages, now at {} " + "[{}] at offset {} (headers {}): " + "expected offset {}+1 (last committed {})".format( + msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw, self.last_committed + ) + ) + self.msg_miss_cnt += msg.offset() - (hw + 1) + self.incr_counter("consumer.missedmsg", 1) + + hwmarks[hwkey] = msg.offset() + + self.consumer.close() + self.consumer_status() + + def consumer_thread_main(self): + """Consumer thread main function""" + try: + self.consumer_run() + except KeyboardInterrupt: + self.logger.info("consumer: aborted by user") + self.run = False + except Exception as ex: + self.logger.fatal("consumer: fatal exception: {}\n{}".format(ex, traceback.print_exc())) + self.run = False + + def consumer_error_cb(self, err): + """Consumer error callback""" + self.logger.error("consumer: error_cb: {}".format(err)) + self.consumer_error_cb_cnt += 1 + self.incr_counter("consumer.errorcb", 1) + + def consumer_commit_cb(self, err, partitions): + """Auto commit result callback""" + if err is not None: + self.logger.error("consumer: offset commit failed for {}: {}".format(partitions, err)) + self.consumer_err_cnt += 1 + self.incr_counter("consumer.error", 1) + else: + self.last_committed = partitions + + def producer_error_cb(self, err): + """Producer error callback""" + self.logger.error("producer: error_cb: {}".format(err)) + self.producer_error_cb_cnt += 1 + self.incr_counter("producer.errorcb", 1) + + def share_error_cb(self, err): + """Share consumer error callback""" + self.logger.error("share: error_cb: {}".format(err)) + self.share_error_cb_cnt += 1 + self.incr_counter("consumer.errorcb", 1) + + def share_status(self): + """Print share consumer status""" + self.logger.info( + "share: {} messages consumed, {} duplicates, " + "{} missed, {} message errors, {} consumer errors, {} error_cbs".format( + self.share_msg_cnt, + self.share_msg_dup_cnt, + self.share_msg_miss_cnt, + self.share_msg_err_cnt, + self.share_err_cnt, + self.share_error_cb_cnt, + ) + ) + + def share_run(self): + """Share consumer main loop""" + self.share_consumer.subscribe([self.topic]) + + self.share_msg_cnt = 0 + self.share_msg_dup_cnt = 0 + self.share_msg_miss_cnt = 0 + self.share_msg_err_cnt = 0 + self.share_err_cnt = 0 + self.share_error_cb_cnt = 0 + + # Track highest offset seen per partition for duplicate/gap detection. + # With implicit ack and a single share consumer, offsets should + # progress sequentially per partition. Skipped in Explict mode. + hwmarks = defaultdict(int) + + self.logger.info("share: running in mode={}".format(self.share_mode)) + + next_status = time.time() + self.disprate + + while self.run: + now = time.time() + if now > next_status: + self.share_status() + next_status = now + self.disprate + + try: + messages = self.share_consumer.poll(timeout=1.0) + except Exception as ex: + self.logger.error("share: poll exception: {}".format(ex)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + continue + + if not messages: + continue + + for msg in messages: + if msg.error() is not None: + self.logger.error("share: error: {}".format(msg.error())) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + continue + + try: + record = SoakRecord.deserialize(msg.value()) # noqa unused variable + except ValueError as ex: + self.logger.info( + "share: Failed to deserialize message in " + "{} [{}] at offset {} (headers {}): {}".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), ex + ) + ) + self.share_msg_err_cnt += 1 + self.incr_counter("consumer.msgerr", 1) + + self.share_msg_cnt += 1 + self.incr_counter("consumer.msg", 1) + + # perf: per-message end-to-end latency gauge disabled to reach + # high throughput (rebuilt a dict + gauge per message). + # headers = dict(msg.headers()) + # txtime = headers.get('time', None) + # if txtime is not None: + # latency = time.time() - float(txtime) + # self.set_gauge( + # "consumer.e2e_latency", latency, + # tags={"partition": "{}".format(msg.partition())} + # ) + + if (self.share_msg_cnt % self.disprate) == 0: + self.logger.info( + "share: {} messages consumed: Message {} " + "[{}] at offset {} (headers {})".format( + self.share_msg_cnt, msg.topic(), + msg.partition(), msg.offset(), msg.headers() + ) + ) + + # Track per-partition high-water mark for duplicate/gap detection + hwkey = "{}-{}".format(msg.topic(), msg.partition()) + hw = hwmarks[hwkey] + + if hw > 0: + if msg.offset() <= hw: + self.logger.warning( + "share: Old or duplicate message {} " + "[{}] at offset {} (headers {}): wanted offset > {}".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) + ) + self.share_msg_dup_cnt += (hw + 1) - msg.offset() + self.incr_counter("consumer.msgdup", 1) + elif msg.offset() > hw + 1: + self.logger.warning( + "share: Lost messages, now at {} " + "[{}] at offset {} (headers {}): " + "expected offset {}+1".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) + ) + self.share_msg_miss_cnt += msg.offset() - (hw + 1) + self.incr_counter("consumer.missedmsg", 1) + + hwmarks[hwkey] = msg.offset() + + # Explicit mode: ack each message with ACCEPT. Commit fires + # once per batch after the for-msg loop below. + if self.share_mode == 'explicit': + try: + self.share_consumer.acknowledge(msg) + except KafkaException as ex: + self.logger.error( + "share: acknowledge failed: {}".format(ex)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + + if self.share_mode == 'explicit': + use_sync = random.random() < 0.5 + try: + if use_sync: + result = self.share_consumer.commit_sync(timeout=10.0) + partition_errs = sum( + 1 for err in result.values() if err is not None + ) + if partition_errs > 0: + self.logger.warning( + "share: commit_sync had {} partition error(s)" + .format(partition_errs) + ) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + else: + self.share_consumer.commit_async() + except KafkaException as ex: + self.logger.error( + "share: commit_{} exception: {}".format( + "sync" if use_sync else "async", ex)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + + self.share_consumer.close() + self.share_status() + + def share_thread_main(self): + """Share consumer thread main function""" + try: + self.share_run() + except KeyboardInterrupt: + self.logger.info("share: aborted by user") + self.run = False + except Exception as ex: + self.logger.fatal("share: fatal exception: {}\n{}".format( + ex, traceback.print_exc())) + self.run = False + + def rtt_stats(self, d): + """Extract broker rtt statistics from the stats dict in @param d""" + + # Get leader RTT stats + for broker in d['brokers'].values(): + if broker['toppars'] is None: + continue + + parts = ','.join([str(x['partition']) for x in broker['toppars'].values()]) + + tags = { + "broker": "{}".format(broker['nodeid']), + "partitions": "{}".format(parts), + "type": "{}".format(d['type']), + } + + self.set_gauge("broker.rtt.p99", float(broker['rtt']['p99']) / 1000000.0, tags=tags) + self.set_gauge("broker.rtt.avg", float(broker['rtt']['avg']) / 1000000.0, tags=tags) + + def stats_cb(self, json_str): + """Common statistics callback.""" + d = json.loads(json_str) + + # Print number of connected brokers to monitor + # the sparse connection functionality. + brokers = d['brokers'] + broker_cnt = len(brokers) + up_brokers = [brokers[x]['name'] for x in brokers if brokers[x]['state'] == 'UP'] + if self.topic in d['topics']: + leaders = [ + '{}={}'.format(p['partition'], p['leader']) + for p in d['topics'][self.topic]['partitions'].values() + if p['partition'] != -1 + ] + else: + leaders = [] + self.logger.info( + "{} stats: {}/{} brokers UP, {} partition leaders: {}".format( + d['name'], len(up_brokers), broker_cnt, self.topic, leaders + ) + ) + + # Emit the full raw stats for troubleshooting. + self.stats_cnt[d['type']] += 1 + self.logger.info("{} raw stats: {}".format(d['name'], json_str)) + + self.rtt_stats(d) + + # Sample the producer queue length + if d['type'] == 'producer': + self.set_gauge("producer.outq", len(self.producer)) + + def create_topic(self, topic, conf): + """Create the topic if it doesn't already exist""" + admin = AdminClient(conf) + fs = admin.create_topics([NewTopic(topic, num_partitions=2, replication_factor=3)]) + f = fs[topic] + try: + res = f.result() # noqa unused variable + except KafkaException as ex: + if ex.args[0].code() == KafkaError.TOPIC_ALREADY_EXISTS: + self.logger.info("Topic {} already exists: good".format(topic)) + else: + raise + + def __init__(self, testid, topic, rate, conf, enable_share=False, + share_mode='implicit'): + """SoakClient constructor. conf is the client configuration""" + self.topic = topic + self.rate = rate + self.disprate = int(rate * 10) + self.run = True + self.stats_cnt = {'producer': 0, 'consumer': 0} + self.start_time = time.time() + self.share_mode = share_mode + + # OTEL instruments + self.counters = {} + self.gauges = {} + self.gauge_cbs = {} + self.gauge_values = {} + + self.last_rusage = None + self.last_rusage_time = None + self.proc = psutil.Process(os.getpid()) + + self.logger = logging.getLogger('soakclient') + self.logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s')) + self.logger.addHandler(handler) + + # Construct a unique id to use for metrics hostname so that + # multiple instances of the SoakClient can run on the same machine. + hostname = os.environ["HOSTNAME"] + self.hostname = "py-{}-{}".format(hostname, self.topic) + self.testid = testid + self.meter = metrics.get_meter("njc.python.soak.tests") + + self.logger.info("SoakClient id {}".format(self.hostname)) + + if 'group.id' not in conf: + # Generate a group.id bound to this client and python version + conf['group.id'] = 'soakclient-{}-{}-{}'.format(self.hostname, version(), sys.version.split(' ')[0]) + + conf = {k: v for k, v in conf.items()} + + def filter_config(conf, filter_out, strip_prefix): + len_sp = len(strip_prefix) + out = {} + for k, v in conf.items(): + if len([x for x in filter_out if k.startswith(x)]) > 0: + continue + if k.startswith(strip_prefix): + k = k[len_sp:] + out[k] = v + return out + + # Create topic (might already exist) + aconf = filter_config(conf, ["consumer.", "producer.", "share."], "admin.") + aconf['client.id'] = self.testid + self.create_topic(self.topic, aconf) + + # + # Create Producer and Consumer/ShareConsumer, each in its own thread. + # + conf['stats_cb'] = self.stats_cb + conf['statistics.interval.ms'] = 120000 + + # Producer + pconf = filter_config(conf, ["consumer.", "admin.", "share."], "producer.") + pconf['error_cb'] = self.producer_error_cb + pconf['client.id'] = self.testid + self.producer = Producer(pconf) + + self.incr_counter("producer.errorcb", 0) + + # Create and start producer thread + self.producer_thread = threading.Thread(target=self.producer_thread_main) + self.producer_thread.start() + + self.consumer = None + self.consumer_thread = None + self.share_consumer = None + self.share_thread = None + + if enable_share: + if not HAS_SHARE_CONSUMER: + raise RuntimeError( + "ShareConsumer requested but not available in this " + "confluent_kafka build." + ) + + sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") + sconf['error_cb'] = self.share_error_cb + # Share consumer rejects `statistics.interval.ms` (librdkafka + # `dev_kip-932_queues-for-kafka` PR #5469). Strip the stats + # wiring inherited from the parent conf so ShareConsumer() + # doesn't crash with `_INVALID_ARG`. + sconf.pop('statistics.interval.ms', None) + sconf.pop('stats_cb', None) + sconf['client.id'] = self.testid + + # In explicit mode, switch the consumer's ack policy. Default is + # implicit (next poll auto-acks); without this flip, calls to + # acknowledge() return _STATE because the message is already acked. + if self.share_mode == 'explicit': + sconf['share.acknowledgement.mode'] = 'explicit' + + # Always set a share-specific group.id. + sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( + self.hostname, version(), sys.version.split(' ')[0] + ) + + self.logger.info("share: using group.id {}".format(sconf['group.id'])) + self.share_consumer = ShareConsumer(sconf) + + # Initialize counters to zero + self.incr_counter("consumer.error", 0) + self.incr_counter("consumer.msgdup", 0) + self.incr_counter("consumer.msgerr", 0) + self.incr_counter("consumer.errorcb", 0) + + # Create and start share consumer thread + self.share_thread = threading.Thread(target=self.share_thread_main) + self.share_thread.start() + else: + # Consumer + cconf = filter_config(conf, ["producer.", "admin.", "share."], "consumer.") + cconf['error_cb'] = self.consumer_error_cb + cconf['on_commit'] = self.consumer_commit_cb + self.logger.info("consumer: using group.id {}".format(cconf['group.id'])) + cconf['client.id'] = self.testid + self.consumer = Consumer(cconf) + + # Initialize some counters to zero to make them appear in the metrics + self.incr_counter("consumer.error", 0) + self.incr_counter("consumer.msgdup", 0) + + # Create and start consumer thread + self.consumer_thread = threading.Thread(target=self.consumer_thread_main) + self.consumer_thread.start() + + def terminate(self): + """Terminate Producer and Consumer/Share Consumer""" + soak.logger.info("Terminating (ran for {}s)".format(time.time() - self.start_time)) + self.run = False + # Wait for background threads to finish. + self.producer_thread.join() + if self.share_thread is not None: + self.share_thread.join() + else: + self.consumer_thread.join() + + # Final resource usage + soak.get_rusage() + + def incr_counter(self, metric_name, incrval, tags=None): + """Increment metric counter by incrval""" + if not tags: + tags = {} + tags.update({"host": self.hostname, "testid": self.testid}) + + full_metric_name = self.METRIC_PFX + metric_name + if full_metric_name not in self.counters: + self.counters[full_metric_name] = self.meter.create_counter( + full_metric_name, + description=full_metric_name, + ) + counter = self.counters[full_metric_name] + counter.add(incrval, tags) + + def set_gauge(self, metric_name, val, tags=None): + """Set metric gauge to val""" + if not tags: + tags = {} + tags.update({"host": self.hostname, "testid": self.testid}) + + full_metric_name = self.METRIC_PFX + metric_name + if full_metric_name not in self.gauge_values: + self.gauge_values[full_metric_name] = [] + + self.gauge_values[full_metric_name].append([val, tags]) + + if full_metric_name not in self.gauges: + + def cb(_): + for value in self.gauge_values[full_metric_name]: + yield metrics.Observation(value[0], value[1]) + self.gauge_values[full_metric_name] = [] + + self.gauge_cbs[full_metric_name] = cb + self.gauges[full_metric_name] = self.meter.create_observable_gauge( + callbacks=[self.gauge_cbs[full_metric_name]], name=full_metric_name, description=full_metric_name + ) + + def calc_rusage_deltas(self, curr, prev, elapsed): + """Calculate deltas between previous and current resource usage""" + + # User CPU % + user_cpu = ((curr.ru_utime - prev.ru_utime) / elapsed) * 100.0 + self.set_gauge("cpu.user", user_cpu) + + # System CPU % + sys_cpu = ((curr.ru_stime - prev.ru_stime) / elapsed) * 100.0 + self.set_gauge("cpu.system", sys_cpu) + + # Max RSS memory (monotonic) + max_rss = curr.ru_maxrss / 1024.0 + self.set_gauge("memory.rss.max", max_rss) + + self.logger.info("User CPU: {:.1f}%, System CPU: {:.1f}%, MaxRSS {:.3f}MiB".format(user_cpu, sys_cpu, max_rss)) + + def get_rusage(self): + """Get resource usage and calculate CPU load, etc""" + ru = resource.getrusage(resource.RUSAGE_SELF) + now = time.time() + + if self.last_rusage is not None: + self.calc_rusage_deltas(ru, self.last_rusage, now - self.last_rusage_time) + + self.last_rusage = ru + self.last_rusage_time = now + + # Current RSS memory + rss = float(self.proc.memory_info().rss) / (1024.0 * 1024.0) + self.set_gauge("memory.rss", rss) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Kafka client soak test') + parser.add_argument('-i', dest='testid', type=str, required=True, help='Test id') + parser.add_argument('-b', dest='brokers', type=str, default=None, help='Bootstrap servers') + parser.add_argument('-t', dest='topic', type=str, required=True, help='Topic to use') + parser.add_argument('-r', dest='rate', type=float, default=10, help='Message produce rate per second') + parser.add_argument( + '-f', dest='conffile', type=argparse.FileType('r'), help='Configuration file (configprop=value format)' + ) + parser.add_argument( + '--share', dest='share', action='store_true', default=False, + help='Enable share consumer thread' + ) + parser.add_argument( + '--explicit', dest='explicit', action='store_true', default=False, + help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)' + ) + + args = parser.parse_args() + + share_mode = 'explicit' if args.explicit else 'implicit' + + if share_mode == 'explicit' and not args.share: + parser.error('--explicit requires --share') + + conf = dict() + if args.conffile is not None: + # Parse client configuration file. + # Standard "key=value" format. + for line in args.conffile: + line = line.strip() + if len(line) == 0 or line[0] == '#': + continue + + i = line.find('=') + if i <= 0: + raise ValueError("Configuration lines must be `name=value..`, not {}".format(line)) + + name = line[:i] + value = line[i + 1 :] + + conf[name] = value + + if args.brokers is not None: + # Overwrite any brokers specified in configuration file with + # brokers from -b command line argument + conf['bootstrap.servers'] = args.brokers + + # We don't care about partition EOFs + conf['enable.partition.eof'] = False + + # Create SoakClient + soak = SoakClient(args.testid, args.topic, args.rate, conf, + enable_share=args.share, share_mode=share_mode) + + # Get initial resource usage + soak.get_rusage() + + # Run until interrupted + try: + while soak.run: + time.sleep(10) + soak.get_rusage() + + soak.logger.info("Soak client aborted") + + except KeyboardInterrupt: + soak.logger.info("Interrupted by user") + except Exception as ex: + soak.logger.error("Fatal exception {}\n{}".format(ex, traceback.print_exc())) + + # Terminate + soak.terminate() From 613c318fc5ff9e4bff24bf92ded0e8a7da87780a Mon Sep 17 00:00:00 2001 From: Ankith L Date: Fri, 12 Jun 2026 16:21:45 +0530 Subject: [PATCH 09/24] Enhance SoakRecord name generation to pad with base label for consistent serialized value size --- tests/soak/soakclient_perf.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 3547760b2..6b12f3fea 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -54,10 +54,17 @@ class SoakRecord(object): """A private record type, with JSON serializer and deserializer""" + # Target serialized value size in bytes. The `name` field is padded by + # repeating the base label until the JSON payload reaches roughly this + # size, so 10k msg/s ~= 10 MB/s of realistic throughput. + TARGET_VALUE_SIZE = 1024 + def __init__(self, msgid, name=None): self.msgid = msgid if name is None: - self.name = "SoakRecord nr #{}".format(self.msgid) + base = "SoakRecord nr #{}".format(self.msgid) + repeat = max(1, self.TARGET_VALUE_SIZE // (len(base) + 1)) + self.name = " ".join([base] * repeat) else: self.name = name From 740df42b087b8aaec7e1ef6e63346027869b688b Mon Sep 17 00:00:00 2001 From: Ankith L Date: Fri, 12 Jun 2026 16:46:12 +0530 Subject: [PATCH 10/24] Refactor SoakRecord name generation to use static padding for consistent serialized value size --- tests/soak/soakclient_perf.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 6b12f3fea..f654e997e 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -54,22 +54,23 @@ class SoakRecord(object): """A private record type, with JSON serializer and deserializer""" - # Target serialized value size in bytes. The `name` field is padded by - # repeating the base label until the JSON payload reaches roughly this - # size, so 10k msg/s ~= 10 MB/s of realistic throughput. - TARGET_VALUE_SIZE = 1024 + # Static padding built once at class load. Repeats a JSON-safe base label + # (letters, digits, spaces, '#') so the final value lands at ~1KB without + # rebuilding the string on every produce_record() call. + _PAD = (" SoakRecord nr #0" * 60)[:1000] def __init__(self, msgid, name=None): self.msgid = msgid if name is None: - base = "SoakRecord nr #{}".format(self.msgid) - repeat = max(1, self.TARGET_VALUE_SIZE // (len(base) + 1)) - self.name = " ".join([base] * repeat) + self.name = "SoakRecord nr #{}{}".format(msgid, self._PAD) else: self.name = name def serialize(self): - return json.dumps(self, default=lambda o: o.__dict__) + # Bytes formatting is ~3-5x faster than json.dumps for this shape. + # name only contains JSON-safe chars so no escaping is needed. + return b'{"msgid":%d,"name":"%s"}' % ( + self.msgid, self.name.encode('ascii')) def __str__(self): return "SoakRecord({})".format(self.name) From ce03c953b4b60168267659bcd4bc5cd6559b8563 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Fri, 12 Jun 2026 17:15:14 +0530 Subject: [PATCH 11/24] Optimize performance by batching OTEL counter updates every 1000 messages in SoakClient --- tests/soak/soakclient_perf.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index f654e997e..00f1d2841 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -105,7 +105,11 @@ def dr_cb(self, err, msg): else: self.dr_cnt += 1 - self.incr_counter("producer.drok", 1) + # perf: batch OTEL counter updates every 1000 messages to avoid + # per-msg SDK overhead (~20-50us/call). Grafana rate calcs are + # unaffected since window >>0.5s. + if self.dr_cnt % 1000 == 0: + self.incr_counter("producer.drok", 1000) # perf: per-message gauge disabled to reach high throughput. # self.set_gauge("producer.latency", msg.latency(), tags={"partition": "{}".format(msg.partition())}) if (self.dr_cnt % self.disprate) == 0: @@ -141,7 +145,9 @@ def produce_record(self): continue self.producer_msgid += 1 - self.incr_counter("producer.send", 1) + # perf: batch OTEL counter updates every 1000 messages. + if self.producer_msgid % 1000 == 0: + self.incr_counter("producer.send", 1000) def producer_status(self): """Print producer status""" @@ -265,7 +271,9 @@ def consumer_run(self): self.incr_counter("consumer.msgerr", 1) self.msg_cnt += 1 - self.incr_counter("consumer.msg", 1) + # perf: batch OTEL counter updates every 1000 messages. + if self.msg_cnt % 1000 == 0: + self.incr_counter("consumer.msg", 1000) # perf: per-message end-to-end latency gauge disabled to reach # high throughput (rebuilt a dict + gauge per message). @@ -428,7 +436,9 @@ def share_run(self): self.incr_counter("consumer.msgerr", 1) self.share_msg_cnt += 1 - self.incr_counter("consumer.msg", 1) + # perf: batch OTEL counter updates every 1000 messages. + if self.share_msg_cnt % 1000 == 0: + self.incr_counter("consumer.msg", 1000) # perf: per-message end-to-end latency gauge disabled to reach # high throughput (rebuilt a dict + gauge per message). From 6e384c998874df941e2cf5fea6877277de42f1ba Mon Sep 17 00:00:00 2001 From: Ankith L Date: Fri, 12 Jun 2026 17:50:11 +0530 Subject: [PATCH 12/24] Disable on_delivery callback in SoakClient to reduce overhead during message production --- tests/soak/soakclient_perf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 00f1d2841..91dc215ea 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -136,7 +136,7 @@ def produce_record(self): # e2e latency gauge, which is now disabled. str(time.time()) # was a syscall + float->str on every produced message. headers={"msgid": str(record.msgid), "txcnt": str(txcnt)}, - on_delivery=self.dr_cb, + # on_delivery=self.dr_cb, ) break From 604789ffbd7532598ee09a795313ae6f73326c96 Mon Sep 17 00:00:00 2001 From: Ojasva Jain Date: Fri, 12 Jun 2026 18:18:37 +0530 Subject: [PATCH 13/24] Make poll non blocking in producer + batch multiple produce calls --- tests/soak/soakclient_perf.py | 43 +++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 91dc215ea..90ed34b53 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -159,7 +159,14 @@ def producer_status(self): def producer_run(self): """Producer main loop""" - sleep_intvl = 1.0 / self.rate + # perf: produce in batches and pace per-batch instead of one + # produce()+blocking poll() per message. At high rates the per-message + # poll() floor (~hundreds of us) capped throughput well below -r; pacing + # a batch lets us sleep a duration the OS can actually honor. -r + # (self.rate) remains the target rate (an upper bound: if a batch takes + # longer than its time budget, we run as fast as we can, below -r). + batch = max(1, int(self.rate / 100)) # ~100 batches/sec + batch_intvl = batch / self.rate # seconds budgeted per batch self.producer_msgid = 0 self.dr_cnt = 0 @@ -169,26 +176,24 @@ def producer_run(self): next_status = time.time() + self.disprate while self.run: + t_start = time.time() - # Produce a single record - self.produce_record() + # Produce a batch of records. + for _ in range(batch): + self.produce_record() - # Enforce message rate by polling until interval is exceeded. - now = time.time() - t_end = now + sleep_intvl - while True: - if now > next_status: - # Print status - self.producer_status() - next_status = now + self.disprate - - remaining_time = t_end - now - if remaining_time < 0: - remaining_time = 0 - self.producer.poll(remaining_time) - if remaining_time <= 0: - break - now = time.time() + # Serve the producer queue (error_cb, any reports) without blocking. + self.producer.poll(0) + + if t_start > next_status: + # Print status + self.producer_status() + next_status = t_start + self.disprate + + # Sleep off the remainder of this batch's time budget to honor -r. + remaining_time = batch_intvl - (time.time() - t_start) + if remaining_time > 0: + time.sleep(remaining_time) # Wait for outstanding messages to be delivered. remaining = self.producer.flush(30) From d3dfea7c81a5ad5ab4b5abf29a8d724732b6c97a Mon Sep 17 00:00:00 2001 From: Ankith L Date: Fri, 19 Jun 2026 20:57:46 +0530 Subject: [PATCH 14/24] Optimize performance by updating OTEL counters per message and re-enabling per-message latency gauge in SoakClient --- tests/soak/soakclient_perf.py | 73 ++++++++++++++--------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 90ed34b53..2b43afa9f 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -105,13 +105,9 @@ def dr_cb(self, err, msg): else: self.dr_cnt += 1 - # perf: batch OTEL counter updates every 1000 messages to avoid - # per-msg SDK overhead (~20-50us/call). Grafana rate calcs are - # unaffected since window >>0.5s. - if self.dr_cnt % 1000 == 0: - self.incr_counter("producer.drok", 1000) - # perf: per-message gauge disabled to reach high throughput. - # self.set_gauge("producer.latency", msg.latency(), tags={"partition": "{}".format(msg.partition())}) + self.incr_counter("producer.drok", 1) + self.set_gauge("producer.latency", msg.latency(), + tags={"partition": "{}".format(msg.partition())}) if (self.dr_cnt % self.disprate) == 0: self.logger.debug( "producer: delivered message to {} [{}] at offset {} in {}s".format( @@ -132,11 +128,12 @@ def produce_record(self): self.producer.produce( self.topic, value=record.serialize(), - # perf: "time" header dropped — it only fed the per-message - # e2e latency gauge, which is now disabled. str(time.time()) - # was a syscall + float->str on every produced message. - headers={"msgid": str(record.msgid), "txcnt": str(txcnt)}, - # on_delivery=self.dr_cb, + headers={ + "msgid": str(record.msgid), + "txcnt": str(txcnt), + "time": str(time.time()), + }, + on_delivery=self.dr_cb, ) break @@ -145,9 +142,7 @@ def produce_record(self): continue self.producer_msgid += 1 - # perf: batch OTEL counter updates every 1000 messages. - if self.producer_msgid % 1000 == 0: - self.incr_counter("producer.send", 1000) + self.incr_counter("producer.send", 1) def producer_status(self): """Print producer status""" @@ -276,20 +271,16 @@ def consumer_run(self): self.incr_counter("consumer.msgerr", 1) self.msg_cnt += 1 - # perf: batch OTEL counter updates every 1000 messages. - if self.msg_cnt % 1000 == 0: - self.incr_counter("consumer.msg", 1000) - - # perf: per-message end-to-end latency gauge disabled to reach - # high throughput (rebuilt a dict + gauge per message). - # headers = dict(msg.headers()) - # txtime = headers.get('time', None) - # if txtime is not None: - # latency = time.time() - float(txtime) - # self.set_gauge("consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())}) - # else: - # latency = None - latency = None + self.incr_counter("consumer.msg", 1) + + headers = dict(msg.headers()) + txtime = headers.get('time', None) + if txtime is not None: + latency = time.time() - float(txtime) + self.set_gauge("consumer.e2e_latency", latency, + tags={"partition": "{}".format(msg.partition())}) + else: + latency = None if (self.msg_cnt % self.disprate) == 0: # Show a sample message every #disprate messages @@ -441,20 +432,16 @@ def share_run(self): self.incr_counter("consumer.msgerr", 1) self.share_msg_cnt += 1 - # perf: batch OTEL counter updates every 1000 messages. - if self.share_msg_cnt % 1000 == 0: - self.incr_counter("consumer.msg", 1000) - - # perf: per-message end-to-end latency gauge disabled to reach - # high throughput (rebuilt a dict + gauge per message). - # headers = dict(msg.headers()) - # txtime = headers.get('time', None) - # if txtime is not None: - # latency = time.time() - float(txtime) - # self.set_gauge( - # "consumer.e2e_latency", latency, - # tags={"partition": "{}".format(msg.partition())} - # ) + self.incr_counter("consumer.msg", 1) + + headers = dict(msg.headers()) + txtime = headers.get('time', None) + if txtime is not None: + latency = time.time() - float(txtime) + self.set_gauge( + "consumer.e2e_latency", latency, + tags={"partition": "{}".format(msg.partition())} + ) if (self.share_msg_cnt % self.disprate) == 0: self.logger.info( From 6c92bcc1a6898c56e0bf8fd9de626b05648f50b9 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Fri, 19 Jun 2026 22:27:27 +0530 Subject: [PATCH 15/24] Increase static padding in SoakRecord for consistent serialized value size --- tests/soak/soakclient_perf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 2b43afa9f..aa652664a 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -55,9 +55,9 @@ class SoakRecord(object): """A private record type, with JSON serializer and deserializer""" # Static padding built once at class load. Repeats a JSON-safe base label - # (letters, digits, spaces, '#') so the final value lands at ~1KB without + # (letters, digits, spaces, '#') so the final value lands at ~4KB without # rebuilding the string on every produce_record() call. - _PAD = (" SoakRecord nr #0" * 60)[:1000] + _PAD = (" SoakRecord nr #0" * 240)[:4050] def __init__(self, msgid, name=None): self.msgid = msgid From 03530134983cb50459c44e9fba1aa910532e39ab Mon Sep 17 00:00:00 2001 From: Ankith L Date: Sat, 20 Jun 2026 00:09:05 +0530 Subject: [PATCH 16/24] Increase static padding in SoakRecord for improved serialized value size --- tests/soak/soakclient_perf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index aa652664a..7bc64af15 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -55,9 +55,9 @@ class SoakRecord(object): """A private record type, with JSON serializer and deserializer""" # Static padding built once at class load. Repeats a JSON-safe base label - # (letters, digits, spaces, '#') so the final value lands at ~4KB without + # (letters, digits, spaces, '#') so the final value lands at ~10KB without # rebuilding the string on every produce_record() call. - _PAD = (" SoakRecord nr #0" * 240)[:4050] + _PAD = (" SoakRecord nr #0" * 600)[:10200] def __init__(self, msgid, name=None): self.msgid = msgid From 4a32ba17438c2d75a2556db966d2e4ce5826e210 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Tue, 23 Jun 2026 22:08:36 +0530 Subject: [PATCH 17/24] Remove unused statistics configuration from SoakClient to prevent crashes --- tests/soak/soakclient.py | 6 ------ tests/soak/soakclient_perf.py | 6 ------ 2 files changed, 12 deletions(-) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 715d4aa36..b72e452fa 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -670,12 +670,6 @@ def filter_config(conf, filter_out, strip_prefix): sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb - # Share consumer rejects `statistics.interval.ms` (librdkafka - # `dev_kip-932_queues-for-kafka` PR #5469). Strip the stats - # wiring inherited from the parent conf so ShareConsumer() - # doesn't crash with `_INVALID_ARG`. - sconf.pop('statistics.interval.ms', None) - sconf.pop('stats_cb', None) sconf['client.id'] = self.testid # In explicit mode, switch the consumer's ack policy. Default is diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 7bc64af15..e09f22d27 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -686,12 +686,6 @@ def filter_config(conf, filter_out, strip_prefix): sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb - # Share consumer rejects `statistics.interval.ms` (librdkafka - # `dev_kip-932_queues-for-kafka` PR #5469). Strip the stats - # wiring inherited from the parent conf so ShareConsumer() - # doesn't crash with `_INVALID_ARG`. - sconf.pop('statistics.interval.ms', None) - sconf.pop('stats_cb', None) sconf['client.id'] = self.testid # In explicit mode, switch the consumer's ack policy. Default is From b21945783585398f130a1822e6a7d4da5383fe76 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Tue, 23 Jun 2026 22:28:05 +0530 Subject: [PATCH 18/24] Remove 'enable.partition.eof' from ShareConsumer configuration to ensure compatibility with librdkafka v2.15.0-RC1 --- tests/soak/soakclient.py | 4 ++++ tests/soak/soakclient_perf.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index b72e452fa..7e46ee8d9 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -671,6 +671,10 @@ def filter_config(conf, filter_out, strip_prefix): sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb sconf['client.id'] = self.testid + # Share consumer rejects `enable.partition.eof` (added in librdkafka + # v2.15.0-RC1). The flag was set on the parent conf for the regular + # Consumer; strip it before creating ShareConsumer. + sconf.pop('enable.partition.eof', None) # In explicit mode, switch the consumer's ack policy. Default is # implicit (next poll auto-acks); without this flip, calls to diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index e09f22d27..ea791d665 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -687,6 +687,10 @@ def filter_config(conf, filter_out, strip_prefix): sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb sconf['client.id'] = self.testid + # Share consumer rejects `enable.partition.eof` (added in librdkafka + # v2.15.0-RC1). The flag was set on the parent conf for the regular + # Consumer; strip it before creating ShareConsumer. + sconf.pop('enable.partition.eof', None) # In explicit mode, switch the consumer's ack policy. Default is # implicit (next poll auto-acks); without this flip, calls to From 653326870f4952d5143b59a1d7fae69a0f35fc40 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Wed, 1 Jul 2026 10:34:54 +0530 Subject: [PATCH 19/24] Refactor SoakClient to streamline gauge setting and improve error logging in commit_sync --- tests/soak/soakclient_perf.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index ea791d665..000c9a169 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -106,8 +106,7 @@ def dr_cb(self, err, msg): else: self.dr_cnt += 1 self.incr_counter("producer.drok", 1) - self.set_gauge("producer.latency", msg.latency(), - tags={"partition": "{}".format(msg.partition())}) + self.set_gauge("producer.latency", msg.latency(), tags={"partition": "{}".format(msg.partition())}) if (self.dr_cnt % self.disprate) == 0: self.logger.debug( "producer: delivered message to {} [{}] at offset {} in {}s".format( @@ -128,11 +127,7 @@ def produce_record(self): self.producer.produce( self.topic, value=record.serialize(), - headers={ - "msgid": str(record.msgid), - "txcnt": str(txcnt), - "time": str(time.time()), - }, + headers={"msgid": str(record.msgid), "time": str(time.time()), "txcnt": str(txcnt)}, on_delivery=self.dr_cb, ) break @@ -273,12 +268,12 @@ def consumer_run(self): self.msg_cnt += 1 self.incr_counter("consumer.msg", 1) + # end-to-end latency headers = dict(msg.headers()) txtime = headers.get('time', None) if txtime is not None: latency = time.time() - float(txtime) - self.set_gauge("consumer.e2e_latency", latency, - tags={"partition": "{}".format(msg.partition())}) + self.set_gauge("consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())}) else: latency = None @@ -434,6 +429,7 @@ def share_run(self): self.share_msg_cnt += 1 self.incr_counter("consumer.msg", 1) + # end-to-end latency headers = dict(msg.headers()) txtime = headers.get('time', None) if txtime is not None: @@ -497,13 +493,14 @@ def share_run(self): try: if use_sync: result = self.share_consumer.commit_sync(timeout=10.0) - partition_errs = sum( - 1 for err in result.values() if err is not None - ) - if partition_errs > 0: + err_details = [ + "{}/{}={}".format(tp.topic, tp.partition, err) + for tp, err in result.items() if err is not None + ] + if err_details: self.logger.warning( - "share: commit_sync had {} partition error(s)" - .format(partition_errs) + "share: commit_sync had {} partition error(s): {}" + .format(len(err_details), "; ".join(err_details)) ) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) From 71d53f67601bac30178c102343c84df510f7d6c2 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Wed, 1 Jul 2026 10:50:09 +0530 Subject: [PATCH 20/24] Validate SHARE flag when EXPLICIT mode is enabled in soak tests --- tests/soak/run.sh | 4 ++ tests/soak/soakclient.py | 103 +++++++++++++++++++++------------- tests/soak/soakclient_perf.py | 103 +++++++++++++++++++++------------- 3 files changed, 132 insertions(+), 78 deletions(-) diff --git a/tests/soak/run.sh b/tests/soak/run.sh index 719d89fd4..29398fdb4 100755 --- a/tests/soak/run.sh +++ b/tests/soak/run.sh @@ -25,6 +25,10 @@ fi explicit_flag="" if [[ "${EXPLICIT:-}" == "true" ]]; then + if [[ "${SHARE:-}" != "true" ]]; then + echo "ERROR: EXPLICIT=true requires SHARE=true" >&2 + exit 1 + fi explicit_flag="--explicit" fi diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 7e46ee8d9..b2a09fbb2 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -360,16 +360,12 @@ def share_run(self): """Share consumer main loop""" self.share_consumer.subscribe([self.topic]) - self.share_msg_cnt = 0 - self.share_msg_dup_cnt = 0 - self.share_msg_miss_cnt = 0 - self.share_msg_err_cnt = 0 - self.share_err_cnt = 0 - self.share_error_cb_cnt = 0 - - # Track highest offset seen per partition for duplicate/gap detection. - # With implicit ack and a single share consumer, offsets should - # progress sequentially per partition. Skipped in Explict mode. + # Counters are initialized in __init__ so error_cb can't hit an + # AttributeError if it fires before this thread starts. + + # Track highest offset seen per partition. Only used in implicit + # mode; in explicit mode released records get redelivered + # non-monotonically, so dup/gap detection isn't valid. hwmarks = defaultdict(int) self.logger.info("share: running in mode={}".format(self.share_mode)) @@ -412,6 +408,9 @@ def share_run(self): ) self.share_msg_err_cnt += 1 self.incr_counter("consumer.msgerr", 1) + # Corrupt payload: don't count it as consumed and don't let + # it drive hwmark/dup logic. + continue self.share_msg_cnt += 1 self.incr_counter("consumer.msg", 1) @@ -435,34 +434,42 @@ def share_run(self): ) ) - # Track per-partition high-water mark for duplicate/gap detection - hwkey = "{}-{}".format(msg.topic(), msg.partition()) - hw = hwmarks[hwkey] - - if hw > 0: - if msg.offset() <= hw: - self.logger.warning( - "share: Old or duplicate message {} " - "[{}] at offset {} (headers {}): wanted offset > {}".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + # Track per-partition high-water mark for duplicate/gap + # detection. Only meaningful with implicit ack + a single share + # consumer: in explicit mode, released records get redelivered + # non-monotonically, which would drive false dup/gap counters. + if self.share_mode == 'implicit': + hwkey = "{}-{}".format(msg.topic(), msg.partition()) + hw = hwmarks[hwkey] + + if hw > 0: + if msg.offset() <= hw: + self.logger.warning( + "share: Old or duplicate message {} " + "[{}] at offset {} (headers {}): wanted offset > {}".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) ) - ) - self.share_msg_dup_cnt += (hw + 1) - msg.offset() - self.incr_counter("consumer.msgdup", 1) - elif msg.offset() > hw + 1: - self.logger.warning( - "share: Lost messages, now at {} " - "[{}] at offset {} (headers {}): " - "expected offset {}+1".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + self.share_msg_dup_cnt += (hw + 1) - msg.offset() + self.incr_counter("consumer.msgdup", 1) + elif msg.offset() > hw + 1: + self.logger.warning( + "share: Lost messages, now at {} " + "[{}] at offset {} (headers {}): " + "expected offset {}+1".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) ) - ) - self.share_msg_miss_cnt += msg.offset() - (hw + 1) - self.incr_counter("consumer.missedmsg", 1) + self.share_msg_miss_cnt += msg.offset() - (hw + 1) + self.incr_counter("consumer.missedmsg", 1) - hwmarks[hwkey] = msg.offset() + # Only advance the mark; don't regress on an out-of-order + # or duplicate delivery, otherwise the next in-order + # message would be misreported as a large gap. + if msg.offset() > hw: + hwmarks[hwkey] = msg.offset() # Explicit mode: ack each message with ACCEPT. Commit fires # once per batch after the for-msg loop below. @@ -476,6 +483,11 @@ def share_run(self): self.incr_counter("consumer.error", 1) if self.share_mode == 'explicit': + # commit_sync returns a per-partition result dict we can + # inspect; commit_async surfaces errors through error_cb + # (share_error_cb_cnt / consumer.errorcb), not here — so + # async-branch failures still show up in metrics, just under + # a different counter. use_sync = random.random() < 0.5 try: if use_sync: @@ -682,10 +694,23 @@ def filter_config(conf, filter_out, strip_prefix): if self.share_mode == 'explicit': sconf['share.acknowledgement.mode'] = 'explicit' - # Always set a share-specific group.id. - sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( - self.hostname, version(), sys.version.split(' ')[0] - ) + # Set a share-specific group.id unless the operator supplied one + # via `share.group.id` in the config (filter_config strips the + # prefix, so it lands in sconf['group.id']). + if 'group.id' not in sconf: + sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( + self.hostname, version(), sys.version.split(' ')[0] + ) + + # Zero share counters *before* constructing the consumer so + # error_cb (which touches share_error_cb_cnt) can't hit + # AttributeError if it fires during construction. + self.share_msg_cnt = 0 + self.share_msg_dup_cnt = 0 + self.share_msg_miss_cnt = 0 + self.share_msg_err_cnt = 0 + self.share_err_cnt = 0 + self.share_error_cb_cnt = 0 self.logger.info("share: using group.id {}".format(sconf['group.id'])) self.share_consumer = ShareConsumer(sconf) diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 000c9a169..4ab36b2ae 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -373,16 +373,12 @@ def share_run(self): """Share consumer main loop""" self.share_consumer.subscribe([self.topic]) - self.share_msg_cnt = 0 - self.share_msg_dup_cnt = 0 - self.share_msg_miss_cnt = 0 - self.share_msg_err_cnt = 0 - self.share_err_cnt = 0 - self.share_error_cb_cnt = 0 - - # Track highest offset seen per partition for duplicate/gap detection. - # With implicit ack and a single share consumer, offsets should - # progress sequentially per partition. Skipped in Explict mode. + # Counters are initialized in __init__ so error_cb can't hit an + # AttributeError if it fires before this thread starts. + + # Track highest offset seen per partition. Only used in implicit + # mode; in explicit mode released records get redelivered + # non-monotonically, so dup/gap detection isn't valid. hwmarks = defaultdict(int) self.logger.info("share: running in mode={}".format(self.share_mode)) @@ -425,6 +421,9 @@ def share_run(self): ) self.share_msg_err_cnt += 1 self.incr_counter("consumer.msgerr", 1) + # Corrupt payload: don't count it as consumed and don't let + # it drive hwmark/dup logic. + continue self.share_msg_cnt += 1 self.incr_counter("consumer.msg", 1) @@ -448,34 +447,42 @@ def share_run(self): ) ) - # Track per-partition high-water mark for duplicate/gap detection - hwkey = "{}-{}".format(msg.topic(), msg.partition()) - hw = hwmarks[hwkey] - - if hw > 0: - if msg.offset() <= hw: - self.logger.warning( - "share: Old or duplicate message {} " - "[{}] at offset {} (headers {}): wanted offset > {}".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + # Track per-partition high-water mark for duplicate/gap + # detection. Only meaningful with implicit ack + a single share + # consumer: in explicit mode, released records get redelivered + # non-monotonically, which would drive false dup/gap counters. + if self.share_mode == 'implicit': + hwkey = "{}-{}".format(msg.topic(), msg.partition()) + hw = hwmarks[hwkey] + + if hw > 0: + if msg.offset() <= hw: + self.logger.warning( + "share: Old or duplicate message {} " + "[{}] at offset {} (headers {}): wanted offset > {}".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) ) - ) - self.share_msg_dup_cnt += (hw + 1) - msg.offset() - self.incr_counter("consumer.msgdup", 1) - elif msg.offset() > hw + 1: - self.logger.warning( - "share: Lost messages, now at {} " - "[{}] at offset {} (headers {}): " - "expected offset {}+1".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + self.share_msg_dup_cnt += (hw + 1) - msg.offset() + self.incr_counter("consumer.msgdup", 1) + elif msg.offset() > hw + 1: + self.logger.warning( + "share: Lost messages, now at {} " + "[{}] at offset {} (headers {}): " + "expected offset {}+1".format( + msg.topic(), msg.partition(), msg.offset(), + msg.headers(), hw + ) ) - ) - self.share_msg_miss_cnt += msg.offset() - (hw + 1) - self.incr_counter("consumer.missedmsg", 1) + self.share_msg_miss_cnt += msg.offset() - (hw + 1) + self.incr_counter("consumer.missedmsg", 1) - hwmarks[hwkey] = msg.offset() + # Only advance the mark; don't regress on an out-of-order + # or duplicate delivery, otherwise the next in-order + # message would be misreported as a large gap. + if msg.offset() > hw: + hwmarks[hwkey] = msg.offset() # Explicit mode: ack each message with ACCEPT. Commit fires # once per batch after the for-msg loop below. @@ -489,6 +496,11 @@ def share_run(self): self.incr_counter("consumer.error", 1) if self.share_mode == 'explicit': + # commit_sync returns a per-partition result dict we can + # inspect; commit_async surfaces errors through error_cb + # (share_error_cb_cnt / consumer.errorcb), not here — so + # async-branch failures still show up in metrics, just under + # a different counter. use_sync = random.random() < 0.5 try: if use_sync: @@ -695,10 +707,23 @@ def filter_config(conf, filter_out, strip_prefix): if self.share_mode == 'explicit': sconf['share.acknowledgement.mode'] = 'explicit' - # Always set a share-specific group.id. - sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( - self.hostname, version(), sys.version.split(' ')[0] - ) + # Set a share-specific group.id unless the operator supplied one + # via `share.group.id` in the config (filter_config strips the + # prefix, so it lands in sconf['group.id']). + if 'group.id' not in sconf: + sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( + self.hostname, version(), sys.version.split(' ')[0] + ) + + # Zero share counters *before* constructing the consumer so + # error_cb (which touches share_error_cb_cnt) can't hit + # AttributeError if it fires during construction. + self.share_msg_cnt = 0 + self.share_msg_dup_cnt = 0 + self.share_msg_miss_cnt = 0 + self.share_msg_err_cnt = 0 + self.share_err_cnt = 0 + self.share_error_cb_cnt = 0 self.logger.info("share: using group.id {}".format(sconf['group.id'])) self.share_consumer = ShareConsumer(sconf) From f2c1d2830cb0ae892e061c390b75bfac647913e1 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Wed, 1 Jul 2026 13:52:53 +0530 Subject: [PATCH 21/24] Refactor soakclient and soakclient_perf for improved readability and consistency in logging --- tests/soak/soakclient.py | 57 +++++++++++++------------------ tests/soak/soakclient_perf.py | 64 ++++++++++++++--------------------- 2 files changed, 49 insertions(+), 72 deletions(-) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index b2a09fbb2..325fd66a8 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -46,6 +46,7 @@ try: from confluent_kafka import ShareConsumer + HAS_SHARE_CONSUMER = True except ImportError: HAS_SHARE_CONSUMER = False @@ -402,8 +403,7 @@ def share_run(self): self.logger.info( "share: Failed to deserialize message in " "{} [{}] at offset {} (headers {}): {}".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), ex + msg.topic(), msg.partition(), msg.offset(), msg.headers(), ex ) ) self.share_msg_err_cnt += 1 @@ -420,17 +420,13 @@ def share_run(self): txtime = headers.get('time', None) if txtime is not None: latency = time.time() - float(txtime) - self.set_gauge( - "consumer.e2e_latency", latency, - tags={"partition": "{}".format(msg.partition())} - ) + self.set_gauge("consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())}) if (self.share_msg_cnt % self.disprate) == 0: self.logger.info( "share: {} messages consumed: Message {} " "[{}] at offset {} (headers {})".format( - self.share_msg_cnt, msg.topic(), - msg.partition(), msg.offset(), msg.headers() + self.share_msg_cnt, msg.topic(), msg.partition(), msg.offset(), msg.headers() ) ) @@ -447,8 +443,7 @@ def share_run(self): self.logger.warning( "share: Old or duplicate message {} " "[{}] at offset {} (headers {}): wanted offset > {}".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw ) ) self.share_msg_dup_cnt += (hw + 1) - msg.offset() @@ -458,8 +453,7 @@ def share_run(self): "share: Lost messages, now at {} " "[{}] at offset {} (headers {}): " "expected offset {}+1".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw ) ) self.share_msg_miss_cnt += msg.offset() - (hw + 1) @@ -477,8 +471,7 @@ def share_run(self): try: self.share_consumer.acknowledge(msg) except KafkaException as ex: - self.logger.error( - "share: acknowledge failed: {}".format(ex)) + self.logger.error("share: acknowledge failed: {}".format(ex)) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) @@ -494,21 +487,21 @@ def share_run(self): result = self.share_consumer.commit_sync(timeout=10.0) err_details = [ "{}/{}={}".format(tp.topic, tp.partition, err) - for tp, err in result.items() if err is not None + for tp, err in result.items() + if err is not None ] if err_details: self.logger.warning( - "share: commit_sync had {} partition error(s): {}" - .format(len(err_details), "; ".join(err_details)) + "share: commit_sync had {} partition error(s): {}".format( + len(err_details), "; ".join(err_details) + ) ) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) else: self.share_consumer.commit_async() except KafkaException as ex: - self.logger.error( - "share: commit_{} exception: {}".format( - "sync" if use_sync else "async", ex)) + self.logger.error("share: commit_{} exception: {}".format("sync" if use_sync else "async", ex)) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) @@ -523,8 +516,7 @@ def share_thread_main(self): self.logger.info("share: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("share: fatal exception: {}\n{}".format( - ex, traceback.print_exc())) + self.logger.fatal("share: fatal exception: {}\n{}".format(ex, traceback.print_exc())) self.run = False def rtt_stats(self, d): @@ -592,8 +584,7 @@ def create_topic(self, topic, conf): else: raise - def __init__(self, testid, topic, rate, conf, enable_share=False, - share_mode='implicit'): + def __init__(self, testid, topic, rate, conf, enable_share=False, share_mode='implicit'): """SoakClient constructor. conf is the client configuration""" self.topic = topic self.rate = rate @@ -675,10 +666,7 @@ def filter_config(conf, filter_out, strip_prefix): if enable_share: if not HAS_SHARE_CONSUMER: - raise RuntimeError( - "ShareConsumer requested but not available in this " - "confluent_kafka build." - ) + raise RuntimeError("ShareConsumer requested but not available in this " "confluent_kafka build.") sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb @@ -838,12 +826,14 @@ def get_rusage(self): '-f', dest='conffile', type=argparse.FileType('r'), help='Configuration file (configprop=value format)' ) parser.add_argument( - '--share', dest='share', action='store_true', default=False, - help='Enable share consumer thread' + '--share', dest='share', action='store_true', default=False, help='Enable share consumer thread' ) parser.add_argument( - '--explicit', dest='explicit', action='store_true', default=False, - help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)' + '--explicit', + dest='explicit', + action='store_true', + default=False, + help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)', ) args = parser.parse_args() @@ -880,8 +870,7 @@ def get_rusage(self): conf['enable.partition.eof'] = False # Create SoakClient - soak = SoakClient(args.testid, args.topic, args.rate, conf, - enable_share=args.share, share_mode=share_mode) + soak = SoakClient(args.testid, args.topic, args.rate, conf, enable_share=args.share, share_mode=share_mode) # Get initial resource usage soak.get_rusage() diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 4ab36b2ae..3a3b95aa0 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -46,6 +46,7 @@ try: from confluent_kafka import ShareConsumer + HAS_SHARE_CONSUMER = True except ImportError: HAS_SHARE_CONSUMER = False @@ -69,8 +70,7 @@ def __init__(self, msgid, name=None): def serialize(self): # Bytes formatting is ~3-5x faster than json.dumps for this shape. # name only contains JSON-safe chars so no escaping is needed. - return b'{"msgid":%d,"name":"%s"}' % ( - self.msgid, self.name.encode('ascii')) + return b'{"msgid":%d,"name":"%s"}' % (self.msgid, self.name.encode('ascii')) def __str__(self): return "SoakRecord({})".format(self.name) @@ -155,8 +155,8 @@ def producer_run(self): # a batch lets us sleep a duration the OS can actually honor. -r # (self.rate) remains the target rate (an upper bound: if a batch takes # longer than its time budget, we run as fast as we can, below -r). - batch = max(1, int(self.rate / 100)) # ~100 batches/sec - batch_intvl = batch / self.rate # seconds budgeted per batch + batch = max(1, int(self.rate / 100)) # ~100 batches/sec + batch_intvl = batch / self.rate # seconds budgeted per batch self.producer_msgid = 0 self.dr_cnt = 0 @@ -415,8 +415,7 @@ def share_run(self): self.logger.info( "share: Failed to deserialize message in " "{} [{}] at offset {} (headers {}): {}".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), ex + msg.topic(), msg.partition(), msg.offset(), msg.headers(), ex ) ) self.share_msg_err_cnt += 1 @@ -433,17 +432,13 @@ def share_run(self): txtime = headers.get('time', None) if txtime is not None: latency = time.time() - float(txtime) - self.set_gauge( - "consumer.e2e_latency", latency, - tags={"partition": "{}".format(msg.partition())} - ) + self.set_gauge("consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())}) if (self.share_msg_cnt % self.disprate) == 0: self.logger.info( "share: {} messages consumed: Message {} " "[{}] at offset {} (headers {})".format( - self.share_msg_cnt, msg.topic(), - msg.partition(), msg.offset(), msg.headers() + self.share_msg_cnt, msg.topic(), msg.partition(), msg.offset(), msg.headers() ) ) @@ -460,8 +455,7 @@ def share_run(self): self.logger.warning( "share: Old or duplicate message {} " "[{}] at offset {} (headers {}): wanted offset > {}".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw ) ) self.share_msg_dup_cnt += (hw + 1) - msg.offset() @@ -471,8 +465,7 @@ def share_run(self): "share: Lost messages, now at {} " "[{}] at offset {} (headers {}): " "expected offset {}+1".format( - msg.topic(), msg.partition(), msg.offset(), - msg.headers(), hw + msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw ) ) self.share_msg_miss_cnt += msg.offset() - (hw + 1) @@ -490,8 +483,7 @@ def share_run(self): try: self.share_consumer.acknowledge(msg) except KafkaException as ex: - self.logger.error( - "share: acknowledge failed: {}".format(ex)) + self.logger.error("share: acknowledge failed: {}".format(ex)) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) @@ -507,21 +499,21 @@ def share_run(self): result = self.share_consumer.commit_sync(timeout=10.0) err_details = [ "{}/{}={}".format(tp.topic, tp.partition, err) - for tp, err in result.items() if err is not None + for tp, err in result.items() + if err is not None ] if err_details: self.logger.warning( - "share: commit_sync had {} partition error(s): {}" - .format(len(err_details), "; ".join(err_details)) + "share: commit_sync had {} partition error(s): {}".format( + len(err_details), "; ".join(err_details) + ) ) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) else: self.share_consumer.commit_async() except KafkaException as ex: - self.logger.error( - "share: commit_{} exception: {}".format( - "sync" if use_sync else "async", ex)) + self.logger.error("share: commit_{} exception: {}".format("sync" if use_sync else "async", ex)) self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) @@ -536,8 +528,7 @@ def share_thread_main(self): self.logger.info("share: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("share: fatal exception: {}\n{}".format( - ex, traceback.print_exc())) + self.logger.fatal("share: fatal exception: {}\n{}".format(ex, traceback.print_exc())) self.run = False def rtt_stats(self, d): @@ -605,8 +596,7 @@ def create_topic(self, topic, conf): else: raise - def __init__(self, testid, topic, rate, conf, enable_share=False, - share_mode='implicit'): + def __init__(self, testid, topic, rate, conf, enable_share=False, share_mode='implicit'): """SoakClient constructor. conf is the client configuration""" self.topic = topic self.rate = rate @@ -688,10 +678,7 @@ def filter_config(conf, filter_out, strip_prefix): if enable_share: if not HAS_SHARE_CONSUMER: - raise RuntimeError( - "ShareConsumer requested but not available in this " - "confluent_kafka build." - ) + raise RuntimeError("ShareConsumer requested but not available in this " "confluent_kafka build.") sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb @@ -851,12 +838,14 @@ def get_rusage(self): '-f', dest='conffile', type=argparse.FileType('r'), help='Configuration file (configprop=value format)' ) parser.add_argument( - '--share', dest='share', action='store_true', default=False, - help='Enable share consumer thread' + '--share', dest='share', action='store_true', default=False, help='Enable share consumer thread' ) parser.add_argument( - '--explicit', dest='explicit', action='store_true', default=False, - help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)' + '--explicit', + dest='explicit', + action='store_true', + default=False, + help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)', ) args = parser.parse_args() @@ -893,8 +882,7 @@ def get_rusage(self): conf['enable.partition.eof'] = False # Create SoakClient - soak = SoakClient(args.testid, args.topic, args.rate, conf, - enable_share=args.share, share_mode=share_mode) + soak = SoakClient(args.testid, args.topic, args.rate, conf, enable_share=args.share, share_mode=share_mode) # Get initial resource usage soak.get_rusage() From ab895435b1fbc3aeb452e7fd8ccd99f104b88b43 Mon Sep 17 00:00:00 2001 From: Ankith L Date: Wed, 1 Jul 2026 15:56:21 +0530 Subject: [PATCH 22/24] Adress the comments --- tests/soak/soakclient.py | 100 ++++++++++++++++++++-------------- tests/soak/soakclient_perf.py | 100 ++++++++++++++++++++-------------- 2 files changed, 116 insertions(+), 84 deletions(-) diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 325fd66a8..b4c262adc 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -186,7 +186,7 @@ def producer_thread_main(self): self.logger.info("producer: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("producer: fatal exception: {}:\n{}".format(ex, traceback.print_exc())) + self.logger.fatal("producer: fatal exception: {}:\n{}".format(ex, traceback.format_exc())) self.run = False def consumer_status(self): @@ -252,6 +252,9 @@ def consumer_run(self): ) self.msg_err_cnt += 1 self.incr_counter("consumer.msgerr", 1) + # Corrupt payload: don't count it as consumed and don't let + # it drive hwmark/dup logic. + continue self.msg_cnt += 1 self.incr_counter("consumer.msg", 1) @@ -302,9 +305,6 @@ def consumer_run(self): hwmarks[hwkey] = msg.offset() - self.consumer.close() - self.consumer_status() - def consumer_thread_main(self): """Consumer thread main function""" try: @@ -313,8 +313,15 @@ def consumer_thread_main(self): self.logger.info("consumer: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("consumer: fatal exception: {}\n{}".format(ex, traceback.print_exc())) + self.logger.fatal("consumer: fatal exception: {}\n{}".format(ex, traceback.format_exc())) self.run = False + finally: + if self.consumer is not None: + try: + self.consumer.close() + except Exception as ex: + self.logger.warning("consumer: close failed: {}".format(ex)) + self.consumer_status() def consumer_error_cb(self, err): """Consumer error callback""" @@ -343,6 +350,20 @@ def share_error_cb(self, err): self.share_error_cb_cnt += 1 self.incr_counter("consumer.errorcb", 1) + def share_acknowledgement_commit_cb(self, offsets, exception): + """Ack-commit callback registered via + ShareConsumer.set_acknowledgement_commit_callback. + + Fires from within commit_sync / commit_async / poll / close when the + broker returns a ShareAcknowledge response, so this is the single + source of truth for per-commit error visibility on both the sync and + async paths. + """ + if exception is not None: + self.logger.error("share: ack commit failed for {} partition(s): {}".format(len(offsets), exception)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + def share_status(self): """Print share consumer status""" self.logger.info( @@ -476,28 +497,13 @@ def share_run(self): self.incr_counter("consumer.error", 1) if self.share_mode == 'explicit': - # commit_sync returns a per-partition result dict we can - # inspect; commit_async surfaces errors through error_cb - # (share_error_cb_cnt / consumer.errorcb), not here — so - # async-branch failures still show up in metrics, just under - # a different counter. + # Per-partition ack outcomes land in + # share_acknowledgement_commit_cb for both branches; only + # catch immediate call-level exceptions here. use_sync = random.random() < 0.5 try: if use_sync: - result = self.share_consumer.commit_sync(timeout=10.0) - err_details = [ - "{}/{}={}".format(tp.topic, tp.partition, err) - for tp, err in result.items() - if err is not None - ] - if err_details: - self.logger.warning( - "share: commit_sync had {} partition error(s): {}".format( - len(err_details), "; ".join(err_details) - ) - ) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) + self.share_consumer.commit_sync(timeout=10.0) else: self.share_consumer.commit_async() except KafkaException as ex: @@ -505,9 +511,6 @@ def share_run(self): self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) - self.share_consumer.close() - self.share_status() - def share_thread_main(self): """Share consumer thread main function""" try: @@ -516,8 +519,15 @@ def share_thread_main(self): self.logger.info("share: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("share: fatal exception: {}\n{}".format(ex, traceback.print_exc())) + self.logger.fatal("share: fatal exception: {}\n{}".format(ex, traceback.format_exc())) self.run = False + finally: + if self.share_consumer is not None: + try: + self.share_consumer.close() + except Exception as ex: + self.logger.warning("share: close failed: {}".format(ex)) + self.share_status() def rtt_stats(self, d): """Extract broker rtt statistics from the stats dict in @param d""" @@ -643,7 +653,14 @@ def filter_config(conf, filter_out, strip_prefix): # # Create Producer and Consumer/ShareConsumer, each in its own thread. + # All clients are constructed BEFORE any thread starts so a failure + # here can't leave the producer running with no consumer draining. # + + # Fail fast if share requested but not available in this build. + if enable_share and not HAS_SHARE_CONSUMER: + raise RuntimeError("ShareConsumer requested but not available in this confluent_kafka build.") + conf['stats_cb'] = self.stats_cb conf['statistics.interval.ms'] = 120000 @@ -652,22 +669,14 @@ def filter_config(conf, filter_out, strip_prefix): pconf['error_cb'] = self.producer_error_cb pconf['client.id'] = self.testid self.producer = Producer(pconf) - self.incr_counter("producer.errorcb", 0) - # Create and start producer thread - self.producer_thread = threading.Thread(target=self.producer_thread_main) - self.producer_thread.start() - self.consumer = None self.consumer_thread = None self.share_consumer = None self.share_thread = None if enable_share: - if not HAS_SHARE_CONSUMER: - raise RuntimeError("ShareConsumer requested but not available in this " "confluent_kafka build.") - sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb sconf['client.id'] = self.testid @@ -702,16 +711,16 @@ def filter_config(conf, filter_out, strip_prefix): self.logger.info("share: using group.id {}".format(sconf['group.id'])) self.share_consumer = ShareConsumer(sconf) + # Register the ack-commit callback: covers per-partition results + # for both commit_sync and commit_async, replacing manual result + # inspection on the sync path. + self.share_consumer.set_acknowledgement_commit_callback(self.share_acknowledgement_commit_cb) # Initialize counters to zero self.incr_counter("consumer.error", 0) self.incr_counter("consumer.msgdup", 0) self.incr_counter("consumer.msgerr", 0) self.incr_counter("consumer.errorcb", 0) - - # Create and start share consumer thread - self.share_thread = threading.Thread(target=self.share_thread_main) - self.share_thread.start() else: # Consumer cconf = filter_config(conf, ["producer.", "admin.", "share."], "consumer.") @@ -725,7 +734,14 @@ def filter_config(conf, filter_out, strip_prefix): self.incr_counter("consumer.error", 0) self.incr_counter("consumer.msgdup", 0) - # Create and start consumer thread + # All clients constructed. Start threads together. + self.producer_thread = threading.Thread(target=self.producer_thread_main) + self.producer_thread.start() + + if enable_share: + self.share_thread = threading.Thread(target=self.share_thread_main) + self.share_thread.start() + else: self.consumer_thread = threading.Thread(target=self.consumer_thread_main) self.consumer_thread.start() @@ -886,7 +902,7 @@ def get_rusage(self): except KeyboardInterrupt: soak.logger.info("Interrupted by user") except Exception as ex: - soak.logger.error("Fatal exception {}\n{}".format(ex, traceback.print_exc())) + soak.logger.error("Fatal exception {}\n{}".format(ex, traceback.format_exc())) # Terminate soak.terminate() diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py index 3a3b95aa0..7a5fc72a1 100644 --- a/tests/soak/soakclient_perf.py +++ b/tests/soak/soakclient_perf.py @@ -198,7 +198,7 @@ def producer_thread_main(self): self.logger.info("producer: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("producer: fatal exception: {}:\n{}".format(ex, traceback.print_exc())) + self.logger.fatal("producer: fatal exception: {}:\n{}".format(ex, traceback.format_exc())) self.run = False def consumer_status(self): @@ -264,6 +264,9 @@ def consumer_run(self): ) self.msg_err_cnt += 1 self.incr_counter("consumer.msgerr", 1) + # Corrupt payload: don't count it as consumed and don't let + # it drive hwmark/dup logic. + continue self.msg_cnt += 1 self.incr_counter("consumer.msg", 1) @@ -314,9 +317,6 @@ def consumer_run(self): hwmarks[hwkey] = msg.offset() - self.consumer.close() - self.consumer_status() - def consumer_thread_main(self): """Consumer thread main function""" try: @@ -325,8 +325,15 @@ def consumer_thread_main(self): self.logger.info("consumer: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("consumer: fatal exception: {}\n{}".format(ex, traceback.print_exc())) + self.logger.fatal("consumer: fatal exception: {}\n{}".format(ex, traceback.format_exc())) self.run = False + finally: + if self.consumer is not None: + try: + self.consumer.close() + except Exception as ex: + self.logger.warning("consumer: close failed: {}".format(ex)) + self.consumer_status() def consumer_error_cb(self, err): """Consumer error callback""" @@ -355,6 +362,20 @@ def share_error_cb(self, err): self.share_error_cb_cnt += 1 self.incr_counter("consumer.errorcb", 1) + def share_acknowledgement_commit_cb(self, offsets, exception): + """Ack-commit callback registered via + ShareConsumer.set_acknowledgement_commit_callback. + + Fires from within commit_sync / commit_async / poll / close when the + broker returns a ShareAcknowledge response, so this is the single + source of truth for per-commit error visibility on both the sync and + async paths. + """ + if exception is not None: + self.logger.error("share: ack commit failed for {} partition(s): {}".format(len(offsets), exception)) + self.share_err_cnt += 1 + self.incr_counter("consumer.error", 1) + def share_status(self): """Print share consumer status""" self.logger.info( @@ -488,28 +509,13 @@ def share_run(self): self.incr_counter("consumer.error", 1) if self.share_mode == 'explicit': - # commit_sync returns a per-partition result dict we can - # inspect; commit_async surfaces errors through error_cb - # (share_error_cb_cnt / consumer.errorcb), not here — so - # async-branch failures still show up in metrics, just under - # a different counter. + # Per-partition ack outcomes land in + # share_acknowledgement_commit_cb for both branches; only + # catch immediate call-level exceptions here. use_sync = random.random() < 0.5 try: if use_sync: - result = self.share_consumer.commit_sync(timeout=10.0) - err_details = [ - "{}/{}={}".format(tp.topic, tp.partition, err) - for tp, err in result.items() - if err is not None - ] - if err_details: - self.logger.warning( - "share: commit_sync had {} partition error(s): {}".format( - len(err_details), "; ".join(err_details) - ) - ) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) + self.share_consumer.commit_sync(timeout=10.0) else: self.share_consumer.commit_async() except KafkaException as ex: @@ -517,9 +523,6 @@ def share_run(self): self.share_err_cnt += 1 self.incr_counter("consumer.error", 1) - self.share_consumer.close() - self.share_status() - def share_thread_main(self): """Share consumer thread main function""" try: @@ -528,8 +531,15 @@ def share_thread_main(self): self.logger.info("share: aborted by user") self.run = False except Exception as ex: - self.logger.fatal("share: fatal exception: {}\n{}".format(ex, traceback.print_exc())) + self.logger.fatal("share: fatal exception: {}\n{}".format(ex, traceback.format_exc())) self.run = False + finally: + if self.share_consumer is not None: + try: + self.share_consumer.close() + except Exception as ex: + self.logger.warning("share: close failed: {}".format(ex)) + self.share_status() def rtt_stats(self, d): """Extract broker rtt statistics from the stats dict in @param d""" @@ -655,7 +665,14 @@ def filter_config(conf, filter_out, strip_prefix): # # Create Producer and Consumer/ShareConsumer, each in its own thread. + # All clients are constructed BEFORE any thread starts so a failure + # here can't leave the producer running with no consumer draining. # + + # Fail fast if share requested but not available in this build. + if enable_share and not HAS_SHARE_CONSUMER: + raise RuntimeError("ShareConsumer requested but not available in this confluent_kafka build.") + conf['stats_cb'] = self.stats_cb conf['statistics.interval.ms'] = 120000 @@ -664,22 +681,14 @@ def filter_config(conf, filter_out, strip_prefix): pconf['error_cb'] = self.producer_error_cb pconf['client.id'] = self.testid self.producer = Producer(pconf) - self.incr_counter("producer.errorcb", 0) - # Create and start producer thread - self.producer_thread = threading.Thread(target=self.producer_thread_main) - self.producer_thread.start() - self.consumer = None self.consumer_thread = None self.share_consumer = None self.share_thread = None if enable_share: - if not HAS_SHARE_CONSUMER: - raise RuntimeError("ShareConsumer requested but not available in this " "confluent_kafka build.") - sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") sconf['error_cb'] = self.share_error_cb sconf['client.id'] = self.testid @@ -714,16 +723,16 @@ def filter_config(conf, filter_out, strip_prefix): self.logger.info("share: using group.id {}".format(sconf['group.id'])) self.share_consumer = ShareConsumer(sconf) + # Register the ack-commit callback: covers per-partition results + # for both commit_sync and commit_async, replacing manual result + # inspection on the sync path. + self.share_consumer.set_acknowledgement_commit_callback(self.share_acknowledgement_commit_cb) # Initialize counters to zero self.incr_counter("consumer.error", 0) self.incr_counter("consumer.msgdup", 0) self.incr_counter("consumer.msgerr", 0) self.incr_counter("consumer.errorcb", 0) - - # Create and start share consumer thread - self.share_thread = threading.Thread(target=self.share_thread_main) - self.share_thread.start() else: # Consumer cconf = filter_config(conf, ["producer.", "admin.", "share."], "consumer.") @@ -737,7 +746,14 @@ def filter_config(conf, filter_out, strip_prefix): self.incr_counter("consumer.error", 0) self.incr_counter("consumer.msgdup", 0) - # Create and start consumer thread + # All clients constructed. Start threads together. + self.producer_thread = threading.Thread(target=self.producer_thread_main) + self.producer_thread.start() + + if enable_share: + self.share_thread = threading.Thread(target=self.share_thread_main) + self.share_thread.start() + else: self.consumer_thread = threading.Thread(target=self.consumer_thread_main) self.consumer_thread.start() @@ -898,7 +914,7 @@ def get_rusage(self): except KeyboardInterrupt: soak.logger.info("Interrupted by user") except Exception as ex: - soak.logger.error("Fatal exception {}\n{}".format(ex, traceback.print_exc())) + soak.logger.error("Fatal exception {}\n{}".format(ex, traceback.format_exc())) # Terminate soak.terminate() From 87bc749ab04a64df5ce8340ea8a562e0b66d8a1a Mon Sep 17 00:00:00 2001 From: Ankith L Date: Thu, 2 Jul 2026 09:49:03 +0530 Subject: [PATCH 23/24] Add --perf flag for high-throughput mode in soak tests and remove soakclient_perf.py --- tests/soak/run.sh | 7 +- tests/soak/soakclient.py | 107 +++- tests/soak/soakclient_perf.py | 920 ---------------------------------- 3 files changed, 89 insertions(+), 945 deletions(-) delete mode 100644 tests/soak/soakclient_perf.py diff --git a/tests/soak/run.sh b/tests/soak/run.sh index 29398fdb4..b71a3845f 100755 --- a/tests/soak/run.sh +++ b/tests/soak/run.sh @@ -32,12 +32,17 @@ if [[ "${EXPLICIT:-}" == "true" ]]; then explicit_flag="--explicit" fi +perf_flag="" +if [[ "${PERF:-}" == "true" ]]; then + perf_flag="--perf" +fi + echo "Starting soak client using topic $topic. Logging to $logfile." set +x while [ "$run" = true ]; do # Ignore SIGINT in children (inherited) trap "" SIGINT - time opentelemetry-instrument $testdir/soakclient.py -i $TESTID -t $topic -r 80 -f $1 $share_flag $explicit_flag |& tee /dev/tty | bzip2 > $logfile & + time opentelemetry-instrument $testdir/soakclient.py -i $TESTID -t $topic -r 80 -f $1 $share_flag $explicit_flag $perf_flag |& tee /dev/tty | bzip2 > $logfile & PID=$! terminate_last() { # List children of $PID only diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index b4c262adc..757aebaf0 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -20,7 +20,8 @@ # long term validation testing. # # Usage: -# tests/soak/soakclient.py -i -t -r -f [--share] +# tests/soak/soakclient.py -i -t -r -f +# [--share] [--explicit] [--perf] # # A unique topic should be used for each soakclient instance. # @@ -55,15 +56,23 @@ class SoakRecord(object): """A private record type, with JSON serializer and deserializer""" + # Optional padding appended to the auto-generated name. Empty by default; + # __main__ sets it to ~10KB when --perf is passed so higher-throughput + # runs exercise realistic payload sizes without rebuilding the string on + # every produce_record() call. + _PAD: str = "" + def __init__(self, msgid, name=None): self.msgid = msgid if name is None: - self.name = "SoakRecord nr #{}".format(self.msgid) + self.name = "SoakRecord nr #{}{}".format(msgid, self._PAD) else: self.name = name def serialize(self): - return json.dumps(self, default=lambda o: o.__dict__) + # Bytes formatting is ~3-5x faster than json.dumps for this shape. + # name only contains JSON-safe chars so no escaping is needed. + return b'{"msgid":%d,"name":"%s"}' % (self.msgid, self.name.encode('ascii')) def __str__(self): return "SoakRecord({})".format(self.name) @@ -142,8 +151,6 @@ def producer_status(self): def producer_run(self): """Producer main loop""" - sleep_intvl = 1.0 / self.rate - self.producer_msgid = 0 self.dr_cnt = 0 self.dr_err_cnt = 0 @@ -151,27 +158,59 @@ def producer_run(self): next_status = time.time() + self.disprate - while self.run: + if self.perf: + # perf: produce in batches and pace per-batch with time.sleep + # instead of one produce()+blocking poll() per message. At high + # rates the per-message poll() floor (~hundreds of us) caps + # throughput well below -r; pacing a batch lets us sleep a + # duration the OS can actually honor. -r remains the target rate + # (an upper bound: if a batch takes longer than its time budget, + # we run as fast as we can, below -r). + batch = max(1, int(self.rate / 100)) # ~100 batches/sec + batch_intvl = batch / self.rate # seconds budgeted per batch - # Produce a single record - self.produce_record() + while self.run: + t_start = time.time() - # Enforce message rate by polling until interval is exceeded. - now = time.time() - t_end = now + sleep_intvl - while True: - if now > next_status: - # Print status + # Produce a batch of records. + for _ in range(batch): + self.produce_record() + + # Serve the producer queue (error_cb, reports) without blocking. + self.producer.poll(0) + + if t_start > next_status: self.producer_status() - next_status = now + self.disprate - - remaining_time = t_end - now - if remaining_time < 0: - remaining_time = 0 - self.producer.poll(remaining_time) - if remaining_time <= 0: - break + next_status = t_start + self.disprate + + # Sleep off the remainder of this batch's time budget. + remaining_time = batch_intvl - (time.time() - t_start) + if remaining_time > 0: + time.sleep(remaining_time) + else: + sleep_intvl = 1.0 / self.rate + + while self.run: + + # Produce a single record + self.produce_record() + + # Enforce message rate by polling until interval is exceeded. now = time.time() + t_end = now + sleep_intvl + while True: + if now > next_status: + # Print status + self.producer_status() + next_status = now + self.disprate + + remaining_time = t_end - now + if remaining_time < 0: + remaining_time = 0 + self.producer.poll(remaining_time) + if remaining_time <= 0: + break + now = time.time() # Wait for outstanding messages to be delivered. remaining = self.producer.flush(30) @@ -594,7 +633,7 @@ def create_topic(self, topic, conf): else: raise - def __init__(self, testid, topic, rate, conf, enable_share=False, share_mode='implicit'): + def __init__(self, testid, topic, rate, conf, enable_share=False, share_mode='implicit', perf=False): """SoakClient constructor. conf is the client configuration""" self.topic = topic self.rate = rate @@ -603,6 +642,7 @@ def __init__(self, testid, topic, rate, conf, enable_share=False, share_mode='im self.stats_cnt = {'producer': 0, 'consumer': 0} self.start_time = time.time() self.share_mode = share_mode + self.perf = perf # OTEL instruments self.counters = {} @@ -851,6 +891,13 @@ def get_rusage(self): default=False, help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)', ) + parser.add_argument( + '--perf', + dest='perf', + action='store_true', + default=False, + help='High-throughput mode: ~10KB payloads and batched producer pacing', + ) args = parser.parse_args() @@ -859,6 +906,10 @@ def get_rusage(self): if share_mode == 'explicit' and not args.share: parser.error('--explicit requires --share') + # Enable ~10KB payload padding for perf runs. Built once at class load. + if args.perf: + SoakRecord._PAD = (" SoakRecord nr #0" * 600)[:10200] + conf = dict() if args.conffile is not None: # Parse client configuration file. @@ -886,7 +937,15 @@ def get_rusage(self): conf['enable.partition.eof'] = False # Create SoakClient - soak = SoakClient(args.testid, args.topic, args.rate, conf, enable_share=args.share, share_mode=share_mode) + soak = SoakClient( + args.testid, + args.topic, + args.rate, + conf, + enable_share=args.share, + share_mode=share_mode, + perf=args.perf, + ) # Get initial resource usage soak.get_rusage() diff --git a/tests/soak/soakclient_perf.py b/tests/soak/soakclient_perf.py deleted file mode 100644 index 7a5fc72a1..000000000 --- a/tests/soak/soakclient_perf.py +++ /dev/null @@ -1,920 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2018 Confluent Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Soak test producer-consumer end-to-end client for -# long term validation testing. -# -# Usage: -# tests/soak/soakclient.py -i -t -r -f [--share] -# -# A unique topic should be used for each soakclient instance. -# - -import argparse -import json -import logging -import os -import random -import resource -import sys -import threading -import time -import traceback -from builtins import int -from collections import defaultdict - -import psutil -from opentelemetry import metrics - -from confluent_kafka import Consumer, KafkaError, KafkaException, Producer, version -from confluent_kafka.admin import AdminClient, NewTopic - -try: - from confluent_kafka import ShareConsumer - - HAS_SHARE_CONSUMER = True -except ImportError: - HAS_SHARE_CONSUMER = False - - -class SoakRecord(object): - """A private record type, with JSON serializer and deserializer""" - - # Static padding built once at class load. Repeats a JSON-safe base label - # (letters, digits, spaces, '#') so the final value lands at ~10KB without - # rebuilding the string on every produce_record() call. - _PAD = (" SoakRecord nr #0" * 600)[:10200] - - def __init__(self, msgid, name=None): - self.msgid = msgid - if name is None: - self.name = "SoakRecord nr #{}{}".format(msgid, self._PAD) - else: - self.name = name - - def serialize(self): - # Bytes formatting is ~3-5x faster than json.dumps for this shape. - # name only contains JSON-safe chars so no escaping is needed. - return b'{"msgid":%d,"name":"%s"}' % (self.msgid, self.name.encode('ascii')) - - def __str__(self): - return "SoakRecord({})".format(self.name) - - @classmethod - def deserialize(cls, binstr): - d = json.loads(binstr) - return SoakRecord(d['msgid'], d['name']) - - -class SoakClient(object): - """The SoakClient consists of a Producer sending messages at - the given rate, and a Consumer or ShareConsumer consuming the messages. - Both clients print their message and error counters every 10 seconds. - The producer and consumer run in separate background threads. - """ - - # metric name prefix - METRIC_PFX = "kafka.client.soak.python." - - def dr_cb(self, err, msg): - """Producer delivery report callback""" - if err is not None: - self.logger.warning("producer: delivery failed: {} [{}]: {}".format(msg.topic(), msg.partition(), err)) - self.dr_err_cnt += 1 - self.incr_counter("producer.drerr", 1) - self.incr_counter( - "producer.delivery.failure", - 1, - {"topic": msg.topic(), "partition": str(msg.partition()), "err": str(err)}, - ) - - else: - self.dr_cnt += 1 - self.incr_counter("producer.drok", 1) - self.set_gauge("producer.latency", msg.latency(), tags={"partition": "{}".format(msg.partition())}) - if (self.dr_cnt % self.disprate) == 0: - self.logger.debug( - "producer: delivered message to {} [{}] at offset {} in {}s".format( - msg.topic(), msg.partition(), msg.offset(), msg.latency() - ) - ) - - def produce_record(self): - """Asynchronously produce a single record, but block and - and retry if buffer is full.""" - record = SoakRecord(self.producer_msgid) - - txcnt = 0 - while True: - txcnt += 1 - - try: - self.producer.produce( - self.topic, - value=record.serialize(), - headers={"msgid": str(record.msgid), "time": str(time.time()), "txcnt": str(txcnt)}, - on_delivery=self.dr_cb, - ) - break - - except BufferError: - self.producer.poll(1) - continue - - self.producer_msgid += 1 - self.incr_counter("producer.send", 1) - - def producer_status(self): - """Print producer status""" - self.logger.info( - "producer: {} messages produced, {} delivered, {} failed, {} error_cbs".format( - self.producer_msgid, self.dr_cnt, self.dr_err_cnt, self.producer_error_cb_cnt - ) - ) - - def producer_run(self): - """Producer main loop""" - # perf: produce in batches and pace per-batch instead of one - # produce()+blocking poll() per message. At high rates the per-message - # poll() floor (~hundreds of us) capped throughput well below -r; pacing - # a batch lets us sleep a duration the OS can actually honor. -r - # (self.rate) remains the target rate (an upper bound: if a batch takes - # longer than its time budget, we run as fast as we can, below -r). - batch = max(1, int(self.rate / 100)) # ~100 batches/sec - batch_intvl = batch / self.rate # seconds budgeted per batch - - self.producer_msgid = 0 - self.dr_cnt = 0 - self.dr_err_cnt = 0 - self.producer_error_cb_cnt = 0 - - next_status = time.time() + self.disprate - - while self.run: - t_start = time.time() - - # Produce a batch of records. - for _ in range(batch): - self.produce_record() - - # Serve the producer queue (error_cb, any reports) without blocking. - self.producer.poll(0) - - if t_start > next_status: - # Print status - self.producer_status() - next_status = t_start + self.disprate - - # Sleep off the remainder of this batch's time budget to honor -r. - remaining_time = batch_intvl - (time.time() - t_start) - if remaining_time > 0: - time.sleep(remaining_time) - - # Wait for outstanding messages to be delivered. - remaining = self.producer.flush(30) - self.logger.warning("producer: {} message(s) remaining in queue after flush()".format(remaining)) - self.producer_status() - - def producer_thread_main(self): - """Producer thread main function""" - try: - self.producer_run() - except KeyboardInterrupt: - self.logger.info("producer: aborted by user") - self.run = False - except Exception as ex: - self.logger.fatal("producer: fatal exception: {}:\n{}".format(ex, traceback.format_exc())) - self.run = False - - def consumer_status(self): - """Print consumer status""" - self.logger.info( - "consumer: {} messages consumed, {} duplicates, " - "{} missed, {} message errors, {} consumer errors, {} error_cbs".format( - self.msg_cnt, - self.msg_dup_cnt, - self.msg_miss_cnt, - self.msg_err_cnt, - self.consumer_err_cnt, - self.consumer_error_cb_cnt, - ) - ) - - def consumer_run(self): - """Consumer main loop""" - self.consumer_msgid_next = 0 - - self.consumer.subscribe([self.topic]) - - self.msg_cnt = 0 - self.msg_dup_cnt = 0 - self.msg_miss_cnt = 0 - self.msg_err_cnt = 0 - self.consumer_err_cnt = 0 - self.consumer_error_cb_cnt = 0 - self.last_commited = None - - # Keep track of high-watermarks to make sure we don't go backwards - hwmarks = defaultdict(int) - - next_status = time.time() + self.disprate - - while self.run: - - now = time.time() - if now > next_status: - # Print status - self.consumer_status() - next_status = now + self.disprate - - msg = self.consumer.poll(1) - if msg is None: - continue - - if msg.error() is not None: - self.logger.error("consumer: error: {}".format(msg.error())) - self.consumer_err_cnt += 1 - self.incr_counter("consumer.error", 1) - continue - - try: - # Deserialize message - record = SoakRecord.deserialize(msg.value()) # noqa unused variable - except ValueError as ex: - self.logger.info( - "consumer: Failed to deserialize message in " - "{} [{}] at offset {} (headers {}): {}".format( - msg.topic(), msg.partition(), msg.offset(), msg.headers(), ex - ) - ) - self.msg_err_cnt += 1 - self.incr_counter("consumer.msgerr", 1) - # Corrupt payload: don't count it as consumed and don't let - # it drive hwmark/dup logic. - continue - - self.msg_cnt += 1 - self.incr_counter("consumer.msg", 1) - - # end-to-end latency - headers = dict(msg.headers()) - txtime = headers.get('time', None) - if txtime is not None: - latency = time.time() - float(txtime) - self.set_gauge("consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())}) - else: - latency = None - - if (self.msg_cnt % self.disprate) == 0: - # Show a sample message every #disprate messages - self.logger.info( - "consumer: {} messages consumed: Message {} " - "[{}] at offset {} (headers {}, latency {})".format( - self.msg_cnt, msg.topic(), msg.partition(), msg.offset(), msg.headers(), latency - ) - ) - - # Keep track of consumer's highwater mark for each partition, - # to identify duplicates and lost messages. - hwkey = "{}-{}".format(msg.topic(), msg.partition()) - hw = hwmarks[hwkey] - - if hw > 0: - if msg.offset() <= hw: - self.logger.warning( - "consumer: Old or duplicate message {} " - "[{}] at offset {} (headers {}): wanted offset > {} (last commited {})".format( - msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw, self.last_committed - ) - ) - self.msg_dup_cnt += (hw + 1) - msg.offset() - self.incr_counter("consumer.msgdup", 1) - elif msg.offset() > hw + 1: - self.logger.warning( - "consumer: Lost messages, now at {} " - "[{}] at offset {} (headers {}): " - "expected offset {}+1 (last committed {})".format( - msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw, self.last_committed - ) - ) - self.msg_miss_cnt += msg.offset() - (hw + 1) - self.incr_counter("consumer.missedmsg", 1) - - hwmarks[hwkey] = msg.offset() - - def consumer_thread_main(self): - """Consumer thread main function""" - try: - self.consumer_run() - except KeyboardInterrupt: - self.logger.info("consumer: aborted by user") - self.run = False - except Exception as ex: - self.logger.fatal("consumer: fatal exception: {}\n{}".format(ex, traceback.format_exc())) - self.run = False - finally: - if self.consumer is not None: - try: - self.consumer.close() - except Exception as ex: - self.logger.warning("consumer: close failed: {}".format(ex)) - self.consumer_status() - - def consumer_error_cb(self, err): - """Consumer error callback""" - self.logger.error("consumer: error_cb: {}".format(err)) - self.consumer_error_cb_cnt += 1 - self.incr_counter("consumer.errorcb", 1) - - def consumer_commit_cb(self, err, partitions): - """Auto commit result callback""" - if err is not None: - self.logger.error("consumer: offset commit failed for {}: {}".format(partitions, err)) - self.consumer_err_cnt += 1 - self.incr_counter("consumer.error", 1) - else: - self.last_committed = partitions - - def producer_error_cb(self, err): - """Producer error callback""" - self.logger.error("producer: error_cb: {}".format(err)) - self.producer_error_cb_cnt += 1 - self.incr_counter("producer.errorcb", 1) - - def share_error_cb(self, err): - """Share consumer error callback""" - self.logger.error("share: error_cb: {}".format(err)) - self.share_error_cb_cnt += 1 - self.incr_counter("consumer.errorcb", 1) - - def share_acknowledgement_commit_cb(self, offsets, exception): - """Ack-commit callback registered via - ShareConsumer.set_acknowledgement_commit_callback. - - Fires from within commit_sync / commit_async / poll / close when the - broker returns a ShareAcknowledge response, so this is the single - source of truth for per-commit error visibility on both the sync and - async paths. - """ - if exception is not None: - self.logger.error("share: ack commit failed for {} partition(s): {}".format(len(offsets), exception)) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) - - def share_status(self): - """Print share consumer status""" - self.logger.info( - "share: {} messages consumed, {} duplicates, " - "{} missed, {} message errors, {} consumer errors, {} error_cbs".format( - self.share_msg_cnt, - self.share_msg_dup_cnt, - self.share_msg_miss_cnt, - self.share_msg_err_cnt, - self.share_err_cnt, - self.share_error_cb_cnt, - ) - ) - - def share_run(self): - """Share consumer main loop""" - self.share_consumer.subscribe([self.topic]) - - # Counters are initialized in __init__ so error_cb can't hit an - # AttributeError if it fires before this thread starts. - - # Track highest offset seen per partition. Only used in implicit - # mode; in explicit mode released records get redelivered - # non-monotonically, so dup/gap detection isn't valid. - hwmarks = defaultdict(int) - - self.logger.info("share: running in mode={}".format(self.share_mode)) - - next_status = time.time() + self.disprate - - while self.run: - now = time.time() - if now > next_status: - self.share_status() - next_status = now + self.disprate - - try: - messages = self.share_consumer.poll(timeout=1.0) - except Exception as ex: - self.logger.error("share: poll exception: {}".format(ex)) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) - continue - - if not messages: - continue - - for msg in messages: - if msg.error() is not None: - self.logger.error("share: error: {}".format(msg.error())) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) - continue - - try: - record = SoakRecord.deserialize(msg.value()) # noqa unused variable - except ValueError as ex: - self.logger.info( - "share: Failed to deserialize message in " - "{} [{}] at offset {} (headers {}): {}".format( - msg.topic(), msg.partition(), msg.offset(), msg.headers(), ex - ) - ) - self.share_msg_err_cnt += 1 - self.incr_counter("consumer.msgerr", 1) - # Corrupt payload: don't count it as consumed and don't let - # it drive hwmark/dup logic. - continue - - self.share_msg_cnt += 1 - self.incr_counter("consumer.msg", 1) - - # end-to-end latency - headers = dict(msg.headers()) - txtime = headers.get('time', None) - if txtime is not None: - latency = time.time() - float(txtime) - self.set_gauge("consumer.e2e_latency", latency, tags={"partition": "{}".format(msg.partition())}) - - if (self.share_msg_cnt % self.disprate) == 0: - self.logger.info( - "share: {} messages consumed: Message {} " - "[{}] at offset {} (headers {})".format( - self.share_msg_cnt, msg.topic(), msg.partition(), msg.offset(), msg.headers() - ) - ) - - # Track per-partition high-water mark for duplicate/gap - # detection. Only meaningful with implicit ack + a single share - # consumer: in explicit mode, released records get redelivered - # non-monotonically, which would drive false dup/gap counters. - if self.share_mode == 'implicit': - hwkey = "{}-{}".format(msg.topic(), msg.partition()) - hw = hwmarks[hwkey] - - if hw > 0: - if msg.offset() <= hw: - self.logger.warning( - "share: Old or duplicate message {} " - "[{}] at offset {} (headers {}): wanted offset > {}".format( - msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw - ) - ) - self.share_msg_dup_cnt += (hw + 1) - msg.offset() - self.incr_counter("consumer.msgdup", 1) - elif msg.offset() > hw + 1: - self.logger.warning( - "share: Lost messages, now at {} " - "[{}] at offset {} (headers {}): " - "expected offset {}+1".format( - msg.topic(), msg.partition(), msg.offset(), msg.headers(), hw - ) - ) - self.share_msg_miss_cnt += msg.offset() - (hw + 1) - self.incr_counter("consumer.missedmsg", 1) - - # Only advance the mark; don't regress on an out-of-order - # or duplicate delivery, otherwise the next in-order - # message would be misreported as a large gap. - if msg.offset() > hw: - hwmarks[hwkey] = msg.offset() - - # Explicit mode: ack each message with ACCEPT. Commit fires - # once per batch after the for-msg loop below. - if self.share_mode == 'explicit': - try: - self.share_consumer.acknowledge(msg) - except KafkaException as ex: - self.logger.error("share: acknowledge failed: {}".format(ex)) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) - - if self.share_mode == 'explicit': - # Per-partition ack outcomes land in - # share_acknowledgement_commit_cb for both branches; only - # catch immediate call-level exceptions here. - use_sync = random.random() < 0.5 - try: - if use_sync: - self.share_consumer.commit_sync(timeout=10.0) - else: - self.share_consumer.commit_async() - except KafkaException as ex: - self.logger.error("share: commit_{} exception: {}".format("sync" if use_sync else "async", ex)) - self.share_err_cnt += 1 - self.incr_counter("consumer.error", 1) - - def share_thread_main(self): - """Share consumer thread main function""" - try: - self.share_run() - except KeyboardInterrupt: - self.logger.info("share: aborted by user") - self.run = False - except Exception as ex: - self.logger.fatal("share: fatal exception: {}\n{}".format(ex, traceback.format_exc())) - self.run = False - finally: - if self.share_consumer is not None: - try: - self.share_consumer.close() - except Exception as ex: - self.logger.warning("share: close failed: {}".format(ex)) - self.share_status() - - def rtt_stats(self, d): - """Extract broker rtt statistics from the stats dict in @param d""" - - # Get leader RTT stats - for broker in d['brokers'].values(): - if broker['toppars'] is None: - continue - - parts = ','.join([str(x['partition']) for x in broker['toppars'].values()]) - - tags = { - "broker": "{}".format(broker['nodeid']), - "partitions": "{}".format(parts), - "type": "{}".format(d['type']), - } - - self.set_gauge("broker.rtt.p99", float(broker['rtt']['p99']) / 1000000.0, tags=tags) - self.set_gauge("broker.rtt.avg", float(broker['rtt']['avg']) / 1000000.0, tags=tags) - - def stats_cb(self, json_str): - """Common statistics callback.""" - d = json.loads(json_str) - - # Print number of connected brokers to monitor - # the sparse connection functionality. - brokers = d['brokers'] - broker_cnt = len(brokers) - up_brokers = [brokers[x]['name'] for x in brokers if brokers[x]['state'] == 'UP'] - if self.topic in d['topics']: - leaders = [ - '{}={}'.format(p['partition'], p['leader']) - for p in d['topics'][self.topic]['partitions'].values() - if p['partition'] != -1 - ] - else: - leaders = [] - self.logger.info( - "{} stats: {}/{} brokers UP, {} partition leaders: {}".format( - d['name'], len(up_brokers), broker_cnt, self.topic, leaders - ) - ) - - # Emit the full raw stats for troubleshooting. - self.stats_cnt[d['type']] += 1 - self.logger.info("{} raw stats: {}".format(d['name'], json_str)) - - self.rtt_stats(d) - - # Sample the producer queue length - if d['type'] == 'producer': - self.set_gauge("producer.outq", len(self.producer)) - - def create_topic(self, topic, conf): - """Create the topic if it doesn't already exist""" - admin = AdminClient(conf) - fs = admin.create_topics([NewTopic(topic, num_partitions=2, replication_factor=3)]) - f = fs[topic] - try: - res = f.result() # noqa unused variable - except KafkaException as ex: - if ex.args[0].code() == KafkaError.TOPIC_ALREADY_EXISTS: - self.logger.info("Topic {} already exists: good".format(topic)) - else: - raise - - def __init__(self, testid, topic, rate, conf, enable_share=False, share_mode='implicit'): - """SoakClient constructor. conf is the client configuration""" - self.topic = topic - self.rate = rate - self.disprate = int(rate * 10) - self.run = True - self.stats_cnt = {'producer': 0, 'consumer': 0} - self.start_time = time.time() - self.share_mode = share_mode - - # OTEL instruments - self.counters = {} - self.gauges = {} - self.gauge_cbs = {} - self.gauge_values = {} - - self.last_rusage = None - self.last_rusage_time = None - self.proc = psutil.Process(os.getpid()) - - self.logger = logging.getLogger('soakclient') - self.logger.setLevel(logging.DEBUG) - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s')) - self.logger.addHandler(handler) - - # Construct a unique id to use for metrics hostname so that - # multiple instances of the SoakClient can run on the same machine. - hostname = os.environ["HOSTNAME"] - self.hostname = "py-{}-{}".format(hostname, self.topic) - self.testid = testid - self.meter = metrics.get_meter("njc.python.soak.tests") - - self.logger.info("SoakClient id {}".format(self.hostname)) - - if 'group.id' not in conf: - # Generate a group.id bound to this client and python version - conf['group.id'] = 'soakclient-{}-{}-{}'.format(self.hostname, version(), sys.version.split(' ')[0]) - - conf = {k: v for k, v in conf.items()} - - def filter_config(conf, filter_out, strip_prefix): - len_sp = len(strip_prefix) - out = {} - for k, v in conf.items(): - if len([x for x in filter_out if k.startswith(x)]) > 0: - continue - if k.startswith(strip_prefix): - k = k[len_sp:] - out[k] = v - return out - - # Create topic (might already exist) - aconf = filter_config(conf, ["consumer.", "producer.", "share."], "admin.") - aconf['client.id'] = self.testid - self.create_topic(self.topic, aconf) - - # - # Create Producer and Consumer/ShareConsumer, each in its own thread. - # All clients are constructed BEFORE any thread starts so a failure - # here can't leave the producer running with no consumer draining. - # - - # Fail fast if share requested but not available in this build. - if enable_share and not HAS_SHARE_CONSUMER: - raise RuntimeError("ShareConsumer requested but not available in this confluent_kafka build.") - - conf['stats_cb'] = self.stats_cb - conf['statistics.interval.ms'] = 120000 - - # Producer - pconf = filter_config(conf, ["consumer.", "admin.", "share."], "producer.") - pconf['error_cb'] = self.producer_error_cb - pconf['client.id'] = self.testid - self.producer = Producer(pconf) - self.incr_counter("producer.errorcb", 0) - - self.consumer = None - self.consumer_thread = None - self.share_consumer = None - self.share_thread = None - - if enable_share: - sconf = filter_config(conf, ["consumer.", "producer.", "admin."], "share.") - sconf['error_cb'] = self.share_error_cb - sconf['client.id'] = self.testid - # Share consumer rejects `enable.partition.eof` (added in librdkafka - # v2.15.0-RC1). The flag was set on the parent conf for the regular - # Consumer; strip it before creating ShareConsumer. - sconf.pop('enable.partition.eof', None) - - # In explicit mode, switch the consumer's ack policy. Default is - # implicit (next poll auto-acks); without this flip, calls to - # acknowledge() return _STATE because the message is already acked. - if self.share_mode == 'explicit': - sconf['share.acknowledgement.mode'] = 'explicit' - - # Set a share-specific group.id unless the operator supplied one - # via `share.group.id` in the config (filter_config strips the - # prefix, so it lands in sconf['group.id']). - if 'group.id' not in sconf: - sconf['group.id'] = 'soakclient-share-{}-{}-{}'.format( - self.hostname, version(), sys.version.split(' ')[0] - ) - - # Zero share counters *before* constructing the consumer so - # error_cb (which touches share_error_cb_cnt) can't hit - # AttributeError if it fires during construction. - self.share_msg_cnt = 0 - self.share_msg_dup_cnt = 0 - self.share_msg_miss_cnt = 0 - self.share_msg_err_cnt = 0 - self.share_err_cnt = 0 - self.share_error_cb_cnt = 0 - - self.logger.info("share: using group.id {}".format(sconf['group.id'])) - self.share_consumer = ShareConsumer(sconf) - # Register the ack-commit callback: covers per-partition results - # for both commit_sync and commit_async, replacing manual result - # inspection on the sync path. - self.share_consumer.set_acknowledgement_commit_callback(self.share_acknowledgement_commit_cb) - - # Initialize counters to zero - self.incr_counter("consumer.error", 0) - self.incr_counter("consumer.msgdup", 0) - self.incr_counter("consumer.msgerr", 0) - self.incr_counter("consumer.errorcb", 0) - else: - # Consumer - cconf = filter_config(conf, ["producer.", "admin.", "share."], "consumer.") - cconf['error_cb'] = self.consumer_error_cb - cconf['on_commit'] = self.consumer_commit_cb - self.logger.info("consumer: using group.id {}".format(cconf['group.id'])) - cconf['client.id'] = self.testid - self.consumer = Consumer(cconf) - - # Initialize some counters to zero to make them appear in the metrics - self.incr_counter("consumer.error", 0) - self.incr_counter("consumer.msgdup", 0) - - # All clients constructed. Start threads together. - self.producer_thread = threading.Thread(target=self.producer_thread_main) - self.producer_thread.start() - - if enable_share: - self.share_thread = threading.Thread(target=self.share_thread_main) - self.share_thread.start() - else: - self.consumer_thread = threading.Thread(target=self.consumer_thread_main) - self.consumer_thread.start() - - def terminate(self): - """Terminate Producer and Consumer/Share Consumer""" - soak.logger.info("Terminating (ran for {}s)".format(time.time() - self.start_time)) - self.run = False - # Wait for background threads to finish. - self.producer_thread.join() - if self.share_thread is not None: - self.share_thread.join() - else: - self.consumer_thread.join() - - # Final resource usage - soak.get_rusage() - - def incr_counter(self, metric_name, incrval, tags=None): - """Increment metric counter by incrval""" - if not tags: - tags = {} - tags.update({"host": self.hostname, "testid": self.testid}) - - full_metric_name = self.METRIC_PFX + metric_name - if full_metric_name not in self.counters: - self.counters[full_metric_name] = self.meter.create_counter( - full_metric_name, - description=full_metric_name, - ) - counter = self.counters[full_metric_name] - counter.add(incrval, tags) - - def set_gauge(self, metric_name, val, tags=None): - """Set metric gauge to val""" - if not tags: - tags = {} - tags.update({"host": self.hostname, "testid": self.testid}) - - full_metric_name = self.METRIC_PFX + metric_name - if full_metric_name not in self.gauge_values: - self.gauge_values[full_metric_name] = [] - - self.gauge_values[full_metric_name].append([val, tags]) - - if full_metric_name not in self.gauges: - - def cb(_): - for value in self.gauge_values[full_metric_name]: - yield metrics.Observation(value[0], value[1]) - self.gauge_values[full_metric_name] = [] - - self.gauge_cbs[full_metric_name] = cb - self.gauges[full_metric_name] = self.meter.create_observable_gauge( - callbacks=[self.gauge_cbs[full_metric_name]], name=full_metric_name, description=full_metric_name - ) - - def calc_rusage_deltas(self, curr, prev, elapsed): - """Calculate deltas between previous and current resource usage""" - - # User CPU % - user_cpu = ((curr.ru_utime - prev.ru_utime) / elapsed) * 100.0 - self.set_gauge("cpu.user", user_cpu) - - # System CPU % - sys_cpu = ((curr.ru_stime - prev.ru_stime) / elapsed) * 100.0 - self.set_gauge("cpu.system", sys_cpu) - - # Max RSS memory (monotonic) - max_rss = curr.ru_maxrss / 1024.0 - self.set_gauge("memory.rss.max", max_rss) - - self.logger.info("User CPU: {:.1f}%, System CPU: {:.1f}%, MaxRSS {:.3f}MiB".format(user_cpu, sys_cpu, max_rss)) - - def get_rusage(self): - """Get resource usage and calculate CPU load, etc""" - ru = resource.getrusage(resource.RUSAGE_SELF) - now = time.time() - - if self.last_rusage is not None: - self.calc_rusage_deltas(ru, self.last_rusage, now - self.last_rusage_time) - - self.last_rusage = ru - self.last_rusage_time = now - - # Current RSS memory - rss = float(self.proc.memory_info().rss) / (1024.0 * 1024.0) - self.set_gauge("memory.rss", rss) - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Kafka client soak test') - parser.add_argument('-i', dest='testid', type=str, required=True, help='Test id') - parser.add_argument('-b', dest='brokers', type=str, default=None, help='Bootstrap servers') - parser.add_argument('-t', dest='topic', type=str, required=True, help='Topic to use') - parser.add_argument('-r', dest='rate', type=float, default=10, help='Message produce rate per second') - parser.add_argument( - '-f', dest='conffile', type=argparse.FileType('r'), help='Configuration file (configprop=value format)' - ) - parser.add_argument( - '--share', dest='share', action='store_true', default=False, help='Enable share consumer thread' - ) - parser.add_argument( - '--explicit', - dest='explicit', - action='store_true', - default=False, - help='Share consumer: per-msg ACCEPT + alternating commit_async/sync (requires --share)', - ) - - args = parser.parse_args() - - share_mode = 'explicit' if args.explicit else 'implicit' - - if share_mode == 'explicit' and not args.share: - parser.error('--explicit requires --share') - - conf = dict() - if args.conffile is not None: - # Parse client configuration file. - # Standard "key=value" format. - for line in args.conffile: - line = line.strip() - if len(line) == 0 or line[0] == '#': - continue - - i = line.find('=') - if i <= 0: - raise ValueError("Configuration lines must be `name=value..`, not {}".format(line)) - - name = line[:i] - value = line[i + 1 :] - - conf[name] = value - - if args.brokers is not None: - # Overwrite any brokers specified in configuration file with - # brokers from -b command line argument - conf['bootstrap.servers'] = args.brokers - - # We don't care about partition EOFs - conf['enable.partition.eof'] = False - - # Create SoakClient - soak = SoakClient(args.testid, args.topic, args.rate, conf, enable_share=args.share, share_mode=share_mode) - - # Get initial resource usage - soak.get_rusage() - - # Run until interrupted - try: - while soak.run: - time.sleep(10) - soak.get_rusage() - - soak.logger.info("Soak client aborted") - - except KeyboardInterrupt: - soak.logger.info("Interrupted by user") - except Exception as ex: - soak.logger.error("Fatal exception {}\n{}".format(ex, traceback.format_exc())) - - # Terminate - soak.terminate() From 2f6e5abbe6ce3d96e59d0dbc2e1ed44b0c08a1ad Mon Sep 17 00:00:00 2001 From: Ankith L Date: Thu, 2 Jul 2026 10:29:27 +0530 Subject: [PATCH 24/24] Enforce SHARE flag requirement for PERF mode in soak tests --- tests/soak/run.sh | 4 ++++ tests/soak/soakclient.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/tests/soak/run.sh b/tests/soak/run.sh index b71a3845f..701b3491a 100755 --- a/tests/soak/run.sh +++ b/tests/soak/run.sh @@ -34,6 +34,10 @@ fi perf_flag="" if [[ "${PERF:-}" == "true" ]]; then + if [[ "${SHARE:-}" != "true" ]]; then + echo "ERROR: PERF=true requires SHARE=true" >&2 + exit 1 + fi perf_flag="--perf" fi diff --git a/tests/soak/soakclient.py b/tests/soak/soakclient.py index 757aebaf0..3e0d11a38 100755 --- a/tests/soak/soakclient.py +++ b/tests/soak/soakclient.py @@ -906,6 +906,9 @@ def get_rusage(self): if share_mode == 'explicit' and not args.share: parser.error('--explicit requires --share') + if args.perf and not args.share: + parser.error('--perf requires --share') + # Enable ~10KB payload padding for perf runs. Built once at class load. if args.perf: SoakRecord._PAD = (" SoakRecord nr #0" * 600)[:10200]