-
Notifications
You must be signed in to change notification settings - Fork 736
Expand file tree
/
Copy pathdebug_bundle_service.cc
More file actions
996 lines (888 loc) · 33.9 KB
/
debug_bundle_service.cc
File metadata and controls
996 lines (888 loc) · 33.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
/*
* Copyright 2024 Redpanda Data, Inc.
*
* Use of this software is governed by the Business Source License
* included in the file licenses/BSL.md
*
* As of the Change Date specified in that file, in accordance with
* the Business Source License, use of this software will be governed
* by the Apache License, Version 2.0
*/
#include "debug_bundle_service.h"
#include "config/configuration.h"
#include "config/node_config.h"
#include "container/chunked_vector.h"
#include "debug_bundle/error.h"
#include "debug_bundle/metadata.h"
#include "debug_bundle/probe.h"
#include "debug_bundle/types.h"
#include "debug_bundle/utils.h"
#include "ssx/future-util.h"
#include "storage/kvstore.h"
#include "utils/external_process.h"
#include "utils/file_io.h"
#include <seastar/core/lowres_clock.hh>
#include <seastar/core/seastar.hh>
#include <seastar/core/shard_id.hh>
#include <seastar/core/sstring.hh>
#include <seastar/coroutine/exception.hh>
#include <seastar/util/defer.hh>
#include <seastar/util/process.hh>
#include <seastar/util/variant_utils.hh>
#include <boost/algorithm/string/join.hpp>
#include <fmt/format.h>
#include <variant>
using namespace std::chrono_literals;
namespace debug_bundle {
static ss::logger lg{"debug-bundle-service"};
namespace {
constexpr std::string_view output_variable = "--output";
constexpr std::string_view verbose_variable = "--verbose";
constexpr std::string_view username_variable = "-Xuser";
constexpr std::string_view password_variable = "-Xpass";
constexpr std::string_view sasl_mechanism_variable = "-Xsasl.mechanism";
constexpr std::string_view controller_logs_size_limit_variable
= "--controller-logs-size-limit";
constexpr std::string_view cpu_profiler_wait_variable = "--cpu-profiler-wait";
constexpr std::string_view logs_since_variable = "--logs-since";
constexpr std::string_view logs_size_limit_variable = "--logs-size-limit";
constexpr std::string_view logs_until_variable = "--logs-until";
constexpr std::string_view metrics_interval_variable = "--metrics-interval";
constexpr std::string_view metrics_samples_variable = "--metrics-samples";
constexpr std::string_view partition_variable = "--partition";
constexpr std::string_view tls_enabled_variable = "-Xtls.enabled";
constexpr std::string_view tls_insecure_skip_verify_variable
= "-Xtls.insecure_skip_verify";
constexpr std::string_view k8s_namespace_variable = "--namespace";
constexpr std::string_view k8s_label_selector = "--label-selector";
bool contains_sensitive_info(const ss::sstring& arg) {
if (arg.find(password_variable) != ss::sstring::npos) {
return true;
}
return false;
}
void print_arguments(
const std::vector<ss::sstring>& args, const std::vector<ss::sstring>& env) {
auto msg = boost::algorithm::join_if(args, " ", [](const ss::sstring& arg) {
return !contains_sensitive_info(arg);
});
ss::sstring cmd{};
if (!env.empty()) {
cmd += ss::format("{} ", fmt::join(env, " "));
}
cmd += ss::format("{}", msg);
vlog(lg.debug, "Starting RPK debug bundle: {}", cmd);
}
std::string form_debug_bundle_file_name(job_id_t job_id) {
return fmt::format("{}.zip", job_id);
}
std::string form_process_output_file_name(job_id_t job_id) {
return fmt::format("{}.out", job_id);
}
std::filesystem::path form_debug_bundle_file_path(
const std::filesystem::path& base_path, job_id_t job_id) {
return base_path / form_debug_bundle_file_name(job_id);
}
std::filesystem::path form_process_output_file_path(
const std::filesystem::path& base_path, job_id_t job_id) {
return base_path / form_process_output_file_name(job_id);
}
std::filesystem::path form_debug_bundle_storage_directory() {
const auto& debug_bundle_dir
= config::shard_local_cfg().debug_bundle_storage_dir();
// Either return the storage directory or the data directory appended with
// "debug-bundle"
return debug_bundle_dir.value_or(
config::node().data_directory.value().path
/ service::debug_bundle_dir_name);
}
bool was_run_successful(ss::experimental::process::wait_status wait_status) {
auto* exited = std::get_if<ss::experimental::process::wait_exited>(
&wait_status);
return exited != nullptr && exited->exit_code == 0;
}
ss::future<bool>
validate_sha256_checksum(std::string_view path, bytes_view checksum) {
auto sum = co_await calculate_sha256_sum(path);
co_return sum == checksum;
}
} // namespace
struct service::output_handler {
using consumption_result_type =
typename ss::input_stream<char>::consumption_result_type;
using stop_consuming_type =
typename consumption_result_type::stop_consuming_type;
using tmp_buf = stop_consuming_type::tmp_buf;
chunked_vector<ss::sstring>& output_buffer;
ss::future<consumption_result_type> operator()(tmp_buf buf) {
output_buffer.emplace_back(ss::sstring{buf.begin(), buf.end()});
co_return ss::continue_consuming{};
}
};
class service::debug_bundle_process {
public:
debug_bundle_process(
job_id_t job_id,
std::unique_ptr<external_process::external_process> rpk_process,
std::filesystem::path output_file_path,
std::filesystem::path process_output_file_path)
: _job_id(job_id)
, _rpk_process(std::move(rpk_process))
, _output_file_path(std::move(output_file_path))
, _process_output_file_path(std::move(process_output_file_path))
, _created_time(clock::now()) {
_rpk_process->set_stdout_consumer(
output_handler{.output_buffer = _cout});
_rpk_process->set_stderr_consumer(
output_handler{.output_buffer = _cerr});
}
explicit debug_bundle_process(metadata md, std::optional<process_output> po)
: _job_id(md.job_id)
, _wait_result(md.get_wait_status())
, _output_file_path(md.debug_bundle_file_path)
, _process_output_file_path(md.process_output_file_path)
, _cout(
po.has_value() ? std::move(po.value().cout)
: chunked_vector<ss::sstring>{})
, _cerr(
po.has_value() ? std::move(po.value().cerr)
: chunked_vector<ss::sstring>{})
, _created_time(md.get_created_at())
, _finished_time(md.get_finished_at()) {}
debug_bundle_process() = delete;
debug_bundle_process(debug_bundle_process&&) = default;
debug_bundle_process& operator=(debug_bundle_process&&) = default;
debug_bundle_process(const debug_bundle_process&) = delete;
debug_bundle_process& operator=(const debug_bundle_process&) = delete;
~debug_bundle_process() {
if (_rpk_process) {
vassert(
!_rpk_process->is_running(),
"Destroying process struct without waiting for process to "
"finish");
}
}
ss::future<> terminate(std::chrono::milliseconds timeout) {
if (_rpk_process) {
return _rpk_process->terminate(timeout);
}
return ss::now();
}
ss::future<ss::experimental::process::wait_status> wait() {
vassert(
_rpk_process != nullptr,
"RPK process should be created if calling wait()");
auto set_finish_time = ss::defer(
[this] { _finished_time = clock::now(); });
try {
co_return _wait_result.emplace(co_await _rpk_process->wait());
} catch (const std::exception& e) {
_wait_result.emplace(ss::experimental::process::wait_exited{1});
co_return ss::coroutine::return_exception(e);
}
}
debug_bundle_status process_status() const {
if (_expired) {
return debug_bundle_status::expired;
}
if (_wait_result.has_value()) {
return ss::visit(
_wait_result.value(),
[](ss::experimental::process::wait_exited e) {
if (e.exit_code == 0) {
return debug_bundle_status::success;
} else {
return debug_bundle_status::error;
}
},
[](ss::experimental::process::wait_signaled) {
return debug_bundle_status::error;
});
}
return debug_bundle_status::running;
}
job_id_t job_id() const { return _job_id; }
const std::filesystem::path& output_file_path() const {
return _output_file_path;
}
const std::filesystem::path& process_output_file_path() const {
return _process_output_file_path;
}
const chunked_vector<ss::sstring>& cout() const { return _cout; }
const chunked_vector<ss::sstring>& cerr() const { return _cerr; }
clock::time_point created_time() const { return _created_time; }
clock::time_point finished_time() const { return _finished_time; }
ss::experimental::process::wait_status get_wait_result() const {
vassert(_wait_result.has_value(), "wait_result must have been set");
return _wait_result.value();
}
void set_expired() { _expired = true; }
bool is_expired() const { return _expired; }
private:
job_id_t _job_id;
std::unique_ptr<external_process::external_process> _rpk_process;
std::optional<ss::experimental::process::wait_status> _wait_result;
std::filesystem::path _output_file_path;
std::filesystem::path _process_output_file_path;
chunked_vector<ss::sstring> _cout;
chunked_vector<ss::sstring> _cerr;
clock::time_point _created_time;
clock::time_point _finished_time;
bool _expired{false};
};
service::service(storage::kvstore* kvstore)
: _kvstore(kvstore)
, _debug_bundle_dir(form_debug_bundle_storage_directory())
, _debug_bundle_storage_dir_binding(
config::shard_local_cfg().debug_bundle_storage_dir.bind())
, _rpk_path_binding(config::shard_local_cfg().rpk_path.bind())
, _debug_bundle_cleanup_binding(
config::shard_local_cfg().debug_bundle_auto_removal_seconds.bind())
, _process_control_mutex("debug_bundle_service::process_control") {
_debug_bundle_storage_dir_binding.watch([this] {
_debug_bundle_dir = form_debug_bundle_storage_directory();
lg.debug("Changed debug bundle directory to {}", _debug_bundle_dir);
});
}
service::~service() noexcept = default;
ss::future<> service::start() {
if (ss::this_shard_id() != service_shard) {
co_return;
}
if (!co_await ss::file_exists(_rpk_path_binding().native())) {
lg.error(
"Current specified RPK location {} does not exist! Debug "
"bundle creation is not available until this is fixed!",
_rpk_path_binding().native());
}
_probe = std::make_unique<probe>();
_probe->setup_metrics();
co_await maybe_reload_previous_run();
_cleanup_timer.set_callback(
[this] { ssx::spawn_with_gate(_gate, [this] { return tick(); }); });
_debug_bundle_cleanup_binding.watch([this] {
vlog(
lg.debug,
"debug bundle cleanup timer changed to {}",
_debug_bundle_cleanup_binding());
maybe_rearm_timer();
});
maybe_rearm_timer();
lg.debug("Service started");
}
ss::future<> service::stop() {
lg.debug("Service stopping");
_cleanup_timer.cancel();
if (ss::this_shard_id() == service_shard) {
auto units = co_await _process_control_mutex.get_units();
if (is_running()) {
try {
co_await _rpk_process->terminate(1s);
} catch (const std::exception& e) {
lg.warn(
"Failed to terminate running process while stopping service: "
"{}",
e.what());
}
}
}
co_await _gate.close();
_rpk_process.reset(nullptr);
_probe.reset(nullptr);
}
ss::future<result<void>> service::initiate_rpk_debug_bundle_collection(
job_id_t job_id,
debug_bundle_parameters params,
std::vector<ss::sstring> env) {
auto hold = _gate.hold();
if (ss::this_shard_id() != service_shard) {
co_return co_await container().invoke_on(
service_shard,
[job_id, params = std::move(params), env = std::move(env)](
service& s) mutable {
return s.initiate_rpk_debug_bundle_collection(
job_id, std::move(params), std::move(env));
});
}
auto units = co_await _process_control_mutex.get_units();
if (!co_await ss::file_exists(_rpk_path_binding().native())) {
co_return error_info(
error_code::rpk_binary_not_present,
fmt::format("{} not present", _rpk_path_binding().native()));
}
if (_rpk_process) {
// Must wait for both the process to no longer be running and for the
// wait result to be populated
if (is_running()) {
co_return error_info(
error_code::debug_bundle_process_running,
"Debug process already running");
}
}
try {
co_await cleanup_previous_run();
} catch (const std::exception& e) {
co_return error_info(
error_code::internal_error,
fmt::format("Failed to clean up previous run: {}", e.what()));
}
// Make a copy of it now and use it throughout the initialize process
// Protects against a situation where the config gets changed while setting
// up the initialization parameters
auto output_dir = _debug_bundle_dir;
if (!co_await ss::file_exists(output_dir.native())) {
try {
co_await ss::recursive_touch_directory(output_dir.native());
} catch (const std::exception& e) {
co_return error_info(
error_code::internal_error,
fmt::format(
"Failed to create debug bundle directory {}: {}",
output_dir,
e.what()));
}
}
auto debug_bundle_file_path = form_debug_bundle_file_path(
output_dir, job_id);
auto process_output_path = form_process_output_file_path(
output_dir, job_id);
auto args_res = build_rpk_arguments(
debug_bundle_file_path.native(), std::move(params));
if (!args_res.has_value()) {
co_return args_res.assume_error();
}
auto args = std::move(args_res.assume_value());
if (lg.is_enabled(ss::log_level::debug)) {
print_arguments(args, env);
}
try {
_rpk_process = std::make_unique<debug_bundle_process>(
job_id,
co_await external_process::external_process::create_external_process(
std::move(args), std::move(env)),
std::move(debug_bundle_file_path),
std::move(process_output_path));
} catch (const std::exception& e) {
_rpk_process.reset();
co_return error_info(
error_code::internal_error,
fmt::format("Starting rpk debug bundle failed: {}", e.what()));
}
// Kick off the wait by waiting for the process to finish and then emplacing
// the result
ssx::spawn_with_gate(_gate, [this, job_id]() {
return _rpk_process->wait()
.then([this, job_id](auto) {
return _process_control_mutex.get_units().then(
[this, job_id](auto units) {
return handle_wait_result(job_id).finally(
[units = std::move(units)] {});
});
})
.handle_exception([](const std::exception_ptr& e) {
lg.error("wait() failed while running rpk debug bundle: {}", e);
});
});
co_return outcome::success();
}
ss::future<result<void>> service::cancel_rpk_debug_bundle(job_id_t job_id) {
auto hold = _gate.hold();
if (ss::this_shard_id() != service_shard) {
co_return co_await container().invoke_on(
service_shard,
[job_id](service& s) { return s.cancel_rpk_debug_bundle(job_id); });
}
auto units = co_await _process_control_mutex.get_units();
auto status = process_status();
if (!status.has_value()) {
co_return error_info(error_code::debug_bundle_process_never_started);
} else if (!is_running()) {
co_return error_info(error_code::debug_bundle_process_not_running);
}
vassert(
_rpk_process,
"_rpk_process should be populated if the process has been executed");
if (job_id != _rpk_process->job_id()) {
co_return error_info(error_code::job_id_not_recognized);
}
try {
co_await _rpk_process->terminate(1s);
} catch (const std::system_error& e) {
if (
e.code() == external_process::error_code::process_already_completed) {
co_return error_info(error_code::debug_bundle_process_not_running);
}
co_return (error_info(error_code::internal_error, e.what()));
} catch (const std::exception& e) {
co_return error_info(error_code::internal_error, e.what());
}
co_return outcome::success();
}
ss::future<result<debug_bundle_status_data>>
service::rpk_debug_bundle_status() {
auto hold = _gate.hold();
if (ss::this_shard_id() != service_shard) {
co_return co_await container().invoke_on(service_shard, [](service& s) {
return s.rpk_debug_bundle_status();
});
}
auto status = process_status();
if (!status.has_value()) {
co_return error_info(error_code::debug_bundle_process_never_started);
}
vassert(
_rpk_process,
"_rpk_process should be populated if the process has been executed");
auto& output_file = _rpk_process->output_file_path().native();
std::optional<size_t> file_size;
if (status.value() == debug_bundle_status::success) {
try {
file_size.emplace(co_await ss::file_size(output_file));
} catch (const std::exception& e) {
co_return error_info(
error_code::internal_error,
fmt::format(
"Failed to get file size for debug bundle file {}: {}",
output_file,
e.what()));
}
}
co_return debug_bundle_status_data{
.job_id = _rpk_process->job_id(),
.status = status.value(),
.created_timestamp = _rpk_process->created_time(),
.file_name = _rpk_process->output_file_path().filename().native(),
.file_size = file_size,
.cout = _rpk_process->cout().copy(),
.cerr = _rpk_process->cerr().copy()};
}
ss::future<result<std::filesystem::path>>
service::rpk_debug_bundle_path(job_id_t job_id) {
if (ss::this_shard_id() != service_shard) {
co_return co_await container().invoke_on(
service_shard,
[job_id](service& s) { return s.rpk_debug_bundle_path(job_id); });
}
auto units = co_await _process_control_mutex.get_units();
auto status = process_status();
if (!status.has_value()) {
co_return error_info(error_code::debug_bundle_process_never_started);
}
switch (status.value()) {
case debug_bundle_status::running:
co_return error_info(error_code::debug_bundle_process_running);
case debug_bundle_status::success:
break;
case debug_bundle_status::error:
co_return error_info(error_code::process_failed);
case debug_bundle_status::expired:
co_return error_info(error_code::debug_bundle_expired);
}
if (job_id != _rpk_process->job_id()) {
co_return error_info(error_code::job_id_not_recognized);
}
if (!co_await ss::file_exists(_rpk_process->output_file_path().native())) {
co_return error_info(
error_code::internal_error,
fmt::format(
"Debug bundle file {} not found",
_rpk_process->output_file_path().native()));
}
co_return _rpk_process->output_file_path();
}
ss::future<result<void>> service::delete_rpk_debug_bundle(job_id_t job_id) {
if (ss::this_shard_id() != service_shard) {
co_return co_await container().invoke_on(
service_shard,
[job_id](service& s) { return s.delete_rpk_debug_bundle(job_id); });
}
auto units = co_await _process_control_mutex.get_units();
auto status = process_status();
if (!status.has_value()) {
co_return error_info(error_code::debug_bundle_process_never_started);
}
switch (status.value()) {
case debug_bundle_status::running:
co_return error_info(error_code::debug_bundle_process_running);
case debug_bundle_status::success:
case debug_bundle_status::error:
case debug_bundle_status::expired:
// Attempt the removal of the file even if the process errored out just
// in case the file was created
break;
}
if (_rpk_process->job_id() != job_id) {
co_return error_info(error_code::job_id_not_recognized);
}
co_await cleanup_previous_run();
_rpk_process.reset();
co_return outcome::success();
}
result<std::vector<ss::sstring>> service::build_rpk_arguments(
std::string_view debug_bundle_file_path, debug_bundle_parameters params) {
std::vector<ss::sstring> rv{
_rpk_path_binding().native(), "debug", "bundle"};
rv.emplace_back(output_variable);
rv.emplace_back(debug_bundle_file_path);
rv.emplace_back(verbose_variable);
if (params.authn_options.has_value()) {
ss::visit(
params.authn_options.value(),
[&rv](const scram_creds& creds) mutable {
rv.emplace_back(
ssx::sformat("{}={}", username_variable, creds.username));
rv.emplace_back(
ssx::sformat("{}={}", password_variable, creds.password));
rv.emplace_back(
ssx::sformat(
"{}={}", sasl_mechanism_variable, creds.mechanism));
},
[&rv](const bearer_creds& creds) mutable {
// rpk accepts -Xpass=token:<TOKEN> for OAUTHBEARER
rv.emplace_back(
ssx::sformat("{}=token:{}", password_variable, creds.token));
rv.emplace_back(
ssx::sformat(
"{}={}", sasl_mechanism_variable, creds.mechanism));
});
}
if (params.controller_logs_size_limit_bytes.has_value()) {
rv.emplace_back(controller_logs_size_limit_variable);
rv.emplace_back(
ssx::sformat("{}B", params.controller_logs_size_limit_bytes.value()));
}
if (params.cpu_profiler_wait_seconds.has_value()) {
rv.emplace_back(cpu_profiler_wait_variable);
rv.emplace_back(
ssx::sformat(
"{}s", params.cpu_profiler_wait_seconds.value().count()));
}
if (params.logs_since.has_value()) {
rv.emplace_back(logs_since_variable);
rv.emplace_back(ssx::sformat("{}", params.logs_since.value()));
}
if (params.logs_size_limit_bytes.has_value()) {
rv.emplace_back(logs_size_limit_variable);
rv.emplace_back(
ssx::sformat("{}B", params.logs_size_limit_bytes.value()));
}
if (params.logs_until.has_value()) {
rv.emplace_back(logs_until_variable);
rv.emplace_back(ssx::sformat("{}", params.logs_until.value()));
}
if (params.metrics_interval_seconds.has_value()) {
rv.emplace_back(metrics_interval_variable);
rv.emplace_back(
ssx::sformat("{}s", params.metrics_interval_seconds.value().count()));
}
if (params.metrics_samples.has_value()) {
rv.emplace_back(metrics_samples_variable);
rv.emplace_back(ssx::sformat("{}", params.metrics_samples.value()));
}
if (params.partition.has_value()) {
rv.emplace_back(partition_variable);
rv.emplace_back(
ssx::sformat("{}", fmt::join(params.partition.value(), " ")));
}
if (params.tls_enabled.has_value() && *params.tls_enabled) {
// Only add `-Xtls.enabled=true` if it's selected. RPK ignores
// the boolean value and will enable TLS if the option is present
rv.emplace_back(
ssx::sformat("{}={}", tls_enabled_variable, *params.tls_enabled));
}
if (params.tls_insecure_skip_verify.has_value()) {
rv.emplace_back(
ssx::sformat(
"{}={}",
tls_insecure_skip_verify_variable,
*params.tls_insecure_skip_verify));
}
if (params.k8s_namespace.has_value()) {
rv.emplace_back(k8s_namespace_variable);
rv.emplace_back(*params.k8s_namespace);
}
if (
params.label_selector.has_value()
&& !params.label_selector.value().empty()) {
rv.emplace_back(k8s_label_selector);
rv.emplace_back(
ssx::sformat("{}", fmt::join(params.label_selector.value(), ",")));
}
return rv;
}
ss::future<> service::cleanup_previous_run() const {
if (_rpk_process == nullptr) {
co_return;
}
auto& debug_bundle_file = _rpk_process->output_file_path().native();
auto& process_output_file
= _rpk_process->process_output_file_path().native();
if (co_await ss::file_exists(debug_bundle_file)) {
vlog(
lg.debug,
"Cleaning up previous debug bundle run {}",
debug_bundle_file);
co_await ss::remove_file(debug_bundle_file);
}
if (co_await ss::file_exists(process_output_file)) {
vlog(
lg.debug,
"Cleaning up previous process output run {}",
process_output_file);
co_await ss::remove_file(process_output_file);
}
co_await remove_kvstore_entry();
}
ss::future<> service::remove_kvstore_entry() const {
co_await _kvstore->remove(
storage::kvstore::key_space::debug_bundle,
bytes::from_string(debug_bundle_metadata_key));
}
ss::future<> service::set_metadata(job_id_t job_id) {
auto& debug_bundle_file = _rpk_process->output_file_path();
auto& process_output_file = _rpk_process->process_output_file_path();
auto run_successful = was_run_successful(_rpk_process->get_wait_result());
bytes sha256_checksum{};
if (run_successful) {
if (!co_await ss::file_exists(debug_bundle_file.native())) {
vlog(
lg.warn,
"Debug bundle file {} does not exist post successful run, cannot "
"set "
"metadata",
debug_bundle_file);
co_return;
}
sha256_checksum = co_await calculate_sha256_sum(
debug_bundle_file.native());
_probe->successful_bundle_generation(_rpk_process->finished_time());
} else {
_probe->failed_bundle_generation(_rpk_process->finished_time());
}
metadata md(
_rpk_process->created_time(),
_rpk_process->finished_time(),
job_id,
debug_bundle_file,
process_output_file,
std::move(sha256_checksum),
_rpk_process->get_wait_result());
iobuf buf;
serde::write(buf, std::move(md));
vlog(lg.debug, "Emplacing metadata into keystore for job {}", job_id);
process_output po{
.cout = _rpk_process->cout().copy(), .cerr = _rpk_process->cerr().copy()};
iobuf po_buf;
serde::write(po_buf, std::move(po));
vlog(
lg.debug,
"Writing process output to {} for job {}",
process_output_file,
job_id);
try {
co_await write_fully(process_output_file, std::move(po_buf));
vlog(
lg.debug,
"Successfully wrote process output to file to {}",
process_output_file.native());
} catch (const std::exception& e) {
vlog(
lg.warn,
"Failed to write process output to file {} for job {}: {}",
process_output_file,
job_id,
e.what());
co_return;
}
co_await _kvstore->put(
storage::kvstore::key_space::debug_bundle,
bytes::from_string(debug_bundle_metadata_key),
std::move(buf));
}
std::optional<debug_bundle_status> service::process_status() const {
if (_rpk_process == nullptr) {
return std::nullopt;
}
return _rpk_process->process_status();
}
bool service::is_running() const {
if (_rpk_process == nullptr) {
return false;
}
return _rpk_process->process_status() == debug_bundle_status::running;
}
ss::future<> service::handle_wait_result(job_id_t job_id) {
vlog(lg.debug, "Wait completed for job {}", job_id);
// This ensures in the extremely unlikely case where this
// continuation is called after another debug bundle has been
// initiated, that we are accessing a present and valid
// _rpk_process with the same job id
if (!_rpk_process || _rpk_process->job_id() != job_id) {
vlog(
lg.debug,
"Unable to enqueue metadata for job {}, "
"another process already started",
job_id());
co_return;
}
try {
co_await set_metadata(job_id);
} catch (const std::exception& e) {
vlog(
lg.warn, "Failed to set metadata for job {}: {}", job_id, e.what());
}
maybe_rearm_timer();
}
ss::future<> service::maybe_reload_previous_run() {
auto md_buf = _kvstore->get(
storage::kvstore::key_space::debug_bundle,
bytes::from_string(debug_bundle_metadata_key));
if (!md_buf) {
vlog(lg.debug, "No previous run detected");
co_return;
}
iobuf_parser p(std::move(*md_buf));
auto md = serde::read<metadata>(p);
auto run_was_successful = was_run_successful(md.get_wait_status());
auto& debug_bundle_file_path = md.debug_bundle_file_path;
auto& process_output_file_path = md.process_output_file_path;
const auto cleanup_files = [](const metadata& md) -> ss::future<> {
if (co_await ss::file_exists(md.debug_bundle_file_path)) {
co_await ss::remove_file(md.debug_bundle_file_path);
}
if (co_await ss::file_exists(md.process_output_file_path)) {
co_await ss::remove_file(md.process_output_file_path);
}
};
const auto has_timed_out = [this, &md] {
if (_debug_bundle_cleanup_binding().has_value()) {
auto cleanup_seconds = _debug_bundle_cleanup_binding().value();
auto elapsed = clock::now() - md.get_finished_at();
return elapsed > cleanup_seconds;
}
return false;
}();
if (has_timed_out) {
vlog(
lg.info,
"Previous run for job {} has timed out, will not reload its metadata",
md.job_id);
co_await cleanup_files(md);
co_return co_await remove_kvstore_entry();
}
if (
run_was_successful && !co_await ss::file_exists(debug_bundle_file_path)) {
vlog(
lg.info,
"Debug bundle file {} does not exist, cannot reload metadata",
debug_bundle_file_path);
co_await cleanup_files(md);
co_return co_await remove_kvstore_entry();
}
if (
run_was_successful
&& !co_await validate_sha256_checksum(
debug_bundle_file_path, md.sha256_checksum)) {
vlog(
lg.info,
"Debug bundle file {} checksum mismatch",
debug_bundle_file_path);
co_await cleanup_files(md);
co_return co_await remove_kvstore_entry();
}
vlog(
lg.info,
"Detected a valid previous run that was {}successful",
run_was_successful ? "" : "un");
std::optional<process_output> po;
auto process_output_file_exists = co_await ss::file_exists(
process_output_file_path);
if (process_output_file_exists) {
try {
auto buf = co_await read_fully(
std::filesystem::path(process_output_file_path));
iobuf_parser p(std::move(buf));
po.emplace(serde::read<process_output>(p));
} catch (const std::exception& e) {
vlog(
lg.warn,
"Failed to read process output from {}: {}",
process_output_file_path,
e.what());
}
}
if (!po) {
vlog(
lg.warn,
"Failed to reload process output for job {} from {}. Incomplete "
"metadata reload",
md.job_id,
process_output_file_path);
if (process_output_file_exists) {
co_await ss::remove_file(process_output_file_path);
}
}
_rpk_process = std::make_unique<debug_bundle_process>(
std::move(md), std::move(po));
if (run_was_successful) {
_probe->successful_bundle_generation(_rpk_process->finished_time());
} else {
_probe->failed_bundle_generation(_rpk_process->finished_time());
}
}
ss::future<> service::tick() {
auto units = co_await _process_control_mutex.get_units();
if (
!_debug_bundle_cleanup_binding().has_value() || _rpk_process == nullptr
|| _rpk_process->process_status() != debug_bundle_status::success) {
co_return;
}
auto cleanup_seconds = _debug_bundle_cleanup_binding().value();
auto cleanup_timepoint = _rpk_process->finished_time() + cleanup_seconds;
if (cleanup_timepoint <= clock::now()) {
vlog(
lg.info,
"Cleaning up debug bundle for job {}",
_rpk_process->job_id());
try {
co_await cleanup_previous_run();
} catch (const std::exception& e) {
vlog(
lg.warn,
"Failed to clean up previous run for job {}: {}",
_rpk_process->job_id(),
e.what());
}
_rpk_process->set_expired();
}
}
void service::maybe_rearm_timer() {
// If the timer is not set or if there is no debug bundle to clean up,
// cancel the timer
if (
!_debug_bundle_cleanup_binding().has_value() || _rpk_process == nullptr
|| _rpk_process->process_status() != debug_bundle_status::success) {
_cleanup_timer.cancel();
return;
}
auto cleanup_seconds = _debug_bundle_cleanup_binding().value();
// The point in time that the cleanup should occur
auto cleanup_timepoint = _rpk_process->finished_time() + cleanup_seconds;
// If the cleanup timepoint is in the past, then we should clean up now
if (cleanup_timepoint <= clock::now()) {
vlog(
lg.info,
"Debug bundle has already expired, firing clean up fiber now");
ssx::spawn_with_gate(_gate, [this] { return tick(); });
return;
}
auto lowres_point = ss::lowres_clock::now()
+ (cleanup_timepoint - clock::now());
vlog(
lg.debug,
"Rearming cleanup timer for {}, which is in {} seconds",
lowres_point.time_since_epoch(),
(lowres_point - ss::lowres_clock::now()) / 1s);
_cleanup_timer.rearm(lowres_point);
}
} // namespace debug_bundle