Skip to content

Commit b80901f

Browse files
committed
feat: support /v1/audio/speech api request.
1 parent effaada commit b80901f

12 files changed

Lines changed: 356 additions & 6 deletions

xllm/api_service/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ cc_library(
2121
non_stream_call.h
2222
service_impl_factory.h
2323
serving_mode.h
24+
speech_service_impl.h
2425
stream_call.h
2526
models_service_impl.h
2627
stream_output_parser.h
@@ -41,6 +42,7 @@ cc_library(
4142
image_generation_service_impl.cpp
4243
models_service_impl.cpp
4344
rerank_service_impl.cpp
45+
speech_service_impl.cpp
4446
stream_output_parser.cpp
4547
qwen3_rerank_service_impl.cpp
4648
embedding_output_builder.cpp
@@ -124,3 +126,4 @@ cc_test(
124126
)
125127
target_link_libraries(openai_service_test PRIVATE brpc leveldb::leveldb OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
126128
add_dependencies(openai_service_test brpc-static)
129+

xllm/api_service/api_service.cpp

Lines changed: 73 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,14 @@ limitations under the License.
3030
#include "common.pb.h"
3131
#include "completion.pb.h"
3232
#include "core/common/constants.h"
33+
#include "core/common/global_flags.h"
3334
#include "core/common/metrics.h"
3435
#include "core/common/types.h"
35-
#include "core/distributed_runtime/dit_master.h"
3636
#include "core/distributed_runtime/llm_master.h"
37-
#include "core/distributed_runtime/rec_master.h"
38-
#include "core/distributed_runtime/vlm_master.h"
3937
#include "core/util/closure_guard.h"
4038
#include "embedding.pb.h"
4139
#include "image_generation.pb.h"
4240
#include "models.pb.h"
43-
#include "service_impl_factory.h"
4441
#include "xllm_metrics.h"
4542
namespace xllm {
4643

@@ -56,7 +53,8 @@ google::protobuf::Arena* GetArenaWithCheck(
5653
}
5754

5855
const char* kSampleNotSupportedError = "/v1/sample is only supported for LLM";
59-
56+
const char* speechNotSupportedError =
57+
"/v1/audio/speech is only supported for VLM";
6058
} // namespace
6159

6260
APIService::APIService(Master* master,
@@ -440,6 +438,75 @@ void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
440438
}
441439
}
442440

441+
void APIService::Speech(::google::protobuf::RpcController* controller,
442+
const proto::SpeechRequest* request,
443+
proto::SpeechResponse* response,
444+
::google::protobuf::Closure* done) {
445+
xllm::ClosureGuard done_guard(
446+
done,
447+
std::bind(request_in_metric, nullptr),
448+
std::bind(request_out_metric, (void*)controller));
449+
if (!request || !response || !controller) {
450+
LOG(ERROR) << "brpc request | respose | controller is null.";
451+
return;
452+
}
453+
454+
auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
455+
if (!speech_service_impl_) {
456+
ctrl->SetFailed(speechNotSupportedError);
457+
return;
458+
}
459+
460+
auto call =
461+
std::make_shared<SpeechCall>(ctrl,
462+
done_guard.release(),
463+
const_cast<proto::SpeechRequest*>(request),
464+
response,
465+
true);
466+
speech_service_impl_->process_async(call);
467+
}
468+
469+
void APIService::SpeechHttp(::google::protobuf::RpcController* controller,
470+
const proto::HttpRequest* request,
471+
proto::HttpResponse* response,
472+
::google::protobuf::Closure* done) {
473+
xllm::ClosureGuard done_guard(
474+
done,
475+
std::bind(request_in_metric, nullptr),
476+
std::bind(request_out_metric, (void*)controller));
477+
if (!request || !response || !controller) {
478+
LOG(ERROR) << "brpc request | respose | controller is null";
479+
return;
480+
}
481+
482+
auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
483+
if (!speech_service_impl_) {
484+
ctrl->SetFailed(speechNotSupportedError);
485+
return;
486+
}
487+
488+
auto arena = GetArenaWithCheck<SpeechCall>(response);
489+
auto req_pb =
490+
google::protobuf::Arena::CreateMessage<proto::SpeechRequest>(arena);
491+
auto resp_pb =
492+
google::protobuf::Arena::CreateMessage<proto::SpeechResponse>(arena);
493+
494+
std::string error;
495+
json2pb::Json2PbOptions options;
496+
butil::IOBuf& buf = ctrl->request_attachment();
497+
butil::IOBufAsZeroCopyInputStream iobuf_stream(buf);
498+
auto st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error);
499+
if (!st) {
500+
ctrl->SetFailed(error);
501+
LOG(ERROR) << "parse json to proto failed: " << error;
502+
return;
503+
}
504+
505+
auto call = std::make_shared<SpeechCall>(
506+
ctrl, done_guard.release(), req_pb, resp_pb, arena != nullptr);
507+
speech_service_impl_->process_async(call);
508+
}
509+
443510
void APIService::ImageGeneration(::google::protobuf::RpcController* controller,
444511
const proto::ImageGenerationRequest* request,
445512
proto::ImageGenerationResponse* response,
@@ -927,7 +994,7 @@ void APIService::WakeupHttp(::google::protobuf::RpcController* controller,
927994
std::vector<WeightSegment> segments;
928995
segments.reserve(seg_list.segments_size());
929996
for (const auto& proto_seg : seg_list.segments()) {
930-
segments.emplace_back(proto_seg.offset(), proto_seg.size());
997+
segments.push_back({proto_seg.offset(), proto_seg.size()});
931998
}
932999
wakeup_options.src_weight_segments.push_back(std::move(segments));
9331000
}

xllm/api_service/api_service.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ limitations under the License.
3030
#include "rec_completion_service_impl.h"
3131
#include "rerank_service_impl.h"
3232
#include "sample_service_impl.h"
33+
#include "speech_service_impl.h"
3334
#include "xllm_service.pb.h"
3435

3536
namespace xllm {
@@ -86,6 +87,16 @@ class APIService : public proto::XllmAPIService {
8687
proto::HttpResponse* response,
8788
::google::protobuf::Closure* done) override;
8889

90+
void Speech(::google::protobuf::RpcController* controller,
91+
const proto::SpeechRequest* request,
92+
proto::SpeechResponse* response,
93+
::google::protobuf::Closure* done) override;
94+
95+
void SpeechHttp(::google::protobuf::RpcController* controller,
96+
const proto::HttpRequest* request,
97+
proto::HttpResponse* response,
98+
::google::protobuf::Closure* done) override;
99+
89100
void ImageGeneration(::google::protobuf::RpcController* controller,
90101
const proto::ImageGenerationRequest* request,
91102
proto::ImageGenerationResponse* response,
@@ -202,6 +213,7 @@ class APIService : public proto::XllmAPIService {
202213
std::unique_ptr<MMChatServiceImpl> mm_chat_service_impl_;
203214
std::unique_ptr<EmbeddingServiceImpl> embedding_service_impl_;
204215
std::unique_ptr<MMEmbeddingServiceImpl> mm_embedding_service_impl_;
216+
std::unique_ptr<SpeechServiceImpl> speech_service_impl_;
205217
std::unique_ptr<ModelsServiceImpl> models_service_impl_;
206218
std::unique_ptr<ImageGenerationServiceImpl> image_generation_service_impl_;
207219
std::unique_ptr<RerankServiceImpl> rerank_service_impl_;

xllm/api_service/service_impl_factory.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ void ServiceImplFactory::create(
8282
std::make_unique<MMChatServiceImpl>(vlm_master, models);
8383
self->mm_embedding_service_impl_ =
8484
std::make_unique<MMEmbeddingServiceImpl>(vlm_master, models);
85+
self->speech_service_impl_ =
86+
create_service_impl<SpeechServiceImpl>(vlm_master, models);
8587
}},
8688
{static_cast<int8_t>(ServingMode::DIT),
8789
[](APIService* self,
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/* Copyright 2026 The xLLM Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
https://github.com/jd-opensource/xllm/blob/main/LICENSE
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#include "speech_service_impl.h"
17+
18+
#include <algorithm>
19+
#include <cctype>
20+
#include <string>
21+
#include <utility>
22+
23+
#include "core/common/global_flags.h"
24+
#include "core/framework/request/mm_data.h"
25+
#include "core/framework/request/request_output.h"
26+
#include "core/framework/request/request_params.h"
27+
28+
namespace xllm {
29+
namespace {
30+
31+
constexpr const char* kRuntimeUnavailableMessage =
32+
"/v1/audio/speech decoder is not integrated in this build";
33+
34+
std::string to_lower_ascii(std::string value) {
35+
std::transform(
36+
value.begin(), value.end(), value.begin(), [](unsigned char ch) {
37+
return std::tolower(ch);
38+
});
39+
return value;
40+
}
41+
42+
bool is_blank(const std::string& value) {
43+
return value.find_first_not_of(" \t\r\n") == std::string::npos;
44+
}
45+
46+
bool is_supported_response_format(const std::string& format) {
47+
return format == "wav" || format == "pcm" || format == "flac" ||
48+
format == "mp3" || format == "aac" || format == "opus";
49+
}
50+
51+
Status prepare_speech_request(proto::SpeechRequest* request,
52+
const absl::flat_hash_set<std::string>& models,
53+
const std::string& default_model) {
54+
if (request == nullptr) {
55+
return Status(StatusCode::INVALID_ARGUMENT, "speech request is null");
56+
}
57+
if (is_blank(request->input())) {
58+
return Status(StatusCode::INVALID_ARGUMENT, "input cannot be empty");
59+
}
60+
61+
if (!request->has_model() || request->model().empty()) {
62+
request->set_model(default_model);
63+
}
64+
if (!models.contains(request->model())) {
65+
return Status(StatusCode::UNKNOWN, "Model not supported");
66+
}
67+
68+
std::string response_format = request->has_response_format()
69+
? to_lower_ascii(request->response_format())
70+
: "wav";
71+
if (response_format.empty()) {
72+
response_format = "wav";
73+
}
74+
if (!is_supported_response_format(response_format)) {
75+
return Status(StatusCode::INVALID_ARGUMENT,
76+
"response_format must be one of wav, pcm, flac, mp3, aac, "
77+
"opus");
78+
}
79+
request->set_response_format(response_format);
80+
81+
if (!request->has_speed()) {
82+
request->set_speed(1.0);
83+
}
84+
if (!request->has_stream_format() || request->stream_format().empty()) {
85+
request->set_stream_format("audio");
86+
}
87+
if (!request->has_stream()) {
88+
request->set_stream(false);
89+
}
90+
if (!request->has_task_type() || request->task_type().empty()) {
91+
const bool is_base =
92+
request->has_ref_audio() || request->speaker_embedding_size() > 0;
93+
request->set_task_type(is_base ? "Base" : "CustomVoice");
94+
}
95+
if (!request->has_language() || request->language().empty()) {
96+
request->set_language("Auto");
97+
}
98+
if (!request->has_max_new_tokens() || request->max_new_tokens() <= 0) {
99+
request->set_max_new_tokens(2048);
100+
}
101+
102+
return Status();
103+
}
104+
105+
} // namespace
106+
107+
SpeechServiceImpl::SpeechServiceImpl(VLMMaster* master,
108+
const std::vector<std::string>& models)
109+
: APIServiceImpl(models),
110+
master_(master),
111+
default_model_(models.empty() ? FLAGS_model_id : models.front()) {
112+
CHECK(master_ != nullptr);
113+
}
114+
115+
void SpeechServiceImpl::process_async_impl(std::shared_ptr<SpeechCall> call) {
116+
proto::SpeechRequest request = call->request();
117+
auto status = prepare_speech_request(&request, models_, default_model_);
118+
if (!status.ok()) {
119+
call->finish_with_error(status.code(), status.message());
120+
return;
121+
}
122+
123+
RequestParams request_params(
124+
request, call->get_x_request_id(), call->get_x_request_time());
125+
126+
auto prompt = request.input();
127+
if (request.has_instructions() && !is_blank(request.instructions())) {
128+
prompt = request.instructions() + "\n" + request.input();
129+
}
130+
131+
master_->handle_request(
132+
std::move(prompt),
133+
MMData(),
134+
std::move(request_params),
135+
[call](const RequestOutput& req_output) -> bool {
136+
if (req_output.status.has_value()) {
137+
const auto& status = req_output.status.value();
138+
if (!status.ok()) {
139+
return call->finish_with_error(status.code(), status.message());
140+
}
141+
}
142+
143+
if (req_output.finished || req_output.cancelled) {
144+
return call->finish_with_error(StatusCode::UNAVAILABLE,
145+
kRuntimeUnavailableMessage);
146+
}
147+
return true;
148+
});
149+
}
150+
151+
} // namespace xllm
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/* Copyright 2026 The xLLM Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
https://github.com/jd-opensource/xllm/blob/main/LICENSE
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#pragma once
17+
18+
#include <string>
19+
#include <vector>
20+
21+
#include "api_service/api_service_impl.h"
22+
#include "api_service/non_stream_call.h"
23+
#include "core/distributed_runtime/vlm_master.h"
24+
#include "speech.pb.h"
25+
26+
namespace xllm {
27+
28+
using SpeechCall = NonStreamCall<proto::SpeechRequest, proto::SpeechResponse>;
29+
30+
class SpeechServiceImpl final : public APIServiceImpl<SpeechCall> {
31+
public:
32+
SpeechServiceImpl(VLMMaster* master, const std::vector<std::string>& models);
33+
34+
void process_async_impl(std::shared_ptr<SpeechCall> call) override;
35+
36+
private:
37+
DISALLOW_COPY_AND_ASSIGN(SpeechServiceImpl);
38+
39+
VLMMaster* master_ = nullptr;
40+
std::string default_model_;
41+
};
42+
43+
} // namespace xllm

0 commit comments

Comments
 (0)