Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions xllm/api_service/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ cc_library(
non_stream_call.h
service_impl_factory.h
serving_mode.h
speech_service_impl.h
stream_call.h
models_service_impl.h
stream_output_parser.h
Expand All @@ -41,6 +42,7 @@ cc_library(
image_generation_service_impl.cpp
models_service_impl.cpp
rerank_service_impl.cpp
speech_service_impl.cpp
stream_output_parser.cpp
qwen3_rerank_service_impl.cpp
embedding_output_builder.cpp
Expand Down Expand Up @@ -124,3 +126,4 @@ cc_test(
)
target_link_libraries(openai_service_test PRIVATE brpc leveldb::leveldb OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
add_dependencies(openai_service_test brpc-static)

79 changes: 73 additions & 6 deletions xllm/api_service/api_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,14 @@ limitations under the License.
#include "common.pb.h"
#include "completion.pb.h"
#include "core/common/constants.h"
#include "core/common/global_flags.h"
#include "core/common/metrics.h"
#include "core/common/types.h"
#include "core/distributed_runtime/dit_master.h"
#include "core/distributed_runtime/llm_master.h"
#include "core/distributed_runtime/rec_master.h"
#include "core/distributed_runtime/vlm_master.h"
#include "core/util/closure_guard.h"
#include "embedding.pb.h"
#include "image_generation.pb.h"
#include "models.pb.h"
#include "service_impl_factory.h"
#include "xllm_metrics.h"
namespace xllm {

Expand All @@ -56,7 +53,8 @@ google::protobuf::Arena* GetArenaWithCheck(
}

const char* kSampleNotSupportedError = "/v1/sample is only supported for LLM";

const char* kSpeechNotSupportedError =
"/v1/audio/speech is only supported for VLM";
} // namespace

APIService::APIService(Master* master,
Expand Down Expand Up @@ -440,6 +438,75 @@ void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
}
}

void APIService::Speech(::google::protobuf::RpcController* controller,
const proto::SpeechRequest* request,
proto::SpeechResponse* response,
::google::protobuf::Closure* done) {
xllm::ClosureGuard done_guard(
done,
std::bind(request_in_metric, nullptr),
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use lambda func instead of std::bind

std::bind(request_out_metric, (void*)controller));
if (!request || !response || !controller) {
LOG(ERROR) << "brpc request | respose | controller is null.";
return;
}

auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
if (!speech_service_impl_) {
ctrl->SetFailed(kSpeechNotSupportedError);
return;
}

auto call =
std::make_shared<SpeechCall>(ctrl,
done_guard.release(),
const_cast<proto::SpeechRequest*>(request),
response,
true);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Annotate constant arguments with a comment indicating the parameter name when calling functions or constructors to improve readability.

                                   /*use_arena=*/true);
References
  1. Annotate constant arguments with a comment indicating the parameter name when calling functions or constructors. (link)

speech_service_impl_->process_async(call);
}

void APIService::SpeechHttp(::google::protobuf::RpcController* controller,
const proto::HttpRequest* request,
proto::HttpResponse* response,
::google::protobuf::Closure* done) {
xllm::ClosureGuard done_guard(
done,
std::bind(request_in_metric, nullptr),
std::bind(request_out_metric, (void*)controller));
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

if (!request || !response || !controller) {
LOG(ERROR) << "brpc request | respose | controller is null";
return;
}

auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
if (!speech_service_impl_) {
ctrl->SetFailed(kSpeechNotSupportedError);
return;
}

auto arena = GetArenaWithCheck<SpeechCall>(response);
auto req_pb =
google::protobuf::Arena::CreateMessage<proto::SpeechRequest>(arena);
auto resp_pb =
google::protobuf::Arena::CreateMessage<proto::SpeechResponse>(arena);

std::string error;
json2pb::Json2PbOptions options;
butil::IOBuf& buf = ctrl->request_attachment();
butil::IOBufAsZeroCopyInputStream iobuf_stream(buf);
auto st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Do not use auto for simple/primitive types like bool. Explicitly state the type to maintain clarity.

  bool st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error);
References
  1. Do not use auto for simple/primitive types (int32_t, float, bool, std::string, etc.). (link)

if (!st) {
ctrl->SetFailed(error);
LOG(ERROR) << "parse json to proto failed: " << error;
return;
}

auto call = std::make_shared<SpeechCall>(
ctrl, done_guard.release(), req_pb, resp_pb, arena != nullptr);
speech_service_impl_->process_async(call);
}

void APIService::ImageGeneration(::google::protobuf::RpcController* controller,
const proto::ImageGenerationRequest* request,
proto::ImageGenerationResponse* response,
Expand Down Expand Up @@ -927,7 +994,7 @@ void APIService::WakeupHttp(::google::protobuf::RpcController* controller,
std::vector<WeightSegment> segments;
segments.reserve(seg_list.segments_size());
for (const auto& proto_seg : seg_list.segments()) {
segments.emplace_back(proto_seg.offset(), proto_seg.size());
segments.push_back({proto_seg.offset(), proto_seg.size()});
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Prefer emplace_back over push_back to construct elements in-place and avoid unnecessary copies or temporary objects.

          segments.emplace_back(proto_seg.offset(), proto_seg.size());
References
  1. Prefer emplace_back over push_back to construct elements in-place and avoid unnecessary copies. (link)

}
wakeup_options.src_weight_segments.push_back(std::move(segments));
}
Expand Down
12 changes: 12 additions & 0 deletions xllm/api_service/api_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ limitations under the License.
#include "rec_completion_service_impl.h"
#include "rerank_service_impl.h"
#include "sample_service_impl.h"
#include "speech_service_impl.h"
#include "xllm_service.pb.h"

namespace xllm {
Expand Down Expand Up @@ -86,6 +87,16 @@ class APIService : public proto::XllmAPIService {
proto::HttpResponse* response,
::google::protobuf::Closure* done) override;

void Speech(::google::protobuf::RpcController* controller,
const proto::SpeechRequest* request,
proto::SpeechResponse* response,
::google::protobuf::Closure* done) override;

void SpeechHttp(::google::protobuf::RpcController* controller,
const proto::HttpRequest* request,
proto::HttpResponse* response,
::google::protobuf::Closure* done) override;

void ImageGeneration(::google::protobuf::RpcController* controller,
const proto::ImageGenerationRequest* request,
proto::ImageGenerationResponse* response,
Expand Down Expand Up @@ -202,6 +213,7 @@ class APIService : public proto::XllmAPIService {
std::unique_ptr<MMChatServiceImpl> mm_chat_service_impl_;
std::unique_ptr<EmbeddingServiceImpl> embedding_service_impl_;
std::unique_ptr<MMEmbeddingServiceImpl> mm_embedding_service_impl_;
std::unique_ptr<SpeechServiceImpl> speech_service_impl_;
std::unique_ptr<ModelsServiceImpl> models_service_impl_;
std::unique_ptr<ImageGenerationServiceImpl> image_generation_service_impl_;
std::unique_ptr<RerankServiceImpl> rerank_service_impl_;
Expand Down
2 changes: 2 additions & 0 deletions xllm/api_service/service_impl_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ void ServiceImplFactory::create(
std::make_unique<MMChatServiceImpl>(vlm_master, models);
self->mm_embedding_service_impl_ =
std::make_unique<MMEmbeddingServiceImpl>(vlm_master, models);
self->speech_service_impl_ =
create_service_impl<SpeechServiceImpl>(vlm_master, models);
}},
{static_cast<int8_t>(ServingMode::DIT),
[](APIService* self,
Expand Down
151 changes: 151 additions & 0 deletions xllm/api_service/speech_service_impl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/* Copyright 2026 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "speech_service_impl.h"

#include <algorithm>
#include <cctype>
#include <string>
#include <utility>

#include "core/common/global_flags.h"
#include "core/framework/request/mm_data.h"
#include "core/framework/request/request_output.h"
#include "core/framework/request/request_params.h"

namespace xllm {
namespace {

constexpr const char* kRuntimeUnavailableMessage =
"/v1/audio/speech decoder is not integrated in this build";

std::string to_lower_ascii(std::string value) {
std::transform(
value.begin(), value.end(), value.begin(), [](unsigned char ch) {
return std::tolower(ch);
});
return value;
}

bool is_blank(const std::string& value) {
return value.find_first_not_of(" \t\r\n") == std::string::npos;
}

bool is_supported_response_format(const std::string& format) {
return format == "wav" || format == "pcm" || format == "flac" ||
format == "mp3" || format == "aac" || format == "opus";
}

Status prepare_speech_request(proto::SpeechRequest* request,
const absl::flat_hash_set<std::string>& models,
const std::string& default_model) {
if (request == nullptr) {
return Status(StatusCode::INVALID_ARGUMENT, "speech request is null");
}
if (is_blank(request->input())) {
return Status(StatusCode::INVALID_ARGUMENT, "input cannot be empty");
}

if (!request->has_model() || request->model().empty()) {
request->set_model(default_model);
}
if (!models.contains(request->model())) {
return Status(StatusCode::UNKNOWN, "Model not supported");
}

std::string response_format = request->has_response_format()
? to_lower_ascii(request->response_format())
: "wav";
if (response_format.empty()) {
response_format = "wav";
}
if (!is_supported_response_format(response_format)) {
return Status(StatusCode::INVALID_ARGUMENT,
"response_format must be one of wav, pcm, flac, mp3, aac, "
"opus");
}
request->set_response_format(response_format);

if (!request->has_speed()) {
request->set_speed(1.0);
}
if (!request->has_stream_format() || request->stream_format().empty()) {
request->set_stream_format("audio");
}
if (!request->has_stream()) {
request->set_stream(false);
}
if (!request->has_task_type() || request->task_type().empty()) {
const bool is_base =
request->has_ref_audio() || request->speaker_embedding_size() > 0;
request->set_task_type(is_base ? "Base" : "CustomVoice");
}
if (!request->has_language() || request->language().empty()) {
request->set_language("Auto");
}
if (!request->has_max_new_tokens() || request->max_new_tokens() <= 0) {
request->set_max_new_tokens(2048);
}

return Status();
}

} // namespace

SpeechServiceImpl::SpeechServiceImpl(VLMMaster* master,
const std::vector<std::string>& models)
: APIServiceImpl(models),
master_(master),
default_model_(models.empty() ? FLAGS_model_id : models.front()) {
CHECK(master_ != nullptr);
}

void SpeechServiceImpl::process_async_impl(std::shared_ptr<SpeechCall> call) {
proto::SpeechRequest request = call->request();
auto status = prepare_speech_request(&request, models_, default_model_);
if (!status.ok()) {
call->finish_with_error(status.code(), status.message());
return;
}

RequestParams request_params(
request, call->get_x_request_id(), call->get_x_request_time());

auto prompt = request.input();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Do not use auto for std::string. Use the explicit type to adhere to the project's coding style.

Suggested change
auto prompt = request.input();
std::string prompt = request.input();
References
  1. Do not use auto for simple/primitive types (int32_t, float, bool, std::string, etc.). (link)

if (request.has_instructions() && !is_blank(request.instructions())) {
prompt = request.instructions() + "\n" + request.input();
}

master_->handle_request(
std::move(prompt),
MMData(),
std::move(request_params),
[call](const RequestOutput& req_output) -> bool {
if (req_output.status.has_value()) {
const auto& status = req_output.status.value();
if (!status.ok()) {
return call->finish_with_error(status.code(), status.message());
}
}

if (req_output.finished || req_output.cancelled) {
return call->finish_with_error(StatusCode::UNAVAILABLE,
kRuntimeUnavailableMessage);
}
Comment on lines +143 to +146
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current implementation does not extract audio data from req_output and always returns an error when the request is finished. This prevents the API from actually delivering speech output. Please ensure the audio data is correctly extracted and populated in the response.

return true;
});
}

} // namespace xllm
43 changes: 43 additions & 0 deletions xllm/api_service/speech_service_impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* Copyright 2026 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#pragma once

#include <string>
#include <vector>

#include "api_service/api_service_impl.h"
#include "api_service/non_stream_call.h"
#include "core/distributed_runtime/vlm_master.h"
#include "speech.pb.h"

namespace xllm {

using SpeechCall = NonStreamCall<proto::SpeechRequest, proto::SpeechResponse>;

class SpeechServiceImpl final : public APIServiceImpl<SpeechCall> {
public:
SpeechServiceImpl(VLMMaster* master, const std::vector<std::string>& models);

void process_async_impl(std::shared_ptr<SpeechCall> call) override;

private:
DISALLOW_COPY_AND_ASSIGN(SpeechServiceImpl);

VLMMaster* master_ = nullptr;
std::string default_model_;
};

} // namespace xllm
Loading
Loading