diff --git a/xllm/api_service/CMakeLists.txt b/xllm/api_service/CMakeLists.txt index 722ee4ee6..eb126690d 100644 --- a/xllm/api_service/CMakeLists.txt +++ b/xllm/api_service/CMakeLists.txt @@ -21,6 +21,7 @@ cc_library( non_stream_call.h service_impl_factory.h serving_mode.h + speech_service_impl.h stream_call.h models_service_impl.h stream_output_parser.h @@ -41,6 +42,7 @@ cc_library( image_generation_service_impl.cpp models_service_impl.cpp rerank_service_impl.cpp + speech_service_impl.cpp stream_output_parser.cpp qwen3_rerank_service_impl.cpp embedding_output_builder.cpp @@ -124,3 +126,4 @@ cc_test( ) target_link_libraries(openai_service_test PRIVATE brpc leveldb::leveldb OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf) add_dependencies(openai_service_test brpc-static) + diff --git a/xllm/api_service/api_service.cpp b/xllm/api_service/api_service.cpp index bff43b0c2..b6fd76c96 100644 --- a/xllm/api_service/api_service.cpp +++ b/xllm/api_service/api_service.cpp @@ -30,17 +30,14 @@ limitations under the License. #include "common.pb.h" #include "completion.pb.h" #include "core/common/constants.h" +#include "core/common/global_flags.h" #include "core/common/metrics.h" #include "core/common/types.h" -#include "core/distributed_runtime/dit_master.h" #include "core/distributed_runtime/llm_master.h" -#include "core/distributed_runtime/rec_master.h" -#include "core/distributed_runtime/vlm_master.h" #include "core/util/closure_guard.h" #include "embedding.pb.h" #include "image_generation.pb.h" #include "models.pb.h" -#include "service_impl_factory.h" #include "xllm_metrics.h" namespace xllm { @@ -56,7 +53,8 @@ google::protobuf::Arena* GetArenaWithCheck( } const char* kSampleNotSupportedError = "/v1/sample is only supported for LLM"; - +const char* kSpeechNotSupportedError = + "/v1/audio/speech is only supported for VLM"; } // namespace APIService::APIService(Master* master, @@ -440,6 +438,75 @@ void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller, } } +void APIService::Speech(::google::protobuf::RpcController* controller, + const proto::SpeechRequest* request, + proto::SpeechResponse* response, + ::google::protobuf::Closure* done) { + xllm::ClosureGuard done_guard( + done, + std::bind(request_in_metric, nullptr), + std::bind(request_out_metric, (void*)controller)); + if (!request || !response || !controller) { + LOG(ERROR) << "brpc request | respose | controller is null."; + return; + } + + auto ctrl = reinterpret_cast(controller); + if (!speech_service_impl_) { + ctrl->SetFailed(kSpeechNotSupportedError); + return; + } + + auto call = + std::make_shared(ctrl, + done_guard.release(), + const_cast(request), + response, + true); + speech_service_impl_->process_async(call); +} + +void APIService::SpeechHttp(::google::protobuf::RpcController* controller, + const proto::HttpRequest* request, + proto::HttpResponse* response, + ::google::protobuf::Closure* done) { + xllm::ClosureGuard done_guard( + done, + std::bind(request_in_metric, nullptr), + std::bind(request_out_metric, (void*)controller)); + if (!request || !response || !controller) { + LOG(ERROR) << "brpc request | respose | controller is null"; + return; + } + + auto ctrl = reinterpret_cast(controller); + if (!speech_service_impl_) { + ctrl->SetFailed(kSpeechNotSupportedError); + return; + } + + auto arena = GetArenaWithCheck(response); + auto req_pb = + google::protobuf::Arena::CreateMessage(arena); + auto resp_pb = + google::protobuf::Arena::CreateMessage(arena); + + std::string error; + json2pb::Json2PbOptions options; + butil::IOBuf& buf = ctrl->request_attachment(); + butil::IOBufAsZeroCopyInputStream iobuf_stream(buf); + auto st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error); + if (!st) { + ctrl->SetFailed(error); + LOG(ERROR) << "parse json to proto failed: " << error; + return; + } + + auto call = std::make_shared( + ctrl, done_guard.release(), req_pb, resp_pb, arena != nullptr); + speech_service_impl_->process_async(call); +} + void APIService::ImageGeneration(::google::protobuf::RpcController* controller, const proto::ImageGenerationRequest* request, proto::ImageGenerationResponse* response, @@ -927,7 +994,7 @@ void APIService::WakeupHttp(::google::protobuf::RpcController* controller, std::vector segments; segments.reserve(seg_list.segments_size()); for (const auto& proto_seg : seg_list.segments()) { - segments.emplace_back(proto_seg.offset(), proto_seg.size()); + segments.push_back({proto_seg.offset(), proto_seg.size()}); } wakeup_options.src_weight_segments.push_back(std::move(segments)); } diff --git a/xllm/api_service/api_service.h b/xllm/api_service/api_service.h index 43bd054f0..9fe5fc10f 100644 --- a/xllm/api_service/api_service.h +++ b/xllm/api_service/api_service.h @@ -30,6 +30,7 @@ limitations under the License. #include "rec_completion_service_impl.h" #include "rerank_service_impl.h" #include "sample_service_impl.h" +#include "speech_service_impl.h" #include "xllm_service.pb.h" namespace xllm { @@ -86,6 +87,16 @@ class APIService : public proto::XllmAPIService { proto::HttpResponse* response, ::google::protobuf::Closure* done) override; + void Speech(::google::protobuf::RpcController* controller, + const proto::SpeechRequest* request, + proto::SpeechResponse* response, + ::google::protobuf::Closure* done) override; + + void SpeechHttp(::google::protobuf::RpcController* controller, + const proto::HttpRequest* request, + proto::HttpResponse* response, + ::google::protobuf::Closure* done) override; + void ImageGeneration(::google::protobuf::RpcController* controller, const proto::ImageGenerationRequest* request, proto::ImageGenerationResponse* response, @@ -202,6 +213,7 @@ class APIService : public proto::XllmAPIService { std::unique_ptr mm_chat_service_impl_; std::unique_ptr embedding_service_impl_; std::unique_ptr mm_embedding_service_impl_; + std::unique_ptr speech_service_impl_; std::unique_ptr models_service_impl_; std::unique_ptr image_generation_service_impl_; std::unique_ptr rerank_service_impl_; diff --git a/xllm/api_service/service_impl_factory.cpp b/xllm/api_service/service_impl_factory.cpp index 092a5aba4..abb070869 100644 --- a/xllm/api_service/service_impl_factory.cpp +++ b/xllm/api_service/service_impl_factory.cpp @@ -82,6 +82,8 @@ void ServiceImplFactory::create( std::make_unique(vlm_master, models); self->mm_embedding_service_impl_ = std::make_unique(vlm_master, models); + self->speech_service_impl_ = + create_service_impl(vlm_master, models); }}, {static_cast(ServingMode::DIT), [](APIService* self, diff --git a/xllm/api_service/speech_service_impl.cpp b/xllm/api_service/speech_service_impl.cpp new file mode 100644 index 000000000..b82b07df3 --- /dev/null +++ b/xllm/api_service/speech_service_impl.cpp @@ -0,0 +1,151 @@ +/* Copyright 2026 The xLLM Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://github.com/jd-opensource/xllm/blob/main/LICENSE + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "speech_service_impl.h" + +#include +#include +#include +#include + +#include "core/common/global_flags.h" +#include "core/framework/request/mm_data.h" +#include "core/framework/request/request_output.h" +#include "core/framework/request/request_params.h" + +namespace xllm { +namespace { + +constexpr const char* kRuntimeUnavailableMessage = + "/v1/audio/speech decoder is not integrated in this build"; + +std::string to_lower_ascii(std::string value) { + std::transform( + value.begin(), value.end(), value.begin(), [](unsigned char ch) { + return std::tolower(ch); + }); + return value; +} + +bool is_blank(const std::string& value) { + return value.find_first_not_of(" \t\r\n") == std::string::npos; +} + +bool is_supported_response_format(const std::string& format) { + return format == "wav" || format == "pcm" || format == "flac" || + format == "mp3" || format == "aac" || format == "opus"; +} + +Status prepare_speech_request(proto::SpeechRequest* request, + const absl::flat_hash_set& models, + const std::string& default_model) { + if (request == nullptr) { + return Status(StatusCode::INVALID_ARGUMENT, "speech request is null"); + } + if (is_blank(request->input())) { + return Status(StatusCode::INVALID_ARGUMENT, "input cannot be empty"); + } + + if (!request->has_model() || request->model().empty()) { + request->set_model(default_model); + } + if (!models.contains(request->model())) { + return Status(StatusCode::UNKNOWN, "Model not supported"); + } + + std::string response_format = request->has_response_format() + ? to_lower_ascii(request->response_format()) + : "wav"; + if (response_format.empty()) { + response_format = "wav"; + } + if (!is_supported_response_format(response_format)) { + return Status(StatusCode::INVALID_ARGUMENT, + "response_format must be one of wav, pcm, flac, mp3, aac, " + "opus"); + } + request->set_response_format(response_format); + + if (!request->has_speed()) { + request->set_speed(1.0); + } + if (!request->has_stream_format() || request->stream_format().empty()) { + request->set_stream_format("audio"); + } + if (!request->has_stream()) { + request->set_stream(false); + } + if (!request->has_task_type() || request->task_type().empty()) { + const bool is_base = + request->has_ref_audio() || request->speaker_embedding_size() > 0; + request->set_task_type(is_base ? "Base" : "CustomVoice"); + } + if (!request->has_language() || request->language().empty()) { + request->set_language("Auto"); + } + if (!request->has_max_new_tokens() || request->max_new_tokens() <= 0) { + request->set_max_new_tokens(2048); + } + + return Status(); +} + +} // namespace + +SpeechServiceImpl::SpeechServiceImpl(VLMMaster* master, + const std::vector& models) + : APIServiceImpl(models), + master_(master), + default_model_(models.empty() ? FLAGS_model_id : models.front()) { + CHECK(master_ != nullptr); +} + +void SpeechServiceImpl::process_async_impl(std::shared_ptr call) { + proto::SpeechRequest request = call->request(); + auto status = prepare_speech_request(&request, models_, default_model_); + if (!status.ok()) { + call->finish_with_error(status.code(), status.message()); + return; + } + + RequestParams request_params( + request, call->get_x_request_id(), call->get_x_request_time()); + + auto prompt = request.input(); + if (request.has_instructions() && !is_blank(request.instructions())) { + prompt = request.instructions() + "\n" + request.input(); + } + + master_->handle_request( + std::move(prompt), + MMData(), + std::move(request_params), + [call](const RequestOutput& req_output) -> bool { + if (req_output.status.has_value()) { + const auto& status = req_output.status.value(); + if (!status.ok()) { + return call->finish_with_error(status.code(), status.message()); + } + } + + if (req_output.finished || req_output.cancelled) { + return call->finish_with_error(StatusCode::UNAVAILABLE, + kRuntimeUnavailableMessage); + } + return true; + }); +} + +} // namespace xllm diff --git a/xllm/api_service/speech_service_impl.h b/xllm/api_service/speech_service_impl.h new file mode 100644 index 000000000..edff0b87c --- /dev/null +++ b/xllm/api_service/speech_service_impl.h @@ -0,0 +1,43 @@ +/* Copyright 2026 The xLLM Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://github.com/jd-opensource/xllm/blob/main/LICENSE + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#pragma once + +#include +#include + +#include "api_service/api_service_impl.h" +#include "api_service/non_stream_call.h" +#include "core/distributed_runtime/vlm_master.h" +#include "speech.pb.h" + +namespace xllm { + +using SpeechCall = NonStreamCall; + +class SpeechServiceImpl final : public APIServiceImpl { + public: + SpeechServiceImpl(VLMMaster* master, const std::vector& models); + + void process_async_impl(std::shared_ptr call) override; + + private: + DISALLOW_COPY_AND_ASSIGN(SpeechServiceImpl); + + VLMMaster* master_ = nullptr; + std::string default_model_; +}; + +} // namespace xllm diff --git a/xllm/core/framework/request/request_params.cpp b/xllm/core/framework/request/request_params.cpp index 00f3e6be7..b01e525e7 100644 --- a/xllm/core/framework/request/request_params.cpp +++ b/xllm/core/framework/request/request_params.cpp @@ -52,6 +52,11 @@ std::string generate_anthropic_chat_request_id() { short_uuid.random(); } +std::string generate_speech_request_id() { + return "speech-" + InstanceName::name()->get_name_hash() + "-" + + short_uuid.random(); +} + // Handle tool_choice conversion from Anthropic format to internal format std::string handle_tool_choice( const proto::AnthropicMessagesRequest& rpc_request) { @@ -531,6 +536,30 @@ RequestParams::RequestParams(const proto::AnthropicMessagesRequest& request, tools = std::move(handle_tools(request)); } +RequestParams::RequestParams(const proto::SpeechRequest& request, + const std::string& x_rid, + const std::string& x_rtime) { + request_id = generate_speech_request_id(); + x_request_id = x_rid; + x_request_time = x_rtime; + speech_request = request; + + const int32_t kDefaultSpeechMaxNewTokens = 2048; + int32_t max_new_tokens = kDefaultSpeechMaxNewTokens; + if (request.has_max_new_tokens() && request.max_new_tokens() > 0) { + max_new_tokens = request.max_new_tokens(); + } + + max_tokens = static_cast(max_new_tokens); + n = 1; + best_of = 1; + temperature = 0.9f; + top_k = 50; + top_p = 1.0f; + repetition_penalty = 1.05f; + streaming = request.has_stream() ? request.stream() : false; +} + bool RequestParams::verify_params(OutputCallback callback) const { if (n == 0) { CALLBACK_WITH_ERROR(StatusCode::INVALID_ARGUMENT, diff --git a/xllm/core/framework/request/request_params.h b/xllm/core/framework/request/request_params.h index 4b249f62d..f0fb6324c 100644 --- a/xllm/core/framework/request/request_params.h +++ b/xllm/core/framework/request/request_params.h @@ -35,6 +35,7 @@ limitations under the License. #include "request_output.h" #include "rerank.pb.h" #include "sample_slot.h" +#include "speech.pb.h" namespace xllm { @@ -61,6 +62,9 @@ struct RequestParams { RequestParams(const proto::AnthropicMessagesRequest& request, const std::string& x_rid, const std::string& x_rtime); + RequestParams(const proto::SpeechRequest& request, + const std::string& x_rid, + const std::string& x_rtime); bool verify_params(OutputCallback callback) const; @@ -166,6 +170,8 @@ struct RequestParams { nlohmann::json chat_template_kwargs = nlohmann::json::object(); + std::optional speech_request; + bool is_sample_request = false; std::vector sample_slots; diff --git a/xllm/proto/CMakeLists.txt b/xllm/proto/CMakeLists.txt index 7937705c0..f0011cfa2 100644 --- a/xllm/proto/CMakeLists.txt +++ b/xllm/proto/CMakeLists.txt @@ -12,6 +12,7 @@ proto_library( chat.proto multimodal.proto embedding.proto + speech.proto rerank.proto models.proto worker.proto @@ -24,3 +25,4 @@ proto_library( anthropic.proto xtensor_dist.proto ) + diff --git a/xllm/proto/speech.proto b/xllm/proto/speech.proto new file mode 100644 index 000000000..a4a5b41a9 --- /dev/null +++ b/xllm/proto/speech.proto @@ -0,0 +1,29 @@ +syntax = "proto3"; + +option go_package = "jd.com/jd-infer/xllm;xllm"; +package xllm.proto; + +message SpeechRequest { + optional string model = 1; + string input = 2; + optional string voice = 3; + optional string instructions = 4; + optional string response_format = 5; + optional double speed = 6; + optional string stream_format = 7; + optional bool stream = 8; + optional string task_type = 9; + optional string language = 10; + optional string ref_audio = 11; + optional string ref_text = 12; + optional bool x_vector_only_mode = 13; + repeated float speaker_embedding = 14; + optional int32 max_new_tokens = 15; + optional int32 initial_codec_chunk_frames = 16; +} + +message SpeechResponse { + bytes audio = 1; + string media_type = 2; +} + diff --git a/xllm/proto/xllm_service.proto b/xllm/proto/xllm_service.proto index 9c5ee4d1c..7d685a710 100644 --- a/xllm/proto/xllm_service.proto +++ b/xllm/proto/xllm_service.proto @@ -9,6 +9,7 @@ import "completion.proto"; import "sample.proto"; import "chat.proto"; import "embedding.proto"; +import "speech.proto"; import "image_generation.proto"; import "rerank.proto"; import "models.proto"; @@ -62,6 +63,9 @@ service XllmAPIService { rpc Embeddings (EmbeddingRequest) returns (EmbeddingResponse); rpc EmbeddingsHttp (HttpRequest) returns (HttpResponse); + rpc Speech (SpeechRequest) returns (SpeechResponse); + rpc SpeechHttp (HttpRequest) returns (HttpResponse); + rpc ImageGeneration(ImageGenerationRequest) returns (ImageGenerationResponse); rpc ImageGenerationHttp(HttpRequest) returns (HttpResponse); @@ -90,3 +94,4 @@ service XllmAPIService { rpc UnlinkD2D (D2DLinkRequest) returns (Status); rpc UnlinkD2DHttp (HttpRequest) returns (HttpResponse); }; + diff --git a/xllm/server/xllm_server.cpp b/xllm/server/xllm_server.cpp index 0934eb608..5f8bc61a6 100644 --- a/xllm/server/xllm_server.cpp +++ b/xllm/server/xllm_server.cpp @@ -37,6 +37,7 @@ constexpr const char* kApiServiceRoutes = "v1/sample => SampleHttp," "v1/chat/completions => ChatCompletionsHttp," "v1/embeddings => EmbeddingsHttp," + "v1/audio/speech => SpeechHttp," "v1/models => ModelsHttp," "v1/image/generation => ImageGenerationHttp," "v1/rerank => RerankHttp,"