jd-opensource · wxh571001500 · Mar 28, 2026 · Mar 30, 2026 · Apr 23, 2026 · XuZhang99
@@ -21,6 +21,7 @@ cc_library(
     non_stream_call.h
     service_impl_factory.h
     serving_mode.h
+    speech_service_impl.h
     stream_call.h
     models_service_impl.h
     stream_output_parser.h
@@ -41,6 +42,7 @@ cc_library(
     image_generation_service_impl.cpp
     models_service_impl.cpp
     rerank_service_impl.cpp
+    speech_service_impl.cpp
     stream_output_parser.cpp
     qwen3_rerank_service_impl.cpp
     embedding_output_builder.cpp
@@ -124,3 +126,4 @@ cc_test(
 )
 target_link_libraries(openai_service_test PRIVATE brpc leveldb::leveldb OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
 add_dependencies(openai_service_test brpc-static)
+
@@ -30,17 +30,14 @@ limitations under the License.
 #include "common.pb.h"
 #include "completion.pb.h"
 #include "core/common/constants.h"
+#include "core/common/global_flags.h"
 #include "core/common/metrics.h"
 #include "core/common/types.h"
-#include "core/distributed_runtime/dit_master.h"
 #include "core/distributed_runtime/llm_master.h"
-#include "core/distributed_runtime/rec_master.h"
-#include "core/distributed_runtime/vlm_master.h"
 #include "core/util/closure_guard.h"
 #include "embedding.pb.h"
 #include "image_generation.pb.h"
 #include "models.pb.h"
-#include "service_impl_factory.h"
 #include "xllm_metrics.h"
 namespace xllm {
 
@@ -56,7 +53,8 @@ google::protobuf::Arena* GetArenaWithCheck(
 }
 
 const char* kSampleNotSupportedError = "/v1/sample is only supported for LLM";
-
+const char* kSpeechNotSupportedError =
+    "/v1/audio/speech is only supported for VLM";
 }  // namespace
 
 APIService::APIService(Master* master,
@@ -440,6 +438,75 @@ void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
   }
 }
 
+void APIService::Speech(::google::protobuf::RpcController* controller,
+                        const proto::SpeechRequest* request,
+                        proto::SpeechResponse* response,
+                        ::google::protobuf::Closure* done) {
+  xllm::ClosureGuard done_guard(
+      done,
+      std::bind(request_in_metric, nullptr),
+      std::bind(request_out_metric, (void*)controller));
+  if (!request || !response || !controller) {
+    LOG(ERROR) << "brpc request | respose | controller is null.";
+    return;
+  }
+
+  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
+  if (!speech_service_impl_) {
+    ctrl->SetFailed(kSpeechNotSupportedError);
+    return;
+  }
+
+  auto call =
+      std::make_shared<SpeechCall>(ctrl,
+                                   done_guard.release(),
+                                   const_cast<proto::SpeechRequest*>(request),
+                                   response,
+                                   true);
+  speech_service_impl_->process_async(call);
+}
+
+void APIService::SpeechHttp(::google::protobuf::RpcController* controller,
+                            const proto::HttpRequest* request,
+                            proto::HttpResponse* response,
+                            ::google::protobuf::Closure* done) {
+  xllm::ClosureGuard done_guard(
+      done,
+      std::bind(request_in_metric, nullptr),
+      std::bind(request_out_metric, (void*)controller));
+  if (!request || !response || !controller) {
+    LOG(ERROR) << "brpc request | respose | controller is null";
+    return;
+  }
+
+  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
+  if (!speech_service_impl_) {
+    ctrl->SetFailed(kSpeechNotSupportedError);
+    return;
+  }
+
+  auto arena = GetArenaWithCheck<SpeechCall>(response);
+  auto req_pb =
+      google::protobuf::Arena::CreateMessage<proto::SpeechRequest>(arena);
+  auto resp_pb =
+      google::protobuf::Arena::CreateMessage<proto::SpeechResponse>(arena);
+
+  std::string error;
+  json2pb::Json2PbOptions options;
+  butil::IOBuf& buf = ctrl->request_attachment();
+  butil::IOBufAsZeroCopyInputStream iobuf_stream(buf);
+  auto st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error);
+  if (!st) {
+    ctrl->SetFailed(error);
+    LOG(ERROR) << "parse json to proto failed: " << error;
+    return;
+  }
+
+  auto call = std::make_shared<SpeechCall>(
+      ctrl, done_guard.release(), req_pb, resp_pb, arena != nullptr);
+  speech_service_impl_->process_async(call);
+}
+
 void APIService::ImageGeneration(::google::protobuf::RpcController* controller,
                                  const proto::ImageGenerationRequest* request,
                                  proto::ImageGenerationResponse* response,
@@ -927,7 +994,7 @@ void APIService::WakeupHttp(::google::protobuf::RpcController* controller,
         std::vector<WeightSegment> segments;
         segments.reserve(seg_list.segments_size());
         for (const auto& proto_seg : seg_list.segments()) {
-          segments.emplace_back(proto_seg.offset(), proto_seg.size());
+          segments.push_back({proto_seg.offset(), proto_seg.size()});
         }
         wakeup_options.src_weight_segments.push_back(std::move(segments));
       }

@@ -30,6 +30,7 @@ limitations under the License.
 #include "rec_completion_service_impl.h"
 #include "rerank_service_impl.h"
 #include "sample_service_impl.h"
+#include "speech_service_impl.h"
 #include "xllm_service.pb.h"
 
 namespace xllm {
@@ -86,6 +87,16 @@ class APIService : public proto::XllmAPIService {
                       proto::HttpResponse* response,
                       ::google::protobuf::Closure* done) override;
 
+  void Speech(::google::protobuf::RpcController* controller,
+              const proto::SpeechRequest* request,
+              proto::SpeechResponse* response,
+              ::google::protobuf::Closure* done) override;
+
+  void SpeechHttp(::google::protobuf::RpcController* controller,
+                  const proto::HttpRequest* request,
+                  proto::HttpResponse* response,
+                  ::google::protobuf::Closure* done) override;
+
   void ImageGeneration(::google::protobuf::RpcController* controller,
                        const proto::ImageGenerationRequest* request,
                        proto::ImageGenerationResponse* response,
@@ -202,6 +213,7 @@ class APIService : public proto::XllmAPIService {
   std::unique_ptr<MMChatServiceImpl> mm_chat_service_impl_;
   std::unique_ptr<EmbeddingServiceImpl> embedding_service_impl_;
   std::unique_ptr<MMEmbeddingServiceImpl> mm_embedding_service_impl_;
+  std::unique_ptr<SpeechServiceImpl> speech_service_impl_;
   std::unique_ptr<ModelsServiceImpl> models_service_impl_;
   std::unique_ptr<ImageGenerationServiceImpl> image_generation_service_impl_;
   std::unique_ptr<RerankServiceImpl> rerank_service_impl_;

@@ -82,6 +82,8 @@ void ServiceImplFactory::create(
              std::make_unique<MMChatServiceImpl>(vlm_master, models);
          self->mm_embedding_service_impl_ =
              std::make_unique<MMEmbeddingServiceImpl>(vlm_master, models);
+         self->speech_service_impl_ =
+             create_service_impl<SpeechServiceImpl>(vlm_master, models);
        }},
       {static_cast<int8_t>(ServingMode::DIT),
        [](APIService* self,

@@ -0,0 +1,151 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "speech_service_impl.h"
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+#include <utility>
+
+#include "core/common/global_flags.h"
+#include "core/framework/request/mm_data.h"
+#include "core/framework/request/request_output.h"
+#include "core/framework/request/request_params.h"
+
+namespace xllm {
+namespace {
+
+constexpr const char* kRuntimeUnavailableMessage =
+    "/v1/audio/speech decoder is not integrated in this build";
+
+std::string to_lower_ascii(std::string value) {
+  std::transform(
+      value.begin(), value.end(), value.begin(), [](unsigned char ch) {
+        return std::tolower(ch);
+      });
+  return value;
+}
+
+bool is_blank(const std::string& value) {
+  return value.find_first_not_of(" \t\r\n") == std::string::npos;
+}
+
+bool is_supported_response_format(const std::string& format) {
+  return format == "wav" || format == "pcm" || format == "flac" ||
+         format == "mp3" || format == "aac" || format == "opus";
+}
+
+Status prepare_speech_request(proto::SpeechRequest* request,
+                              const absl::flat_hash_set<std::string>& models,
+                              const std::string& default_model) {
+  if (request == nullptr) {
+    return Status(StatusCode::INVALID_ARGUMENT, "speech request is null");
+  }
+  if (is_blank(request->input())) {
+    return Status(StatusCode::INVALID_ARGUMENT, "input cannot be empty");
+  }
+
+  if (!request->has_model() || request->model().empty()) {
+    request->set_model(default_model);
+  }
+  if (!models.contains(request->model())) {
+    return Status(StatusCode::UNKNOWN, "Model not supported");
+  }
+
+  std::string response_format = request->has_response_format()
+                                    ? to_lower_ascii(request->response_format())
+                                    : "wav";
+  if (response_format.empty()) {
+    response_format = "wav";
+  }
+  if (!is_supported_response_format(response_format)) {
+    return Status(StatusCode::INVALID_ARGUMENT,
+                  "response_format must be one of wav, pcm, flac, mp3, aac, "
+                  "opus");
+  }
+  request->set_response_format(response_format);
+
+  if (!request->has_speed()) {
+    request->set_speed(1.0);
+  }
+  if (!request->has_stream_format() || request->stream_format().empty()) {
+    request->set_stream_format("audio");
+  }
+  if (!request->has_stream()) {
+    request->set_stream(false);
+  }
+  if (!request->has_task_type() || request->task_type().empty()) {
+    const bool is_base =
+        request->has_ref_audio() || request->speaker_embedding_size() > 0;
+    request->set_task_type(is_base ? "Base" : "CustomVoice");
+  }
+  if (!request->has_language() || request->language().empty()) {
+    request->set_language("Auto");
+  }
+  if (!request->has_max_new_tokens() || request->max_new_tokens() <= 0) {
+    request->set_max_new_tokens(2048);
+  }
+
+  return Status();
+}
+
+}  // namespace
+
+SpeechServiceImpl::SpeechServiceImpl(VLMMaster* master,
+                                     const std::vector<std::string>& models)
+    : APIServiceImpl(models),
+      master_(master),
+      default_model_(models.empty() ? FLAGS_model_id : models.front()) {
+  CHECK(master_ != nullptr);
+}
+
+void SpeechServiceImpl::process_async_impl(std::shared_ptr<SpeechCall> call) {
+  proto::SpeechRequest request = call->request();
+  auto status = prepare_speech_request(&request, models_, default_model_);
+  if (!status.ok()) {
+    call->finish_with_error(status.code(), status.message());
+    return;
+  }
+
+  RequestParams request_params(
+      request, call->get_x_request_id(), call->get_x_request_time());
+
+  auto prompt = request.input();
-  auto prompt = request.input();
+  std::string prompt = request.input();
-  auto prompt = request.input();
+  std::string prompt = request.input();
+  if (request.has_instructions() && !is_blank(request.instructions())) {
+    prompt = request.instructions() + "\n" + request.input();
+  }
+
+  master_->handle_request(
+      std::move(prompt),
+      MMData(),
+      std::move(request_params),
+      [call](const RequestOutput& req_output) -> bool {
+        if (req_output.status.has_value()) {
+          const auto& status = req_output.status.value();
+          if (!status.ok()) {
+            return call->finish_with_error(status.code(), status.message());
+          }
+        }
+
+        if (req_output.finished || req_output.cancelled) {
+          return call->finish_with_error(StatusCode::UNAVAILABLE,
+                                         kRuntimeUnavailableMessage);
+        }
+        return true;
+      });
+}
+
+}  // namespace xllm
@@ -0,0 +1,43 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "api_service/api_service_impl.h"
+#include "api_service/non_stream_call.h"
+#include "core/distributed_runtime/vlm_master.h"
+#include "speech.pb.h"
+
+namespace xllm {
+
+using SpeechCall = NonStreamCall<proto::SpeechRequest, proto::SpeechResponse>;
+
+class SpeechServiceImpl final : public APIServiceImpl<SpeechCall> {
+ public:
+  SpeechServiceImpl(VLMMaster* master, const std::vector<std::string>& models);
+
+  void process_async_impl(std::shared_ptr<SpeechCall> call) override;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(SpeechServiceImpl);
+
+  VLMMaster* master_ = nullptr;
+  std::string default_model_;
+};
+
+}  // namespace xllm