diff --git a/xllm/api_service/CMakeLists.txt b/xllm/api_service/CMakeLists.txt
index 722ee4ee6..eb126690d 100644
--- a/xllm/api_service/CMakeLists.txt
+++ b/xllm/api_service/CMakeLists.txt
@@ -21,6 +21,7 @@ cc_library(
     non_stream_call.h
     service_impl_factory.h
     serving_mode.h
+    speech_service_impl.h
     stream_call.h
     models_service_impl.h
     stream_output_parser.h
@@ -41,6 +42,7 @@ cc_library(
     image_generation_service_impl.cpp
     models_service_impl.cpp
     rerank_service_impl.cpp
+    speech_service_impl.cpp
     stream_output_parser.cpp
     qwen3_rerank_service_impl.cpp
     embedding_output_builder.cpp
@@ -124,3 +126,4 @@ cc_test(
 )
 target_link_libraries(openai_service_test PRIVATE brpc leveldb::leveldb OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
 add_dependencies(openai_service_test brpc-static)
+
diff --git a/xllm/api_service/api_service.cpp b/xllm/api_service/api_service.cpp
index bff43b0c2..b6fd76c96 100644
--- a/xllm/api_service/api_service.cpp
+++ b/xllm/api_service/api_service.cpp
@@ -30,17 +30,14 @@ limitations under the License.
 #include "common.pb.h"
 #include "completion.pb.h"
 #include "core/common/constants.h"
+#include "core/common/global_flags.h"
 #include "core/common/metrics.h"
 #include "core/common/types.h"
-#include "core/distributed_runtime/dit_master.h"
 #include "core/distributed_runtime/llm_master.h"
-#include "core/distributed_runtime/rec_master.h"
-#include "core/distributed_runtime/vlm_master.h"
 #include "core/util/closure_guard.h"
 #include "embedding.pb.h"
 #include "image_generation.pb.h"
 #include "models.pb.h"
-#include "service_impl_factory.h"
 #include "xllm_metrics.h"
 namespace xllm {
 
@@ -56,7 +53,8 @@ google::protobuf::Arena* GetArenaWithCheck(
 }
 
 const char* kSampleNotSupportedError = "/v1/sample is only supported for LLM";
-
+const char* kSpeechNotSupportedError =
+    "/v1/audio/speech is only supported for VLM";
 }  // namespace
 
 APIService::APIService(Master* master,
@@ -440,6 +438,75 @@ void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
   }
 }
 
+void APIService::Speech(::google::protobuf::RpcController* controller,
+                        const proto::SpeechRequest* request,
+                        proto::SpeechResponse* response,
+                        ::google::protobuf::Closure* done) {
+  xllm::ClosureGuard done_guard(
+      done,
+      std::bind(request_in_metric, nullptr),
+      std::bind(request_out_metric, (void*)controller));
+  if (!request || !response || !controller) {
+    LOG(ERROR) << "brpc request | respose | controller is null.";
+    return;
+  }
+
+  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
+  if (!speech_service_impl_) {
+    ctrl->SetFailed(kSpeechNotSupportedError);
+    return;
+  }
+
+  auto call =
+      std::make_shared<SpeechCall>(ctrl,
+                                   done_guard.release(),
+                                   const_cast<proto::SpeechRequest*>(request),
+                                   response,
+                                   true);
+  speech_service_impl_->process_async(call);
+}
+
+void APIService::SpeechHttp(::google::protobuf::RpcController* controller,
+                            const proto::HttpRequest* request,
+                            proto::HttpResponse* response,
+                            ::google::protobuf::Closure* done) {
+  xllm::ClosureGuard done_guard(
+      done,
+      std::bind(request_in_metric, nullptr),
+      std::bind(request_out_metric, (void*)controller));
+  if (!request || !response || !controller) {
+    LOG(ERROR) << "brpc request | respose | controller is null";
+    return;
+  }
+
+  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
+  if (!speech_service_impl_) {
+    ctrl->SetFailed(kSpeechNotSupportedError);
+    return;
+  }
+
+  auto arena = GetArenaWithCheck<SpeechCall>(response);
+  auto req_pb =
+      google::protobuf::Arena::CreateMessage<proto::SpeechRequest>(arena);
+  auto resp_pb =
+      google::protobuf::Arena::CreateMessage<proto::SpeechResponse>(arena);
+
+  std::string error;
+  json2pb::Json2PbOptions options;
+  butil::IOBuf& buf = ctrl->request_attachment();
+  butil::IOBufAsZeroCopyInputStream iobuf_stream(buf);
+  auto st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error);
+  if (!st) {
+    ctrl->SetFailed(error);
+    LOG(ERROR) << "parse json to proto failed: " << error;
+    return;
+  }
+
+  auto call = std::make_shared<SpeechCall>(
+      ctrl, done_guard.release(), req_pb, resp_pb, arena != nullptr);
+  speech_service_impl_->process_async(call);
+}
+
 void APIService::ImageGeneration(::google::protobuf::RpcController* controller,
                                  const proto::ImageGenerationRequest* request,
                                  proto::ImageGenerationResponse* response,
@@ -927,7 +994,7 @@ void APIService::WakeupHttp(::google::protobuf::RpcController* controller,
         std::vector<WeightSegment> segments;
         segments.reserve(seg_list.segments_size());
         for (const auto& proto_seg : seg_list.segments()) {
-          segments.emplace_back(proto_seg.offset(), proto_seg.size());
+          segments.push_back({proto_seg.offset(), proto_seg.size()});
         }
         wakeup_options.src_weight_segments.push_back(std::move(segments));
       }
diff --git a/xllm/api_service/api_service.h b/xllm/api_service/api_service.h
index 43bd054f0..9fe5fc10f 100644
--- a/xllm/api_service/api_service.h
+++ b/xllm/api_service/api_service.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "rec_completion_service_impl.h"
 #include "rerank_service_impl.h"
 #include "sample_service_impl.h"
+#include "speech_service_impl.h"
 #include "xllm_service.pb.h"
 
 namespace xllm {
@@ -86,6 +87,16 @@ class APIService : public proto::XllmAPIService {
                       proto::HttpResponse* response,
                       ::google::protobuf::Closure* done) override;
 
+  void Speech(::google::protobuf::RpcController* controller,
+              const proto::SpeechRequest* request,
+              proto::SpeechResponse* response,
+              ::google::protobuf::Closure* done) override;
+
+  void SpeechHttp(::google::protobuf::RpcController* controller,
+                  const proto::HttpRequest* request,
+                  proto::HttpResponse* response,
+                  ::google::protobuf::Closure* done) override;
+
   void ImageGeneration(::google::protobuf::RpcController* controller,
                        const proto::ImageGenerationRequest* request,
                        proto::ImageGenerationResponse* response,
@@ -202,6 +213,7 @@ class APIService : public proto::XllmAPIService {
   std::unique_ptr<MMChatServiceImpl> mm_chat_service_impl_;
   std::unique_ptr<EmbeddingServiceImpl> embedding_service_impl_;
   std::unique_ptr<MMEmbeddingServiceImpl> mm_embedding_service_impl_;
+  std::unique_ptr<SpeechServiceImpl> speech_service_impl_;
   std::unique_ptr<ModelsServiceImpl> models_service_impl_;
   std::unique_ptr<ImageGenerationServiceImpl> image_generation_service_impl_;
   std::unique_ptr<RerankServiceImpl> rerank_service_impl_;
diff --git a/xllm/api_service/service_impl_factory.cpp b/xllm/api_service/service_impl_factory.cpp
index 092a5aba4..abb070869 100644
--- a/xllm/api_service/service_impl_factory.cpp
+++ b/xllm/api_service/service_impl_factory.cpp
@@ -82,6 +82,8 @@ void ServiceImplFactory::create(
              std::make_unique<MMChatServiceImpl>(vlm_master, models);
          self->mm_embedding_service_impl_ =
              std::make_unique<MMEmbeddingServiceImpl>(vlm_master, models);
+         self->speech_service_impl_ =
+             create_service_impl<SpeechServiceImpl>(vlm_master, models);
        }},
       {static_cast<int8_t>(ServingMode::DIT),
        [](APIService* self,
diff --git a/xllm/api_service/speech_service_impl.cpp b/xllm/api_service/speech_service_impl.cpp
new file mode 100644
index 000000000..b82b07df3
--- /dev/null
+++ b/xllm/api_service/speech_service_impl.cpp
@@ -0,0 +1,151 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "speech_service_impl.h"
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+#include <utility>
+
+#include "core/common/global_flags.h"
+#include "core/framework/request/mm_data.h"
+#include "core/framework/request/request_output.h"
+#include "core/framework/request/request_params.h"
+
+namespace xllm {
+namespace {
+
+constexpr const char* kRuntimeUnavailableMessage =
+    "/v1/audio/speech decoder is not integrated in this build";
+
+std::string to_lower_ascii(std::string value) {
+  std::transform(
+      value.begin(), value.end(), value.begin(), [](unsigned char ch) {
+        return std::tolower(ch);
+      });
+  return value;
+}
+
+bool is_blank(const std::string& value) {
+  return value.find_first_not_of(" \t\r\n") == std::string::npos;
+}
+
+bool is_supported_response_format(const std::string& format) {
+  return format == "wav" || format == "pcm" || format == "flac" ||
+         format == "mp3" || format == "aac" || format == "opus";
+}
+
+Status prepare_speech_request(proto::SpeechRequest* request,
+                              const absl::flat_hash_set<std::string>& models,
+                              const std::string& default_model) {
+  if (request == nullptr) {
+    return Status(StatusCode::INVALID_ARGUMENT, "speech request is null");
+  }
+  if (is_blank(request->input())) {
+    return Status(StatusCode::INVALID_ARGUMENT, "input cannot be empty");
+  }
+
+  if (!request->has_model() || request->model().empty()) {
+    request->set_model(default_model);
+  }
+  if (!models.contains(request->model())) {
+    return Status(StatusCode::UNKNOWN, "Model not supported");
+  }
+
+  std::string response_format = request->has_response_format()
+                                    ? to_lower_ascii(request->response_format())
+                                    : "wav";
+  if (response_format.empty()) {
+    response_format = "wav";
+  }
+  if (!is_supported_response_format(response_format)) {
+    return Status(StatusCode::INVALID_ARGUMENT,
+                  "response_format must be one of wav, pcm, flac, mp3, aac, "
+                  "opus");
+  }
+  request->set_response_format(response_format);
+
+  if (!request->has_speed()) {
+    request->set_speed(1.0);
+  }
+  if (!request->has_stream_format() || request->stream_format().empty()) {
+    request->set_stream_format("audio");
+  }
+  if (!request->has_stream()) {
+    request->set_stream(false);
+  }
+  if (!request->has_task_type() || request->task_type().empty()) {
+    const bool is_base =
+        request->has_ref_audio() || request->speaker_embedding_size() > 0;
+    request->set_task_type(is_base ? "Base" : "CustomVoice");
+  }
+  if (!request->has_language() || request->language().empty()) {
+    request->set_language("Auto");
+  }
+  if (!request->has_max_new_tokens() || request->max_new_tokens() <= 0) {
+    request->set_max_new_tokens(2048);
+  }
+
+  return Status();
+}
+
+}  // namespace
+
+SpeechServiceImpl::SpeechServiceImpl(VLMMaster* master,
+                                     const std::vector<std::string>& models)
+    : APIServiceImpl(models),
+      master_(master),
+      default_model_(models.empty() ? FLAGS_model_id : models.front()) {
+  CHECK(master_ != nullptr);
+}
+
+void SpeechServiceImpl::process_async_impl(std::shared_ptr<SpeechCall> call) {
+  proto::SpeechRequest request = call->request();
+  auto status = prepare_speech_request(&request, models_, default_model_);
+  if (!status.ok()) {
+    call->finish_with_error(status.code(), status.message());
+    return;
+  }
+
+  RequestParams request_params(
+      request, call->get_x_request_id(), call->get_x_request_time());
+
+  auto prompt = request.input();
+  if (request.has_instructions() && !is_blank(request.instructions())) {
+    prompt = request.instructions() + "\n" + request.input();
+  }
+
+  master_->handle_request(
+      std::move(prompt),
+      MMData(),
+      std::move(request_params),
+      [call](const RequestOutput& req_output) -> bool {
+        if (req_output.status.has_value()) {
+          const auto& status = req_output.status.value();
+          if (!status.ok()) {
+            return call->finish_with_error(status.code(), status.message());
+          }
+        }
+
+        if (req_output.finished || req_output.cancelled) {
+          return call->finish_with_error(StatusCode::UNAVAILABLE,
+                                         kRuntimeUnavailableMessage);
+        }
+        return true;
+      });
+}
+
+}  // namespace xllm
diff --git a/xllm/api_service/speech_service_impl.h b/xllm/api_service/speech_service_impl.h
new file mode 100644
index 000000000..edff0b87c
--- /dev/null
+++ b/xllm/api_service/speech_service_impl.h
@@ -0,0 +1,43 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "api_service/api_service_impl.h"
+#include "api_service/non_stream_call.h"
+#include "core/distributed_runtime/vlm_master.h"
+#include "speech.pb.h"
+
+namespace xllm {
+
+using SpeechCall = NonStreamCall<proto::SpeechRequest, proto::SpeechResponse>;
+
+class SpeechServiceImpl final : public APIServiceImpl<SpeechCall> {
+ public:
+  SpeechServiceImpl(VLMMaster* master, const std::vector<std::string>& models);
+
+  void process_async_impl(std::shared_ptr<SpeechCall> call) override;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(SpeechServiceImpl);
+
+  VLMMaster* master_ = nullptr;
+  std::string default_model_;
+};
+
+}  // namespace xllm
diff --git a/xllm/core/framework/request/request_params.cpp b/xllm/core/framework/request/request_params.cpp
index 00f3e6be7..b01e525e7 100644
--- a/xllm/core/framework/request/request_params.cpp
+++ b/xllm/core/framework/request/request_params.cpp
@@ -52,6 +52,11 @@ std::string generate_anthropic_chat_request_id() {
          short_uuid.random();
 }
 
+std::string generate_speech_request_id() {
+  return "speech-" + InstanceName::name()->get_name_hash() + "-" +
+         short_uuid.random();
+}
+
 // Handle tool_choice conversion from Anthropic format to internal format
 std::string handle_tool_choice(
     const proto::AnthropicMessagesRequest& rpc_request) {
@@ -531,6 +536,30 @@ RequestParams::RequestParams(const proto::AnthropicMessagesRequest& request,
   tools = std::move(handle_tools(request));
 }
 
+RequestParams::RequestParams(const proto::SpeechRequest& request,
+                             const std::string& x_rid,
+                             const std::string& x_rtime) {
+  request_id = generate_speech_request_id();
+  x_request_id = x_rid;
+  x_request_time = x_rtime;
+  speech_request = request;
+
+  const int32_t kDefaultSpeechMaxNewTokens = 2048;
+  int32_t max_new_tokens = kDefaultSpeechMaxNewTokens;
+  if (request.has_max_new_tokens() && request.max_new_tokens() > 0) {
+    max_new_tokens = request.max_new_tokens();
+  }
+
+  max_tokens = static_cast<uint32_t>(max_new_tokens);
+  n = 1;
+  best_of = 1;
+  temperature = 0.9f;
+  top_k = 50;
+  top_p = 1.0f;
+  repetition_penalty = 1.05f;
+  streaming = request.has_stream() ? request.stream() : false;
+}
+
 bool RequestParams::verify_params(OutputCallback callback) const {
   if (n == 0) {
     CALLBACK_WITH_ERROR(StatusCode::INVALID_ARGUMENT,
diff --git a/xllm/core/framework/request/request_params.h b/xllm/core/framework/request/request_params.h
index 4b249f62d..f0fb6324c 100644
--- a/xllm/core/framework/request/request_params.h
+++ b/xllm/core/framework/request/request_params.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "request_output.h"
 #include "rerank.pb.h"
 #include "sample_slot.h"
+#include "speech.pb.h"
 
 namespace xllm {
 
@@ -61,6 +62,9 @@ struct RequestParams {
   RequestParams(const proto::AnthropicMessagesRequest& request,
                 const std::string& x_rid,
                 const std::string& x_rtime);
+  RequestParams(const proto::SpeechRequest& request,
+                const std::string& x_rid,
+                const std::string& x_rtime);
 
   bool verify_params(OutputCallback callback) const;
 
@@ -166,6 +170,8 @@ struct RequestParams {
 
   nlohmann::json chat_template_kwargs = nlohmann::json::object();
 
+  std::optional<proto::SpeechRequest> speech_request;
+
   bool is_sample_request = false;
 
   std::vector<SampleSlot> sample_slots;
diff --git a/xllm/proto/CMakeLists.txt b/xllm/proto/CMakeLists.txt
index 7937705c0..f0011cfa2 100644
--- a/xllm/proto/CMakeLists.txt
+++ b/xllm/proto/CMakeLists.txt
@@ -12,6 +12,7 @@ proto_library(
     chat.proto
     multimodal.proto
     embedding.proto
+    speech.proto
     rerank.proto
     models.proto
     worker.proto
@@ -24,3 +25,4 @@ proto_library(
     anthropic.proto
     xtensor_dist.proto
 )
+
diff --git a/xllm/proto/speech.proto b/xllm/proto/speech.proto
new file mode 100644
index 000000000..a4a5b41a9
--- /dev/null
+++ b/xllm/proto/speech.proto
@@ -0,0 +1,29 @@
+syntax = "proto3";
+
+option go_package = "jd.com/jd-infer/xllm;xllm";
+package xllm.proto;
+
+message SpeechRequest {
+  optional string model = 1;
+  string input = 2;
+  optional string voice = 3;
+  optional string instructions = 4;
+  optional string response_format = 5;
+  optional double speed = 6;
+  optional string stream_format = 7;
+  optional bool stream = 8;
+  optional string task_type = 9;
+  optional string language = 10;
+  optional string ref_audio = 11;
+  optional string ref_text = 12;
+  optional bool x_vector_only_mode = 13;
+  repeated float speaker_embedding = 14;
+  optional int32 max_new_tokens = 15;
+  optional int32 initial_codec_chunk_frames = 16;
+}
+
+message SpeechResponse {
+  bytes audio = 1;
+  string media_type = 2;
+}
+
diff --git a/xllm/proto/xllm_service.proto b/xllm/proto/xllm_service.proto
index 9c5ee4d1c..7d685a710 100644
--- a/xllm/proto/xllm_service.proto
+++ b/xllm/proto/xllm_service.proto
@@ -9,6 +9,7 @@ import "completion.proto";
 import "sample.proto";
 import "chat.proto";
 import "embedding.proto";
+import "speech.proto";
 import "image_generation.proto";
 import "rerank.proto";
 import "models.proto";
@@ -62,6 +63,9 @@ service XllmAPIService {
   rpc Embeddings (EmbeddingRequest) returns (EmbeddingResponse);
   rpc EmbeddingsHttp (HttpRequest) returns (HttpResponse);
 
+  rpc Speech (SpeechRequest) returns (SpeechResponse);
+  rpc SpeechHttp (HttpRequest) returns (HttpResponse);
+
   rpc ImageGeneration(ImageGenerationRequest) returns (ImageGenerationResponse);
   rpc ImageGenerationHttp(HttpRequest) returns (HttpResponse);
 
@@ -90,3 +94,4 @@ service XllmAPIService {
   rpc UnlinkD2D (D2DLinkRequest) returns (Status);
   rpc UnlinkD2DHttp (HttpRequest) returns (HttpResponse);
 }; 
+
diff --git a/xllm/server/xllm_server.cpp b/xllm/server/xllm_server.cpp
index 0934eb608..5f8bc61a6 100644
--- a/xllm/server/xllm_server.cpp
+++ b/xllm/server/xllm_server.cpp
@@ -37,6 +37,7 @@ constexpr const char* kApiServiceRoutes =
     "v1/sample => SampleHttp,"
     "v1/chat/completions => ChatCompletionsHttp,"
     "v1/embeddings => EmbeddingsHttp,"
+    "v1/audio/speech => SpeechHttp,"
     "v1/models => ModelsHttp,"
     "v1/image/generation => ImageGenerationHttp,"
     "v1/rerank => RerankHttp,"