|
| 1 | +/* Copyright 2026 The xLLM Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + https://github.com/jd-opensource/xllm/blob/main/LICENSE |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#include "speech_service_impl.h" |
| 17 | + |
| 18 | +#include <algorithm> |
| 19 | +#include <cctype> |
| 20 | +#include <string> |
| 21 | +#include <utility> |
| 22 | + |
| 23 | +#include "core/common/global_flags.h" |
| 24 | +#include "core/framework/request/mm_data.h" |
| 25 | +#include "core/framework/request/request_output.h" |
| 26 | +#include "core/framework/request/request_params.h" |
| 27 | + |
| 28 | +namespace xllm { |
| 29 | +namespace { |
| 30 | + |
| 31 | +constexpr const char* kRuntimeUnavailableMessage = |
| 32 | + "/v1/audio/speech decoder is not integrated in this build"; |
| 33 | + |
| 34 | +std::string to_lower_ascii(std::string value) { |
| 35 | + std::transform( |
| 36 | + value.begin(), value.end(), value.begin(), [](unsigned char ch) { |
| 37 | + return std::tolower(ch); |
| 38 | + }); |
| 39 | + return value; |
| 40 | +} |
| 41 | + |
| 42 | +bool is_blank(const std::string& value) { |
| 43 | + return value.find_first_not_of(" \t\r\n") == std::string::npos; |
| 44 | +} |
| 45 | + |
| 46 | +bool is_supported_response_format(const std::string& format) { |
| 47 | + return format == "wav" || format == "pcm" || format == "flac" || |
| 48 | + format == "mp3" || format == "aac" || format == "opus"; |
| 49 | +} |
| 50 | + |
| 51 | +Status prepare_speech_request(proto::SpeechRequest* request, |
| 52 | + const absl::flat_hash_set<std::string>& models, |
| 53 | + const std::string& default_model) { |
| 54 | + if (request == nullptr) { |
| 55 | + return Status(StatusCode::INVALID_ARGUMENT, "speech request is null"); |
| 56 | + } |
| 57 | + if (is_blank(request->input())) { |
| 58 | + return Status(StatusCode::INVALID_ARGUMENT, "input cannot be empty"); |
| 59 | + } |
| 60 | + |
| 61 | + if (!request->has_model() || request->model().empty()) { |
| 62 | + request->set_model(default_model); |
| 63 | + } |
| 64 | + if (!models.contains(request->model())) { |
| 65 | + return Status(StatusCode::UNKNOWN, "Model not supported"); |
| 66 | + } |
| 67 | + |
| 68 | + std::string response_format = request->has_response_format() |
| 69 | + ? to_lower_ascii(request->response_format()) |
| 70 | + : "wav"; |
| 71 | + if (response_format.empty()) { |
| 72 | + response_format = "wav"; |
| 73 | + } |
| 74 | + if (!is_supported_response_format(response_format)) { |
| 75 | + return Status(StatusCode::INVALID_ARGUMENT, |
| 76 | + "response_format must be one of wav, pcm, flac, mp3, aac, " |
| 77 | + "opus"); |
| 78 | + } |
| 79 | + request->set_response_format(response_format); |
| 80 | + |
| 81 | + if (!request->has_speed()) { |
| 82 | + request->set_speed(1.0); |
| 83 | + } |
| 84 | + if (!request->has_stream_format() || request->stream_format().empty()) { |
| 85 | + request->set_stream_format("audio"); |
| 86 | + } |
| 87 | + if (!request->has_stream()) { |
| 88 | + request->set_stream(false); |
| 89 | + } |
| 90 | + if (!request->has_task_type() || request->task_type().empty()) { |
| 91 | + const bool is_base = |
| 92 | + request->has_ref_audio() || request->speaker_embedding_size() > 0; |
| 93 | + request->set_task_type(is_base ? "Base" : "CustomVoice"); |
| 94 | + } |
| 95 | + if (!request->has_language() || request->language().empty()) { |
| 96 | + request->set_language("Auto"); |
| 97 | + } |
| 98 | + if (!request->has_max_new_tokens() || request->max_new_tokens() <= 0) { |
| 99 | + request->set_max_new_tokens(2048); |
| 100 | + } |
| 101 | + |
| 102 | + return Status(); |
| 103 | +} |
| 104 | + |
| 105 | +} // namespace |
| 106 | + |
| 107 | +SpeechServiceImpl::SpeechServiceImpl(VLMMaster* master, |
| 108 | + const std::vector<std::string>& models) |
| 109 | + : APIServiceImpl(models), |
| 110 | + master_(master), |
| 111 | + default_model_(models.empty() ? FLAGS_model_id : models.front()) { |
| 112 | + CHECK(master_ != nullptr); |
| 113 | +} |
| 114 | + |
| 115 | +void SpeechServiceImpl::process_async_impl(std::shared_ptr<SpeechCall> call) { |
| 116 | + proto::SpeechRequest request = call->request(); |
| 117 | + auto status = prepare_speech_request(&request, models_, default_model_); |
| 118 | + if (!status.ok()) { |
| 119 | + call->finish_with_error(status.code(), status.message()); |
| 120 | + return; |
| 121 | + } |
| 122 | + |
| 123 | + RequestParams request_params( |
| 124 | + request, call->get_x_request_id(), call->get_x_request_time()); |
| 125 | + |
| 126 | + auto prompt = request.input(); |
| 127 | + if (request.has_instructions() && !is_blank(request.instructions())) { |
| 128 | + prompt = request.instructions() + "\n" + request.input(); |
| 129 | + } |
| 130 | + |
| 131 | + master_->handle_request( |
| 132 | + std::move(prompt), |
| 133 | + MMData(), |
| 134 | + std::move(request_params), |
| 135 | + [call](const RequestOutput& req_output) -> bool { |
| 136 | + if (req_output.status.has_value()) { |
| 137 | + const auto& status = req_output.status.value(); |
| 138 | + if (!status.ok()) { |
| 139 | + return call->finish_with_error(status.code(), status.message()); |
| 140 | + } |
| 141 | + } |
| 142 | + |
| 143 | + if (req_output.finished || req_output.cancelled) { |
| 144 | + return call->finish_with_error(StatusCode::UNAVAILABLE, |
| 145 | + kRuntimeUnavailableMessage); |
| 146 | + } |
| 147 | + return true; |
| 148 | + }); |
| 149 | +} |
| 150 | + |
| 151 | +} // namespace xllm |
0 commit comments