diff --git a/react/src/components/Chat/ChatCard.tsx b/react/src/components/Chat/ChatCard.tsx index 35fc1bd9ea..8c6d522c0d 100644 --- a/react/src/components/Chat/ChatCard.tsx +++ b/react/src/components/Chat/ChatCard.tsx @@ -238,6 +238,7 @@ const PureChatCard: React.FC = ({ const dropContainerRef = useRef(null); const [fetchKey, updateFetchKey] = useUpdatableState('first'); const [startTime, setStartTime] = useState(null); + const [endTime, setEndTime] = useState(null); const { agents } = useAIAgent(); const agent = agents.find((a) => a.id === chat.provider.agentId); @@ -261,9 +262,6 @@ const PureChatCard: React.FC = ({ const { error, messages, stop, status, sendMessage, setMessages } = useChat({ experimental_throttle: 100, messages: chat.messages, - onFinish: () => { - setStartTime(null); - }, // Because there is an issue(https://github.com/vercel/ai/issues/8956) with useChat that does not run a new transport without an id change, // we have to change the id and use fetch by utilizing useEventNotStable. id: `chat-${baseURL}-${modelId}-${effectiveApiKey}`, @@ -329,9 +327,28 @@ const PureChatCard: React.FC = ({ const isStreaming = status === 'streaming' || status === 'submitted'; + // TPS measurement window follows the standard LLM inference convention: + // start when the first output token arrives (status transitions to + // 'streaming') and stop when streaming ends (success, abort, or error). + // This excludes file upload, network RTT, and prefill (TTFT), so the + // displayed TPS reflects pure decode rate — the same definition used by + // vLLM, Ollama, NVIDIA GenAI-Perf, etc. + useEffect(() => { + if (status === 'streaming' && startTime === null) { + setStartTime(Date.now()); + } + }, [status, startTime]); + + useEffect(() => { + if (!isStreaming && startTime !== null && endTime === null) { + setEndTime(Date.now()); + } + }, [isStreaming, startTime, endTime]); + // Helper function to handle message sending with files const handleSendMessage = async (textContent: string, files?: File[]) => { - setStartTime(Date.now()); + setStartTime(null); + setEndTime(null); const parts: Array< | { type: 'text'; text: string } @@ -540,6 +557,7 @@ const PureChatCard: React.FC = ({ input={input} isStreaming={isStreaming} startTime={startTime} + endTime={endTime} /> = ({ @@ -25,6 +26,7 @@ const ChatMessages: React.FC = ({ input, isStreaming, startTime, + endTime, }) => { const { token } = theme.useToken(); return ( @@ -44,6 +46,7 @@ const ChatMessages: React.FC = ({ messages={messages} input={input} startTime={startTime} + endTime={endTime} /> diff --git a/react/src/components/Chat/ChatTokenCounter.tsx b/react/src/components/Chat/ChatTokenCounter.tsx index a3c6655e69..ca73c29858 100644 --- a/react/src/components/Chat/ChatTokenCounter.tsx +++ b/react/src/components/Chat/ChatTokenCounter.tsx @@ -9,12 +9,13 @@ import { Typography, Tag, Divider } from 'antd'; import { BAIFlex } from 'backend.ai-ui'; import { t } from 'i18next'; import { map, last } from 'lodash-es'; -import React, { useMemo } from 'react'; +import React from 'react'; interface ChatTokenCounterProps { input: string; messages: UIMessage[]; startTime: number | null; + endTime: number | null; style?: React.CSSProperties; } @@ -22,39 +23,35 @@ const ChatTokenCounter: React.FC = ({ input, messages, startTime, + endTime, }) => { + 'use memo'; + const inputTokenCount = useTokenCount(input); - const allChatMessageString = useMemo(() => { - return map(messages, (message) => - message?.parts - ?.filter((part) => part.type === 'text') - .map((part) => part.text) - .join(''), - ).join(''); - }, [messages]); + const allChatMessageString = map(messages, (message) => + message?.parts + ?.filter((part) => part.type === 'text') + .map((part) => part.text) + .join(''), + ).join(''); const chatsTokenCount = useTokenCount(allChatMessageString); const totalTokenCount = inputTokenCount + chatsTokenCount; - const lastAssistantMessageString = useMemo(() => { - const lastAssistantMessage = last(messages); - if (lastAssistantMessage?.role === 'assistant') { - return ( - lastAssistantMessage?.parts + const lastAssistantMessage = last(messages); + const lastAssistantMessageString = + lastAssistantMessage?.role === 'assistant' + ? lastAssistantMessage?.parts ?.filter((part) => part.type === 'text') .map((part) => part.text) .join('') || '' - ); - } else { - return ''; - } - }, [messages]); + : ''; const lastAssistantTokenCount = useTokenCount(lastAssistantMessageString); - const tokenPerSecond = useMemo(() => { - return lastAssistantTokenCount > 0 && startTime - ? // eslint-disable-next-line react-hooks/purity - lastAssistantTokenCount / ((Date.now() - startTime) / 1000) - : 0; - }, [lastAssistantTokenCount, startTime]); + let tokenPerSecond = 0; + if (lastAssistantTokenCount > 0 && startTime) { + // eslint-disable-next-line react-hooks/purity + const elapsedSec = ((endTime ?? Date.now()) - startTime) / 1000; + tokenPerSecond = elapsedSec > 0 ? lastAssistantTokenCount / elapsedSec : 0; + } return (