From 799c941aae0f94970322c91e87b0a0435510368c Mon Sep 17 00:00:00 2001 From: B-Step62 Date: Sat, 28 Feb 2026 23:56:06 +0900 Subject: [PATCH] Add MLflow tracing observability documentation Co-Authored-By: Claude Signed-off-by: B-Step62 --- docs/source/index.rst | 7 + docs/source/observability/mlflow.md | 194 ++++++++++++++++++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 docs/source/observability/mlflow.md diff --git a/docs/source/index.rst b/docs/source/index.rst index 8a88d8f4..62fa79c9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -129,3 +129,10 @@ Join our community by joining our `Discord `__ an framework/function_call framework/LlamaIndex framework/Langchain + +.. toctree:: + :maxdepth: 1 + :caption: Observability + :hidden: + + observability/mlflow diff --git a/docs/source/observability/mlflow.md b/docs/source/observability/mlflow.md new file mode 100644 index 00000000..de78818f --- /dev/null +++ b/docs/source/observability/mlflow.md @@ -0,0 +1,194 @@ +# MLflow + +This guide shows you how to trace your Qwen model calls with [MLflow](https://mlflow.org/), an [open-source](https://github.com/mlflow/mlflow) agent engineering platform. +By enabling MLflow tracing, you can inspect every API call to Qwen, including input prompts, responses, token usage, latencies, and tool invocations — all in a visual trace UI. + +Qwen models can be served locally with [vLLM](../deployment/vllm) or [SGLang](../deployment/sglang), both of which expose an OpenAI-compatible API. +MLflow's OpenAI autologging automatically captures traces for all calls made through the OpenAI SDK, so it works seamlessly with locally served Qwen models. + +## Setup + +### Serve Qwen Locally + +First, start a local Qwen model with vLLM or SGLang. Here we use Qwen3-8B as an example: + +::::{tab-set} + +:::{tab-item} vLLM + +```bash +vllm serve Qwen/Qwen3-8B --port 8000 --enable-reasoning --reasoning-parser qwen3 +``` + +::: + +:::{tab-item} SGLang + +```bash +python -m sglang.launch_server --model-path Qwen/Qwen3-8B --port 8000 --reasoning-parser qwen3 +``` + +::: + +:::: + +For more details on deployment options and model sizes, see [vLLM](../deployment/vllm) and [SGLang](../deployment/sglang). + +:::{tip} +You can also use the [DashScope API](https://www.alibabacloud.com/help/en/model-studio/getting-started/) instead of a local server. +In that case, set `base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"` and `api_key` to your DashScope API key in the examples below. +::: + +### Start MLflow + +The quickest way to start the MLflow tracking server is with `uvx`: + +```bash +uvx mlflow[genai] server --port 5000 +``` + +The MLflow UI will be available at `http://localhost:5000`. + +:::{note} +There are other ways to set up MLflow, including Docker Compose, pip install, and managed services such as [Databricks](https://www.databricks.com/product/managed-mlflow) and [AWS SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/mlflow.html). +See the [MLflow setup guide](https://mlflow.org/docs/latest/genai/getting-started/connect-environment/) for details. +::: + +### Install Dependencies + +```bash +pip install mlflow>=2.21.0 openai>=3.1.0 +``` + +## Basic Usage + +The following example traces chat completion calls to a locally served Qwen model. +`mlflow.openai.autolog()` enables automatic tracing for all OpenAI SDK calls. + +```python +import mlflow +from openai import OpenAI + +# Enable automatic tracing +mlflow.openai.autolog() + +# Point MLflow to the tracking server +mlflow.set_tracking_uri("http://localhost:5000") +mlflow.set_experiment("qwen-tracing") + +# Connect to the local Qwen server +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +response = client.chat.completions.create( + model="Qwen/Qwen3-8B", + messages=[ + {"role": "user", "content": "Give me a short introduction to large language models."}, + ], + max_tokens=32768, + temperature=0.6, + top_p=0.95, + extra_body={"top_k": 20}, +) +print(response.choices[0].message.content) +``` + +Open `http://localhost:5000` in your browser and navigate to the **qwen-tracing** experiment. +Click on the **Traces** tab to see the recorded trace, which includes input/output messages, token counts, latency, and model parameters. + +## Function Calling + +MLflow also traces Qwen's function calling flow. +Each step — the initial request, tool calls, and the follow-up response — appears as separate spans in the trace. + +```python +import json +import mlflow +from mlflow.entities import SpanType +from openai import OpenAI + +mlflow.openai.autolog() +mlflow.set_tracking_uri("http://localhost:5000") +mlflow.set_experiment("qwen-function-calling") + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city name, e.g. 'Beijing'", + } + }, + "required": ["location"], + }, + }, + } +] + + +@mlflow.trace(span_type=SpanType.TOOL) +def get_weather(location: str) -> dict: + # In a real application, this would call a weather API + return {"temperature": "22°C", "condition": "Sunny"} + + +@mlflow.trace(span_type=SpanType.AGENT) +def run_agent(user_query: str) -> str: + messages = [{"role": "user", "content": user_query}] + + # First call — model decides whether to use tools + response = client.chat.completions.create( + model="Qwen/Qwen3-8B", + messages=messages, + tools=tools, + ) + + if not response.choices[0].message.tool_calls: + return response.choices[0].message.content + + # Execute tool calls + messages.append(response.choices[0].message) + for tool_call in response.choices[0].message.tool_calls: + result = get_weather(**json.loads(tool_call.function.arguments)) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": json.dumps(result), + } + ) + + # Second call — model incorporates tool results + final_response = client.chat.completions.create( + model="Qwen/Qwen3-8B", + messages=messages, + tools=tools, + ) + return final_response.choices[0].message.content + + +print(run_agent("What's the weather like in Hangzhou?")) +``` + +In the MLflow UI, you can inspect the full function calling flow — the parent `run_agent` span contains the first LLM call, the `get_weather` tool execution, and the second LLM call with the tool result. + +## Next Step + +Now you can trace and inspect your Qwen model calls. +For more advanced usage such as streaming, async calls, and custom spans, refer to the following resources: + +- [MLflow Qwen Integration Guide](https://mlflow.org/docs/latest/genai/tracing/integrations/listing/qwen/) +- [MLflow Tracing Documentation](https://mlflow.org/docs/latest/genai/tracing/)