Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- `opentelemetry-sdk`: Add `service` resource detector support to declarative file configuration via `detection_development.detectors[].service`
([#5003](https://github.com/open-telemetry/opentelemetry-python/pull/5003))
- `opentelemetry-exporter-otlp-proto-http`: Add experimental opt-in support for handling HTTP 413 (Payload Too Large) responses in trace and log exporters by splitting the batch in half and retrying each half recursively. Enable via `OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413=true`.
([#5032](https://github.com/open-telemetry/opentelemetry-python/pull/5032))

## Version 1.41.0/0.62b0 (2026-04-09)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def _is_retryable(resp: requests.Response) -> bool:
return False


def _is_payload_too_large(resp: requests.Response) -> bool:
return resp.status_code == 413


def _load_session_from_envvar(
cred_envvar: Literal[
"OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
Compression,
)
from opentelemetry.exporter.otlp.proto.http._common import (
_is_payload_too_large,
_is_retryable,
_load_session_from_envvar,
)
Expand All @@ -41,6 +42,7 @@
)
from opentelemetry.sdk._shared_internal import DuplicateFilter
from opentelemetry.sdk.environment_variables import (
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413,
_OTEL_PYTHON_EXPORTER_OTLP_HTTP_LOGS_CREDENTIAL_PROVIDER,
OTEL_EXPORTER_OTLP_CERTIFICATE,
OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE,
Expand Down Expand Up @@ -69,6 +71,7 @@
DEFAULT_LOGS_EXPORT_PATH = "v1/logs"
DEFAULT_TIMEOUT = 10 # in seconds
_MAX_RETRYS = 6
_MAX_BISECTS = 5


class OTLPLogExporter(LogRecordExporter):
Expand Down Expand Up @@ -183,8 +186,17 @@ def export(
_logger.warning("Exporter already shutdown, ignoring batch")
return LogRecordExportResult.FAILURE

serialized_data = encode_logs(batch).SerializeToString()
deadline_sec = time() + self._timeout
return self._export_batch(batch, deadline_sec, _MAX_BISECTS)

def _export_batch(
self,
batch: Sequence[ReadableLogRecord],
deadline_sec: float,
remaining_bisects: int,
) -> LogRecordExportResult:
serialized_data = encode_logs(batch).SerializeToString()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Under what scenarios do you need this feature? If you already know the payload limits of your backend, we could bisect based on a configurable max_payload_size, and eliminate the 413 errors entirely.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reactive approach covers cases where the backend limit isn't known upfront or changes without SDK reconfiguration (e.g., proxies). I can tackle the proactive splitting via max_payload_size in a follow-up.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Krishnachaitanyakc Do you have a custom collector implementation that returns 413 error codes? From my understanding the vanilla OpenTelemetry collector does not return 413 errors.

Copy link
Copy Markdown
Contributor Author

@Krishnachaitanyakc Krishnachaitanyakc Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Krishnachaitanyakc
Right, I'm more asking about the following:

Typically the flow for telemetry is:

OTel SDK (Python) -> OTel Collector -> Backend (New Relic, Datadog, etc.)

The only way you'd get a 413 error code is if the collector returned it to the SDK, which from my understanding is not something the collector currently does.

Copy link
Copy Markdown
Contributor Author

@Krishnachaitanyakc Krishnachaitanyakc Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that flow, yes, the vanilla Collector doesn't return 413 to the SDK.

But the direct-to-backend (no Collector) is a supported deployment pattern which is what the original reporter seems to be using on #4533


for retry_num in range(_MAX_RETRYS):
# multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff.
backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2)
Expand All @@ -196,12 +208,24 @@ def export(
reason = error
retryable = isinstance(error, ConnectionError)
status_code = None
bisectable = False
else:
reason = resp.reason
retryable = _is_retryable(resp)
status_code = resp.status_code
bisectable = (
_is_payload_too_large(resp)
and len(batch) > 1
and remaining_bisects > 0
and environ.get(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should ideally be bound to an instance variable in the constructor (e.g. self._retry_entity_too_large)

_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, ""
)
.strip()
.lower()
== "true"
)

if not retryable:
if not retryable and not bisectable:
_logger.error(
"Failed to export logs batch code: %s, reason: %s",
status_code,
Expand All @@ -219,6 +243,34 @@ def export(
"max retries or shutdown."
)
return LogRecordExportResult.FAILURE

if bisectable:
if time() >= deadline_sec or self._shutdown:
_logger.error(
"Payload too large but %s, dropping %d log records",
"shutdown in progress"
if self._shutdown
else "deadline expired",
len(batch),
)
return LogRecordExportResult.FAILURE
Comment on lines +248 to +256
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't be necessary anymore because of lines 212-215.

mid = len(batch) // 2
_logger.warning(
"Payload too large (%d log records), splitting into two batches",
len(batch),
)
first = self._export_batch(
list(batch[:mid]),
deadline_sec,
remaining_bisects - 1,
)
if first != LogRecordExportResult.SUCCESS:
return LogRecordExportResult.FAILURE
return self._export_batch(
list(batch[mid:]),
deadline_sec,
remaining_bisects - 1,
)
_logger.warning(
"Transient error %s encountered while exporting logs batch, retrying in %.2fs.",
reason,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@
Compression,
)
from opentelemetry.exporter.otlp.proto.http._common import (
_is_payload_too_large,
_is_retryable,
_load_session_from_envvar,
)
from opentelemetry.sdk.environment_variables import (
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413,
_OTEL_PYTHON_EXPORTER_OTLP_HTTP_TRACES_CREDENTIAL_PROVIDER,
OTEL_EXPORTER_OTLP_CERTIFICATE,
OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE,
Expand Down Expand Up @@ -65,6 +67,7 @@
DEFAULT_TRACES_EXPORT_PATH = "v1/traces"
DEFAULT_TIMEOUT = 10 # in seconds
_MAX_RETRYS = 6
_MAX_BISECTS = 5


class OTLPSpanExporter(SpanExporter):
Expand Down Expand Up @@ -176,8 +179,17 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
_logger.warning("Exporter already shutdown, ignoring batch")
return SpanExportResult.FAILURE

serialized_data = encode_spans(spans).SerializePartialToString()
deadline_sec = time() + self._timeout
return self._export_batch(spans, deadline_sec, _MAX_BISECTS)

def _export_batch(
self,
spans: Sequence[ReadableSpan],
deadline_sec: float,
remaining_bisects: int,
) -> SpanExportResult:
serialized_data = encode_spans(spans).SerializePartialToString()

for retry_num in range(_MAX_RETRYS):
# multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff.
backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2)
Expand All @@ -189,12 +201,24 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
reason = error
retryable = isinstance(error, ConnectionError)
status_code = None
bisectable = False
else:
reason = resp.reason
retryable = _is_retryable(resp)
status_code = resp.status_code
bisectable = (
_is_payload_too_large(resp)
and len(spans) > 1
and remaining_bisects > 0
and environ.get(
_OTEL_PYTHON_EXPERIMENTAL_OTLP_RETRY_ON_413, ""
)
.strip()
.lower()
== "true"
)

if not retryable:
if not retryable and not bisectable:
_logger.error(
"Failed to export span batch code: %s, reason: %s",
status_code,
Expand All @@ -212,6 +236,34 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
"max retries or shutdown."
)
return SpanExportResult.FAILURE

if bisectable:
if time() >= deadline_sec or self._shutdown:
_logger.error(
"Payload too large but %s, dropping %d spans",
"shutdown in progress"
if self._shutdown
else "deadline expired",
len(spans),
)
return SpanExportResult.FAILURE
mid = len(spans) // 2
_logger.warning(
"Payload too large (%d spans), splitting into two batches",
len(spans),
)
first = self._export_batch(
list(spans[:mid]),
deadline_sec,
remaining_bisects - 1,
)
if first != SpanExportResult.SUCCESS:
return SpanExportResult.FAILURE
return self._export_batch(
list(spans[mid:]),
deadline_sec,
remaining_bisects - 1,
)
_logger.warning(
"Transient error %s encountered while exporting span batch, retrying in %.2fs.",
reason,
Expand Down
Loading
Loading