Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ jobs:
- ChunkCodecCore/**
- ChunkCodecTests/**
- LibLz4/**
LibLzma:
- ChunkCodecCore/**
- ChunkCodecTests/**
- LibLzma/**
LibSnappy:
- ChunkCodecCore/**
- ChunkCodecTests/**
Expand Down
11 changes: 11 additions & 0 deletions LibLzma/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Release Notes

All notable changes to this package will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## Unreleased

### Added

- Initial release
21 changes: 21 additions & 0 deletions LibLzma/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2025 Nathan Zimmerberg

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
16 changes: 16 additions & 0 deletions LibLzma/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name = "ChunkCodecLibLzma"
uuid = "e95d29e5-19c5-4afd-ae0f-beb790efacdf"
version = "0.1.0"
authors = ["nhz2 <nhz2@cornell.edu>"]

[workspace]
projects = ["test"]

[deps]
ChunkCodecCore = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1"
XZ_jll = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"

[compat]
ChunkCodecCore = "1"
XZ_jll = "5"
julia = "1.6"
24 changes: 24 additions & 0 deletions LibLzma/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# ChunkCodecLibLzma

This package implements the ChunkCodec interface for the following encoders and decoders
using the liblzma C library <https://tukaani.org/xz/>

1. `XZCodec`, `XZEncodeOptions`, `XZDecodeOptions`

## Example

```julia-repl
julia> using ChunkCodecLibLzma

julia> data = [0x00, 0x01, 0x02, 0x03];

julia> compressed_data = encode(XZEncodeOptions(;prefix=UInt32(6), check=ChunkCodecLibLzma.LZMA_CHECK_CRC64), data);
Comment thread
nhz2 marked this conversation as resolved.
Outdated

julia> decompressed_data = decode(XZCodec(), compressed_data; max_size=length(data), size_hint=length(data));

julia> data == decompressed_data
true
```

The low level interface is defined in the `ChunkCodecCore` package.

66 changes: 66 additions & 0 deletions LibLzma/src/ChunkCodecLibLzma.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
module ChunkCodecLibLzma

using XZ_jll: liblzma

using ChunkCodecCore:
Codec,
EncodeOptions,
DecodeOptions,
check_in_range,
check_contiguous,
grow_dst!,
DecodingError,
MaybeSize,
NOT_SIZE
import ChunkCodecCore:
decode_options,
can_concatenate,
try_decode!,
try_resize_decode!,
try_encode!,
encode_bound,
try_find_decoded_size,
decoded_size_range

export XZCodec,
XZEncodeOptions,
XZDecodeOptions,
LZMADecodingError

if VERSION >= v"1.11.0-DEV.469"
eval(Meta.parse("""
public
LZMA_PRESET_LEVEL_MASK,
LZMA_PRESET_EXTREME,
LZMA_CHECK_NONE,
LZMA_CHECK_CRC32,
LZMA_CHECK_CRC64,
LZMA_CHECK_SHA256
"""))
end



# reexport ChunkCodecCore
using ChunkCodecCore: ChunkCodecCore, encode, decode
export ChunkCodecCore, encode, decode


include("liblzma.jl")

"""
struct XZCodec <: Codec
XZCodec()

xz compression using the liblzma C library <https://tukaani.org/xz/>

See also [`XZEncodeOptions`](@ref) and [`XZDecodeOptions`](@ref)
"""
struct XZCodec <: Codec
end
decode_options(::XZCodec) = XZDecodeOptions()

include("encode.jl")
include("decode.jl")

end # module ChunkCodecLibLzma
145 changes: 145 additions & 0 deletions LibLzma/src/decode.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""
LZMADecodingError(code)

Error for data that cannot be decoded.
"""
struct LZMADecodingError <: DecodingError
code::Cint
end

function Base.showerror(io::IO, err::LZMADecodingError)
print(io, "LZMADecodingError: ")
if err.code == LZMA_DATA_ERROR
print(io, "LZMA_DATA_ERROR: data is corrupt")
elseif err.code == LZMA_FORMAT_ERROR
print(io, "LZMA_FORMAT_ERROR: file format not recognized")
elseif err.code == LZMA_OPTIONS_ERROR
print(io, "LZMA_OPTIONS_ERROR: reserved bits set in headers. Data corrupt, or upgrading liblzma may help")
elseif err.code == LZMA_BUF_ERROR
print(io, "LZMA_BUF_ERROR: the compressed stream may be truncated or corrupt")
else
print(io, "unknown lzma error code: ")
print(io, err.code)
end
nothing
end

"""
struct XZDecodeOptions <: DecodeOptions
XZDecodeOptions(; kwargs...)

xz decompression using the liblzma C library <https://tukaani.org/xz/>

Like the command line tool `xz`, decoding accepts concatenated and padded compressed data and returns the decompressed data concatenated.

# Keyword Arguments

- `codec::XZCodec=XZCodec()`
"""
struct XZDecodeOptions <: DecodeOptions
codec::XZCodec
end
function XZDecodeOptions(;
codec::XZCodec=XZCodec(),
kwargs...
)
XZDecodeOptions(codec)
end
can_concatenate(::XZDecodeOptions) = true

function try_find_decoded_size(::XZDecodeOptions, src::AbstractVector{UInt8})::Nothing
# Potentially this could be found by parsing through the index
# This is complicated by potential padding and concatenated streams
nothing
end

function try_decode!(d::XZDecodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; kwargs...)::MaybeSize
try_resize_decode!(d, dst, src, Int64(length(dst)))
end

function try_resize_decode!(d::XZDecodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}, max_size::Int64; kwargs...)::MaybeSize
dst_size::Int64 = length(dst)
src_size::Int64 = length(src)
src_left::Int64 = src_size
dst_left::Int64 = dst_size
check_contiguous(dst)
check_contiguous(src)
if isempty(src)
throw(LZMADecodingError(LZMA_BUF_ERROR))
end
cconv_src = Base.cconvert(Ptr{UInt8}, src)
# We start by allocating our allocator
cconv_allocator = Base.cconvert(Ref{lzma_allocator}, default_allocator())
GC.@preserve cconv_allocator begin
allocator_p = Base.unsafe_convert(Ref{lzma_allocator}, cconv_allocator)
stream = lzma_stream()
stream.allocator = allocator_p
ret = @ccall liblzma.lzma_stream_decoder(
stream::Ref{lzma_stream},
typemax(UInt64)::UInt64,
LZMA_CONCATENATED::UInt32,
)::Cint
if ret == LZMA_MEM_ERROR
throw(OutOfMemoryError())
elseif ret != LZMA_OK
error("Unknown lzma error code: $(ret)")
end
try
while true # Loop for resizing dst
# dst may get resized, so cconvert needs to be redone on each iteration.
cconv_dst = Base.cconvert(Ptr{UInt8}, dst)
GC.@preserve cconv_src cconv_dst begin
src_p = Base.unsafe_convert(Ptr{UInt8}, cconv_src)
dst_p = Base.unsafe_convert(Ptr{UInt8}, cconv_dst)
stream.avail_in = src_left
stream.avail_out = dst_left
stream.next_in = src_p + (src_size - src_left)
stream.next_out = dst_p + (dst_size - dst_left)
ret = @ccall liblzma.lzma_code(
stream::Ref{lzma_stream},
LZMA_FINISH::Cint,
)::Cint
if ret == LZMA_OK || ret == LZMA_STREAM_END
@assert stream.avail_in ≤ src_left
@assert stream.avail_out ≤ dst_left
src_left = stream.avail_in
dst_left = stream.avail_out
@assert src_left ∈ 0:src_size
@assert dst_left ∈ 0:dst_size
end
if ret == LZMA_OK
# Likely not enough output space
# but also potentially the input is truncated
# Unlike zlib, we can keep trying until we get LZMA_BUF_ERROR
if iszero(dst_left)
# Give more space and try again
# This might result in returning a NOT_SIZE
# when instead the actual issue is that the input is truncated.
local next_size = grow_dst!(dst, max_size)
if isnothing(next_size)
return NOT_SIZE
end
dst_left += next_size - dst_size
dst_size = next_size
@assert dst_left > 0
end
elseif ret == LZMA_STREAM_END
@assert iszero(src_left)
# yay done return decompressed size
real_dst_size = dst_size - dst_left
@assert real_dst_size ∈ 0:length(dst)
return real_dst_size
elseif ret == LZMA_DATA_ERROR || ret == LZMA_FORMAT_ERROR || ret == LZMA_OPTIONS_ERROR || ret == LZMA_BUF_ERROR
throw(LZMADecodingError(ret))
elseif ret == LZMA_MEM_ERROR
throw(OutOfMemoryError())
else
error("Unknown lzma error code: $(ret)")
end
end
end
finally
@ccall liblzma.lzma_end(stream::Ref{lzma_stream})::Cvoid
end
end
end
Loading
Loading