Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,21 @@

> ⚠️ SmarterJSON **always returns an `Array`** of documents.
>
> `SmarterJSON.process` / `SmarterJSON.process_file` return:
>
> `SmarterJSON.process` / `SmarterJSON.process_file`
> both return:
> — `[]` for no doc
> - `[doc]` for one doc
> - `[d1, d2, …]` for several docs (NDJSON / JSONL / concatenated docs)

> ⚠️ We discourage the use of `process(input).first` / `process(input)[0]` because it silently drops potential additional documents
> Please use `process_one` if you are expecting only one JSON doc, e.g. in API payloads.

## 1.1.1 (2026-06-11)

RSpec tests: 1,070 → 1,097

- The C extension now emits the same `on_warning` warnings as the pure-Ruby parser. `empty_value` and `duplicate_key` warnings name the offending key (and `duplicate_key` names the resolution strategy), and the warning text, line, and column are now identical whether or not the C extension is loaded.

## 1.1.0 (2026-06-09)

RSpec tests: 1,038 → 1,070
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ A lenient, fast JSON processor for Ruby. It extracts strict JSON, NDJSON, JSONL,

## Why SmarterJSON?

> 📖 **The thinking behind it:** [*Strict by Accident: Your JSON parser isn't broken, it's answering the wrong question*](https://dev.to/tilo_sloboda/strict-by-accident-your-json-parser-isnt-broken-its-answering-the-wrong-question-54f0) — why a data pipeline wants a lenient, recovery-first parser rather than a spec-policing one.

**Are you tired of seeing errors like these?**

```
Expand Down
91 changes: 69 additions & 22 deletions ext/smarter_json/smarter_json.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ typedef struct {
* an error) by scanning from the start of the buffer. CR, LF, and CRLF each
* count as one newline; col is bytes since the last line start (1-based).
* Keeping this off the hot path is the point — fj_advance never touches it. */
static void fj_line_col(fj_state *st, long *line, long *col) {
static void fj_line_col_to(fj_state *st, long stop, long *line, long *col) {
long l = 1, c = 1, i;
long limit = (st->pos < st->len) ? st->pos : st->len;
long limit = (stop < st->len) ? stop : st->len;
for (i = 0; i < limit; i++) {
unsigned char b = (unsigned char)st->buf[i];
if (b == 0x0A) { l++; c = 1; }
Expand All @@ -93,6 +93,9 @@ static void fj_line_col(fj_state *st, long *line, long *col) {
*line = l;
*col = c;
}
static void fj_line_col(fj_state *st, long *line, long *col) {
fj_line_col_to(st, st->pos, line, col);
}

/* Report a non-fatal lenient fix to the on_warning callable — a no-op (and builds no
* Warning) when no handler was given. The internal Qnil guard is the safety net; the
Expand All @@ -106,6 +109,25 @@ static void fj_warn(fj_state *st, VALUE type_sym, const char *msg) {
rb_utf8_str_new_cstr(msg), LONG2NUM(line), LONG2NUM(col)));
}

/* Like fj_warn but the message is a prebuilt Ruby String (rb_enc_sprintf, for messages
* that interpolate the offending key). Same Qnil guard and st->pos line/col. */
static void fj_warn_str(fj_state *st, VALUE type_sym, VALUE msg) {
long line, col;
if (st->on_warning == Qnil) return;
fj_line_col(st, &line, &col);
rb_funcall(st->on_warning, fj_call_id, 1,
rb_funcall(cWarning, fj_new_id, 4, type_sym, msg, LONG2NUM(line), LONG2NUM(col)));
}

/* Like fj_warn_str but at an explicit (line,col) — for a warning detected away from the
* cursor. Duplicate keys are found while building the closed object, but attributed to
* the object's closing position so the column matches the pure-Ruby parser. */
static void fj_warn_str_at(fj_state *st, VALUE type_sym, VALUE msg, long line, long col) {
if (st->on_warning == Qnil) return;
rb_funcall(st->on_warning, fj_call_id, 1,
rb_funcall(cWarning, fj_new_id, 4, type_sym, msg, LONG2NUM(line), LONG2NUM(col)));
}

/* 1-based column of the current byte position (bytes since the last line start).
* Used for triple-quoted indentation stripping (smarter_json.md §2.3). */
static long fj_column(fj_state *st) {
Expand Down Expand Up @@ -1290,7 +1312,7 @@ void rb_hash_bulk_insert(long, const VALUE *, VALUE);
/* Build a Hash from `count` interleaved key,value slots. Fast path (String keys,
* default :last_wins): pre-size + bulk insert. symbolize_keys / :first_wins use a
* per-member loop into the same pre-sized hash. */
static VALUE fj_build_object(fj_state *st, const VALUE *pairs, long count) {
static VALUE fj_build_object(fj_state *st, const VALUE *pairs, const long *positions, long count) {
long entries = count / 2, i;
VALUE hash = rb_hash_new_capa(entries);

Expand All @@ -1305,7 +1327,16 @@ static VALUE fj_build_object(fj_state *st, const VALUE *pairs, long count) {
VALUE k = st->symbolize_keys ? rb_funcall(pairs[i], fj_to_sym_id, 0) : pairs[i];
if (st->dup_first_wins || st->on_warning != Qnil) {
if (RTEST(rb_funcall(hash, fj_key_p_id, 1, k))) {
fj_warn(st, fj_sym_duplicate_key, "duplicate key");
if (st->on_warning != Qnil) {
long wl, wc;
fj_line_col_to(st, positions[i + 1], &wl, &wc); /* the duplicate member's value position — matches the Ruby parser */
fj_warn_str_at(st, fj_sym_duplicate_key,
rb_enc_sprintf(rb_utf8_encoding(),
"duplicate key %"PRIsVALUE" \xe2\x80\x94 %s",
rb_inspect(k),
st->dup_first_wins ? "first_wins" : "last_wins"),
wl, wc);
}
if (st->dup_first_wins) continue;
}
}
Expand All @@ -1323,6 +1354,7 @@ typedef struct { long mark; int is_obj; } fj_frame;

typedef struct {
VALUE *vptr; long vhead; long vcapa; /* pending values (GC-marked) */
long *pptr; /* byte offset just past each pushed value (mirrors vptr/vcapa); used only to attribute a duplicate-key warning to the duplicate member's position */
fj_frame *fptr; long fhead; long fcapa; /* open-container frames (no VALUEs) */
} fj_pstack;

Expand All @@ -1334,21 +1366,31 @@ static void fj_pstack_mark(void *p) {
static void fj_pstack_free(void *p) {
fj_pstack *ps = (fj_pstack *)p;
if (ps->vptr != NULL) xfree(ps->vptr);
if (ps->pptr != NULL) xfree(ps->pptr);
if (ps->fptr != NULL) xfree(ps->fptr);
xfree(ps);
}
static size_t fj_pstack_memsize(const void *p) {
const fj_pstack *ps = (const fj_pstack *)p;
return sizeof(fj_pstack) + (size_t)ps->vcapa * sizeof(VALUE) + (size_t)ps->fcapa * sizeof(fj_frame);
return sizeof(fj_pstack) + (size_t)ps->vcapa * sizeof(VALUE)
+ (ps->pptr ? (size_t)ps->vcapa * sizeof(long) : 0)
+ (size_t)ps->fcapa * sizeof(fj_frame);
}
static const rb_data_type_t fj_pstack_type = {
"smarter_json/pstack",
{ fj_pstack_mark, fj_pstack_free, fj_pstack_memsize, },
0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
};

static inline void fj_vpush(fj_pstack *ps, VALUE v) {
if (ps->vhead >= ps->vcapa) { ps->vcapa *= 2; REALLOC_N(ps->vptr, VALUE, ps->vcapa); }
static inline void fj_vpush(fj_state *st, fj_pstack *ps, VALUE v) {
if (ps->vhead >= ps->vcapa) {
ps->vcapa *= 2;
REALLOC_N(ps->vptr, VALUE, ps->vcapa);
if (ps->pptr) REALLOC_N(ps->pptr, long, ps->vcapa);
}
/* pptr is allocated only when on_warning is set; the fast path (no handler) does no
* extra store. The offset is just past this value — the duplicate-key warning position. */
if (ps->pptr) ps->pptr[ps->vhead] = st->pos;
ps->vptr[ps->vhead++] = v;
}
static inline void fj_fpush(fj_pstack *ps, long mark, int is_obj) {
Expand All @@ -1368,6 +1410,7 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
int vss = 0; /* warnings: has a value landed in the current container since the last separator? */

ps->vptr = ALLOC_N(VALUE, 64); ps->vhead = 0; ps->vcapa = 64;
ps->pptr = (st->on_warning != Qnil) ? ALLOC_N(long, 64) : NULL; /* only the warning path needs per-value positions */
ps->fptr = ALLOC_N(fj_frame, 16); ps->fhead = 0; ps->fcapa = 16;

if (implicit_root) fj_fpush(ps, 0, 1);
Expand Down Expand Up @@ -1398,25 +1441,25 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
b = fj_byte(st);
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
if (b == ',') { /* collapsing separator: skip empty member */
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma, collapsed an empty slot");
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma \xe2\x80\x94 collapsed an empty slot");
vss = 0;
fj_advance(st, 1);
continue;
}
if (b == '}') {
VALUE hash;
fj_advance(st, 1);
hash = fj_build_object(st, &ps->vptr[mark], ps->vhead - mark);
hash = fj_build_object(st, &ps->vptr[mark], ps->pptr ? &ps->pptr[mark] : NULL, ps->vhead - mark);
ps->vhead = mark;
ps->fhead--;
if (ps->fhead == 0) { result = hash; break; }
fj_vpush(ps, hash);
fj_vpush(st, ps, hash);
vss = 1;
continue;
}
if (b == -1) {
if (implicit_root && ps->fhead == 1) {
result = fj_build_object(st, &ps->vptr[mark], ps->vhead - mark);
result = fj_build_object(st, &ps->vptr[mark], ps->pptr ? &ps->pptr[mark] : NULL, ps->vhead - mark);
break;
}
fj_error(st, "unterminated object");
Expand All @@ -1430,28 +1473,32 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
b = fj_byte(st);
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
if (b == '{' || b == '[') {
fj_vpush(ps, key);
fj_vpush(st, ps, key);
fj_advance(st, 1);
fj_fpush(ps, ps->vhead, (b == '{'));
vss = 0;
continue;
}
if (b == '}' || b == ',') { /* key with a colon but no value -> null */
fj_vpush(ps, key);
fj_vpush(ps, Qnil);
fj_warn(st, fj_sym_empty_value, "empty value, used null");
fj_vpush(st, ps, key);
fj_vpush(st, ps, Qnil);
if (st->on_warning != Qnil)
fj_warn_str(st, fj_sym_empty_value,
rb_enc_sprintf(rb_utf8_encoding(),
"key %"PRIsVALUE" had no value \xe2\x80\x94 used null",
rb_inspect(key)));
vss = 1;
continue;
}
if (b == -1) fj_error(st, "unexpected end of input");
fj_vpush(ps, key);
fj_vpush(ps, fj_parse_member_value(st));
fj_vpush(st, ps, key);
fj_vpush(st, ps, fj_parse_member_value(st));
vss = 1;
} else { /* array */
b = fj_byte(st);
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
if (b == ',') { /* collapsing separator: skip empty slot */
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma, collapsed an empty slot");
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma \xe2\x80\x94 collapsed an empty slot");
vss = 0;
fj_advance(st, 1);
continue;
Expand All @@ -1463,7 +1510,7 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
ps->vhead = mark;
ps->fhead--;
if (ps->fhead == 0) { result = ary; break; }
fj_vpush(ps, ary);
fj_vpush(st, ps, ary);
vss = 1;
continue;
}
Expand All @@ -1481,10 +1528,10 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
smart-quote, literals) falls through to the full dispatch below. */
if (b == '-' || b == '+' || b == '.' || (b >= '0' && b <= '9')) {
VALUE num;
if (fj_try_member_number(st, &num)) { fj_vpush(ps, num); vss = 1; continue; }
if (fj_try_member_number(st, &num)) { fj_vpush(st, ps, num); vss = 1; continue; }
}
if (b == '"') { fj_vpush(ps, fj_parse_string(st, '"')); vss = 1; continue; }
fj_vpush(ps, fj_parse_member_value(st));
if (b == '"') { fj_vpush(st, ps, fj_parse_string(st, '"')); vss = 1; continue; }
fj_vpush(st, ps, fj_parse_member_value(st));
vss = 1;
}
}
Expand Down
2 changes: 1 addition & 1 deletion lib/smarter_json/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module SmarterJSON
VERSION = "1.1.0"
VERSION = "1.1.1"
end
98 changes: 98 additions & 0 deletions spec/warning_parity_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# frozen_string_literal: true

require "smarter_json"

# Every lenient fix must report the SAME SmarterJSON::Warning (type, message, line,
# col) on the C extension and on the pure-Ruby parser. Most warnings already do,
# because they are emitted from shared Ruby code (the Recovery layer and the number
# path); empty_slot / empty_value / duplicate_key have SEPARATE emission sites in the
# C extension and in the Ruby parser, which is where message/column drift crept in.
#
# The pure-Ruby parser is the contract: it names the offending key, names the
# duplicate-key resolution strategy, and reports the line/col of the duplicate
# member's value. The C extension is brought up to match. (— is an em dash; the save hook keeps
# the spec ASCII, so the expected strings use the escape, which equals the parser's
# real em dash at run time.)
RSpec.describe SmarterJSON, "warning parity (C and Ruby emit identical warnings)" do
def first_warning(input, **opts)
seen = nil
SmarterJSON.process(input, on_warning: ->(w) { seen ||= w }, **opts)
seen
end

[true, false].each do |acceleration|
context "acceleration: #{acceleration}" do
it "empty_slot — names nothing, em dash, col at the extra comma" do
w = first_warning("[1,,2]", acceleration: acceleration)
expect([w.type, w.message, w.line, w.col])
.to eq([:empty_slot, "extra comma — collapsed an empty slot", 1, 4])
end

it "empty_value — names the key" do
w = first_warning('{"x":}', acceleration: acceleration)
expect([w.type, w.message, w.line, w.col])
.to eq([:empty_value, "key \"x\" had no value — used null", 1, 6])
end

it "duplicate_key — names the key and strategy (last_wins), col at the duplicate value" do
w = first_warning('{"a":1,"a":2}', acceleration: acceleration)
expect([w.type, w.message, w.line, w.col])
.to eq([:duplicate_key, "duplicate key \"a\" — last_wins", 1, 13])
end

it "duplicate_key — names the first_wins strategy" do
w = first_warning('{"a":1,"a":2}', duplicate_key: :first_wins, acceleration: acceleration)
expect(w.message).to eq("duplicate key \"a\" — first_wins")
end

it "duplicate_key — inspects a symbolized key" do
w = first_warning('{"a":1,"a":2}', symbolize_keys: true, acceleration: acceleration)
expect(w.message).to eq("duplicate key :a — last_wins")
end

it "duplicate_key — multiline: line/col track the duplicate member's value, not the brace" do
w = first_warning(%({\n "a": 1,\n "a": 2\n}), acceleration: acceleration)
expect([w.type, w.message, w.line, w.col])
.to eq([:duplicate_key, "duplicate key \"a\" — last_wins", 3, 9])
end

it "duplicate_key — nested object, col tracks the inner duplicate" do
w = first_warning(%({\n "outer": {\n "k": 1,\n "k": 2\n }\n}), acceleration: acceleration)
expect([w.line, w.col]).to eq([4, 11])
end

it "duplicate_key — trailing whitespace after the value is included, matching Ruby" do
w = first_warning(%({\n "a": 1,\n "a": 2 \n}), acceleration: acceleration)
expect([w.line, w.col]).to eq([3, 11])
end
end
end

# Belt-and-suspenders: for every warning type, the C path and the Ruby path must
# produce a byte-identical Warning. This is the coverage that was missing — it would
# have caught the message/column drift immediately, and locks all paths going forward.
describe "every warning type is identical on the C and Ruby paths" do
{
"empty_slot (array)" => [:process, "[1,,2]"],
"empty_slot (object)" => [:process, '{"a":1,,"b":2}'],
"empty_value" => [:process, '{"x":}'],
"duplicate_key" => [:process, '{"a":1,"a":2}'],
"duplicate_key (multiline)" => [:process, "{\n \"a\": 1,\n \"a\": 2\n}"],
"number_overflow" => [:process, "1e400"],
"code_fence_stripped" => [:process, "```json\n{\"a\":1}\n```"],
"prefix_text_ignored" => [:process, 'Here is the json: {"a":1}'],
"suffix_text_ignored" => [:process, '{"a":1} thanks!'],
"wrapper_tag_stripped" => [:process, "<json>{\"a\":1}</json>"],
"extra_documents" => [:process_one, "{\"a\":1}\n{\"b\":2}"],
}.each do |label, (meth, input)|
it "#{label}: same (type, message, line, col) with and without the C extension" do
tuple = lambda do |accel|
w = nil
SmarterJSON.public_send(meth, input, acceleration: accel, on_warning: ->(x) { w ||= x })
[w.type, w.message, w.line, w.col]
end
expect(tuple.call(true)).to eq(tuple.call(false))
end
end
end
end