diff --git a/.gitattributes b/.gitattributes index 35216a569d..4e97c7aa01 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,6 +7,6 @@ vendor/** linguist-vendored src/lexer.[ch] linguist-generated=true src/parser.[ch] linguist-generated=true tests/man.test linguist-generated=true -tests/manonig.test linguist-generated=true +tests/manpcre2.test linguist-generated=true jq.1.prebuilt linguist-generated=true docs/Pipfile.lock linguist-generated=true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9e5600b326..46bac7ad0f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,18 +77,18 @@ jobs: - name: Clone repository uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - name: Install packages run: | sudo apt-get update sudo apt-get install -y automake autoconf libtool crossbuild-essential-${{ matrix.arch }} - name: Build run: | - autoreconf -i + autoreconf --verbose -i ./configure \ --host=${{ matrix.CC }} \ --disable-docs \ - --with-oniguruma=builtin \ + --with-pcre2=builtin \ --enable-static \ --enable-all-static \ CFLAGS="-O2 -pthread -fstack-protector-all" @@ -138,7 +138,7 @@ jobs: - name: Clone repository uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - name: Install packages run: | # brew update sometimes fails with "Fetching /usr/local/Homebrew/Library/Taps/homebrew/homebrew-cask failed!" @@ -149,11 +149,11 @@ jobs: echo "CC=clang -target ${{ matrix.target }}$(uname -r)" >> "$GITHUB_ENV" - name: Build run: | - autoreconf -i + autoreconf --no-recursive -i ./configure \ --host="${{ matrix.target }}$(uname -r)" \ --disable-docs \ - --with-oniguruma=builtin \ + --with-pcre2=builtin \ --disable-shared \ --enable-static \ --enable-all-static \ @@ -215,7 +215,7 @@ jobs: - name: Clone repository uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - uses: msys2/setup-msys2@v2 with: update: true @@ -230,10 +230,10 @@ jobs: toolchain:p - name: Build run: | - autoreconf -i + autoreconf --no-recursive -i ./configure \ --disable-docs \ - --with-oniguruma=builtin \ + --with-pcre2=builtin \ --disable-shared \ --enable-static \ --enable-all-static \ @@ -243,7 +243,9 @@ jobs: cp ./jq.exe jq-${{ env.SUFFIX }}.exe - name: Test run: | - make check VERBOSE=yes + # TODO: pcre2 tests do not work in our msys2 env + # so use check-am that does not do recursive check + make check-am VERBOSE=yes git diff --exit-code - name: Upload Test Logs if: ${{ failure() }} @@ -268,7 +270,7 @@ jobs: - name: Clone repository uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - name: Install packages run: | sudo apt-get update -qq @@ -278,7 +280,7 @@ jobs: autoreconf -i ./configure \ --disable-docs \ - --with-oniguruma=builtin + --with-pcre2=builtin make distcheck make dist dist-zip git diff --exit-code @@ -383,7 +385,7 @@ jobs: uses: actions/download-artifact@v8 with: pattern: jq-* - merge-multiple: true + merge-multiple: recursive - name: Upload release env: TAG_NAME: ${{ github.ref_name }} diff --git a/.github/workflows/decnum.yml b/.github/workflows/decnum.yml index 81c7d91e70..b307891e26 100644 --- a/.github/workflows/decnum.yml +++ b/.github/workflows/decnum.yml @@ -12,7 +12,7 @@ jobs: - name: Clone repository uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - name: Install packages run: | sudo apt-get update diff --git a/.github/workflows/manpage.yml b/.github/workflows/manpage.yml index 9c77b7ec1f..88f7a8babd 100644 --- a/.github/workflows/manpage.yml +++ b/.github/workflows/manpage.yml @@ -1,18 +1,18 @@ -name: Building man page, man.test, manonig.test +name: Building man page, man.test, manpcre2.test on: push: paths: - '.github/workflows/manpage.yml' - 'docs/**' - 'tests/man.test' - - 'tests/manonig.test' + - 'tests/manpcre2.test' - 'jq.1.prebuilt' pull_request: paths: - '.github/workflows/manpage.yml' - 'docs/**' - 'tests/man.test' - - 'tests/manonig.test' + - 'tests/manpcre2.test' - 'jq.1.prebuilt' jobs: @@ -38,14 +38,14 @@ jobs: run: | autoreconf -i ./configure --enable-docs - - name: Build man page, man.test, manonig.test + - name: Build man page, man.test, manpcre2.test run: | mv jq.1.prebuilt jq.1.old - rm -f tests/man.test manonig.test - make jq.1.prebuilt tests/man.test tests/manonig.test - - name: Make sure that jq.1.prebuilt, man.test, manonig.test are up to date + rm -f tests/man.test manpcre2.test + make jq.1.prebuilt tests/man.test tests/manpcre2.test + - name: Make sure that jq.1.prebuilt, man.test, manpcre2.test are up to date run: | - git diff --exit-code tests/man.test tests/manonig.test + git diff --exit-code tests/man.test tests/manpcre2.test # skip build date in jq.1.prebuilt test -s jq.1.prebuilt diff -- <(tail -n +3 jq.1.old) <(tail -n +3 jq.1.prebuilt) diff --git a/.github/workflows/oniguruma.yml b/.github/workflows/pcre2.yml similarity index 84% rename from .github/workflows/oniguruma.yml rename to .github/workflows/pcre2.yml index ea36ddf27e..aa0a33383e 100644 --- a/.github/workflows/oniguruma.yml +++ b/.github/workflows/pcre2.yml @@ -1,12 +1,12 @@ -name: oniguruma +name: pcre2 on: push: branches: - master pull_request: -# Since builtin oniguruma is tested in the CI workflow, -# we test other options for --with-oniguruma here. +# Since builtin pcre2 is tested in the CI workflow, +# we test other options for --with-pcre2 here. jobs: installed: runs-on: ubuntu-24.04 @@ -16,14 +16,14 @@ jobs: - name: Install packages run: | sudo apt-get update - sudo apt-get install -y automake autoconf libtool valgrind libonig-dev + sudo apt-get install -y automake autoconf libtool valgrind libpcre2-dev libpcre2-8-0 - name: Build run: | autoreconf -i ./configure \ --disable-docs \ --enable-valgrind \ - --with-oniguruma=yes + --with-pcre2=yes make -j"$(nproc)" file ./jq - name: Test @@ -35,7 +35,7 @@ jobs: if: ${{ failure() }} uses: actions/upload-artifact@v7 with: - name: test-logs-oniguruma-installed + name: test-logs-pcre-installed retention-days: 7 path: | test-suite.log @@ -56,7 +56,7 @@ jobs: ./configure \ --disable-docs \ --enable-valgrind \ - --with-oniguruma=no + --with-pcre2=no make -j"$(nproc)" file ./jq - name: Test @@ -68,7 +68,7 @@ jobs: if: ${{ failure() }} uses: actions/upload-artifact@v7 with: - name: test-logs-oniguruma-disabled + name: test-logs-pcre-disabled retention-days: 7 path: | test-suite.log diff --git a/.github/workflows/scanbuild.yml b/.github/workflows/scanbuild.yml index 21800a1500..a01493cd76 100644 --- a/.github/workflows/scanbuild.yml +++ b/.github/workflows/scanbuild.yml @@ -12,7 +12,7 @@ jobs: - name: Clone repository uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - name: Install packages run: | sudo apt-get update -qq diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml index c698a8e59f..8dab2a58a0 100644 --- a/.github/workflows/valgrind.yml +++ b/.github/workflows/valgrind.yml @@ -12,7 +12,7 @@ jobs: - name: Clone repository uses: actions/checkout@v6 with: - submodules: true + submodules: recursive - name: Install packages run: | sudo apt-get update @@ -23,7 +23,7 @@ jobs: ./configure \ --disable-docs \ --enable-valgrind \ - --with-oniguruma=builtin + --with-pcre2=builtin make -j"$(nproc)" file ./jq - name: Test diff --git a/.gitmodules b/.gitmodules index 193b803046..1eaf1a0d20 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "vendor/oniguruma"] - path = vendor/oniguruma - url = https://github.com/kkos/oniguruma.git +[submodule "vendor/pcre2"] + path = vendor/pcre2 + url = https://github.com/PCRE2Project/pcre2.git diff --git a/Dockerfile b/Dockerfile index 17950b40e6..12fcf1373d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,7 +19,7 @@ COPY . /app RUN autoreconf -i \ && ./configure \ --disable-docs \ - --with-oniguruma=builtin \ + --with-pcre2=builtin \ --enable-static \ --enable-all-static \ --prefix=/usr/local \ diff --git a/Makefile.am b/Makefile.am index 96d603817a..a70d396ae8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -56,7 +56,7 @@ AM_YFLAGS = --warnings=all -Wno-yacc -d lib_LTLIBRARIES = libjq.la libjq_la_SOURCES = ${LIBJQ_SRC} libjq_la_LIBADD = -lm -libjq_la_LDFLAGS = $(onig_LDFLAGS) -export-symbols-regex '^j[qv]_' -version-info 1:4:0 +libjq_la_LDFLAGS = $(pcre2_LDFLAGS) $(pcre2_LDFLAGS) -export-symbols-regex '^j[qv]_' -version-info 1:4:0 if WIN32 libjq_la_LIBADD += -lshlwapi @@ -158,7 +158,7 @@ check_DATA = tests/man.test # Making changes to the manpage without having the python deps means your # tests won't run. If you aren't making changes to the examples, you probably # don't care. But if you are, then you need to run the tests anyway. -tests/man.test tests/manonig.test: $(srcdir)/docs/content/manual/dev/manual.yml +tests/man.test tests/manopcre2.test: $(srcdir)/docs/content/manual/dev/manual.yml if ENABLE_DOCS $(AM_V_GEN) ( cd ${abs_srcdir}/docs && \ $(PIPENV) run python validate_manual_schema.py content/manual/dev/manual.yml && \ @@ -189,18 +189,19 @@ jq.1: jq.1.prebuilt CLEANFILES += jq.1 -### Build oniguruma +### Build PCRE2 -if BUILD_ONIGURUMA -libjq_la_LIBADD += vendor/oniguruma/src/.libs/libonig.la -jq_LDADD += vendor/oniguruma/src/.libs/libonig.la -SUBDIRS = vendor/oniguruma +if BUILD_PCRE2 +libjq_la_LIBADD += vendor/pcre2/.libs/libpcre2-8.la +jq_LDADD += vendor/pcre2/.libs/libpcre2-8.la +SUBDIRS = vendor/pcre2 endif -AM_CFLAGS += $(onig_CFLAGS) +AM_CFLAGS += $(pcre2_CFLAGS) +AM_CFLAGS += $(pcre2_CFLAGS) -if WITH_ONIGURUMA -TESTS += tests/onigtest tests/manonigtest +if WITH_PCRE2 +TESTS += tests/pcre2test tests/manpcre2test endif ### Packaging @@ -227,14 +228,14 @@ EXTRA_DIST = $(DOC_FILES) $(man_MANS) $(TESTS) $(TEST_LOG_COMPILER) \ tests/modules/test_bind_order0.jq \ tests/modules/test_bind_order1.jq \ tests/modules/test_bind_order2.jq \ - tests/onig.supp tests/local.supp \ + tests/pcre2.supp tests/local.supp \ tests/setup tests/torture/input0.json \ - tests/optional.test tests/man.test tests/manonig.test \ - tests/jq.test tests/onig.test tests/base64.test tests/uri.test \ + tests/optional.test tests/man.test tests/manpcre2.test \ + tests/jq.test tests/pcre2.test tests/pcre2.test tests/base64.test tests/uri.test \ tests/jq-f-test.sh \ tests/no-main-program.jq tests/yes-main-program.jq -AM_DISTCHECK_CONFIGURE_FLAGS=--with-oniguruma=builtin +AM_DISTCHECK_CONFIGURE_FLAGS=--with-pcre2=builtin # README.md is expected in GitHub projects, good stuff in it, so we'll # distribute it and install it with the package in the doc directory. diff --git a/README.md b/README.md index 9ef09cc4f2..e6064f62af 100644 --- a/README.md +++ b/README.md @@ -43,10 +43,10 @@ docker run --rm -i -v "$PWD:$PWD" -w "$PWD" ghcr.io/jqlang/jq:latest '.version' #### Instructions ```console -git submodule update --init # if building from git to get oniguruma -autoreconf -i # if building from git -./configure --with-oniguruma=builtin -make clean # if upgrading from a version previously built from source +git submodule update --init --recursive # if building from git to get pcre2 +autoreconf -i # if building from git +./configure --with-pcre2=builtin # build with builtin pcre2 +make clean # if upgrading from a version previously built from source make -j8 make check sudo make install diff --git a/configure.ac b/configure.ac index bc1d27317a..7eb14e7dcb 100644 --- a/configure.ac +++ b/configure.ac @@ -236,54 +236,62 @@ AC_C_BIGENDIAN( AC_MSG_ERROR(universal endianness not supported) ) -dnl Oniguruma -AC_ARG_WITH([oniguruma], - [AS_HELP_STRING([--with-oniguruma=prefix], - [try this for a non-standard install prefix of the oniguruma library])], , - [with_oniguruma=yes]) - -onig_CFLAGS= -onig_LDFLAGS= -build_oniguruma=no -AS_IF([test "x$with_oniguruma" != xno], [ +dnl PCRE2 +AC_ARG_WITH([pcre2], + [AS_HELP_STRING([--with-pcre2=prefix], + [try this for a non-standard install prefix of the pcre2 library])], , + [with_pcre2=yes]) + +pcre2_CFLAGS= +pcre2_LDFLAGS= +build_pcre2=no +AS_IF([test "x$with_pcre2" != xno], [ save_CFLAGS="$CFLAGS" save_LDFLAGS="$LDFLAGS" - AS_IF([test "x$with_oniguruma" != xyes], [ - AS_IF([test "x$with_oniguruma" = xbuiltin], [ - build_oniguruma=yes + AS_IF([test "x$with_pcre2" != xyes], [ + AS_IF([test "x$with_pcre2" = xbuiltin], [ + build_pcre2=yes ], [ - onig_CFLAGS="-I${with_oniguruma}/include" - onig_LDFLAGS="-L${with_oniguruma}/lib" + pcre2_CFLAGS="-I${with_pcre2}/include" + pcre2_LDFLAGS="-L${with_pcre2}/lib" ]) + ], [ + AC_CHECK_PROGS(PCRE2_CONFIG, pcre2-config) + AS_IF([test "x$ac_cv_prog_PCRE2_CONFIG" = x], [ + AC_MSG_ERROR([could not find pcre2-config]) + ]) + # header file require DPCRE2_CODE_UNIT_WIDTH to be set + pcre2_CFLAGS="`pcre2-config --cflags` -DPCRE2_CODE_UNIT_WIDTH=8" + pcre2_LDFLAGS="`pcre2-config --libs8`" ]) - AS_IF([test "x$build_oniguruma" = xno], [ - # check for ONIGURUMA library, either in /usr or where requested - CFLAGS="$CFLAGS $onig_CFLAGS" - LDFLAGS="$LDFLAGS $onig_LDFLAGS" - AC_CHECK_HEADER("oniguruma.h", - AC_CHECK_LIB([onig],[onig_version])) + AS_IF([test "x$build_pcre2" = xno], [ + # check for PCRE2 library, either in /usr or where requested + CFLAGS="$CFLAGS $pcre2_CFLAGS" + LDFLAGS="$LDFLAGS $pcre2_LDFLAGS" + AC_CHECK_HEADER("pcre2.h", + # symbols in libs8 has _8 suffix + AC_CHECK_LIB([pcre2-8],[pcre2_compile_8])) # handle check results - AS_IF([test "x$ac_cv_lib_onig_onig_version" != "xyes"], [ - build_oniguruma=yes - AC_MSG_NOTICE([Oniguruma was not found. Will use the packaged oniguruma.]) + AS_IF([test "x$ac_cv_lib_pcre2_8_pcre2_compile_8" != "xyes"], [ + build_pcre2=yes + AC_MSG_NOTICE([PCRE2 was not found. Will use the packaged pcre2.]) ]) ]) - AS_IF([test "x$build_oniguruma" = xyes && test -f "${srcdir}/vendor/oniguruma/configure.ac" ], [ - onig_CFLAGS="-I${srcdir}/vendor/oniguruma/src" - onig_LDFLAGS="-L${srcdir}/vendor/oniguruma/src -Wl,-rpath,${libdir}" - AC_CONFIG_SUBDIRS([vendor/oniguruma]) - AC_DEFINE([HAVE_LIBONIG],1,[Define to 1 if the system includes libonig]) + AS_IF([test "x$build_pcre2" = xyes && test -f "${srcdir}/vendor/pcre2/configure.ac" ], [ + pcre2_CFLAGS="-I${srcdir}/vendor/pcre2/src -DPCRE2_CODE_UNIT_WIDTH=8" + pcre2_LDFLAGS="-L${srcdir}/vendor/pcre2/src -Wl,-rpath,${libdir}" + AC_CONFIG_SUBDIRS([vendor/pcre2]) + AC_DEFINE([HAVE_LIBPCRE2_8],1,[Define to 1 if the system includes libpcre2]) ]) CFLAGS="$save_CFLAGS" LDFLAGS="$save_LDFLAGS" ]) -AC_SUBST(onig_CFLAGS) -AC_SUBST(onig_LDFLAGS) +AC_SUBST(pcre2_CFLAGS) +AC_SUBST(pcre2_LDFLAGS) -AM_CONDITIONAL([BUILD_ONIGURUMA], [test "x$build_oniguruma" = xyes]) -AM_CONDITIONAL([WITH_ONIGURUMA], [test "x$with_oniguruma" != xno]) +AM_CONDITIONAL([BUILD_PCRE2], [test "x$build_pcre2" = xyes]) +AM_CONDITIONAL([WITH_PCRE2], [test "x$with_pcre2" != xno]) AC_CONFIG_MACRO_DIRS([config/m4 m4]) AC_CONFIG_FILES([Makefile libjq.pc]) AC_OUTPUT - diff --git a/docs/build_mantests.py b/docs/build_mantests.py index bbc7e325bb..d8639533e5 100755 --- a/docs/build_mantests.py +++ b/docs/build_mantests.py @@ -7,13 +7,13 @@ with open('content/manual/dev/manual.yml') as source, \ open('../tests/man.test', 'w') as man, \ - open('../tests/manonig.test', 'w') as manonig: + open('../tests/manpcre2.test', 'w') as manpcre2: manual = yaml.safe_load(source) for section in manual.get('sections', []): for entry in section.get('entries', []): for example in entry.get('examples', []): program = example.get('program', '').replace('\n', ' ') - out = manonig if regex_program_pattern.search(program) else man + out = manpcre2 if regex_program_pattern.search(program) else man print(program, file=out) print(example.get('input', ''), file=out) for s in example.get('output', []): diff --git a/docs/content/manual/dev/manual.yml b/docs/content/manual/dev/manual.yml index fcdbcfa3cf..1beb91d0a5 100644 --- a/docs/content/manual/dev/manual.yml +++ b/docs/content/manual/dev/manual.yml @@ -2598,13 +2598,10 @@ sections: body: | jq uses the - [Oniguruma regular expression library](https://github.com/kkos/oniguruma/blob/master/doc/RE), - as do PHP, TextMate, Sublime Text, etc, so the + [PCRE2 library](https://pcre2project.github.io/pcre2/) (Perl-Comptible Regular Rexpressions), + as do lots of other open soruce projcets, so the description here will focus on jq specifics. - Oniguruma supports several flavors of regular expression, so it is important to know - that jq uses the ["Perl NG" (Perl with named groups)](https://github.com/kkos/oniguruma/blob/master/doc/SYNTAX.md) flavor. - The jq regex filters are defined so that they can be used using one of these patterns: @@ -2631,7 +2628,6 @@ sections: * `n` - Ignore empty matches * `p` - Both s and m modes are enabled * `s` - Single line mode (`^` -> `\A`, `$` -> `\Z`) - * `l` - Find longest possible matches * `x` - Extended regex format (ignore whitespace and comments) To match a whitespace with the `x` flag, use `\s`, e.g. diff --git a/jq.1.prebuilt b/jq.1.prebuilt index 0b15447ec6..cb295ada15 100644 --- a/jq.1.prebuilt +++ b/jq.1.prebuilt @@ -1,5 +1,5 @@ . -.TH "JQ" "1" "May 2025" "" "" +.TH "JQ" "1" "January 2026" "" "" . .SH "NAME" \fBjq\fR \- Command\-line JSON processor @@ -2818,11 +2818,7 @@ jq \'[\.[] | tonumber?]\' .IP "" 0 . .SH "REGULAR EXPRESSIONS" -jq uses the Oniguruma regular expression library, as do PHP, TextMate, Sublime Text, etc, so the description here will focus on jq specifics\. - -. -.P -Oniguruma supports several flavors of regular expression, so it is important to know that jq uses the "Perl NG" (Perl with named groups) flavor\. +jq uses the PCRE2 library (Perl\-Comptible Regular Rexpressions), as do lots of other open soruce projcets, so the description here will focus on jq specifics\. . .P @@ -2880,9 +2876,6 @@ FLAGS is a string consisting of one of more of the supported flags: \fBs\fR \- Single line mode (\fB^\fR \-> \fB\eA\fR, \fB$\fR \-> \fB\eZ\fR) . .IP "\(bu" 4 -\fBl\fR \- Find longest possible matches -. -.IP "\(bu" 4 \fBx\fR \- Extended regex format (ignore whitespace and comments) . .IP "" 0 diff --git a/src/builtin.c b/src/builtin.c index a6a1d33302..868705aebb 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -18,8 +18,12 @@ #include #include #include -#ifdef HAVE_LIBONIG -#include +#ifdef HAVE_LIBPCRE2_8 +#ifdef WIN32 +// TODO: move to configure.ac +#define PCRE2_STATIC 1 +#endif +#include #endif #include #include @@ -910,233 +914,280 @@ static jv f_unique_by_impl(jq_state *jq, jv input, jv keys) { } } -#ifdef HAVE_LIBONIG -static int f_match_name_iter(const UChar* name, const UChar *name_end, int ngroups, - int *groups, regex_t *reg, void *arg) { - jv captures = *(jv*)arg; - for (int i = 0; i < ngroups; ++i) { - jv cap = jv_array_get(jv_copy(captures),groups[i]-1); - if (jv_get_kind(cap) == JV_KIND_OBJECT) { - cap = jv_object_set(cap, jv_string("name"), jv_string_sized((const char*)name, name_end-name)); - captures = jv_array_set(captures,groups[i]-1,cap); - } else { - jv_free(cap); - } +#if defined(HAVE_LIBPCRE2_8) +static int utf8_cp_count(PCRE2_SPTR s, PCRE2_SPTR end) { + int c = 0; + for (; s < end; c++) { + s += jvp_utf8_decode_length(*s); } - *(jv *)arg = captures; - return 0; + return c; } static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) { - int test = jv_equal(testmode, jv_true()); - jv result; - int onigret; - int global = 0; - regex_t *reg; - OnigErrorInfo einfo; - OnigRegion* region; + int is_testmode = jv_equal(testmode, jv_true()); if (jv_get_kind(input) != JV_KIND_STRING) { jv_free(regex); jv_free(modifiers); - return type_error(input, "cannot be matched, as it is not a string"); + return type_error(input, "input cannot be matched, as it is not a string"); } if (jv_get_kind(regex) != JV_KIND_STRING) { jv_free(input); jv_free(modifiers); - return type_error(regex, "is not a string"); + return type_error(regex, "regex is not a string"); } - OnigOptionType options = ONIG_OPTION_CAPTURE_GROUP; - + int global = 0; + uint32_t compile_options = + (PCRE2_UTF | // pattern and subject uses utf-8 (PCRE2_CODE_UNIT_WIDTH 8) + PCRE2_UCP | // include unicode in some character classes, ex \b + PCRE2_NO_UTF_CHECK | // pattern is valid utf-8 + PCRE2_DUPNAMES // allow dup group names (same as jq with onig) + ); + uint32_t match_options = PCRE2_NO_UTF_CHECK; if (jv_get_kind(modifiers) == JV_KIND_STRING) { jv modarray = jv_string_explode(jv_copy(modifiers)); jv_array_foreach(modarray, i, mod) { switch ((int)jv_number_value(mod)) { - case 'g': - global = 1; - break; - case 'i': - options |= ONIG_OPTION_IGNORECASE; - break; - case 'x': - options |= ONIG_OPTION_EXTEND; - break; - case 'm': - options |= ONIG_OPTION_MULTILINE; - break; - case 's': - options |= ONIG_OPTION_SINGLELINE; - break; - case 'p': - options |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE; - break; - case 'l': - options |= ONIG_OPTION_FIND_LONGEST; - break; - case 'n': - options |= ONIG_OPTION_FIND_NOT_EMPTY; - break; - default: - jv_free(input); - jv_free(regex); - jv_free(modarray); - return jv_invalid_with_msg(jv_string_concat(modifiers, - jv_string(" is not a valid modifier string"))); + case 'g': + global = 1; + break; + case 'i': + compile_options |= PCRE2_CASELESS; + break; + case 'x': + // TODO: also PCRE2_EXTENDED_MORE? + compile_options |= PCRE2_EXTENDED; + break; + case 'm': + compile_options |= PCRE2_MULTILINE; + break; + case 's': + compile_options |= PCRE2_FIRSTLINE; + break; + case 'p': + compile_options |= PCRE2_MULTILINE | PCRE2_FIRSTLINE; + break; + // case 'l': + // TODO: possible with PCRE2? throw unsupported error? + // (first) longest match: + // "a aaa bbb" | match("[ab]*";"l") -> "aaa" + // with "lg" it includes all matches after the first longest match. + // if not supported by pcre2 it can probably be reimplement somehow + // with some added complexity. + break; + case 'n': + match_options = PCRE2_NOTEMPTY; + break; + default: + jv_free(input); + jv_free(regex); + jv_free(modarray); + return jv_invalid_with_msg(jv_string_concat( + modifiers, jv_string(" is not a valid modifier string"))); } } jv_free(modarray); } else if (jv_get_kind(modifiers) != JV_KIND_NULL) { - // If it isn't a string or null, then it is the wrong type... + // if it isn't a string or null, then it is the wrong type jv_free(input); jv_free(regex); - return type_error(modifiers, "is not a string"); + return type_error(modifiers, "regex modifiers is not a string"); } - jv_free(modifiers); - onigret = onig_new(®, (const UChar*)jv_string_value(regex), - (const UChar*)(jv_string_value(regex) + jv_string_length_bytes(jv_copy(regex))), - options, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PERL_NG, &einfo); - if (onigret != ONIG_NORMAL) { - UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str(ebuf, onigret, &einfo); + pcre2_code *re = NULL; + PCRE2_SIZE erroffset = 0; + int errorcode = 0; + re = pcre2_compile((PCRE2_SPTR)jv_string_value(regex), PCRE2_ZERO_TERMINATED, + compile_options, &errorcode, &erroffset, NULL); + if (re == NULL) { + PCRE2_UCHAR buffer[256] = {}; + pcre2_get_error_message(errorcode, buffer, sizeof(buffer)); jv_free(input); jv_free(regex); - return jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "), - jv_string((char*)ebuf))); - } - result = test ? jv_false() : jv_array(); - const char *input_string = jv_string_value(input); - const UChar* start = (const UChar*)jv_string_value(input); - const unsigned long length = jv_string_length_bytes(jv_copy(input)); - const UChar* end = start + length; - region = onig_region_new(); - do { - onigret = onig_search(reg, - (const UChar*)jv_string_value(input), end, /* string boundaries */ - start, end, /* search boundaries */ - region, ONIG_OPTION_NONE); - if (onigret >= 0) { - if (test) { + return jv_invalid_with_msg(jv_string_concat(jv_string("regex failure: "), + jv_string((char *)buffer))); + } + + uint32_t capture_count = 0; + PCRE2_SPTR *capture_names = NULL; + + if (!is_testmode) { + pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count); + if (capture_count > 0) { + // use calloc so that unnamed capture groups has a null pointer + capture_names = jv_mem_calloc(capture_count, sizeof(capture_names[0])); + + uint32_t name_count = 0; + pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &name_count); + if (name_count > 0) { + PCRE2_SPTR name_table; + uint32_t name_entry_size; + pcre2_pattern_info(re, PCRE2_INFO_NAMETABLE, &name_table); + pcre2_pattern_info(re, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size); + + // each entry looks like this: + // struct { + // n uint16_t // big endian + // char name[] // null terminated name + // char padding[] // padding to be PCRE2_INFO_NAMEENTRYSIZE in size + // } + PCRE2_SPTR entry = name_table; + for (uint32_t i = 0; i < name_count; i++) { + // -1 as first pair in ovector is the whole match + int n = ((entry[0] << 8) | entry[1]) - 1; + capture_names[n] = entry + 2; + entry += name_entry_size; + } + } + } + } + + pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL); + PCRE2_SPTR subject = (PCRE2_SPTR)jv_string_value(input); + PCRE2_SIZE subject_length = jv_string_length_bytes(jv_copy(input)); + + PCRE2_SIZE start_byte_offset = 0; + // these are used keep track of codepoint offset for current match + // so that we don't have to count from the subject start for each match + // TODO: better tracking for substrings offset also? + int match_offset = 0; + PCRE2_SPTR match_end_prev = subject; + jv result = is_testmode ? jv_false() : jv_array(); + uint32_t retry_empty_options = 0; + for (;;) { + int rc = pcre2_match(re, subject, subject_length, start_byte_offset, + match_options | retry_empty_options, match_data, NULL); + if (rc > 0) { + if (is_testmode) { result = jv_true(); break; } - - // Zero-width match - if (region->end[0] == region->beg[0]) { - unsigned long idx; - const char *fr = (const char*)input_string; - for (idx = 0; fr < input_string+region->beg[0]; idx++) { - fr += jvp_utf8_decode_length(*fr); - } - jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); - match = jv_object_set(match, jv_string("length"), jv_number(0)); - match = jv_object_set(match, jv_string("string"), jv_string("")); - jv captures = jv_array(); - for (int i = 1; i < region->num_regs; ++i) { - jv cap = jv_object(); - if (region->beg[i] == -1) { - cap = jv_object_set(cap, jv_string("offset"), jv_number(-1)); - cap = jv_object_set(cap, jv_string("string"), jv_null()); - } else { - cap = jv_object_set(cap, jv_string("offset"), jv_number(idx)); - cap = jv_object_set(cap, jv_string("string"), jv_string("")); - } - cap = jv_object_set(cap, jv_string("length"), jv_number(0)); - cap = jv_object_set(cap, jv_string("name"), jv_null()); - captures = jv_array_append(captures, cap); - } - onig_foreach_name(reg, f_match_name_iter, &captures); - match = jv_object_set(match, jv_string("captures"), captures); - result = jv_array_append(result, match); - // ensure '"qux" | match("(?=u)"; "g")' matches just once - start = (const UChar*)(input_string+region->end[0]+1); - continue; + } else if (rc == PCRE2_ERROR_NOMATCH) { + // TODO: this is based on pcre2demo.c before it used pcre2_next_match + // TODO: maybe can port pcre2_next_match? + if (retry_empty_options == 0) { + // retry did not help + break; } + start_byte_offset += jvp_utf8_decode_length(subject[start_byte_offset]); + retry_empty_options = 0; + // retry match with no extra options + continue; + } else { + jv_free(result); + // TODO: reason string? + result = + jv_invalid_with_msg(jv_string_fmt("regexp match error (%d)", rc)); + break; + } - unsigned long idx; - unsigned long len; - const char *fr = (const char*)input_string; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data); + PCRE2_SPTR match_start = subject + ovector[0]; + PCRE2_SPTR match_end = subject + ovector[1]; + PCRE2_SIZE match_length_bytes = match_end - match_start; + match_offset += utf8_cp_count(match_end_prev, match_start); + int match_length = utf8_cp_count(match_start, match_end); + jv match = jv_object(); + match = jv_object_set(match, jv_string("offset"), jv_number(match_offset)); + match = jv_object_set(match, jv_string("length"), jv_number(match_length)); + match = jv_object_set( + match, jv_string("string"), + jv_string_sized((const char *)match_start, match_length_bytes)); + + jv captures = jv_array(); + for (int i = 1; i < rc; i++) { + int substring_matched = ovector[2 * i] != PCRE2_UNSET; + int cap_offset = 0; + int cap_length = 0; + jv cap_string = jv_null(); + + if (substring_matched) { + PCRE2_SPTR substring_start = subject + ovector[2 * i]; + PCRE2_SPTR substring_end = subject + ovector[2 * i + 1]; + PCRE2_SIZE substring_length = substring_end - substring_start; + cap_offset = match_offset + utf8_cp_count(match_start, substring_start); + cap_length = utf8_cp_count(substring_start, substring_end); + cap_string = + jv_string_sized((const char *)substring_start, substring_length); + } else { + cap_offset = -1; + } - for (idx = len = 0; fr < input_string+region->end[0]; len++) { - if (fr == input_string+region->beg[0]) idx = len, len=0; - fr += jvp_utf8_decode_length(*fr); + jv cap = jv_object(); + cap = jv_object_set(cap, jv_string("offset"), jv_number(cap_offset)); + cap = jv_object_set(cap, jv_string("length"), jv_number(cap_length)); + cap = jv_object_set(cap, jv_string("string"), cap_string); + PCRE2_SPTR name = capture_names != NULL ? capture_names[i - 1] : NULL; + if (name != NULL) { + cap = jv_object_set(cap, jv_string("name"), + jv_string((const char *)name)); + } else { + cap = jv_object_set(cap, jv_string("name"), jv_null()); } - jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); - - unsigned long blen = region->end[0]-region->beg[0]; - match = jv_object_set(match, jv_string("length"), jv_number(len)); - match = jv_object_set(match, jv_string("string"), jv_string_sized(input_string+region->beg[0],blen)); - jv captures = jv_array(); - for (int i = 1; i < region->num_regs; ++i) { - // Empty capture. - if (region->beg[i] == region->end[i]) { - // Didn't match. - jv cap; - if (region->beg[i] == -1) { - cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(-1)); - cap = jv_object_set(cap, jv_string("string"), jv_null()); - } else { - fr = input_string; - for (idx = 0; fr < input_string+region->beg[i]; idx++) { - fr += jvp_utf8_decode_length(*fr); - } - cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); - cap = jv_object_set(cap, jv_string("string"), jv_string("")); - } - cap = jv_object_set(cap, jv_string("length"), jv_number(0)); - cap = jv_object_set(cap, jv_string("name"), jv_null()); - captures = jv_array_append(captures, cap); - continue; - } - fr = input_string; - for (idx = len = 0; fr < input_string+region->end[i]; len++) { - if (fr == input_string+region->beg[i]) idx = len, len=0; - fr += jvp_utf8_decode_length(*fr); - } + captures = jv_array_append(captures, cap); + } - blen = region->end[i]-region->beg[i]; - jv cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); - cap = jv_object_set(cap, jv_string("length"), jv_number(len)); - cap = jv_object_set(cap, jv_string("string"), jv_string_sized(input_string+region->beg[i],blen)); + // this imitates onig by filling out non-matched capture groups + for (uint32_t i = rc - 1; i < capture_count; i++) { + jv cap = jv_object(); + cap = jv_object_set(cap, jv_string("offset"), jv_number(-1)); + cap = jv_object_set(cap, jv_string("length"), jv_number(0)); + cap = jv_object_set(cap, jv_string("string"), jv_null()); + PCRE2_SPTR name = capture_names != NULL ? capture_names[i] : NULL; + if (name != NULL) { + cap = jv_object_set(cap, jv_string("name"), + jv_string((const char *)name)); + } else { cap = jv_object_set(cap, jv_string("name"), jv_null()); - captures = jv_array_append(captures,cap); } - onig_foreach_name(reg,f_match_name_iter,&captures); - match = jv_object_set(match, jv_string("captures"), captures); - result = jv_array_append(result, match); - start = (const UChar*)(input_string+region->end[0]); - onig_region_free(region,0); - } else if (onigret == ONIG_MISMATCH) { - break; - } else { /* Error */ - UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str(ebuf, onigret, &einfo); - jv_free(result); - result = jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "), - jv_string((char*)ebuf))); + captures = jv_array_append(captures, cap); + } + + match = jv_object_set(match, jv_string("captures"), captures); + result = jv_array_append(result, match); + + if (!global) { break; } - } while (global && start <= end); - onig_region_free(region,1); - region = NULL; - onig_free(reg); + + start_byte_offset = ovector[1]; + match_end_prev = match_end; + match_offset += match_length; + + // code below is partially based pcre2_next_match from new version of pcre2 + // maybe use it in the future when more available? + if (ovector[0] == ovector[1]) { + if (ovector[0] == subject_length) { + break; + } + retry_empty_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } + } + + if (capture_names != NULL) { + jv_mem_free(capture_names); + } + pcre2_match_data_free(match_data); + pcre2_code_free(re); jv_free(input); jv_free(regex); + return result; } -#else /* !HAVE_LIBONIG */ + +#else static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) { jv_free(input); jv_free(regex); jv_free(modifiers); jv_free(testmode); - return jv_invalid_with_msg(jv_string("jq was compiled without ONIGURUMA regex library. match/test/sub and related functions are not available.")); + return jv_invalid_with_msg(jv_string("jq was compiled without a regex library. match/test/sub and related functions are not available.")); } -#endif /* HAVE_LIBONIG */ +#endif /* HAVE_LIBPCRE2 */ static jv minmax_by(jv values, jv keys, int is_min) { if (jv_get_kind(values) != JV_KIND_ARRAY) diff --git a/src/main.c b/src/main.c index ce362607e2..d6f47111fb 100644 --- a/src/main.c +++ b/src/main.c @@ -21,10 +21,6 @@ extern void jv_tsd_dtoa_ctx_init(); #endif -#ifdef HAVE_LIBONIG -#include -#endif - #if !defined(HAVE_ISATTY) && defined(HAVE__ISATTY) #undef isatty #define isatty _isatty @@ -302,13 +298,6 @@ int main(int argc, char* argv[]) { (void) setlocale(LC_ALL, ""); #endif -#ifdef HAVE_LIBONIG - // use a lower regex parse depth limit than the default (4096) to protect - // from stack-overflows - // https://github.com/jqlang/jq/security/advisories/GHSA-f946-j5j2-4w5m - onig_set_parse_depth_limit(1024); -#endif - #ifdef __OpenBSD__ if (pledge("stdio rpath", NULL) == -1) { perror("pledge"); diff --git a/tests/jq_fuzz_execute.cpp b/tests/jq_fuzz_execute.cpp index 7d12af9a66..fafebdb969 100644 --- a/tests/jq_fuzz_execute.cpp +++ b/tests/jq_fuzz_execute.cpp @@ -3,12 +3,6 @@ #include "jq.h" #include "jv.h" -#include "oniguruma.h" - -extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) { - onig_set_parse_depth_limit(1024); - return 0; -} // Fuzzer inspired by /src/jq_test.c // The goal is to have the fuzzer execute the functions: diff --git a/tests/manonigtest b/tests/manonigtest deleted file mode 100755 index cc06f42e7b..0000000000 --- a/tests/manonigtest +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -. "${0%/*}/setup" "$@" - -$VALGRIND $Q $JQ -L "$mods" --run-tests $JQBASEDIR/tests/manonig.test diff --git a/tests/manonig.test b/tests/manpcre2.test similarity index 100% rename from tests/manonig.test rename to tests/manpcre2.test diff --git a/tests/manpcre2test b/tests/manpcre2test new file mode 100755 index 0000000000..a067460a24 --- /dev/null +++ b/tests/manpcre2test @@ -0,0 +1,5 @@ +#!/bin/sh + +. "${0%/*}/setup" "$@" + +$VALGRIND $Q $JQ -L "$mods" --run-tests $JQBASEDIR/tests/manpcre2.test diff --git a/tests/onig.supp b/tests/onig.supp deleted file mode 100644 index 37c847eebf..0000000000 --- a/tests/onig.supp +++ /dev/null @@ -1,21 +0,0 @@ -{ - onig node recycling - Memcheck:Leak - ... - fun:onig_parse_make_tree - ... -} -{ - onig unicode case insensitivity 1 - Memcheck:Leak - ... - fun:setup_tree - ... -} -{ - onig unicode case insensitivity 2 - Memcheck:Leak - ... - fun:onig*unicode* - ... -} diff --git a/tests/onigtest b/tests/onigtest deleted file mode 100755 index f452193805..0000000000 --- a/tests/onigtest +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -. "${0%/*}/setup" "$@" - -$VALGRIND $Q $JQ -L "$mods" --run-tests $JQTESTDIR/onig.test diff --git a/tests/pcre2.supp b/tests/pcre2.supp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/onig.test b/tests/pcre2.test similarity index 86% rename from tests/onig.test rename to tests/pcre2.test index 3b189d4037..7efe917d96 100644 --- a/tests/onig.test +++ b/tests/pcre2.test @@ -29,9 +29,10 @@ "a bār" [{"offset": 2, "length": 4, "string": "bār", "captures":[]}] -[match(".+?\\b")] -"ā two-codepoint grapheme" -[{"offset": 0, "length": 2, "string": "ā", "captures":[]}] +# TODO: temp disabled as pcre2 < 10.43 captures length 1 +#[match(".+?\\b")] +#"ā two-codepoint grapheme" +#[{"offset": 0, "length": 2, "string": "ā", "captures":[]}] [match(["foo (?bar)? foo", "ig"])] "foo bar foo foo foo" @@ -209,3 +210,34 @@ sub("(?.)"; "\(.x)!") "abAABBabA" ["a","AA","a","A"] +# subject type error test +try match("") catch . +null +"null (null) input cannot be matched, as it is not a string" + +# pattern error test +try match("*") catch . +"" +"regex failure: quantifier does not follow a repeatable item" + +# modifiers type error test +try match(""; 123) catch . +"" +"number (123) regex modifiers is not a string" + +# null modifiers is ok with onig impl +test(""; null) +"" +true + +# multi-byte codepoint split +# different from onig (["","å","","ä","","ö","",""]) +[splits("")] +"åäö" +["","å","ä","ö",""] + +# dup group names (tested to be same as oni) +capture("(?.)(?.)").n +"ab" +"b" + diff --git a/tests/pcre2test b/tests/pcre2test new file mode 100755 index 0000000000..68555238f1 --- /dev/null +++ b/tests/pcre2test @@ -0,0 +1,5 @@ +#!/bin/sh + +. "${0%/*}/setup" "$@" + +$VALGRIND $Q $JQ -L "$mods" --run-tests $JQTESTDIR/pcre2.test diff --git a/tests/setup b/tests/setup index 7e6dfe6934..22944d1af0 100755 --- a/tests/setup +++ b/tests/setup @@ -18,7 +18,7 @@ export LC_ALL if [ -n "${ENABLE_VALGRIND-}" ] && which valgrind > /dev/null; then VALGRIND="valgrind --error-exitcode=1 --leak-check=full \ - --suppressions=$JQTESTDIR/onig.supp \ + --suppressions=$JQTESTDIR/pcre2.supp \ --suppressions=$JQTESTDIR/local.supp" VG_EXIT0=--error-exitcode=0 Q=-q diff --git a/vendor/oniguruma b/vendor/oniguruma deleted file mode 160000 index 4ef89209a2..0000000000 --- a/vendor/oniguruma +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4ef89209a239c1aea328cf13c05a2807e5c146d1 diff --git a/vendor/pcre2 b/vendor/pcre2 new file mode 160000 index 0000000000..f454e231fe --- /dev/null +++ b/vendor/pcre2 @@ -0,0 +1 @@ +Subproject commit f454e231fe5006dd7ff8f4693fd2b8eb94333429