diff --git a/CMakeLists.txt b/CMakeLists.txt index 23ba6c5293..1a29381b74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -536,6 +536,7 @@ endif() if(ENABLE_PDFIUM) find_package(PDFIUM REQUIRED MODULE) + set(HAVE_PDFIUM 1) endif() if(ENABLE_JSON_SHARED) diff --git a/Cargo.lock b/Cargo.lock index 37226169b8..27e3acbe69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -345,7 +345,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" dependencies = [ "iana-time-zone", + "js-sys", "num-traits", + "wasm-bindgen", "windows-link", ] @@ -400,6 +402,7 @@ dependencies = [ "num-traits", "onenote_parser", "openssl", + "pdfium-render", "rustdct", "sha1", "sha2", @@ -473,6 +476,26 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + +[[package]] +name = "console_log" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f" +dependencies = [ + "log", + "web-sys", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -1286,6 +1309,12 @@ dependencies = [ "imgref", ] +[[package]] +name = "maybe-owned" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" + [[package]] name = "maybe-rayon" version = "0.1.1" @@ -1496,6 +1525,32 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pdfium-render" +version = "0.8.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6553f6604a52b3203db7b4e9d51eb4dd193cf455af9e56d40cab6575b547b679" +dependencies = [ + "bitflags 2.9.4", + "bytemuck", + "bytes", + "chrono", + "console_error_panic_hook", + "console_log", + "image", + "itertools 0.14.0", + "js-sys", + "libloading", + "log", + "maybe-owned", + "once_cell", + "utf16string", + "vecmath", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1508,6 +1563,12 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "piston-float" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590" + [[package]] name = "pkg-config" version = "0.3.32" @@ -2223,6 +2284,15 @@ dependencies = [ "serde", ] +[[package]] +name = "utf16string" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216" +dependencies = [ + "byteorder", +] + [[package]] name = "utf8_iter" version = "1.0.4" @@ -2262,6 +2332,15 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "vecmath" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a" +dependencies = [ + "piston-float", +] + [[package]] name = "version-compare" version = "0.2.0" @@ -2325,6 +2404,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.104" @@ -2357,6 +2449,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web-sys" +version = "0.3.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "weezl" version = "0.1.10" diff --git a/clamd/server-th.c b/clamd/server-th.c index 79c046bce3..be49dab611 100644 --- a/clamd/server-th.c +++ b/clamd/server-th.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -58,6 +59,51 @@ #include "misc.h" #include "idmef_logging.h" +static int parse_pdf_render_canvas(const char *value, uint32_t *width, uint32_t *height) +{ + unsigned int parsed_width = 0; + unsigned int parsed_height = 0; + char trailing = '\0'; + + if ((NULL == value) || (NULL == width) || (NULL == height)) { + return 0; + } + + if ((2 != sscanf(value, "%ux%u", &parsed_width, &parsed_height)) || + (0 == parsed_width) || + (0 == parsed_height)) { + return 0; + } + + if (3 == sscanf(value, "%ux%u%c", &parsed_width, &parsed_height, &trailing)) { + return 0; + } + + *width = parsed_width; + *height = parsed_height; + + return 1; +} + +static int parse_pdf_render_format(const char *value, uint32_t *format) +{ + if ((NULL == value) || (NULL == format)) { + return 0; + } + + if (0 == strcmp(value, "png")) { + *format = 1; + return 1; + } + + if ((0 == strcmp(value, "jpeg")) || (0 == strcmp(value, "jpg"))) { + *format = 2; + return 1; + } + + return 0; +} + #include "server.h" #include "thrmgr.h" #include "session.h" @@ -1122,6 +1168,103 @@ int recvloop(int *socketds, unsigned nsockets, struct cl_engine *engine, unsigne val = cl_engine_get_num(engine, CL_ENGINE_PCRE_MAX_FILESIZE, NULL); logg(LOGG_INFO, "Limits: PCREMaxFileSize limit set to %llu.\n", val); + if (optget(opts, "PDFRenderDPI")->active && optget(opts, "PDFRenderCanvas")->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "PDFRenderDPI and PDFRenderCanvas require ClamAV to be built with PDFium support.\n"); +#else + logg(LOGG_ERROR, "Cannot set both PDFRenderDPI and PDFRenderCanvas.\n"); +#endif + cl_engine_free(engine); + return 1; + } + + if ((opt = optget(opts, "PDFRenderDPI"))->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "PDFRenderDPI requires ClamAV to be built with PDFium support.\n"); + cl_engine_free(engine); + return 1; +#else + if (opt->numarg <= 0) { + logg(LOGG_ERROR, "PDFRenderDPI must be greater than 0.\n"); + cl_engine_free(engine); + return 1; + } + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_DPI, opt->numarg))) { + logg(LOGG_ERROR, "cli_engine_set_num(PDFRenderDPI) failed: %s\n", cl_strerror(ret)); + cl_engine_free(engine); + return 1; + } +#endif + } +#ifdef HAVE_PDFIUM + val = cl_engine_get_num(engine, CL_ENGINE_PDF_RENDER_DPI, NULL); + if (val > 0) { + logg(LOGG_INFO, "PDF rendering: DPI set to %llu.\n", val); + } +#else + logg(LOGG_INFO, "PDF rendering unavailable: built without PDFium support.\n"); +#endif + + if ((opt = optget(opts, "PDFRenderCanvas"))->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "PDFRenderCanvas requires ClamAV to be built with PDFium support.\n"); + cl_engine_free(engine); + return 1; +#else + uint32_t canvas_width = 0; + uint32_t canvas_height = 0; + + if (!parse_pdf_render_canvas(opt->strarg, &canvas_width, &canvas_height)) { + logg(LOGG_ERROR, "PDFRenderCanvas must be in WIDTHxHEIGHT format, for example 1920x1080.\n"); + cl_engine_free(engine); + return 1; + } + + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_CANVAS_WIDTH, canvas_width))) { + logg(LOGG_ERROR, "cli_engine_set_num(PDFRenderCanvasWidth) failed: %s\n", cl_strerror(ret)); + cl_engine_free(engine); + return 1; + } + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_CANVAS_HEIGHT, canvas_height))) { + logg(LOGG_ERROR, "cli_engine_set_num(PDFRenderCanvasHeight) failed: %s\n", cl_strerror(ret)); + cl_engine_free(engine); + return 1; + } +#endif + } +#ifdef HAVE_PDFIUM + val = cl_engine_get_num(engine, CL_ENGINE_PDF_RENDER_CANVAS_WIDTH, NULL); + logg(LOGG_INFO, "PDF rendering: canvas width set to %llu.\n", val); + val = cl_engine_get_num(engine, CL_ENGINE_PDF_RENDER_CANVAS_HEIGHT, NULL); + logg(LOGG_INFO, "PDF rendering: canvas height set to %llu.\n", val); +#endif + + if ((opt = optget(opts, "PDFRenderFormat"))->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "PDFRenderFormat requires ClamAV to be built with PDFium support.\n"); + cl_engine_free(engine); + return 1; +#else + uint32_t render_format = 0; + + if (!parse_pdf_render_format(opt->strarg, &render_format)) { + logg(LOGG_ERROR, "PDFRenderFormat must be either png or jpeg.\n"); + cl_engine_free(engine); + return 1; + } + + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_FORMAT, render_format))) { + logg(LOGG_ERROR, "cli_engine_set_num(PDFRenderFormat) failed: %s\n", cl_strerror(ret)); + cl_engine_free(engine); + return 1; + } +#endif + } +#ifdef HAVE_PDFIUM + val = cl_engine_get_num(engine, CL_ENGINE_PDF_RENDER_FORMAT, NULL); + logg(LOGG_INFO, "PDF rendering: format set to %s.\n", (2 == val) ? "jpeg" : "png"); +#endif + if (optget(opts, "ScanArchive")->enabled) { logg(LOGG_INFO, "Archive support enabled.\n"); options.parse |= CL_SCAN_PARSE_ARCHIVE; @@ -1143,6 +1286,26 @@ int recvloop(int *socketds, unsigned nsockets, struct cl_engine *engine, unsigne logg(LOGG_INFO, "Detection using image fuzzy hash disabled.\n"); } +#ifndef HAVE_PDFIUM + if (optget(opts, "ScanPDFImageFuzzyHash")->active && + optget(opts, "ScanPDFImageFuzzyHash")->enabled) { + logg(LOGG_ERROR, "ScanPDFImageFuzzyHash requires ClamAV to be built with PDFium support.\n"); + cl_engine_free(engine); + return 1; + } +#endif + + if (optget(opts, "ScanPDFImageFuzzyHash")->enabled) { +#ifdef HAVE_PDFIUM + logg(LOGG_INFO, "Detection using PDF render image fuzzy hash enabled.\n"); + options.parse |= CL_SCAN_PARSE_PDF_IMAGE_FUZZY_HASH; +#else + logg(LOGG_INFO, "Detection using PDF render image fuzzy hash unavailable: built without PDFium support.\n"); +#endif + } else { + logg(LOGG_INFO, "Detection using PDF render image fuzzy hash disabled.\n"); + } + /* TODO: Remove deprecated option in a future feature release. */ if (optget(opts, "ArchiveBlockEncrypted")->enabled) { if (options.parse & CL_SCAN_PARSE_ARCHIVE) { diff --git a/clamscan/clamscan.c b/clamscan/clamscan.c index 4d4e2c2465..dab542b37b 100644 --- a/clamscan/clamscan.c +++ b/clamscan/clamscan.c @@ -333,6 +333,12 @@ void help(void) mprintf(LOGG_INFO, " --scan-archive[=yes(*)/no] Scan archive files (supported by libclamav).\n"); mprintf(LOGG_INFO, " --scan-image[=yes(*)/no] Scan image (graphics) files.\n"); mprintf(LOGG_INFO, " --scan-image-fuzzy-hash[=yes(*)/no] Detect files by calculating image (graphics) fuzzy hashes.\n"); +#ifdef HAVE_PDFIUM + mprintf(LOGG_INFO, " --scan-pdf-image-fuzzy-hash[=yes(*)/no] Detect PDFs by rendering the first page and calculating an image fuzzy hash.\n"); + mprintf(LOGG_INFO, " --pdf-render-dpi=#n Render PDF pages for fuzzy hashing at the specified DPI.\n"); + mprintf(LOGG_INFO, " --pdf-render-canvas=WxH Render PDF pages for fuzzy hashing to fit within a WIDTH x HEIGHT canvas.\n"); + mprintf(LOGG_INFO, " --pdf-render-format=TYPE Render PDF pages as either png or jpeg.\n"); +#endif mprintf(LOGG_INFO, " --alert-broken[=yes/no(*)] Alert on broken executable files (PE & ELF).\n"); mprintf(LOGG_INFO, " --alert-broken-media[=yes/no(*)] Alert on broken graphics files (JPEG, TIFF, PNG, GIF).\n"); mprintf(LOGG_INFO, " --alert-encrypted[=yes/no(*)] Alert on encrypted archives and documents.\n"); diff --git a/clamscan/manager.c b/clamscan/manager.c index b012a5c3a9..0c1e33dc1e 100644 --- a/clamscan/manager.c +++ b/clamscan/manager.c @@ -48,6 +48,7 @@ #include #include #include +#include // libclamav #include "clamav.h" @@ -75,6 +76,51 @@ dev_t procdev; #endif +static int parse_pdf_render_canvas(const char *value, uint32_t *width, uint32_t *height) +{ + unsigned int parsed_width = 0; + unsigned int parsed_height = 0; + char trailing = '\0'; + + if ((NULL == value) || (NULL == width) || (NULL == height)) { + return 0; + } + + if ((2 != sscanf(value, "%ux%u", &parsed_width, &parsed_height)) || + (0 == parsed_width) || + (0 == parsed_height)) { + return 0; + } + + if (3 == sscanf(value, "%ux%u%c", &parsed_width, &parsed_height, &trailing)) { + return 0; + } + + *width = parsed_width; + *height = parsed_height; + + return 1; +} + +static int parse_pdf_render_format(const char *value, uint32_t *format) +{ + if ((NULL == value) || (NULL == format)) { + return 0; + } + + if (0 == strcmp(value, "png")) { + *format = 1; + return 1; + } + + if ((0 == strcmp(value, "jpeg")) || (0 == strcmp(value, "jpg"))) { + *format = 2; + return 1; + } + + return 0; +} + #ifdef _WIN32 /* FIXME: If possible, handle users correctly */ static int checkaccess(const char *path, const char *username, int mode) @@ -1681,6 +1727,87 @@ int scanmanager(const struct optstruct *opts) } } + if (optget(opts, "pdf-render-dpi")->active && optget(opts, "pdf-render-canvas")->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "--pdf-render-dpi and --pdf-render-canvas require ClamAV to be built with PDFium support.\n"); + ret = 2; + goto done; +#else + logg(LOGG_ERROR, "Cannot use --pdf-render-dpi and --pdf-render-canvas together.\n"); + ret = 2; + goto done; +#endif + } + + if ((opt = optget(opts, "pdf-render-dpi"))->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "--pdf-render-dpi requires ClamAV to be built with PDFium support.\n"); + ret = 2; + goto done; +#else + if (opt->numarg <= 0) { + logg(LOGG_ERROR, "--pdf-render-dpi must be greater than 0.\n"); + ret = 2; + goto done; + } + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_DPI, opt->numarg))) { + logg(LOGG_ERROR, "cli_engine_set_num(CL_ENGINE_PDF_RENDER_DPI) failed: %s\n", cl_strerror(ret)); + ret = 2; + goto done; + } +#endif + } + + if ((opt = optget(opts, "pdf-render-canvas"))->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "--pdf-render-canvas requires ClamAV to be built with PDFium support.\n"); + ret = 2; + goto done; +#else + uint32_t canvas_width = 0; + uint32_t canvas_height = 0; + + if (!parse_pdf_render_canvas(opt->strarg, &canvas_width, &canvas_height)) { + logg(LOGG_ERROR, "--pdf-render-canvas must be in WIDTHxHEIGHT format, for example 1920x1080.\n"); + ret = 2; + goto done; + } + + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_CANVAS_WIDTH, canvas_width))) { + logg(LOGG_ERROR, "cli_engine_set_num(CL_ENGINE_PDF_RENDER_CANVAS_WIDTH) failed: %s\n", cl_strerror(ret)); + ret = 2; + goto done; + } + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_CANVAS_HEIGHT, canvas_height))) { + logg(LOGG_ERROR, "cli_engine_set_num(CL_ENGINE_PDF_RENDER_CANVAS_HEIGHT) failed: %s\n", cl_strerror(ret)); + ret = 2; + goto done; + } +#endif + } + + if ((opt = optget(opts, "pdf-render-format"))->active) { +#ifndef HAVE_PDFIUM + logg(LOGG_ERROR, "--pdf-render-format requires ClamAV to be built with PDFium support.\n"); + ret = 2; + goto done; +#else + uint32_t render_format = 0; + + if (!parse_pdf_render_format(opt->strarg, &render_format)) { + logg(LOGG_ERROR, "--pdf-render-format must be either png or jpeg.\n"); + ret = 2; + goto done; + } + + if ((ret = cl_engine_set_num(engine, CL_ENGINE_PDF_RENDER_FORMAT, render_format))) { + logg(LOGG_ERROR, "cli_engine_set_num(CL_ENGINE_PDF_RENDER_FORMAT) failed: %s\n", cl_strerror(ret)); + ret = 2; + goto done; + } +#endif + } + /* set scan options */ if (optget(opts, "allmatch")->enabled) { options.general |= CL_SCAN_GENERAL_ALLMATCHES; @@ -1772,6 +1899,20 @@ int scanmanager(const struct optstruct *opts) if (optget(opts, "scan-image-fuzzy-hash")->enabled) options.parse |= CL_SCAN_PARSE_IMAGE_FUZZY_HASH; +#ifndef HAVE_PDFIUM + if (optget(opts, "scan-pdf-image-fuzzy-hash")->active && + optget(opts, "scan-pdf-image-fuzzy-hash")->enabled) { + logg(LOGG_ERROR, "--scan-pdf-image-fuzzy-hash requires ClamAV to be built with PDFium support.\n"); + ret = 2; + goto done; + } +#endif + +#ifdef HAVE_PDFIUM + if (optget(opts, "scan-pdf-image-fuzzy-hash")->enabled) + options.parse |= CL_SCAN_PARSE_PDF_IMAGE_FUZZY_HASH; +#endif + /* TODO: Remove deprecated option in a future feature release */ if ((optget(opts, "algorithmic-detection")->enabled) && /* && used due to default-yes for both options */ (optget(opts, "heuristic-alerts")->enabled)) { diff --git a/cmake/FindPDFIUM.cmake b/cmake/FindPDFIUM.cmake index d5d55d4a61..ad750ff66a 100644 --- a/cmake/FindPDFIUM.cmake +++ b/cmake/FindPDFIUM.cmake @@ -43,6 +43,20 @@ if(PDFIUM_FOUND AND NOT TARGET PDFIUM::pdfium) add_library(PDFIUM::pdfium UNKNOWN IMPORTED) set_target_properties(PDFIUM::pdfium PROPERTIES IMPORTED_LOCATION "${PDFIUM_LIBRARY}") + + if(APPLE AND PDFIUM_LIBRARY MATCHES "\\.a$") + get_filename_component(_PDFIUM_LIBDIR "${PDFIUM_LIBRARY}" DIRECTORY) + set(_PDFIUM_INTERFACE_LIBRARIES + "c++" + "-framework CoreGraphics") + if(PDFIUM_EXTRA_LIBRARIES) + list(APPEND _PDFIUM_INTERFACE_LIBRARIES ${PDFIUM_EXTRA_LIBRARIES}) + endif() + + set_target_properties(PDFIUM::pdfium PROPERTIES + INTERFACE_LINK_LIBRARIES "${_PDFIUM_INTERFACE_LIBRARIES}") + endif() + if(PDFIUM_INCLUDE_DIR) set_target_properties(PDFIUM::pdfium PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${PDFIUM_INCLUDE_DIR}") diff --git a/common/optparser.c b/common/optparser.c index 46f6fd1919..71a65ac932 100644 --- a/common/optparser.c +++ b/common/optparser.c @@ -482,6 +482,14 @@ const struct clam_option __clam_options[] = { {"ScanImageFuzzyHash", "scan-image-fuzzy-hash", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "This option enables detection by calculating a fuzzy hash of image (graphics)\nfiles\nSignatures using image fuzzy hashes typically match files and documents by\nidentifying images embedded or attached to those files.\nIf you turn off this option, then some files may no longer be detected.", "yes"}, + {"ScanPDFImageFuzzyHash", "scan-pdf-image-fuzzy-hash", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "This option enables PDF rendering plus image fuzzy hash detection for PDF files.\nIf you turn off this option, PDFs will still be scanned, but the PDFium-based\nrender and fuzzy hash step will be skipped.", "yes"}, + + {"PDFRenderDPI", "pdf-render-dpi", 0, CLOPT_TYPE_NUMBER, MATCH_NUMBER, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Render PDF pages for fuzzy hash calculation at the specified DPI.\nThis option is mutually exclusive with PDFRenderCanvas / --pdf-render-canvas.\nThe value must be greater than 0.", "0"}, + + {"PDFRenderCanvas", "pdf-render-canvas", 0, CLOPT_TYPE_STRING, NULL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Render PDF pages for fuzzy hash calculation to fit within a canvas specified\nas WIDTHxHEIGHT pixels.\nThis option is mutually exclusive with PDFRenderDPI / --pdf-render-dpi.\nExample: 1920x1080.", "2000x2000"}, + + {"PDFRenderFormat", "pdf-render-format", 0, CLOPT_TYPE_STRING, NULL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Render PDF pages for fuzzy hash calculation as either PNG or JPEG.\nSupported values are 'png' and 'jpeg'.", "png"}, + {"ForceToDisk", "force-to-disk", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "This option causes memory or nested map scans to dump the content to disk.\nIf you turn on this option, more data is written to disk and is available\nwhen the leave-temps option is enabled at the cost of more disk writes.", "no"}, {"MaxScanTime", "max-scantime", 0, CLOPT_TYPE_NUMBER, MATCH_NUMBER, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "This option sets the maximum amount of time a scan may take to complete.\nThe value of 0 disables the limit.\nWARNING: disabling this limit or setting it too high may result allow scanning\nof certain files to lock up the scanning process/threads resulting in a Denial of Service.\nThe value is in milliseconds.", "120000"}, diff --git a/etc/clamd.conf.sample b/etc/clamd.conf.sample index 116822c4d8..d4093e19eb 100644 --- a/etc/clamd.conf.sample +++ b/etc/clamd.conf.sample @@ -515,6 +515,31 @@ Example # Default: yes #ScanImageFuzzyHash no +# This option enables PDF rendering plus image fuzzy hash detection for PDF +# files. +# If you turn off this option, PDFs will still be scanned, but the PDFium-based +# render and fuzzy hash step will be skipped. +# Default: yes +#ScanPDFImageFuzzyHash no + +# Render PDF pages for fuzzy hash calculation at the specified DPI. +# This option is mutually exclusive with PDFRenderCanvas. +# The value must be greater than 0. +# Default: disabled +#PDFRenderDPI 144 + +# Render PDF pages for fuzzy hash calculation to fit within a canvas specified +# as WIDTHxHEIGHT pixels. +# This option is mutually exclusive with PDFRenderDPI. +# Example: 1920x1080 +# Default: 2000x2000 +#PDFRenderCanvas 1920x1080 + +# Render PDF pages for fuzzy hash calculation as either PNG or JPEG. +# Supported values are 'png' and 'jpeg'. +# Default: png +#PDFRenderFormat png + ## ## Mail files diff --git a/libclamav/clamav.h b/libclamav/clamav.h index b41d39e39b..adc22aad6d 100644 --- a/libclamav/clamav.h +++ b/libclamav/clamav.h @@ -205,6 +205,7 @@ struct cl_scan_options { #define CL_SCAN_PARSE_ONENOTE 0x400 #define CL_SCAN_PARSE_IMAGE 0x800 /** option to enable/disable parsing images (graphics) */ #define CL_SCAN_PARSE_IMAGE_FUZZY_HASH 0x1000 /** option to enable/disable image fuzzy hash calculation. */ +#define CL_SCAN_PARSE_PDF_IMAGE_FUZZY_HASH 0x2000 /** option to enable/disable PDF rendering and image fuzzy hash calculation for PDFs. */ /* heuristic alerting options */ #define CL_SCAN_HEURISTIC_BROKEN 0x2 /** alert on broken PE and broken ELF files */ @@ -345,6 +346,10 @@ enum cl_engine_field { CL_ENGINE_CVDCERTSDIR, /** (char *) */ CL_ENGINE_TMPDIR_RECURSION, /** uint32_t */ CL_ENGINE_FIPS_LIMITS, /** uint32_t */ + CL_ENGINE_PDF_RENDER_DPI, /** uint32_t */ + CL_ENGINE_PDF_RENDER_CANVAS_WIDTH, /** uint32_t */ + CL_ENGINE_PDF_RENDER_CANVAS_HEIGHT, /** uint32_t */ + CL_ENGINE_PDF_RENDER_FORMAT, /** uint32_t */ }; enum bytecode_security { diff --git a/libclamav/matcher.c b/libclamav/matcher.c index 0367e33580..c8ad849f32 100644 --- a/libclamav/matcher.c +++ b/libclamav/matcher.c @@ -186,6 +186,7 @@ static inline cl_error_t matcher_run(const struct cli_matcher *root, } switch (ftype) { + case CL_TYPE_PDF: case CL_TYPE_GIF: case CL_TYPE_TIFF: case CL_TYPE_JPEG: diff --git a/libclamav/others.c b/libclamav/others.c index d14a6aecb6..643bf54f18 100644 --- a/libclamav/others.c +++ b/libclamav/others.c @@ -578,6 +578,9 @@ struct cl_engine *cl_engine_new(void) /* Engine max settings */ new->maxiconspe = CLI_DEFAULT_MAXICONSPE; new->maxrechwp3 = CLI_DEFAULT_MAXRECHWP3; + new->pdf_render_canvas_width = 2000; + new->pdf_render_canvas_height = 2000; + new->pdf_render_format = 1; /* PCRE matching limitations */ new->pcre_match_limit = CLI_DEFAULT_PCRE_MATCH_LIMIT; @@ -867,6 +870,18 @@ cl_error_t cl_engine_set_num(struct cl_engine *engine, enum cl_engine_field fiel engine->engine_options &= ~(ENGINE_OPTIONS_FIPS_LIMITS); } break; + case CL_ENGINE_PDF_RENDER_DPI: + engine->pdf_render_dpi = (uint32_t)num; + break; + case CL_ENGINE_PDF_RENDER_CANVAS_WIDTH: + engine->pdf_render_canvas_width = (uint32_t)num; + break; + case CL_ENGINE_PDF_RENDER_CANVAS_HEIGHT: + engine->pdf_render_canvas_height = (uint32_t)num; + break; + case CL_ENGINE_PDF_RENDER_FORMAT: + engine->pdf_render_format = (uint32_t)num; + break; default: cli_errmsg("cl_engine_set_num: Incorrect field number\n"); return CL_EARG; @@ -954,6 +969,14 @@ long long cl_engine_get_num(const struct cl_engine *engine, enum cl_engine_field return engine->pcre_recmatch_limit; case CL_ENGINE_PCRE_MAX_FILESIZE: return engine->pcre_max_filesize; + case CL_ENGINE_PDF_RENDER_DPI: + return engine->pdf_render_dpi; + case CL_ENGINE_PDF_RENDER_CANVAS_WIDTH: + return engine->pdf_render_canvas_width; + case CL_ENGINE_PDF_RENDER_CANVAS_HEIGHT: + return engine->pdf_render_canvas_height; + case CL_ENGINE_PDF_RENDER_FORMAT: + return engine->pdf_render_format; default: cli_errmsg("cl_engine_get: Incorrect field number\n"); if (err) @@ -1098,6 +1121,10 @@ struct cl_settings *cl_engine_settings_copy(const struct cl_engine *engine) settings->pcre_match_limit = engine->pcre_match_limit; settings->pcre_recmatch_limit = engine->pcre_recmatch_limit; settings->pcre_max_filesize = engine->pcre_max_filesize; + settings->pdf_render_dpi = engine->pdf_render_dpi; + settings->pdf_render_canvas_width = engine->pdf_render_canvas_width; + settings->pdf_render_canvas_height = engine->pdf_render_canvas_height; + settings->pdf_render_format = engine->pdf_render_format; return settings; } @@ -1179,6 +1206,10 @@ cl_error_t cl_engine_settings_apply(struct cl_engine *engine, const struct cl_se engine->pcre_match_limit = settings->pcre_match_limit; engine->pcre_recmatch_limit = settings->pcre_recmatch_limit; engine->pcre_max_filesize = settings->pcre_max_filesize; + engine->pdf_render_dpi = settings->pdf_render_dpi; + engine->pdf_render_canvas_width = settings->pdf_render_canvas_width; + engine->pdf_render_canvas_height = settings->pdf_render_canvas_height; + engine->pdf_render_format = settings->pdf_render_format; return CL_SUCCESS; } diff --git a/libclamav/others.h b/libclamav/others.h index a58b8c6605..c10132024c 100644 --- a/libclamav/others.h +++ b/libclamav/others.h @@ -445,6 +445,10 @@ struct cl_engine { uint64_t pcre_match_limit; uint64_t pcre_recmatch_limit; uint64_t pcre_max_filesize; + uint32_t pdf_render_dpi; + uint32_t pdf_render_canvas_width; + uint32_t pdf_render_canvas_height; + uint32_t pdf_render_format; #ifdef HAVE_YARA /* YARA */ @@ -523,6 +527,10 @@ struct cl_settings { uint64_t pcre_match_limit; uint64_t pcre_recmatch_limit; uint64_t pcre_max_filesize; + uint32_t pdf_render_dpi; + uint32_t pdf_render_canvas_width; + uint32_t pdf_render_canvas_height; + uint32_t pdf_render_format; }; extern cl_unrar_error_t (*cli_unrar_open)(const char *filename, void **hArchive, char **comment, uint32_t *comment_size, uint8_t debug_flag); diff --git a/libclamav/scanners.c b/libclamav/scanners.c index 24e7ee4065..2efdf4cd8d 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -289,7 +289,7 @@ static cl_error_t cli_scanrar_file(const char *filepath, int desc, cli_ctx *ctx) } /* Scan the comment */ - status = cli_magic_scan_buff(comment, comment_size, ctx, NULL, LAYER_ATTRIBUTES_NONE); + status = cli_magic_scan_buff(comment, comment_size, ctx, CL_TYPE_ANY, NULL, LAYER_ATTRIBUTES_NONE); if (status != CL_SUCCESS) { goto done; } @@ -703,7 +703,7 @@ static cl_error_t cli_scanegg(cli_ctx *ctx) /* * Scan the comment. */ - status = cli_magic_scan_buff(comments[i], strlen(comments[i]), ctx, NULL, LAYER_ATTRIBUTES_NONE); + status = cli_magic_scan_buff(comments[i], strlen(comments[i]), ctx, CL_TYPE_ANY, NULL, LAYER_ATTRIBUTES_NONE); if (status != CL_SUCCESS) { goto done; } @@ -863,7 +863,7 @@ static cl_error_t cli_scanegg(cli_ctx *ctx) * Scan the extracted file... */ cli_dbgmsg("EGG: Extraction complete. Scanning now...\n"); - status = cli_magic_scan_buff(extract_buffer, extract_buffer_len, ctx, filename_base, LAYER_ATTRIBUTES_NONE); + status = cli_magic_scan_buff(extract_buffer, extract_buffer_len, ctx, CL_TYPE_ANY, filename_base, LAYER_ATTRIBUTES_NONE); if (status != CL_SUCCESS) { goto done; } @@ -4349,13 +4349,28 @@ static cl_error_t dispatch_prescan_callback(clcb_pre_scan cb, cli_ctx *ctx, cons return status; } +static bool should_calculate_image_fuzzy_hash(cli_ctx *ctx) +{ + if (SCAN_PARSE_IMAGE_FUZZY_HASH && (DCONF_OTHER & OTHER_CONF_IMAGE_FUZZY_HASH)) { + return true; + } + + if ((ctx->options->parse & CL_SCAN_PARSE_PDF_IMAGE_FUZZY_HASH) && + (ctx->recursion_level > 0) && + (ctx->recursion_stack[ctx->recursion_level].attributes & LAYER_ATTRIBUTES_NORMALIZED) && + (ctx->recursion_stack[ctx->recursion_level - 1].type == CL_TYPE_PDF)) { + return true; + } + + return false; +} + static cl_error_t calculate_fuzzy_image_hash(cli_ctx *ctx, cli_file_t type) { cl_error_t status = CL_EPARSE; const uint8_t *offset = NULL; image_fuzzy_hash_t hash = {0}; json_object *header = NULL; - FFIError *fuzzy_hash_calc_error = NULL; offset = fmap_need_off(ctx->fmap, 0, ctx->fmap->real_len); @@ -4400,6 +4415,104 @@ static cl_error_t calculate_fuzzy_image_hash(cli_ctx *ctx, cli_file_t type) return status; } +#ifdef HAVE_PDFIUM +static cl_error_t scan_rendered_pdf_image(cli_ctx *ctx) +{ + cl_error_t status = CL_EPARSE; + const uint8_t *offset = NULL; + FFIError *pdf_render_error = NULL; + RenderedPdfImage rendered_image = {0}; + PdfImageFuzzyHashConfig config = {0}; + char *source_basename = NULL; + char *rendered_image_path = NULL; + int rendered_image_fd = -1; + char *rendered_image_name = NULL; + + offset = fmap_need_off(ctx->fmap, 0, ctx->fmap->real_len); + if (NULL == offset) { + return CL_EMAP; + } + + if (ctx->engine->pdf_render_dpi > 0) { + config.render_mode = PDF_IMAGE_FUZZY_HASH_RENDER_MODE_DPI; + config.dpi = ctx->engine->pdf_render_dpi; + } else { + config.render_mode = PDF_IMAGE_FUZZY_HASH_RENDER_MODE_CANVAS; + config.canvas_width = ctx->engine->pdf_render_canvas_width; + config.canvas_height = ctx->engine->pdf_render_canvas_height; + } + config.image_format = ctx->engine->pdf_render_format; + + if (!pdf_render_to_image(offset, ctx->fmap->real_len, &config, &rendered_image, &pdf_render_error)) { + cli_dbgmsg("Failed to render PDF for normalized image scan: %s\n", ffierror_fmt(pdf_render_error)); + goto done; + } + + if ((NULL != ctx->fmap->name) && + (CL_SUCCESS == cli_basename(ctx->fmap->name, strlen(ctx->fmap->name), &source_basename, + true /* posix_support_backslash_pathsep */))) { + const char *image_ext = (rendered_image.image_type == CL_TYPE_JPEG) ? "jpeg" : "png"; + size_t rendered_name_len = strlen("pdf-render-") + strlen(source_basename) + strlen(".") + strlen(image_ext) + 1; + + rendered_image_name = cli_max_malloc(rendered_name_len); + if (NULL == rendered_image_name) { + status = CL_EMEM; + goto done; + } + + snprintf(rendered_image_name, rendered_name_len, "pdf-render-%s.%s", source_basename, image_ext); + } else { + rendered_image_name = cli_safer_strdup((rendered_image.image_type == CL_TYPE_JPEG) ? "pdf-render.jpeg" : "pdf-render.png"); + if (NULL == rendered_image_name) { + status = CL_EMEM; + goto done; + } + } + + if (ctx->engine->keeptmp) { + status = cli_gentempfd_with_prefix(ctx->this_layer_tmpdir, rendered_image_name, &rendered_image_path, &rendered_image_fd); + if (status != CL_SUCCESS) { + goto done; + } + + if (cli_writen(rendered_image_fd, rendered_image.image_data, rendered_image.image_len) == (size_t)-1) { + cli_errmsg("Failed writing rendered PDF image tempfile\n"); + status = CL_EWRITE; + goto done; + } + + cli_dbgmsg("Rendered PDF image written to %s\n", rendered_image_path); + + status = cli_magic_scan_desc_type(rendered_image_fd, rendered_image_path, ctx, rendered_image.image_type, + rendered_image_name, LAYER_ATTRIBUTES_NORMALIZED); + } else { + status = cli_magic_scan_buff(rendered_image.image_data, rendered_image.image_len, ctx, + rendered_image.image_type, rendered_image_name, LAYER_ATTRIBUTES_NORMALIZED); + } + +done: + if (rendered_image_fd >= 0) { + close(rendered_image_fd); + } + if (NULL != rendered_image_path) { + free(rendered_image_path); + } + if (NULL != rendered_image_name) { + free(rendered_image_name); + } + if (NULL != source_basename) { + free(source_basename); + } + if (NULL != rendered_image.image_data) { + pdf_rendered_image_free(rendered_image.image_data, rendered_image.image_len); + } + if (NULL != pdf_render_error) { + ffierror_free(pdf_render_error); + } + return status; +} +#endif + /** * @brief A unified list of reasons why a scan result inside the magic_scan function * should goto done instead of continuing to parse/scan this layer. @@ -5003,7 +5116,7 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type) * JPEG 2000 is not handled by cli_parsejpeg. */ - if (SCAN_PARSE_IMAGE_FUZZY_HASH && (DCONF_OTHER & OTHER_CONF_IMAGE_FUZZY_HASH)) { + if (should_calculate_image_fuzzy_hash(ctx)) { // It's okay if it fails to calculate the fuzzy hash. (void)calculate_fuzzy_image_hash(ctx, type); } @@ -5024,7 +5137,7 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type) } } - if (SCAN_PARSE_IMAGE_FUZZY_HASH && (DCONF_OTHER & OTHER_CONF_IMAGE_FUZZY_HASH)) { + if (should_calculate_image_fuzzy_hash(ctx)) { // It's okay if it fails to calculate the fuzzy hash. (void)calculate_fuzzy_image_hash(ctx, type); } @@ -5045,7 +5158,7 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type) } } - if (SCAN_PARSE_IMAGE_FUZZY_HASH && (DCONF_OTHER & OTHER_CONF_IMAGE_FUZZY_HASH)) { + if (should_calculate_image_fuzzy_hash(ctx)) { // It's okay if it fails to calculate the fuzzy hash. (void)calculate_fuzzy_image_hash(ctx, type); } @@ -5069,7 +5182,7 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type) } } - if (SCAN_PARSE_IMAGE_FUZZY_HASH && (DCONF_OTHER & OTHER_CONF_IMAGE_FUZZY_HASH)) { + if (should_calculate_image_fuzzy_hash(ctx)) { // It's okay if it fails to calculate the fuzzy hash. (void)calculate_fuzzy_image_hash(ctx, type); } @@ -5252,6 +5365,11 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type) case CL_TYPE_PDF: /* FIXMELIMITS: pdf should be an archive! */ if (SCAN_PARSE_PDF && (DCONF_DOC & DOC_CONF_PDF)) { ret = cli_scanpdf(ctx, 0); +#ifdef HAVE_PDFIUM + if ((CL_SUCCESS == ret) && (ctx->options->parse & CL_SCAN_PARSE_PDF_IMAGE_FUZZY_HASH)) { + ret = scan_rendered_pdf_image(ctx); + } +#endif } break; @@ -5585,7 +5703,7 @@ cl_error_t cli_magic_scan_nested_fmap_type(cl_fmap_t *map, size_t offset, size_t return ret; } -cl_error_t cli_magic_scan_buff(const void *buffer, size_t length, cli_ctx *ctx, const char *name, uint32_t attributes) +cl_error_t cli_magic_scan_buff(const void *buffer, size_t length, cli_ctx *ctx, cli_file_t type, const char *name, uint32_t attributes) { cl_error_t ret; fmap_t *map = NULL; @@ -5595,7 +5713,7 @@ cl_error_t cli_magic_scan_buff(const void *buffer, size_t length, cli_ctx *ctx, return CL_EMAP; } - ret = cli_magic_scan_nested_fmap_type(map, 0, length, ctx, CL_TYPE_ANY, name, attributes); + ret = cli_magic_scan_nested_fmap_type(map, 0, length, ctx, type, name, attributes); fmap_free(map); @@ -5961,7 +6079,7 @@ static cl_error_t scan_common( if (status != CL_VIRUS && (iroot->ac_lsigs || iroot->ac_patterns || iroot->pcre_metas)) { cli_dbgmsg("scan_common: running deprecated preclass bytecodes for target type 13\n"); ctx.options->general &= ~CL_SCAN_GENERAL_COLLECT_METADATA; - status = cli_magic_scan_buff(jstring, strlen(jstring), &ctx, NULL, LAYER_ATTRIBUTES_NONE); + status = cli_magic_scan_buff(jstring, strlen(jstring), &ctx, CL_TYPE_ANY, NULL, LAYER_ATTRIBUTES_NONE); } } diff --git a/libclamav/scanners.h b/libclamav/scanners.h index 5a823159fe..74b4ef0f72 100644 --- a/libclamav/scanners.h +++ b/libclamav/scanners.h @@ -90,17 +90,18 @@ cl_error_t cli_magic_scan_nested_fmap_type(cl_fmap_t *map, size_t offset, size_t /** * @brief Convenience wrapper for cli_magic_scan_nested_fmap_type(). * - * Creates an fmap and calls cli_magic_scan_nested_fmap_type() for you, with type CL_TYPE_ANY. + * Creates an fmap and calls cli_magic_scan_nested_fmap_type() for you. * * @param buffer Pointer to the buffer to be scanned. * @param length Size in bytes of the buffer being scanned. * @param ctx Scanning context structure. + * @param type CL_TYPE of data to be scanned. * @param name (optional) Original name of the file (to set fmap name metadata) * @param attributes Layer attributes of the file being scanned (is it normalized, decrypted, etc) * @return int CL_SUCCESS, or an error code. */ cl_error_t cli_magic_scan_buff(const void *buffer, size_t length, cli_ctx *ctx, - const char *name, uint32_t attributes); + cli_file_t type, const char *name, uint32_t attributes); /** * @brief Internal-use version of cl_scanfile. diff --git a/libclamav/xar.c b/libclamav/xar.c index a74be0607b..798cc17828 100644 --- a/libclamav/xar.c +++ b/libclamav/xar.c @@ -307,7 +307,7 @@ static int xar_scan_subdocuments(xmlTextReaderPtr reader, cli_ctx *ctx) } subdoc_len = xmlStrlen(subdoc); cli_dbgmsg("cli_scanxar: in-memory scan of xml subdocument, len %i.\n", subdoc_len); - rc = cli_magic_scan_buff(subdoc, subdoc_len, ctx, NULL, LAYER_ATTRIBUTES_NONE); + rc = cli_magic_scan_buff(subdoc, subdoc_len, ctx, CL_TYPE_ANY, NULL, LAYER_ATTRIBUTES_NONE); /* make a file to leave if --leave-temps in effect */ if (ctx->engine->keeptmp) { @@ -512,7 +512,7 @@ int cli_scanxar(cli_ctx *ctx) /* scan the xml */ cli_dbgmsg("cli_scanxar: scanning xar TOC xml in memory.\n"); - rc = cli_magic_scan_buff(toc, hdr.toc_length_decompressed, ctx, NULL, LAYER_ATTRIBUTES_NONE); + rc = cli_magic_scan_buff(toc, hdr.toc_length_decompressed, ctx, CL_TYPE_ANY, NULL, LAYER_ATTRIBUTES_NONE); if (rc != CL_SUCCESS) { goto exit_toc; } diff --git a/libclamav/xdp.c b/libclamav/xdp.c index 8b02d963f7..18fd91bfc0 100644 --- a/libclamav/xdp.c +++ b/libclamav/xdp.c @@ -158,7 +158,7 @@ cl_error_t cli_scanxdp(cli_ctx *ctx) break; } - rc = cli_magic_scan_buff(decoded, decodedlen, ctx, NULL, LAYER_ATTRIBUTES_NONE); + rc = cli_magic_scan_buff(decoded, decodedlen, ctx, CL_TYPE_ANY, NULL, LAYER_ATTRIBUTES_NONE); free(decoded); if (rc != CL_SUCCESS) { xmlFree((void *)value); diff --git a/libclamav/xlm_extract.c b/libclamav/xlm_extract.c index daaa26af0d..3c9c915a4c 100644 --- a/libclamav/xlm_extract.c +++ b/libclamav/xlm_extract.c @@ -4353,7 +4353,7 @@ cl_error_t process_blip_record(struct OfficeArtRecordHeader_Unpacked *rh, const NULL, LAYER_ATTRIBUTES_NONE); } else { /* Scan the buffer */ - ret = cli_magic_scan_buff(start_of_image, size_of_image, ctx, NULL, LAYER_ATTRIBUTES_NONE); + ret = cli_magic_scan_buff(start_of_image, size_of_image, ctx, CL_TYPE_ANY, NULL, LAYER_ATTRIBUTES_NONE); } if (CL_SUCCESS != ret) { status = ret; diff --git a/libclamav_rust/Cargo.toml b/libclamav_rust/Cargo.toml index cdb1ba5f21..4589685ac6 100644 --- a/libclamav_rust/Cargo.toml +++ b/libclamav_rust/Cargo.toml @@ -5,6 +5,7 @@ name = "clamav_rust" version = "0.0.1" [dependencies] +pdfium-render = { version = "0.8.37", features = ["static"] } flate2 = "1" hex = "0.4" libc = "0.2" @@ -41,5 +42,5 @@ crate-type = ["staticlib"] name = "clamav_rust" [build-dependencies] -cbindgen = { version ="0.25", default-features = false } +cbindgen = { version = "0.25", default-features = false } bindgen = "0.65" diff --git a/libclamav_rust/build.rs b/libclamav_rust/build.rs index 010703dc68..400383e355 100644 --- a/libclamav_rust/build.rs +++ b/libclamav_rust/build.rs @@ -95,6 +95,11 @@ const C_HEADER_OUTPUT: &str = "clamav_rust.h"; const ENV_PATTERNS: &[&str] = &["CARGO_", "RUST", "LIB"]; fn main() -> Result<(), &'static str> { + println!("cargo:rerun-if-changed=build.rs"); + println!("cargo:rerun-if-changed=cbindgen.toml"); + println!("cargo:rerun-if-changed=src/fuzzy_hash.rs"); + println!("cargo:rerun-if-changed=src/pdf_render.rs"); + // Dump the command line and interesting environment variables for diagnostic // purposes. These will end up in a 'stderr' file under the target directory, // in a ".../clamav_rust-" subdirectory diff --git a/libclamav_rust/cbindgen.toml b/libclamav_rust/cbindgen.toml index d404dc84a3..03aefdbca4 100644 --- a/libclamav_rust/cbindgen.toml +++ b/libclamav_rust/cbindgen.toml @@ -40,6 +40,10 @@ include = [ "fuzzy_hash::fuzzy_hash_calculate_image", "fuzzy_hash::fuzzy_hash_load_subsignature", "fuzzy_hash::fuzzy_hash_check", + "pdf_render::PdfImageFuzzyHashConfig", + "pdf_render::RenderedPdfImage", + "pdf_render::pdf_render_to_image", + "pdf_render::pdf_rendered_image_free", "ffi_util::FFIError", "ffi_util::ffierror_fmt", "ffi_util::ffierror_free", diff --git a/libclamav_rust/src/fuzzy_hash.rs b/libclamav_rust/src/fuzzy_hash.rs index dff79c8da3..1b1b465222 100644 --- a/libclamav_rust/src/fuzzy_hash.rs +++ b/libclamav_rust/src/fuzzy_hash.rs @@ -422,6 +422,10 @@ pub fn fuzzy_hash_calculate_image(buffer: &[u8]) -> Result, Error> { Err(_) => return Err(Error::ImageLoadPanic()), }; + fuzzy_hash_calculate_dynamic_image(og_image) +} + +pub fn fuzzy_hash_calculate_dynamic_image(og_image: DynamicImage) -> Result, Error> { // Drop the alpha channel (if exists). let buff_rgb8 = og_image.to_rgb8(); diff --git a/libclamav_rust/src/lib.rs b/libclamav_rust/src/lib.rs index 46bf401927..2be8c0bbc1 100644 --- a/libclamav_rust/src/lib.rs +++ b/libclamav_rust/src/lib.rs @@ -35,5 +35,6 @@ pub mod fmap; pub mod fuzzy_hash; pub mod logging; pub mod onenote; +pub mod pdf_render; pub mod scanners; pub mod util; diff --git a/libclamav_rust/src/pdf_render.rs b/libclamav_rust/src/pdf_render.rs new file mode 100644 index 0000000000..64335418a5 --- /dev/null +++ b/libclamav_rust/src/pdf_render.rs @@ -0,0 +1,189 @@ +/* + * Render the first page of a PDF document as jpeg in a [u8]. + * + * Copyright (C) 2023 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + * + * Authors: Micah Snyder + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +use image::{DynamicImage, ImageFormat}; +use pdfium_render::prelude::*; +use std::io::Cursor; + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct PdfImageFuzzyHashConfig { + pub render_mode: u32, + pub dpi: u32, + pub canvas_width: u32, + pub canvas_height: u32, + pub image_format: u32, +} + +#[repr(C)] +#[derive(Debug)] +pub struct RenderedPdfImage { + pub image_data: *mut u8, + pub image_len: usize, + pub image_type: crate::sys::cli_file_t, +} + +pub const PDF_IMAGE_FUZZY_HASH_RENDER_MODE_DEFAULT: u32 = 0; +pub const PDF_IMAGE_FUZZY_HASH_RENDER_MODE_DPI: u32 = 1; +pub const PDF_IMAGE_FUZZY_HASH_RENDER_MODE_CANVAS: u32 = 2; +pub const PDF_IMAGE_FUZZY_HASH_IMAGE_FORMAT_PNG: u32 = 1; +pub const PDF_IMAGE_FUZZY_HASH_IMAGE_FORMAT_JPEG: u32 = 2; + +#[derive(thiserror::Error, Debug)] +pub enum PdfRenderError { + #[error("PDF Rendering error : {0}")] + PDFRenderError(#[from] PdfiumError), + + #[error("PDF Rendering error : {0}")] + ImageEncodeError(#[from] image::ImageError), + + #[error("PDF Rendering error : empty document")] + PDFRenderEmpty, +} + +impl Default for PdfImageFuzzyHashConfig { + fn default() -> Self { + Self { + render_mode: PDF_IMAGE_FUZZY_HASH_RENDER_MODE_CANVAS, + dpi: 0, + canvas_width: 2000, + canvas_height: 2000, + image_format: PDF_IMAGE_FUZZY_HASH_IMAGE_FORMAT_PNG, + } + } +} + +pub fn render( + data: &[u8], + config: Option<&PdfImageFuzzyHashConfig>, +) -> Result { + //let pdfium = Pdfium::new(Pdfium::bind_to_system_library()?); + let pdfium = Pdfium::new(Pdfium::bind_to_statically_linked_library()?); + let document = pdfium.load_pdf_from_byte_slice(data, None)?; + + if document.pages().is_empty() { + return Err(PdfRenderError::PDFRenderEmpty); + } + + let config = config.copied().unwrap_or_default(); + let render_config = match config.render_mode { + PDF_IMAGE_FUZZY_HASH_RENDER_MODE_DPI if config.dpi > 0 => { + PdfRenderConfig::new().scale_page_by_factor(config.dpi as f32 / 72.0) + } + PDF_IMAGE_FUZZY_HASH_RENDER_MODE_CANVAS | PDF_IMAGE_FUZZY_HASH_RENDER_MODE_DEFAULT => { + let canvas_width = if config.canvas_width > 0 { + config.canvas_width + } else { + 2000 + } as i32; + let canvas_height = if config.canvas_height > 0 { + config.canvas_height + } else { + 2000 + } as i32; + PdfRenderConfig::new().scale_page_to_display_size(canvas_width, canvas_height) + } + _ => PdfRenderConfig::new().scale_page_to_display_size(2000, 2000), + }; + + let image = document + .pages() + .first()? + .render_with_config(&render_config)? + .as_image(); + + Ok(image) +} + +pub fn render_to_image( + data: &[u8], + config: Option<&PdfImageFuzzyHashConfig>, +) -> Result<(Vec, crate::sys::cli_file_t), PdfRenderError> { + let image = render(data, config)?; + let config = config.copied().unwrap_or_default(); + let mut cursor = Cursor::new(Vec::new()); + let image_type = match config.image_format { + PDF_IMAGE_FUZZY_HASH_IMAGE_FORMAT_JPEG => { + image.write_to(&mut cursor, ImageFormat::Jpeg)?; + crate::sys::cli_file_CL_TYPE_JPEG + } + _ => { + image.write_to(&mut cursor, ImageFormat::Png)?; + crate::sys::cli_file_CL_TYPE_PNG + } + }; + Ok((cursor.into_inner(), image_type)) +} + +#[export_name = "pdf_render_to_image"] +pub unsafe extern "C" fn _pdf_render_to_image( + file_bytes: *const u8, + file_size: usize, + config: *const PdfImageFuzzyHashConfig, + rendered_image_out: *mut RenderedPdfImage, + err: *mut *mut crate::ffi_util::FFIError, +) -> bool { + if rendered_image_out.is_null() { + return crate::ffi_error!( + err = err, + crate::ffi_util::Error::NullParameter("rendered_image_out".to_string()) + ); + } + if file_bytes.is_null() { + return crate::ffi_error!( + err = err, + crate::ffi_util::Error::NullParameter("file_bytes".to_string()) + ); + } + + let buffer = std::slice::from_raw_parts(file_bytes, file_size); + let render_result = render_to_image(buffer, config.as_ref()); + + match render_result { + Ok((image_data, image_type)) => { + let len = image_data.len(); + let boxed = image_data.into_boxed_slice(); + let ptr = Box::into_raw(boxed) as *mut u8; + + *rendered_image_out = RenderedPdfImage { + image_data: ptr, + image_len: len, + image_type, + }; + true + } + Err(e) => { + *err = Box::into_raw(Box::new(e.into())); + false + } + } +} + +#[export_name = "pdf_rendered_image_free"] +pub unsafe extern "C" fn _pdf_rendered_image_free(image_data: *mut u8, image_len: usize) { + if image_data.is_null() { + return; + } + + let slice_ptr = std::ptr::slice_from_raw_parts_mut(image_data, image_len); + drop(Box::from_raw(slice_ptr)); +} diff --git a/libclamav_rust/src/scanners.rs b/libclamav_rust/src/scanners.rs index dc3fbffd73..41b6d966c6 100644 --- a/libclamav_rust/src/scanners.rs +++ b/libclamav_rust/src/scanners.rs @@ -38,7 +38,7 @@ use crate::{ onenote::OneNote, sys::{ cl_error_t, cl_error_t_CL_EFORMAT, cl_error_t_CL_ERROR, cl_error_t_CL_SUCCESS, cli_ctx, - cli_magic_scan_buff, + cli_file_CL_TYPE_ANY, cli_magic_scan_buff, }, util::{check_scan_limits, scan_archive_metadata}, }; @@ -74,7 +74,7 @@ pub unsafe fn magic_scan(ctx: *mut cli_ctx, buf: &[u8], name: Option) -> Err(_) => null_mut(), }; - let ret = unsafe { cli_magic_scan_buff(ptr as *const c_void, len, ctx, name_ptr, 0) }; + let ret = unsafe { cli_magic_scan_buff(ptr as *const c_void, len, ctx, cli_file_CL_TYPE_ANY, name_ptr, 0) }; if ret != cl_error_t_CL_SUCCESS { debug!("cli_magic_scan_buff returned error: {}", ret); } diff --git a/libclamav_rust/src/sys.rs b/libclamav_rust/src/sys.rs index 085dcf914c..ec3cb036c7 100644 --- a/libclamav_rust/src/sys.rs +++ b/libclamav_rust/src/sys.rs @@ -1265,11 +1265,12 @@ extern "C" { ) -> *mut ::std::os::raw::c_char; } extern "C" { - #[doc = " @brief Convenience wrapper for cli_magic_scan_nested_fmap_type().\n\n Creates an fmap and calls cli_magic_scan_nested_fmap_type() for you, with type CL_TYPE_ANY.\n\n @param buffer Pointer to the buffer to be scanned.\n @param length Size in bytes of the buffer being scanned.\n @param ctx Scanning context structure.\n @param name (optional) Original name of the file (to set fmap name metadata)\n @param attributes Layer attributes of the file being scanned (is it normalized, decrypted, etc)\n @return int CL_SUCCESS, or an error code."] + #[doc = " @brief Convenience wrapper for cli_magic_scan_nested_fmap_type().\n\n Creates an fmap and calls cli_magic_scan_nested_fmap_type() for you.\n\n @param buffer Pointer to the buffer to be scanned.\n @param length Size in bytes of the buffer being scanned.\n @param ctx Scanning context structure.\n @param type CL_TYPE of data to be scanned.\n @param name (optional) Original name of the file (to set fmap name metadata)\n @param attributes Layer attributes of the file being scanned (is it normalized, decrypted, etc)\n @return int CL_SUCCESS, or an error code."] pub fn cli_magic_scan_buff( buffer: *const ::std::os::raw::c_void, length: usize, ctx: *mut cli_ctx, + type_: cli_file, name: *const ::std::os::raw::c_char, attributes: u32, ) -> cl_error_t; diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index ed7437546f..0785a74d97 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -270,9 +270,16 @@ else() endif() endif() +if(HAVE_PDFIUM) + set(HAVE_PDFIUM_ENV 1) +else() + set(HAVE_PDFIUM_ENV 0) +endif() + set(ENVIRONMENT PYTHONTRACEMALLOC=1 VERSION=${PROJECT_VERSION}${VERSION_SUFFIX} SOURCE=${SOURCE} BUILD=${BUILD} TMP=${TMP} + HAVE_PDFIUM=${HAVE_PDFIUM_ENV} CK_FORK=no CK_DEFAULT_TIMEOUT=300 LD_LIBRARY_PATH=${LD_LIBRARY_PATH} diff --git a/unit_tests/clamscan/pdf_render_canvas_test.py b/unit_tests/clamscan/pdf_render_canvas_test.py new file mode 100644 index 0000000000..7ce4216d36 --- /dev/null +++ b/unit_tests/clamscan/pdf_render_canvas_test.py @@ -0,0 +1,143 @@ +# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + +""" +Run clamscan tests. +""" + +import sys +import os +import shutil +import unittest + +sys.path.append('../unit_tests') +import testcase + + +@unittest.skipUnless(os.getenv("HAVE_PDFIUM") == "1", "requires PDFium support") +class TC(testcase.TestCase): + @classmethod + def setUpClass(cls): + super(TC, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + super(TC, cls).tearDownClass() + + def setUp(self): + super(TC, self).setUp() + + def tearDown(self): + super(TC, self).tearDown() + + if (self.path_tmp / "TD").exists(): + shutil.rmtree(self.path_tmp / "TD") + + self.verify_valgrind_log() + + def test_pdf_render_canvas_valid(self): + self.step_name('Test valid PDF render canvas option') + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfile} --scan-pdf-image-fuzzy-hash=no --pdf-render-canvas=1920x1080'.format( + valgrind=TC.valgrind, + valgrind_args=TC.valgrind_args, + clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + def test_pdf_render_canvas_invalid(self): + self.step_name('Test invalid PDF render canvas option') + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf' + + invalid_values = [ + '1920', + '1920x0', + 'x1080', + ] + + for invalid_value in invalid_values: + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfile} --pdf-render-canvas={invalid_value}'.format( + valgrind=TC.valgrind, + valgrind_args=TC.valgrind_args, + clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + testfile=testfile, + invalid_value=invalid_value, + ) + output = self.execute_command(command) + + assert output.ec == 2 # error + self.verify_output( + output.err, + expected=['--pdf-render-canvas must be in WIDTHxHEIGHT format, for example 1920x1080.'], + ) + + def test_pdf_render_format_valid(self): + self.step_name('Test valid PDF render format option') + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf' + + for image_format in ['png', 'jpeg']: + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfile} --scan-pdf-image-fuzzy-hash=no --pdf-render-format={image_format}'.format( + valgrind=TC.valgrind, + valgrind_args=TC.valgrind_args, + clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + testfile=testfile, + image_format=image_format, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + def test_pdf_render_format_invalid(self): + self.step_name('Test invalid PDF render format option') + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfile} --pdf-render-format=gif'.format( + valgrind=TC.valgrind, + valgrind_args=TC.valgrind_args, + clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 2 # error + self.verify_output( + output.err, + expected=['--pdf-render-format must be either png or jpeg.'], + ) + + def test_pdf_render_jpeg_honors_pdf_fuzzy_hash_option(self): + self.step_name('Test JPEG PDF render honors PDF fuzzy hash option') + + tempdir = self.path_tmp / "TD" + if not os.path.isdir(tempdir): + os.makedirs(tempdir) + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile} --scan-pdf-image-fuzzy-hash=yes --scan-image-fuzzy-hash=no --pdf-render-format=jpeg'.format( + valgrind=TC.valgrind, + valgrind_args=TC.valgrind_args, + clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + tempdir=tempdir, + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + expected_strings = [ + '"FileName":"pdf-render-pdf-stats-test.pdf.jpeg"', + '"Normalized":true', + '"FileType":"CL_TYPE_JPEG"', + '"ImageFuzzyHash":{', + ] + self.verify_metadata_json(tempdir, expected_strings) diff --git a/win32/conf_examples/clamd.conf.sample b/win32/conf_examples/clamd.conf.sample index 7973921c36..d625ba6b50 100644 --- a/win32/conf_examples/clamd.conf.sample +++ b/win32/conf_examples/clamd.conf.sample @@ -487,6 +487,31 @@ TCPAddr localhost # Default: yes #ScanImageFuzzyHash no +# This option enables PDF rendering plus image fuzzy hash detection for PDF +# files. +# If you turn off this option, PDFs will still be scanned, but the PDFium-based +# render and fuzzy hash step will be skipped. +# Default: yes +#ScanPDFImageFuzzyHash no + +# Render PDF pages for fuzzy hash calculation at the specified DPI. +# This option is mutually exclusive with PDFRenderCanvas. +# The value must be greater than 0. +# Default: disabled +#PDFRenderDPI 144 + +# Render PDF pages for fuzzy hash calculation to fit within a canvas specified +# as WIDTHxHEIGHT pixels. +# This option is mutually exclusive with PDFRenderDPI. +# Example: 1920x1080 +# Default: 2000x2000 +#PDFRenderCanvas 1920x1080 + +# Render PDF pages for fuzzy hash calculation as either PNG or JPEG. +# Supported values are 'png' and 'jpeg'. +# Default: png +#PDFRenderFormat png + ## ## Mail files