From c043858e7850384c18c70c78681e18f242010a9f Mon Sep 17 00:00:00 2001 From: Janpot <2109932+Janpot@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:34:30 +0100 Subject: [PATCH 01/16] [code-infra] Add optional HTML validation to broken links checker Add an `htmlValidate` option to the crawl config that validates HTML content of crawled pages using the html-validate library. The option accepts `true` (uses recommended rules), or a config object supporting `extends: ['mui:recommended']` for the default preset. Config is always static (never loaded from disk). Reports are printed per page. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/code-infra/package.json | 3 +- .../__fixtures__/static-site/index.html | 1 + .../static-site/invalid-html.html | 15 ++++ .../src/brokenLinksChecker/index.mjs | 81 ++++++++++++++++++- .../src/brokenLinksChecker/index.test.ts | 30 ++++++- pnpm-lock.yaml | 61 ++++++++++++++ 6 files changed, 188 insertions(+), 3 deletions(-) create mode 100644 packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/invalid-html.html diff --git a/packages/code-infra/package.json b/packages/code-infra/package.json index a3abb61b8..4ede00ac2 100644 --- a/packages/code-infra/package.json +++ b/packages/code-infra/package.json @@ -93,6 +93,7 @@ "clipboardy": "^5.3.1", "content-type": "^1.0.5", "env-ci": "^11.2.0", + "es-toolkit": "^1.45.1", "eslint-config-prettier": "^10.1.8", "eslint-import-resolver-typescript": "^4.4.4", "eslint-module-utils": "^2.12.1", @@ -104,11 +105,11 @@ "eslint-plugin-react-compiler": "^19.1.0-rc.2", "eslint-plugin-react-hooks": "^7.0.1", "eslint-plugin-testing-library": "^7.16.0", - "es-toolkit": "^1.45.1", "execa": "^9.6.1", "git-url-parse": "^16.1.0", "globals": "^17.4.0", "globby": "^16.1.1", + "html-validate": "^10.11.2", "minimatch": "^10.2.4", "node-html-parser": "^7.1.0", "open": "^11.0.0", diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html index c9ba541e5..c79573ba0 100644 --- a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html @@ -22,6 +22,7 @@

Test Site Home

  • Page with API Links
  • Example Markdown
  • Page with Unclosed Tags
  • +
  • Invalid HTML Page
  • diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/invalid-html.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/invalid-html.html new file mode 100644 index 000000000..fd4f01f73 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/invalid-html.html @@ -0,0 +1,15 @@ + + + + + Invalid HTML Page + + +

    Invalid HTML

    + +
    First
    +
    Second
    + +

    Tom & Jerry

    + + diff --git a/packages/code-infra/src/brokenLinksChecker/index.mjs b/packages/code-infra/src/brokenLinksChecker/index.mjs index 64718cec2..c389f11a4 100644 --- a/packages/code-infra/src/brokenLinksChecker/index.mjs +++ b/packages/code-infra/src/brokenLinksChecker/index.mjs @@ -7,6 +7,7 @@ import * as path from 'node:path'; import chalk from 'chalk'; import { Transform } from 'node:stream'; import contentType from 'content-type'; +import { HtmlValidate, StaticConfigLoader, formatterFactory } from 'html-validate'; import { unified } from 'unified'; import remarkParse from 'remark-parse'; import remarkGfm from 'remark-gfm'; @@ -16,6 +17,36 @@ import rehypeStringify from 'rehype-stringify'; const DEFAULT_CONCURRENCY = 4; +/** @type {import('html-validate').ConfigData} */ +const MUI_RECOMMENDED_HTML_VALIDATE_CONFIG = { + extends: ['html-validate:recommended'], +}; + +/** + * Resolves the htmlValidate option into an html-validate config object or null. + * Supports `true` (use defaults), an object (use as config with `mui:recommended` preset support), or falsy (disabled). + * @param {boolean | import('html-validate').ConfigData | undefined} option + * @returns {import('html-validate').ConfigData | null} + */ +function resolveHtmlValidateConfig(option) { + if (!option) { + return null; + } + if (option === true) { + return MUI_RECOMMENDED_HTML_VALIDATE_CONFIG; + } + // Resolve mui:recommended in extends + if (Array.isArray(option.extends)) { + return { + ...option, + extends: option.extends.flatMap((ext) => + ext === 'mui:recommended' ? (MUI_RECOMMENDED_HTML_VALIDATE_CONFIG.extends ?? []) : [ext], + ), + }; + } + return option; +} + /** * Creates a Transform stream that prefixes each line with a given string. * Useful for distinguishing server logs from other output. @@ -402,6 +433,7 @@ function shouldIgnoreLink(link, ignores) { * @property {number} [concurrency] - Number of concurrent page fetches (defaults to 4) * @property {string[]} [seedUrls] - Starting URLs for the crawl (defaults to ['/']) * @property {IgnoreRule[]} [ignores] - Rules to ignore broken links. Each rule can have path, href, contentType, and/or has properties. All specified properties must match (AND logic). Within a property, multiple values use OR logic. + * @property {boolean | import('html-validate').ConfigData} [htmlValidate] - Enable HTML validation on crawled pages. `false` (default): disabled. `true`: validate with recommended rules. Object: use as html-validate config (supports `extends: ['mui:recommended']` to reference the default config). */ /** @@ -447,6 +479,7 @@ function resolveOptions(rawOptions) { concurrency: rawOptions.concurrency ?? DEFAULT_CONCURRENCY, seedUrls: rawOptions.seedUrls ?? ['/'], ignores: normalizedIgnores, + htmlValidate: rawOptions.htmlValidate ?? false, }; } @@ -518,6 +551,7 @@ async function resolveKnownTargets(options) { * @property {Set} links - All links discovered during the crawl * @property {Map} pages - All pages crawled, keyed by normalized URL * @property {Issue[]} issues - All broken links and broken targets found + * @property {Map} htmlValidateResults - HTML validation results per page URL (empty map if validation disabled) */ /** @@ -553,6 +587,25 @@ function reportIssues(issuesList) { } } +/** + * Reports HTML validation issues to stderr, grouped by page URL. + * @param {Map} validationResults - Validation results per page + */ +function reportHtmlValidation(validationResults) { + if (validationResults.size === 0) { + return; + } + + const formatResults = formatterFactory('stylish'); + + console.error('\nHTML validation issues:\n'); + + for (const [pageUrl, results] of validationResults.entries()) { + console.error(`Page ${chalk.cyan(pageUrl)}:`); + console.error(formatResults(results)); + } +} + /** * Crawls a website starting from seed URLs, discovering all internal links and checking for broken links/targets. * @param {CrawlOptions} rawOptions - Configuration options for the crawl @@ -562,6 +615,14 @@ export async function crawl(rawOptions) { const options = resolveOptions(rawOptions); const startTime = Date.now(); + const htmlValidateConfig = resolveHtmlValidateConfig(rawOptions.htmlValidate); + /** @type {HtmlValidate | null} */ + const htmlValidator = htmlValidateConfig + ? new HtmlValidate(new StaticConfigLoader(htmlValidateConfig)) + : null; + /** @type {Map} */ + const htmlValidateResults = new Map(); + /** @type {AbortController | null} */ let controller = null; if (options.startCommand) { @@ -653,6 +714,14 @@ export async function crawl(rawOptions) { } const rawContent = await res.text(); + + if (htmlValidator && type === 'text/html') { + const report = await htmlValidator.validateString(rawContent, pageUrl); + if (!report.valid) { + htmlValidateResults.set(pageUrl, report.results); + } + } + const content = type === 'text/markdown' ? await markdownToHtml(rawContent) : rawContent; const dom = parse(content, { parseNoneClosedTags: true }); @@ -772,6 +841,7 @@ export async function crawl(rawOptions) { } reportIssues(issues); + reportHtmlValidation(htmlValidateResults); // Derive counts from issues const brokenLinks = issues.filter((issue) => issue.type === 'broken-link').length; @@ -789,10 +859,19 @@ export async function crawl(rawOptions) { console.log(` Total broken links: ${chalk.cyan(brokenLinks)}`); console.log(` Total broken link targets: ${chalk.cyan(brokenLinkTargets)}`); console.log(` Total ignored: ${chalk.cyan(ignoredCount)}`); + if (htmlValidator) { + const totalHtmlIssues = [...htmlValidateResults.values()].reduce( + (sum, pageResults) => sum + pageResults.reduce((s, r) => s + r.messages.length, 0), + 0, + ); + console.log( + ` HTML validation issues: ${chalk.cyan(totalHtmlIssues)} across ${chalk.cyan(htmlValidateResults.size)} page(s)`, + ); + } if (options.outPath) { console.log(chalk.blue(`Output written to: ${options.outPath}`)); } - return { links: crawledLinks, pages: results, issues }; + return { links: crawledLinks, pages: results, issues, htmlValidateResults }; } diff --git a/packages/code-infra/src/brokenLinksChecker/index.test.ts b/packages/code-infra/src/brokenLinksChecker/index.test.ts index 025d30730..ef1049d11 100644 --- a/packages/code-infra/src/brokenLinksChecker/index.test.ts +++ b/packages/code-infra/src/brokenLinksChecker/index.test.ts @@ -56,9 +56,15 @@ describe('Broken Links Checker', () => { // Test href-only rule (matches from any page) - note: matches the actual href value { href: 'broken-relative.html' }, ], + htmlValidate: { + extends: ['mui:recommended'], + rules: { + 'no-raw-characters': 'off', + }, + }, }); - expect(result.links).toHaveLength(66); + expect(result.links).toHaveLength(67); // Issue count: original 11, minus ignored ones (broken-from-markdown via contentType, // broken-relative via href-only rule) expect(result.issues).toHaveLength(9); @@ -257,5 +263,27 @@ describe('Broken Links Checker', () => { // Test contentType is stored on pageData expect(result.pages.get('/example.md')?.contentType).toBe('text/markdown'); expect(result.pages.get('/')?.contentType).toBe('text/html'); + + // Test htmlValidate: invalid-html.html has duplicate IDs which should be reported + expect(result.htmlValidateResults.has('/invalid-html.html')).toBe(true); + const invalidHtmlMessages = result.htmlValidateResults + .get('/invalid-html.html') + ?.flatMap((r) => r.messages); + expect(invalidHtmlMessages).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + ruleId: 'no-dup-id', + }), + ]), + ); + + // Test htmlValidate override: no-raw-characters is off, so raw & should NOT be reported + expect(invalidHtmlMessages).not.toEqual( + expect.arrayContaining([ + expect.objectContaining({ + ruleId: 'no-raw-characters', + }), + ]), + ); }, 30000); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index efa7fcab9..9abd03914 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -680,6 +680,9 @@ importers: globby: specifier: ^16.1.1 version: 16.1.1 + html-validate: + specifier: ^10.11.2 + version: 10.11.2(jest-diff@30.2.0)(vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jsdom@28.1.0)(vite@8.0.0(@types/node@22.19.0)(esbuild@0.27.1)(jiti@2.6.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.1))) minimatch: specifier: ^10.2.4 version: 10.2.4 @@ -2667,6 +2670,10 @@ packages: resolution: {integrity: sha512-bV8v7R/c0gNve8i7yPmZbcCTJUqRbCnMSvcegcMaz+ly+FoZf9i4+3MTjKsX+OZn9w0w1I6VJYQBcdM+yMWPQQ==} engines: {node: '>=0.10.0'} + '@html-validate/stylish@5.0.0': + resolution: {integrity: sha512-xjhRV9k1mWfgsOcpYlwsjUOFy3w3EnCDrqUrEw+DWdvOStMK59ts2H7GLKWZtmLI5m6Npp3qMSNReafQy9K2sA==} + engines: {node: ^20.11 || >= 22.16} + '@humanfs/core@0.19.1': resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==} engines: {node: '>=18.18.0'} @@ -5538,6 +5545,12 @@ packages: '@sec-ant/readable-stream@0.4.1': resolution: {integrity: sha512-831qok9r2t8AlxLko40y2ebgSDhenenCatLVeW/uBtnHPyhHOvG0C7TvfgecV+wHzIm5KUICgzmVpWS+IMEAeg==} + '@sidvind/better-ajv-errors@4.0.1': + resolution: {integrity: sha512-6arF1ssKxItxgitPYXafUoLmsVBA6K7m9+ZGj6hLDoBl7nWpJ33EInwQUdHTle2METeWGxgQiqSex20KZRykew==} + engines: {node: '>= 18'} + peerDependencies: + ajv: ^7.0.0 || ^8.0.0 + '@sigstore/bundle@4.0.0': resolution: {integrity: sha512-NwCl5Y0V6Di0NexvkTqdoVfmjTaQwoLM236r89KEojGmq/jMls8S+zb7yOwAPdXvbwfKDlP+lmXgAL4vKSQT+A==} engines: {node: ^20.17.0 || >=22.9.0} @@ -8862,6 +8875,25 @@ packages: resolution: {integrity: sha512-QY6S+hZ0f5m1WT8WffYN+Hg+xm/w5I8XeUcAq/ZYP5wVC8xbKi4Whhru3FtrAebD5EhBW8rmFzkDI6eCAuFe2w==} hasBin: true + html-validate@10.11.2: + resolution: {integrity: sha512-ZT4812sBvF77WrfTNWcaaml7xSOMDsGvw3plP4zrgcPw5RWESsRgprsVVYnaQD2lSNfD0WObZ0Ur22+xXj0TXA==} + engines: {node: ^20.19.0 || >= 22.16.0} + hasBin: true + peerDependencies: + jest: ^28.1.3 || ^29.0.3 || ^30.0.0 + jest-diff: ^28.1.3 || ^29.0.3 || ^30.0.0 + jest-snapshot: ^28.1.3 || ^29.0.3 || ^30.0.0 + vitest: ^1.0.0 || ^2.0.0 || ^3.0.0 || ^4.0.1 + peerDependenciesMeta: + jest: + optional: true + jest-diff: + optional: true + jest-snapshot: + optional: true + vitest: + optional: true + html-void-elements@3.0.0: resolution: {integrity: sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg==} @@ -9473,6 +9505,10 @@ packages: resolution: {integrity: sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==} engines: {node: '>=6'} + kleur@4.1.5: + resolution: {integrity: sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==} + engines: {node: '>=6'} + ky@1.14.0: resolution: {integrity: sha512-Rczb6FMM6JT0lvrOlP5WUOCB7s9XKxzwgErzhKlKde1bEV90FXplV1o87fpt4PU/asJFiqjYJxAJyzJhcrxOsQ==} engines: {node: '>=18'} @@ -14883,6 +14919,10 @@ snapshots: dependencies: ip-address: 5.9.4 + '@html-validate/stylish@5.0.0': + dependencies: + kleur: 4.1.5 + '@humanfs/core@0.19.1': {} '@humanfs/node@0.16.7': @@ -17934,6 +17974,11 @@ snapshots: '@sec-ant/readable-stream@0.4.1': {} + '@sidvind/better-ajv-errors@4.0.1(ajv@8.18.0)': + dependencies: + ajv: 8.18.0 + kleur: 4.1.5 + '@sigstore/bundle@4.0.0': dependencies: '@sigstore/protobuf-specs': 0.5.0 @@ -22207,6 +22252,20 @@ snapshots: readable-stream: 1.0.34 through2: 0.4.2 + html-validate@10.11.2(jest-diff@30.2.0)(vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jsdom@28.1.0)(vite@8.0.0(@types/node@22.19.0)(esbuild@0.27.1)(jiti@2.6.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.1))): + dependencies: + '@html-validate/stylish': 5.0.0 + '@sidvind/better-ajv-errors': 4.0.1(ajv@8.18.0) + ajv: 8.18.0 + glob: 13.0.6 + kleur: 4.1.5 + minimist: 1.2.8 + prompts: 2.4.2 + semver: 7.7.4 + optionalDependencies: + jest-diff: 30.2.0 + vitest: 4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jsdom@28.1.0)(vite@8.0.0(@types/node@22.19.0)(esbuild@0.27.1)(jiti@2.6.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.1)) + html-void-elements@3.0.0: {} http-cache-semantics@4.2.0: {} @@ -22797,6 +22856,8 @@ snapshots: kleur@3.0.3: {} + kleur@4.1.5: {} + ky@1.14.0: {} language-subtag-registry@0.3.23: {} From b28952f9bea388acae0d8595473409ea9b2a8846 Mon Sep 17 00:00:00 2001 From: Janpot <2109932+Janpot@users.noreply.github.com> Date: Tue, 24 Mar 2026 16:56:00 +0100 Subject: [PATCH 02/16] [code-infra] Use staticResolver for mui:recommended html-validate preset Replace manual string replacement of mui:recommended in extends with html-validate's staticResolver API, which properly registers the preset so html-validate's own config resolution handles it. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/brokenLinksChecker/index.mjs | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/packages/code-infra/src/brokenLinksChecker/index.mjs b/packages/code-infra/src/brokenLinksChecker/index.mjs index c389f11a4..d0efac0f6 100644 --- a/packages/code-infra/src/brokenLinksChecker/index.mjs +++ b/packages/code-infra/src/brokenLinksChecker/index.mjs @@ -7,7 +7,7 @@ import * as path from 'node:path'; import chalk from 'chalk'; import { Transform } from 'node:stream'; import contentType from 'content-type'; -import { HtmlValidate, StaticConfigLoader, formatterFactory } from 'html-validate'; +import { HtmlValidate, StaticConfigLoader, staticResolver, formatterFactory } from 'html-validate'; import { unified } from 'unified'; import remarkParse from 'remark-parse'; import remarkGfm from 'remark-gfm'; @@ -17,14 +17,16 @@ import rehypeStringify from 'rehype-stringify'; const DEFAULT_CONCURRENCY = 4; -/** @type {import('html-validate').ConfigData} */ -const MUI_RECOMMENDED_HTML_VALIDATE_CONFIG = { - extends: ['html-validate:recommended'], -}; +const muiHtmlValidateResolver = staticResolver({ + configs: { + 'mui:recommended': { + extends: ['html-validate:recommended'], + }, + }, +}); /** * Resolves the htmlValidate option into an html-validate config object or null. - * Supports `true` (use defaults), an object (use as config with `mui:recommended` preset support), or falsy (disabled). * @param {boolean | import('html-validate').ConfigData | undefined} option * @returns {import('html-validate').ConfigData | null} */ @@ -33,16 +35,7 @@ function resolveHtmlValidateConfig(option) { return null; } if (option === true) { - return MUI_RECOMMENDED_HTML_VALIDATE_CONFIG; - } - // Resolve mui:recommended in extends - if (Array.isArray(option.extends)) { - return { - ...option, - extends: option.extends.flatMap((ext) => - ext === 'mui:recommended' ? (MUI_RECOMMENDED_HTML_VALIDATE_CONFIG.extends ?? []) : [ext], - ), - }; + return { extends: ['mui:recommended'] }; } return option; } @@ -618,7 +611,7 @@ export async function crawl(rawOptions) { const htmlValidateConfig = resolveHtmlValidateConfig(rawOptions.htmlValidate); /** @type {HtmlValidate | null} */ const htmlValidator = htmlValidateConfig - ? new HtmlValidate(new StaticConfigLoader(htmlValidateConfig)) + ? new HtmlValidate(new StaticConfigLoader([muiHtmlValidateResolver], htmlValidateConfig)) : null; /** @type {Map} */ const htmlValidateResults = new Map(); From 7dbe11ddd254491b83cae89c8e95c0775674ea33 Mon Sep 17 00:00:00 2001 From: Janpot <2109932+Janpot@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:26:41 +0100 Subject: [PATCH 03/16] try worker --- .../brokenLinksChecker/htmlValidateWorker.mjs | 23 ++++++++ .../src/brokenLinksChecker/index.mjs | 54 ++++++++++++------- 2 files changed, 57 insertions(+), 20 deletions(-) create mode 100644 packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs diff --git a/packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs b/packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs new file mode 100644 index 000000000..aa9df748b --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs @@ -0,0 +1,23 @@ +import { workerData, parentPort } from 'node:worker_threads'; +import { HtmlValidate, StaticConfigLoader, staticResolver } from 'html-validate'; + +const { htmlValidateConfig, rawContent, pageUrl } = workerData; + +const muiHtmlValidateResolver = staticResolver({ + configs: { + 'mui:recommended': { + extends: ['html-validate:recommended'], + }, + }, +}); + +const htmlValidator = new HtmlValidate( + new StaticConfigLoader([muiHtmlValidateResolver], htmlValidateConfig), +); + +const report = await htmlValidator.validateString(rawContent, pageUrl); + +/** @type {import('node:worker_threads').MessagePort} */ (parentPort).postMessage({ + pageUrl, + results: report.valid ? null : report.results, +}); diff --git a/packages/code-infra/src/brokenLinksChecker/index.mjs b/packages/code-infra/src/brokenLinksChecker/index.mjs index d0efac0f6..9041d98ad 100644 --- a/packages/code-infra/src/brokenLinksChecker/index.mjs +++ b/packages/code-infra/src/brokenLinksChecker/index.mjs @@ -7,7 +7,8 @@ import * as path from 'node:path'; import chalk from 'chalk'; import { Transform } from 'node:stream'; import contentType from 'content-type'; -import { HtmlValidate, StaticConfigLoader, staticResolver, formatterFactory } from 'html-validate'; +import { Worker } from 'node:worker_threads'; +import { formatterFactory } from 'html-validate'; import { unified } from 'unified'; import remarkParse from 'remark-parse'; import remarkGfm from 'remark-gfm'; @@ -17,13 +18,24 @@ import rehypeStringify from 'rehype-stringify'; const DEFAULT_CONCURRENCY = 4; -const muiHtmlValidateResolver = staticResolver({ - configs: { - 'mui:recommended': { - extends: ['html-validate:recommended'], - }, - }, -}); +const htmlValidateWorkerUrl = new URL('./htmlValidateWorker.mjs', import.meta.url); + +/** + * Validates HTML content in a worker thread. + * @param {import('html-validate').ConfigData} htmlValidateConfig - html-validate config + * @param {string} rawContent - Raw HTML content to validate + * @param {string} pageUrl - URL of the page being validated + * @returns {Promise<{ pageUrl: string, results: import('html-validate').Result[] | null }>} + */ +function validateHtmlInWorker(htmlValidateConfig, rawContent, pageUrl) { + return new Promise((resolve, reject) => { + const worker = new Worker(htmlValidateWorkerUrl, { + workerData: { htmlValidateConfig, rawContent, pageUrl }, + }); + worker.on('message', (msg) => resolve(msg)); + worker.on('error', (err) => reject(err)); + }); +} /** * Resolves the htmlValidate option into an html-validate config object or null. @@ -609,12 +621,8 @@ export async function crawl(rawOptions) { const startTime = Date.now(); const htmlValidateConfig = resolveHtmlValidateConfig(rawOptions.htmlValidate); - /** @type {HtmlValidate | null} */ - const htmlValidator = htmlValidateConfig - ? new HtmlValidate(new StaticConfigLoader([muiHtmlValidateResolver], htmlValidateConfig)) - : null; - /** @type {Map} */ - const htmlValidateResults = new Map(); + /** @type {Promise<{ pageUrl: string, results: import('html-validate').Result[] | null }>[]} */ + const htmlValidatePromises = []; /** @type {AbortController | null} */ let controller = null; @@ -708,11 +716,8 @@ export async function crawl(rawOptions) { const rawContent = await res.text(); - if (htmlValidator && type === 'text/html') { - const report = await htmlValidator.validateString(rawContent, pageUrl); - if (!report.valid) { - htmlValidateResults.set(pageUrl, report.results); - } + if (htmlValidateConfig && type === 'text/html') { + htmlValidatePromises.push(validateHtmlInWorker(htmlValidateConfig, rawContent, pageUrl)); } const content = type === 'text/markdown' ? await markdownToHtml(rawContent) : rawContent; @@ -758,6 +763,15 @@ export async function crawl(rawOptions) { await queue.waitAll(); + /** @type {Map} */ + const htmlValidateResults = new Map(); + const resolvedValidations = await Promise.all(htmlValidatePromises); + for (const validation of resolvedValidations) { + if (validation.results) { + htmlValidateResults.set(validation.pageUrl, validation.results); + } + } + if (controller) { console.log(chalk.blue('Stopping server...')); controller.abort(); @@ -852,7 +866,7 @@ export async function crawl(rawOptions) { console.log(` Total broken links: ${chalk.cyan(brokenLinks)}`); console.log(` Total broken link targets: ${chalk.cyan(brokenLinkTargets)}`); console.log(` Total ignored: ${chalk.cyan(ignoredCount)}`); - if (htmlValidator) { + if (htmlValidateConfig) { const totalHtmlIssues = [...htmlValidateResults.values()].reduce( (sum, pageResults) => sum + pageResults.reduce((s, r) => s + r.messages.length, 0), 0, From b7e9cc6bafac69dab14e2117781b20fd8d546a56 Mon Sep 17 00:00:00 2001 From: Janpot <2109932+Janpot@users.noreply.github.com> Date: Wed, 25 Mar 2026 11:29:45 +0100 Subject: [PATCH 04/16] [code-infra] Move per-URL crawl work into worker threads Move fetch, parse, link/target extraction, and HTML validation into a single crawlWorker per URL. The main thread now only handles queue management, deduplication, and post-crawl analysis. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/brokenLinksChecker/crawlWorker.mjs | 168 +++++++++++ .../brokenLinksChecker/htmlValidateWorker.mjs | 23 -- .../src/brokenLinksChecker/index.mjs | 285 ++++++------------ 3 files changed, 252 insertions(+), 224 deletions(-) create mode 100644 packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs delete mode 100644 packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs diff --git a/packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs b/packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs new file mode 100644 index 000000000..6bc991e8e --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs @@ -0,0 +1,168 @@ +import { workerData, parentPort } from 'node:worker_threads'; +import { parse } from 'node-html-parser'; +import contentType from 'content-type'; +import { HtmlValidate, StaticConfigLoader, staticResolver } from 'html-validate'; +import { unified } from 'unified'; +import remarkParse from 'remark-parse'; +import remarkGfm from 'remark-gfm'; +import remarkRehype from 'remark-rehype'; +import rehypeSlug from 'rehype-slug'; +import rehypeStringify from 'rehype-stringify'; + +/** @type {import('./index.mjs').CrawlWorkerInput} */ +const { pageUrl, options } = workerData; + +/** + * Posts the crawl result back to the parent thread. + * @param {import('./index.mjs').CrawlWorkerOutput} output + */ +function postResult(output) { + if (!parentPort) { + throw new Error('crawlWorker must be run as a worker thread'); + } + parentPort.postMessage(output); +} + +/** + * Computes the accessible name of an element according to ARIA rules. + * @param {import('node-html-parser').HTMLElement | null} elm + * @param {import('node-html-parser').HTMLElement} ownerDocument + * @returns {string} + */ +function getAccessibleName(elm, ownerDocument) { + if (!elm) { + return ''; + } + + const ariaLabel = elm.getAttribute('aria-label')?.trim(); + if (ariaLabel) { + return ariaLabel; + } + + const labelledby = elm.getAttribute('aria-labelledby'); + if (labelledby) { + const labels = []; + for (const id of labelledby.split(/\s+/)) { + const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument); + if (label) { + labels.push(label); + } + } + const label = labels.join(' ').trim(); + if (label) { + return label; + } + } + + if (elm.id) { + const label = ownerDocument.querySelector(`label[for="${elm.id}"]`); + if (label) { + return getAccessibleName(label, ownerDocument); + } + } + + if (elm.tagName === 'IMG') { + const alt = elm.getAttribute('alt')?.trim(); + if (alt) { + return alt; + } + } + + return elm.innerText.trim(); +} + +/** + * Converts markdown content to HTML using unified pipeline. + * @param {string} markdown + * @returns {Promise} + */ +async function markdownToHtml(markdown) { + const result = await unified() + .use(remarkParse) + .use(remarkGfm) + .use(remarkRehype) + .use(rehypeSlug) + .use(rehypeStringify) + .process(markdown); + return String(result); +} + +const res = await fetch(new URL(pageUrl, options.host)); + +const contentTypeHeader = res.headers.get('content-type'); +let type = 'text/html'; + +if (contentTypeHeader) { + try { + const parsed = contentType.parse(contentTypeHeader); + type = parsed.type; + } catch { + // invalid content-type, default to text/html + } +} + +/** @type {import('./index.mjs').CrawlWorkerPageData} */ +const pageData = { + url: pageUrl, + status: res.status, + targets: [], + contentType: type, +}; + +if (pageData.status < 200 || pageData.status >= 400) { + postResult({ pageData, links: [], htmlValidateResults: null }); +} else if (type.startsWith('image/') || (type !== 'text/html' && type !== 'text/markdown')) { + postResult({ pageData, links: [], htmlValidateResults: null }); +} else { + const rawContent = await res.text(); + + const content = type === 'text/markdown' ? await markdownToHtml(rawContent) : rawContent; + + const dom = parse(content, { parseNoneClosedTags: true }); + + // Extract targets + for (const target of dom.querySelectorAll('*[id]')) { + if (!options.ignoredTargets.has(target.id)) { + pageData.targets.push(`#${target.id}`); + } + } + + // Extract links + let ignoredSelector = ':not(*)'; + if (options.ignoredContent.length > 0) { + ignoredSelector = Array.from(options.ignoredContent) + .flatMap((selector) => [selector, `${selector} *`]) + .join(','); + } + const linksSelector = `a[href]:not(${ignoredSelector})`; + + const links = dom.querySelectorAll(linksSelector).map((a) => ({ + src: pageUrl, + text: getAccessibleName(a, dom), + href: a.getAttribute('href') ?? '', + contentType: type, + })); + + // HTML validation + let htmlValidateResults = null; + if (options.htmlValidate && type === 'text/html') { + const muiHtmlValidateResolver = staticResolver({ + configs: { + 'mui:recommended': { + extends: ['html-validate:standard', 'html-validate:document', 'html-validate:browser'], + }, + }, + }); + + const htmlValidator = new HtmlValidate( + new StaticConfigLoader([muiHtmlValidateResolver], options.htmlValidate), + ); + + const report = await htmlValidator.validateString(rawContent, pageUrl); + if (!report.valid) { + htmlValidateResults = { pageUrl, results: report.results }; + } + } + + postResult({ pageData, links, htmlValidateResults }); +} diff --git a/packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs b/packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs deleted file mode 100644 index aa9df748b..000000000 --- a/packages/code-infra/src/brokenLinksChecker/htmlValidateWorker.mjs +++ /dev/null @@ -1,23 +0,0 @@ -import { workerData, parentPort } from 'node:worker_threads'; -import { HtmlValidate, StaticConfigLoader, staticResolver } from 'html-validate'; - -const { htmlValidateConfig, rawContent, pageUrl } = workerData; - -const muiHtmlValidateResolver = staticResolver({ - configs: { - 'mui:recommended': { - extends: ['html-validate:recommended'], - }, - }, -}); - -const htmlValidator = new HtmlValidate( - new StaticConfigLoader([muiHtmlValidateResolver], htmlValidateConfig), -); - -const report = await htmlValidator.validateString(rawContent, pageUrl); - -/** @type {import('node:worker_threads').MessagePort} */ (parentPort).postMessage({ - pageUrl, - results: report.valid ? null : report.results, -}); diff --git a/packages/code-infra/src/brokenLinksChecker/index.mjs b/packages/code-infra/src/brokenLinksChecker/index.mjs index 9041d98ad..983e40feb 100644 --- a/packages/code-infra/src/brokenLinksChecker/index.mjs +++ b/packages/code-infra/src/brokenLinksChecker/index.mjs @@ -1,56 +1,16 @@ /* eslint-disable no-console */ import { execaCommand } from 'execa'; import timers from 'node:timers/promises'; -import { parse } from 'node-html-parser'; import * as fs from 'node:fs/promises'; import * as path from 'node:path'; import chalk from 'chalk'; import { Transform } from 'node:stream'; -import contentType from 'content-type'; import { Worker } from 'node:worker_threads'; import { formatterFactory } from 'html-validate'; -import { unified } from 'unified'; -import remarkParse from 'remark-parse'; -import remarkGfm from 'remark-gfm'; -import remarkRehype from 'remark-rehype'; -import rehypeSlug from 'rehype-slug'; -import rehypeStringify from 'rehype-stringify'; const DEFAULT_CONCURRENCY = 4; -const htmlValidateWorkerUrl = new URL('./htmlValidateWorker.mjs', import.meta.url); - -/** - * Validates HTML content in a worker thread. - * @param {import('html-validate').ConfigData} htmlValidateConfig - html-validate config - * @param {string} rawContent - Raw HTML content to validate - * @param {string} pageUrl - URL of the page being validated - * @returns {Promise<{ pageUrl: string, results: import('html-validate').Result[] | null }>} - */ -function validateHtmlInWorker(htmlValidateConfig, rawContent, pageUrl) { - return new Promise((resolve, reject) => { - const worker = new Worker(htmlValidateWorkerUrl, { - workerData: { htmlValidateConfig, rawContent, pageUrl }, - }); - worker.on('message', (msg) => resolve(msg)); - worker.on('error', (err) => reject(err)); - }); -} - -/** - * Resolves the htmlValidate option into an html-validate config object or null. - * @param {boolean | import('html-validate').ConfigData | undefined} option - * @returns {import('html-validate').ConfigData | null} - */ -function resolveHtmlValidateConfig(option) { - if (!option) { - return null; - } - if (option === true) { - return { extends: ['mui:recommended'] }; - } - return option; -} +const crawlWorkerUrl = new URL('./crawlWorker.mjs', import.meta.url); /** * Creates a Transform stream that prefixes each line with a given string. @@ -141,6 +101,30 @@ function deserializeLinkStructure(data) { return linkStructure; } +/** + * Input data passed to the crawl worker via workerData. + * @typedef {Object} CrawlWorkerInput + * @property {string} pageUrl - The page URL to crawl + * @property {ResolvedCrawlOptions} options - Fully resolved crawl options + */ + +/** + * Serialized page data returned by the crawl worker (uses arrays instead of Sets for structured clone). + * @typedef {Object} CrawlWorkerPageData + * @property {string} url - The normalized page URL + * @property {number} status - HTTP status code + * @property {string[]} targets - Array of anchor targets (e.g., '#intro') + * @property {string} contentType - Content-type of the page + */ + +/** + * Output message posted by the crawl worker. + * @typedef {Object} CrawlWorkerOutput + * @property {CrawlWorkerPageData} pageData - Serialized page data + * @property {Link[]} links - Links discovered on the page + * @property {{ pageUrl: string, results: import('html-validate').Result[] } | null} htmlValidateResults - HTML validation results, or null if validation was skipped/passed + */ + /** * Data about a crawled page including its URL, HTTP status, and available link targets. * @typedef {Object} PageData @@ -167,77 +151,6 @@ async function writePagesToFile(pages, outPath) { await fs.writeFile(outPath, JSON.stringify(fileContent, null, 2), 'utf-8'); } -/** - * Computes the accessible name of an element according to ARIA rules. - * Polyfill for `node.computedName` available only in Chrome v112+. - * Checks in order: aria-label, aria-labelledby, label[for], img alt, innerText. - * @param {import('node-html-parser').HTMLElement | null} elm - Element to compute name for - * @param {import('node-html-parser').HTMLElement} ownerDocument - Document containing the element - * @returns {string} The computed accessible name, or empty string if none found - */ -function getAccessibleName(elm, ownerDocument) { - if (!elm) { - return ''; - } - - // 1. aria-label - const ariaLabel = elm.getAttribute('aria-label')?.trim(); - if (ariaLabel) { - return ariaLabel; - } - - // 2. aria-labelledby - const labelledby = elm.getAttribute('aria-labelledby'); - if (labelledby) { - const labels = []; - for (const id of labelledby.split(/\s+/)) { - const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument); - if (label) { - labels.push(label); - } - } - const label = labels.join(' ').trim(); - if (label) { - return label; - } - } - - // 3.