From ef16b66f9d81b38c4aef1ed28eccb5f1d4292e75 Mon Sep 17 00:00:00 2001 From: Pablo Date: Wed, 3 Jun 2026 09:39:02 -0300 Subject: [PATCH] feat: add opt-in field filtering to scrape_batch + search_engine_batch Optional `fields` param returns only the requested top-level fields, to save tokens in agent pipelines. Backward compatible: when omitted, output is unchanged. scrape_batch also isolates per-URL errors (a failed URL resolves to {url, error} instead of a rejected settlement). --- field_filter.js | 23 +++++++ server.js | 72 ++++++++++++++------- test/field-filter.test.js | 133 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 206 insertions(+), 22 deletions(-) create mode 100644 field_filter.js create mode 100644 test/field-filter.test.js diff --git a/field_filter.js b/field_filter.js new file mode 100644 index 0000000..7fb9951 --- /dev/null +++ b/field_filter.js @@ -0,0 +1,23 @@ +'use strict'; /*jslint node:true es9:true*/ + +// Opt-in response shaping for the batch tools. Given a list of result objects +// and a list of field names, return only those top-level fields from each item +// so agent pipelines don't pay tokens for data they didn't ask for. Keys that +// would pollute the prototype are never copied, and non-object items collapse +// to {} so the output array stays uniform. +const PROTECTED_PROPS = new Set(['__proto__', 'constructor', 'prototype']); + +export function filter_fields(results, fields){ + if (!fields || fields.length===0) + return results; + if (!Array.isArray(results)) + return results; + const safe_fields = fields.filter(f=>!PROTECTED_PROPS.has(f)); + return results.map(item=>{ + if (item===null || typeof item!=='object') + return {}; + return Object.fromEntries(safe_fields + .filter(f=>Object.prototype.hasOwnProperty.call(item, f)) + .map(f=>[f, item[f]])); + }); +} diff --git a/server.js b/server.js index ff9e086..22f1a69 100644 --- a/server.js +++ b/server.js @@ -10,6 +10,7 @@ import {parse_google_search_response} from './search_utils.js'; import {createRequire} from 'node:module'; import {remark} from 'remark'; import strip from 'strip-markdown'; +import {filter_fields} from './field_filter.js'; const require = createRequire(import.meta.url); const package_json = require('./package.json'); const api_token = process.env.API_TOKEN; @@ -300,8 +301,13 @@ addTool({ .describe('2-letter country code for geo-targeted results ' +'(e.g., "us", "uk")'), })).min(1).max(5), + fields: z.array(z.enum(['link', 'title', 'description', + 'relevance_score', 'cursor'])) + .optional() + .describe('Filter response to only these fields. ' + +'Saves tokens in agent pipelines.'), }), - execute: tool_fn('search_engine_batch', async({queries}, ctx)=>{ + execute: tool_fn('search_engine_batch', async({queries, fields}, ctx)=>{ const search_promises = queries.map(({query, engine, cursor, geo_location})=>{ const normalized_engine = engine || 'google'; @@ -349,7 +355,20 @@ addTool({ }); const results = await Promise.all(search_promises); - return JSON.stringify(results, null, 2); + if (!fields) + return JSON.stringify(results, null, 2); + const filtered = results.map(item=>{ + if (item && item.result && Array.isArray(item.result.organic)) + return { + ...item, + result: { + ...item.result, + organic: filter_fields(item.result.organic, fields), + }, + }; + return item; + }); + return JSON.stringify(filtered, null, 2); }), }); @@ -365,29 +384,38 @@ addTool({ openWorldHint: true, }, parameters: z.object({ - urls: z.array(z.string().url()).min(1).max(5).describe('Array of URLs to scrape (max 5)') + urls: z.array(z.string().url()).min(1).max(5) + .describe('Array of URLs to scrape (max 5)'), + fields: z.array(z.string()) + .optional() + .describe('Optional: return only these fields from each result ' + +'(e.g. ["content"]).'), }), - execute: tool_fn('scrape_batch', async ({urls}, ctx)=>{ - const scrapePromises = urls.map(url => - base_request({ - url: 'https://api.brightdata.com/request', - method: 'POST', - data: { - url, - zone: unlocker_zone, - format: 'raw', - data_format: 'markdown', - }, - headers: api_headers(ctx.clientName, 'scrape_batch'), - responseType: 'text', - }).then(async response=>({ - url, - content: (await remark() + execute: tool_fn('scrape_batch', async ({urls, fields}, ctx)=>{ + const scrapePromises = urls.map(async url=>{ + try { + const response = await base_request({ + url: 'https://api.brightdata.com/request', + method: 'POST', + data: { + url, + zone: unlocker_zone, + format: 'raw', + data_format: 'markdown', + }, + headers: api_headers(ctx.clientName, 'scrape_batch'), + responseType: 'text', + }); + const content = (await remark() .use(strip, {keep: ['link', 'linkReference', 'code', 'inlineCode']}) - .process(response.data)).value, - })) - ); + .process(response.data)).value; + const result = {url, content}; + return fields ? filter_fields([result], fields)[0] : result; + } catch(e){ + return {url, error: e instanceof Error ? e.message : String(e)}; + } + }); const results = await Promise.allSettled(scrapePromises); return JSON.stringify(results, null, 2); diff --git a/test/field-filter.test.js b/test/field-filter.test.js new file mode 100644 index 0000000..a62a5ab --- /dev/null +++ b/test/field-filter.test.js @@ -0,0 +1,133 @@ +'use strict'; /*jslint node:true es9:true*/ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import {filter_fields} from '../field_filter.js'; + +// Empty and null inputs +test('empty array returns empty array', ()=>{ + assert.deepEqual(filter_fields([], ['title', 'url']), []); +}); + +test('null results returns null', ()=>{ + assert.equal(filter_fields(null, ['title']), null); +}); + +test('undefined results returns undefined', ()=>{ + assert.equal(filter_fields(undefined, ['title']), undefined); +}); + +// Empty field list returns the items untouched +test('empty fields array returns original items', ()=>{ + const items = [{a: 1, b: 2}, {c: 3}]; + assert.deepEqual(filter_fields(items, []), items); +}); + +test('null fields array returns original items', ()=>{ + const items = [{a: 1}]; + assert.deepEqual(filter_fields(items, null), items); +}); + +// Null/undefined items collapse to {} +test('null item in array returns empty object', ()=>{ + assert.deepEqual( + filter_fields([{title: 'a'}, null, {title: 'b'}], ['title']), + [{title: 'a'}, {}, {title: 'b'}]); +}); + +test('undefined item in array returns empty object', ()=>{ + assert.deepEqual( + filter_fields([{title: 'a'}, undefined, {title: 'b'}], ['title']), + [{title: 'a'}, {}, {title: 'b'}]); +}); + +test('null item with non-empty fields returns empty object', ()=>{ + assert.deepEqual(filter_fields([null], ['title']), [{}]); +}); + +// Field selection +test('select single field', ()=>{ + assert.deepEqual( + filter_fields([{title: 'Hello', url: 'http://x.com', desc: 'Desc'}], + ['title']), + [{title: 'Hello'}]); +}); + +test('select multiple fields', ()=>{ + assert.deepEqual( + filter_fields([{title: 'Hello', url: 'http://x.com', desc: 'Desc'}], + ['title', 'url']), + [{title: 'Hello', url: 'http://x.com'}]); +}); + +test('select fields that do not exist returns empty object', ()=>{ + assert.deepEqual(filter_fields([{title: 'Hello'}], ['url', 'desc']), + [{}]); +}); + +test('select fields from multiple items', ()=>{ + const items = [ + {title: 'A', url: 'http://a.com'}, + {title: 'B', url: 'http://b.com'}, + ]; + assert.deepEqual(filter_fields(items, ['title']), + [{title: 'A'}, {title: 'B'}]); +}); + +// Field ordering follows the requested order +test('fields are returned in specified order', ()=>{ + assert.deepEqual(filter_fields([{z: 1, a: 2, m: 3}], ['a', 'm', 'z']), + [{a: 2, m: 3, z: 1}]); +}); + +// Duplicate field names are deduplicated by the output object +test('duplicate fields in list are deduplicated', ()=>{ + assert.deepEqual(filter_fields([{title: 'Hello'}], ['title', 'title']), + [{title: 'Hello'}]); +}); + +// Non-object items +test('non-object item in array returns empty object', ()=>{ + assert.deepEqual(filter_fields([42, 'string', true], ['a']), + [{}, {}, {}]); +}); + +test('mixed object and non-object items', ()=>{ + assert.deepEqual( + filter_fields([{title: 'A'}, 42, {title: 'B'}], ['title']), + [{title: 'A'}, {}, {title: 'B'}]); +}); + +// Large field list +test('large field list is handled', ()=>{ + const fields = Array.from({length: 1000}, (_, i)=>`field${i}`); + const r = filter_fields([{field0: 0, field500: 500, field999: 999}], + fields); + assert.deepEqual(r, [{field0: 0, field500: 500, field999: 999}]); +}); + +// Special characters and numeric-looking field names +test('fields with special chars', ()=>{ + assert.deepEqual( + filter_fields([{'field-name': 1, 'field_name': 2, 'field.name': 3}], + ['field-name', 'field_name']), + [{'field-name': 1, 'field_name': 2}]); +}); + +test('numeric-looking field names', ()=>{ + assert.deepEqual(filter_fields([{'123': 'num', '0': 'zero'}], ['123', '0']), + [{'123': 'num', '0': 'zero'}]); +}); + +// Nested objects are kept as values (only top-level keys are selected) +test('nested objects are preserved as values', ()=>{ + assert.deepEqual( + filter_fields([{title: 'A', meta: {k: 'v'}}], ['title', 'meta']), + [{title: 'A', meta: {k: 'v'}}]); +}); + +// Prototype-pollution guard: protected keys are never copied even if requested +test('protected prototype keys are never copied', ()=>{ + assert.deepEqual( + filter_fields([{a: 1}], ['__proto__', 'constructor', 'prototype', 'a']), + [{a: 1}]); +});