diff --git a/context_cache.js b/context_cache.js new file mode 100644 index 0000000..f219ad6 --- /dev/null +++ b/context_cache.js @@ -0,0 +1,113 @@ +// context_cache.js +// Deduplication layer for batch scraping +'use strict'; + +import crypto from 'node:crypto'; + +/** + * SHA-256 prefix fingerprint cache. + * Uses first 2048 chars as content signature to detect duplicates. + */ +export class ContextCache { + constructor(options = {}) { + this._seen = new Map(); + this._prefix_len = options.prefix_len ?? 2048; + this._stats = { hits: 0, misses: 0, bytes_saved: 0 }; + } + + /** + * Check if content is duplicate. + * @param {string} content + * @param {string} url + * @returns {{ isDuplicate: boolean, contentHash: string, duplicateOf?: string }} + */ + check(content, url) { + let hash; + if (content.length <= 2048) { + // Short content: use full content hash + hash = crypto.createHash('sha256').update(content).digest('hex'); + } else { + // Long content: sample from start, middle, and end + const prefix = content.slice(0, 2048); + const midIdx = Math.floor(content.length / 2); + const middle = content.slice(midIdx, midIdx + 256); + const suffix = content.slice(-256); + hash = crypto + .createHash('sha256') + .update(prefix + middle + suffix) + .digest('hex'); + } + + if (this._seen.has(hash)) { + this._stats.hits++; + this._stats.bytes_saved += content.length; + return { + isDuplicate: true, + contentHash: hash, + duplicateOf: this._seen.get(hash), + }; + } + + this._seen.set(hash, url); + this._stats.misses++; + return { isDuplicate: false, contentHash: hash }; + } + + /** + * Return deduplication stats. + */ + stats() { + return { + unique_blocks: this._stats.misses, + duplicate_blocks: this._stats.hits, + bytes_saved: this._stats.bytes_saved, + dedup_ratio: this._stats.hits > 0 + ? (this._stats.hits / (this._stats.hits + this._stats.misses)).toFixed(3) + : '0.000', + }; + } + + /** + * Clear the cache. Useful for long-running processes. + */ + clear() { + this._seen.clear(); + this._stats = { hits: 0, misses: 0, bytes_saved: 0 }; + } +} + +/** + * Filter fields from search results. + * @param {Array} results + * @param {string[]} fields + * @returns {Array} + */ +const PROTECTED_PROPS = new Set(['__proto__', 'constructor', 'prototype']); + +export function filterFields(results, fields) { + if (!fields || fields.length === 0) return results; + if (!Array.isArray(results)) return results; + + // Filter out dangerous properties + const safeFields = fields.filter(f => !PROTECTED_PROPS.has(f)); + + return results.map(item => { + if (item == null) return {}; + if (typeof item !== 'object') return {}; + return Object.fromEntries( + safeFields.filter(f => f in item).map(f => [f, item[f]]) + ); + }); +} + +/** + * Build metrics summary for batch responses. + */ +export function buildBatchMetrics(cache, timings = {}) { + return { + version: '1.0.0', + dedup: cache.stats(), + timings, + timestamp_utc: new Date().toISOString(), + }; +} \ No newline at end of file diff --git a/server.js b/server.js index ff9e086..6d36101 100644 --- a/server.js +++ b/server.js @@ -10,6 +10,7 @@ import {parse_google_search_response} from './search_utils.js'; import {createRequire} from 'node:module'; import {remark} from 'remark'; import strip from 'strip-markdown'; +import { ContextCache, filterFields, buildBatchMetrics } from './context_cache.js'; const require = createRequire(import.meta.url); const package_json = require('./package.json'); const api_token = process.env.API_TOKEN; @@ -299,9 +300,12 @@ addTool({ .optional() .describe('2-letter country code for geo-targeted results ' +'(e.g., "us", "uk")'), + fields: z.array(z.enum(['link', 'title', 'description', 'relevance_score', 'cursor'])) + .optional() + .describe('Filter response to only these fields. Saves tokens in agent pipelines.'), })).min(1).max(5), }), - execute: tool_fn('search_engine_batch', async({queries}, ctx)=>{ + execute: tool_fn('search_engine_batch', async({queries, fields}, ctx)=>{ const search_promises = queries.map(({query, engine, cursor, geo_location})=>{ const normalized_engine = engine || 'google'; @@ -349,49 +353,126 @@ addTool({ }); const results = await Promise.all(search_promises); - return JSON.stringify(results, null, 2); + + // Apply field filtering if requested + // For Google: filter within result.organic array + // For Bing/Yandex: result is just text, no fields to filter + let all_results = results; + if (fields && Array.isArray(all_results)) { + all_results = all_results.map(page_result => { + if (page_result.result && typeof page_result.result === 'object' && Array.isArray(page_result.result.organic)) { + return { + ...page_result, + result: { + ...page_result.result, + organic: filterFields(page_result.result.organic, fields), + }, + }; + } + return page_result; + }); + } + + return JSON.stringify(all_results, null, 2); }), }); addTool({ - name: 'scrape_batch', - description: 'Scrape multiple webpages URLs with advanced options for ' + name: 'scrape_batch', + description: 'Scrape multiple webpages URLs with advanced options for ' +'content extraction and get back the results in MarkDown language. ' +'This tool can unlock any webpage even if it uses bot detection or ' +'CAPTCHA.', - annotations: { - title: 'Scrape Batch', - readOnlyHint: true, - openWorldHint: true, - }, - parameters: z.object({ - urls: z.array(z.string().url()).min(1).max(5).describe('Array of URLs to scrape (max 5)') - }), - execute: tool_fn('scrape_batch', async ({urls}, ctx)=>{ - const scrapePromises = urls.map(url => - base_request({ - url: 'https://api.brightdata.com/request', - method: 'POST', - data: { - url, - zone: unlocker_zone, - format: 'raw', - data_format: 'markdown', - }, - headers: api_headers(ctx.clientName, 'scrape_batch'), - responseType: 'text', - }).then(async response=>({ - url, - content: (await remark() - .use(strip, {keep: ['link', 'linkReference', 'code', - 'inlineCode']}) - .process(response.data)).value, - })) - ); + annotations: { + title: 'Batch Scrape', + readOnlyHint: true, + openWorldHint: true, + }, + parameters: z.object({ + urls: z.array(z.string().url()).min(1).max(5) + .describe('List of URLs to scrape (max 5)'), + deduplicate: z.boolean().optional().default(true) + .describe('Remove duplicate content blocks across URLs. ' + +'Deduplication: removes duplicate content blocks across URLs. Default: true.'), + fields: z.array(z.string()).optional() + .describe('Optional: return only these top-level fields from each result'), + format: z.enum(['markdown', 'raw']).optional().default('markdown') + .describe('Output format'), + include_metrics: z.boolean().optional().default(false) + .describe('Include deduplication metrics in response. Default: false (returns flat array).'), + }), + execute: tool_fn('scrape_batch', async (data, ctx) => { + check_rate_limit(); + const cache = data.deduplicate ? new ContextCache() : null; + const t0 = Date.now(); + + const scrape_promises = data.urls.map(async (url) => { + const t_url = Date.now(); + try { + const response = await base_request({ + url: `https://api.brightdata.com/request`, + method: 'POST', + headers: api_headers(ctx?.clientName, 'scrape_batch'), + data: { + zone: unlocker_zone, + url, + format: 'raw', + data_format: 'markdown', + }, + }); + + let content = response.data; + if (data.format === 'markdown') { + content = (await remark().use(strip, { + keep: ['link', 'linkReference', 'code', 'inlineCode'], + }).process(content)).value; + } + + const dedup = cache?.check(content, url); + const result = { + url, + status: 'success', + latency_ms: Date.now() - t_url, + ...(dedup?.isDuplicate + ? { + content: null, + skipped: true, + duplicate_of: dedup.duplicateOf, + content_hash: dedup.contentHash, + } + : { + content: data.fields + ? filterFields([{ content }], data.fields)[0] + : content, + content_hash: dedup?.contentHash ?? null, + }), + }; + return result; + } catch (e) { + return { + url, + status: 'error', + latency_ms: Date.now() - t_url, + error: 'Scrape failed: ' + (e.response?.status ?? e.message), + }; + } + }); - const results = await Promise.allSettled(scrapePromises); - return JSON.stringify(results, null, 2); - }), + const results = await Promise.allSettled(scrape_promises); + const output = results.map(r => + r.status === 'fulfilled' ? r.value : { status: 'error', error: r.status === 'rejected' ? 'Request failed: ' + String(r.reason?.message ?? r.reason ?? 'Unknown error') : r.value } + ); + + if (data.include_metrics) { + return JSON.stringify({ + results: output, + metrics: cache + ? buildBatchMetrics(cache, { total_ms: Date.now() - t0 }) + : null, + }, null, 2); + } + return JSON.stringify(output, null, 2); + }), }); addTool({ diff --git a/test/test_context_cache.js b/test/test_context_cache.js new file mode 100644 index 0000000..b3edea0 --- /dev/null +++ b/test/test_context_cache.js @@ -0,0 +1,117 @@ +// test_context_cache.js โ€” Tests for ContextCache dedup layer +// Run: node --experimental-vm-modules test/test_context_cache.js +import { ContextCache, filterFields, buildBatchMetrics } from '../context_cache.js'; +import assert from 'node:assert/strict'; + +let passed = 0; +let failed = 0; + +function test(name, fn) { + try { + fn(); + console.log(` โœ… ${name}`); + passed++; + } catch (e) { + console.error(` โŒ ${name}: ${e.message}`); + failed++; + } +} + +console.log('\n๐Ÿงช ContextCache โ€” deduplication tests\n'); + +test('New content is not duplicate', () => { + const cache = new ContextCache(); + const result = cache.check('Hello world unique content', 'https://a.com'); + assert.equal(result.isDuplicate, false); + assert.ok(result.contentHash); +}); + +test('Same content from different URL is flagged as duplicate', () => { + const cache = new ContextCache(); + const content = 'Identical navigation header repeated across pages'; + cache.check(content, 'https://a.com'); + const r2 = cache.check(content, 'https://b.com'); + assert.equal(r2.isDuplicate, true); + assert.equal(r2.duplicateOf, 'https://a.com'); +}); + +test('Different content is NOT flagged as duplicate', () => { + const cache = new ContextCache(); + cache.check('Content A about machine learning', 'https://a.com'); + const r2 = cache.check('Content B about web scraping', 'https://b.com'); + assert.equal(r2.isDuplicate, false); +}); + +test('Stats track hits and misses correctly', () => { + const cache = new ContextCache(); + cache.check('Content A', 'https://a.com'); + cache.check('Content A', 'https://b.com'); // duplicate + cache.check('Content B', 'https://c.com'); + const stats = cache.stats(); + assert.equal(stats.duplicate_blocks, 1); + assert.ok(stats.bytes_saved > 0); +}); + +test('filterFields returns only requested fields', () => { + const results = [ + { link: 'https://a.com', title: 'A', description: 'Desc A', metadata: 'extra' }, + { link: 'https://b.com', title: 'B', description: 'Desc B', metadata: 'extra' }, + ]; + const filtered = filterFields(results, ['link', 'title']); + assert.deepEqual(Object.keys(filtered[0]), ['link', 'title']); + assert.equal('metadata' in filtered[0], false); +}); + +test('buildBatchMetrics returns valid structure', () => { + const cache = new ContextCache(); + cache.check('test', 'https://a.com'); + const metrics = buildBatchMetrics(cache, { total_ms: 123 }); + assert.equal(metrics.version, '1.0.0'); + assert.ok(metrics.timestamp_utc); + assert.ok(metrics.dedup); +}); + +test('Content differing after 2048 chars with same length is NOT flagged duplicate', () => { + const cache = new ContextCache(); + const prefix = 'A'.repeat(2048); + const content1 = prefix + 'UNIQUE_BODY_CONTENT_AAAA'; + const content2 = prefix + 'UNIQUE_BODY_CONTENT_BBBB'; + + const r1 = cache.check(content1, 'https://a.com'); + const r2 = cache.check(content2, 'https://b.com'); + + // Both should be unique because content length differs + assert.equal(r1.isDuplicate, false, 'First content should be UNIQUE'); + assert.equal(r2.isDuplicate, false, 'Second content should be UNIQUE (different body)'); + assert.notEqual(r1.contentHash, r2.contentHash, 'Hashes should differ'); +}); + +test('Content with same prefix but different length is NOT flagged duplicate', () => { + const cache = new ContextCache(); + const prefix = 'X'.repeat(2048); + const content1 = prefix + 'A'; // 2049 chars total + const content2 = prefix + 'AB'; // 2050 chars total + + const r1 = cache.check(content1, 'https://a.com'); + const r2 = cache.check(content2, 'https://b.com'); + + assert.equal(r1.isDuplicate, false, 'First should be UNIQUE'); + assert.equal(r2.isDuplicate, false, 'Second should be UNIQUE (different length)'); +}); + +test('Stats with complex dedup scenario', () => { + const cache = new ContextCache(); + // 3 URLs, 2 with same content (duplicate), 1 unique + const same = 'Shared content header and footer'; + cache.check(same, 'https://a.com'); + cache.check(same, 'https://b.com'); // duplicate + cache.check('Different unique content', 'https://c.com'); + + const stats = cache.stats(); + assert.equal(stats.unique_blocks, 2, 'Should have 2 unique blocks'); + assert.equal(stats.duplicate_blocks, 1, 'Should have 1 duplicate'); + assert.ok(stats.bytes_saved > 0, 'Should have saved bytes'); +}); + +console.log(`\n๐Ÿ“Š Results: ${passed} passed, ${failed} failed\n`); +if (failed > 0) process.exit(1); \ No newline at end of file diff --git a/test/test_dedup_edge_cases.js b/test/test_dedup_edge_cases.js new file mode 100644 index 0000000..6fd0b66 --- /dev/null +++ b/test/test_dedup_edge_cases.js @@ -0,0 +1,89 @@ +// test_dedup_edge_cases.js +// Disaster recovery tests for scrape_batch deduplication +import { ContextCache, filterFields } from '../context_cache.js'; +import assert from 'node:assert/strict'; + +let passed = 0; +let failed = 0; + +function test(name, fn) { + try { + fn(); + console.log(` โœ… ${name}`); + passed++; + } catch (e) { + console.error(` โŒ ${name}: ${e.message}`); + failed++; + } +} + +console.log('\n๐Ÿงช Dedup Edge Case Tests\n'); + +test('Empty string content', () => { + const cache = new ContextCache(); + const r = cache.check('', 'https://a.com'); + // Empty string has length 0, should still get a hash + assert.ok(r.contentHash, 'Should have a hash even for empty content'); + assert.equal(r.isDuplicate, false, 'First empty content is unique'); + + // Second empty from different URL should be duplicate + const r2 = cache.check('', 'https://b.com'); + assert.equal(r2.isDuplicate, true, 'Second empty content is duplicate'); +}); + +test('Single character content', () => { + const cache = new ContextCache(); + const r = cache.check('X', 'https://a.com'); + assert.equal(r.isDuplicate, false, 'Single char should be unique'); + assert.ok(r.contentHash); +}); + +test('Content exactly at prefix boundary (2048 chars)', () => { + const cache = new ContextCache(); + const exactly2048 = 'B'.repeat(2048); + const r = cache.check(exactly2048, 'https://a.com'); + assert.equal(r.isDuplicate, false, 'Exactly 2048 chars should be unique'); + assert.ok(r.contentHash); +}); + +test('Content at prefix+1 boundary', () => { + const cache = new ContextCache(); + const content1 = 'C'.repeat(2049); + const content2 = 'C'.repeat(2049) + 'X'; + + const r1 = cache.check(content1, 'https://a.com'); + const r2 = cache.check(content2, 'https://b.com'); + + // Different length = different hash even if prefix same + assert.equal(r1.isDuplicate, false); + assert.equal(r2.isDuplicate, false); + assert.notEqual(r1.contentHash, r2.contentHash); +}); + +test('Very long content (50000 chars)', () => { + const cache = new ContextCache(); + const long = 'D'.repeat(50000); + const r = cache.check(long, 'https://a.com'); + assert.equal(r.isDuplicate, false); + assert.ok(r.contentHash); +}); + +test('filterFields with non-existent field returns object without it', () => { + const results = [{ link: 'https://a.com', title: 'A' }]; + const filtered = filterFields(results, ['link', 'nonexistent']); + assert.deepEqual(filtered[0], { link: 'https://a.com' }); +}); + +test('filterFields with empty results array', () => { + const filtered = filterFields([], ['link']); + assert.deepEqual(filtered, []); +}); + +test('filterFields with null/undefined item', () => { + const results = [{ link: 'https://a.com' }, null, { link: 'https://b.com' }]; + const filtered = filterFields(results, ['link']); + assert.deepEqual(filtered[1], {}); // null item returns empty object +}); + +console.log(`\n๐Ÿ“Š Edge Case Results: ${passed} passed, ${failed} failed\n`); +if (failed > 0) process.exit(1); \ No newline at end of file diff --git a/test/test_filter_fields.js b/test/test_filter_fields.js new file mode 100644 index 0000000..96eafa8 --- /dev/null +++ b/test/test_filter_fields.js @@ -0,0 +1,145 @@ +// test_filter_fields.js +// Comprehensive filterFields edge case tests +import { filterFields } from '../context_cache.js'; +import assert from 'node:assert/strict'; + +let passed = 0; +let failed = 0; + +function test(name, fn) { + try { + fn(); + console.log(` โœ… ${name}`); + passed++; + } catch (e) { + console.error(` โŒ ${name}: ${e.message}`); + failed++; + } +} + +console.log('\n๐Ÿงช filterFields Edge Case Tests\n'); + +// 1. Empty and null inputs +test('empty array returns empty array', () => { + const r = filterFields([], ['title', 'url']); + assert.deepEqual(r, []); +}); + +test('null results returns null', () => { + const r = filterFields(null, ['title']); + assert.equal(r, null); +}); + +test('undefined results returns undefined', () => { + const r = filterFields(undefined, ['title']); + assert.equal(r, undefined); +}); + +// 2. Empty field list +test('empty fields array returns original items', () => { + const items = [{ a: 1, b: 2 }, { c: 3 }]; + const r = filterFields(items, []); + assert.deepEqual(r, items); +}); + +test('null fields array returns original items', () => { + const items = [{ a: 1 }]; + const r = filterFields(items, null); + assert.deepEqual(r, items); +}); + +// 3. Null/undefined items in array +test('null item in array returns empty object', () => { + const r = filterFields([{ title: 'a' }, null, { title: 'b' }], ['title']); + assert.deepEqual(r, [{ title: 'a' }, {}, { title: 'b' }]); +}); + +test('undefined item in array returns empty object', () => { + const r = filterFields([{ title: 'a' }, undefined, { title: 'b' }], ['title']); + assert.deepEqual(r, [{ title: 'a' }, {}, { title: 'b' }]); +}); + +test('null item with non-empty fields returns empty object', () => { + const r = filterFields([null], ['title']); + assert.deepEqual(r, [{}]); +}); + +// 4. Field selection +test('select single field', () => { + const r = filterFields([{ title: 'Hello', url: 'http://x.com', desc: 'Desc' }], ['title']); + assert.deepEqual(r, [{ title: 'Hello' }]); +}); + +test('select multiple fields', () => { + const r = filterFields([{ title: 'Hello', url: 'http://x.com', desc: 'Desc' }], ['title', 'url']); + assert.deepEqual(r, [{ title: 'Hello', url: 'http://x.com' }]); +}); + +test('select fields that dont exist returns empty object for that item', () => { + const r = filterFields([{ title: 'Hello' }], ['url', 'desc']); + assert.deepEqual(r, [{}]); +}); + +test('select fields from multiple items', () => { + const items = [ + { title: 'A', url: 'http://a.com' }, + { title: 'B', url: 'http://b.com' }, + ]; + const r = filterFields(items, ['title']); + assert.deepEqual(r, [{ title: 'A' }, { title: 'B' }]); +}); + +// 5. Field ordering preserved +test('fields are returned in specified order', () => { + const r = filterFields([{ z: 1, a: 2, m: 3 }], ['a', 'm', 'z']); + assert.deepEqual(r, [{ a: 2, m: 3, z: 1 }]); +}); + +// 6. Duplicate fields in list (should dedupe) +test('duplicate fields in list are deduplicated', () => { + const r = filterFields([{ title: 'Hello' }], ['title', 'title']); + assert.deepEqual(r, [{ title: 'Hello' }]); +}); + +// 7. Non-object items +test('non-object item in array returns empty object', () => { + const r = filterFields([42, 'string', true], ['a']); + assert.deepEqual(r, [{}, {}, {}]); +}); + +test('mixed object and non-object items', () => { + const r = filterFields([{ title: 'A' }, 42, { title: 'B' }], ['title']); + assert.deepEqual(r, [{ title: 'A' }, {}, { title: 'B' }]); +}); + +// 8. Very large field list +test('large field list is handled', () => { + const fields = Array.from({ length: 1000 }, (_, i) => `field${i}`); + const item = { field0: 0, field500: 500, field999: 999 }; + const r = filterFields([item], fields); + assert.equal(r.length, 1); + assert.ok(r[0].field0 === 0); + assert.ok(r[0].field500 === 500); + assert.ok(r[0].field999 === 999); +}); + +// 9. Special characters in field names +test('fields with special chars', () => { + const r = filterFields([{ 'field-name': 1, 'field_name': 2, 'field.name': 3 }], ['field-name', 'field_name']); + assert.deepEqual(r, [{ 'field-name': 1, 'field_name': 2 }]); +}); + +test('numeric-looking field names', () => { + const r = filterFields([{ '123': 'num', '0': 'zero' }], ['123', '0']); + assert.deepEqual(r, [{ '123': 'num', '0': 'zero' }]); +}); + +// 10. Nested objects (should only get top-level) +test('nested objects are preserved as values', () => { + const item = { title: 'A', meta: { k: 'v' } }; + const r = filterFields([item], ['title', 'meta']); + assert.deepEqual(r, [{ title: 'A', meta: { k: 'v' } }]); +}); + +console.log(`\n๐Ÿ“Š filterFields Results: ${passed} passed, ${failed} failed\n`); +if (failed > 0) process.exit(1); \ No newline at end of file