Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions context_cache.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// context_cache.js
// Deduplication layer for batch scraping
'use strict';

import crypto from 'node:crypto';

/**
* SHA-256 prefix fingerprint cache.
* Uses first 2048 chars as content signature to detect duplicates.
*/
export class ContextCache {
constructor(options = {}) {
this._seen = new Map();
this._prefix_len = options.prefix_len ?? 2048;
this._stats = { hits: 0, misses: 0, bytes_saved: 0 };
}

/**
* Check if content is duplicate.
* @param {string} content
* @param {string} url
* @returns {{ isDuplicate: boolean, contentHash: string, duplicateOf?: string }}
*/
check(content, url) {
let hash;
if (content.length <= 2048) {
// Short content: use full content hash
hash = crypto.createHash('sha256').update(content).digest('hex');
} else {
// Long content: sample from start, middle, and end
const prefix = content.slice(0, 2048);
const midIdx = Math.floor(content.length / 2);
const middle = content.slice(midIdx, midIdx + 256);
const suffix = content.slice(-256);
hash = crypto
.createHash('sha256')
.update(prefix + middle + suffix)
.digest('hex');
}

if (this._seen.has(hash)) {
this._stats.hits++;
this._stats.bytes_saved += content.length;
return {
isDuplicate: true,
contentHash: hash,
duplicateOf: this._seen.get(hash),
};
}

this._seen.set(hash, url);
this._stats.misses++;
return { isDuplicate: false, contentHash: hash };
}

/**
* Return deduplication stats.
*/
stats() {
return {
unique_blocks: this._stats.misses,
duplicate_blocks: this._stats.hits,
bytes_saved: this._stats.bytes_saved,
dedup_ratio: this._stats.hits > 0
? (this._stats.hits / (this._stats.hits + this._stats.misses)).toFixed(3)
: '0.000',
};
}

/**
* Clear the cache. Useful for long-running processes.
*/
clear() {
this._seen.clear();
this._stats = { hits: 0, misses: 0, bytes_saved: 0 };
}
}

/**
* Filter fields from search results.
* @param {Array} results
* @param {string[]} fields
* @returns {Array}
*/
const PROTECTED_PROPS = new Set(['__proto__', 'constructor', 'prototype']);

export function filterFields(results, fields) {
if (!fields || fields.length === 0) return results;
if (!Array.isArray(results)) return results;

// Filter out dangerous properties
const safeFields = fields.filter(f => !PROTECTED_PROPS.has(f));

return results.map(item => {
if (item == null) return {};
if (typeof item !== 'object') return {};
return Object.fromEntries(
safeFields.filter(f => f in item).map(f => [f, item[f]])
);
});
}

/**
* Build metrics summary for batch responses.
*/
export function buildBatchMetrics(cache, timings = {}) {
return {
version: '1.0.0',
dedup: cache.stats(),
timings,
timestamp_utc: new Date().toISOString(),
};
}
153 changes: 117 additions & 36 deletions server.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {parse_google_search_response} from './search_utils.js';
import {createRequire} from 'node:module';
import {remark} from 'remark';
import strip from 'strip-markdown';
import { ContextCache, filterFields, buildBatchMetrics } from './context_cache.js';
const require = createRequire(import.meta.url);
const package_json = require('./package.json');
const api_token = process.env.API_TOKEN;
Expand Down Expand Up @@ -299,9 +300,12 @@ addTool({
.optional()
.describe('2-letter country code for geo-targeted results '
+'(e.g., "us", "uk")'),
fields: z.array(z.enum(['link', 'title', 'description', 'relevance_score', 'cursor']))
.optional()
.describe('Filter response to only these fields. Saves tokens in agent pipelines.'),
})).min(1).max(5),
}),
execute: tool_fn('search_engine_batch', async({queries}, ctx)=>{
execute: tool_fn('search_engine_batch', async({queries, fields}, ctx)=>{
const search_promises = queries.map(({query, engine, cursor,
geo_location})=>{
const normalized_engine = engine || 'google';
Expand Down Expand Up @@ -349,49 +353,126 @@ addTool({
});

const results = await Promise.all(search_promises);
return JSON.stringify(results, null, 2);

// Apply field filtering if requested
// For Google: filter within result.organic array
// For Bing/Yandex: result is just text, no fields to filter
let all_results = results;
if (fields && Array.isArray(all_results)) {
all_results = all_results.map(page_result => {
if (page_result.result && typeof page_result.result === 'object' && Array.isArray(page_result.result.organic)) {
return {
...page_result,
result: {
...page_result.result,
organic: filterFields(page_result.result.organic, fields),
},
};
}
return page_result;
});
}

return JSON.stringify(all_results, null, 2);
}),
});

addTool({
name: 'scrape_batch',
description: 'Scrape multiple webpages URLs with advanced options for '
name: 'scrape_batch',
description: 'Scrape multiple webpages URLs with advanced options for '
+'content extraction and get back the results in MarkDown language. '
+'This tool can unlock any webpage even if it uses bot detection or '
+'CAPTCHA.',
annotations: {
title: 'Scrape Batch',
readOnlyHint: true,
openWorldHint: true,
},
parameters: z.object({
urls: z.array(z.string().url()).min(1).max(5).describe('Array of URLs to scrape (max 5)')
}),
execute: tool_fn('scrape_batch', async ({urls}, ctx)=>{
const scrapePromises = urls.map(url =>
base_request({
url: 'https://api.brightdata.com/request',
method: 'POST',
data: {
url,
zone: unlocker_zone,
format: 'raw',
data_format: 'markdown',
},
headers: api_headers(ctx.clientName, 'scrape_batch'),
responseType: 'text',
}).then(async response=>({
url,
content: (await remark()
.use(strip, {keep: ['link', 'linkReference', 'code',
'inlineCode']})
.process(response.data)).value,
}))
);
annotations: {
title: 'Batch Scrape',
readOnlyHint: true,
openWorldHint: true,
},
parameters: z.object({
urls: z.array(z.string().url()).min(1).max(5)
.describe('List of URLs to scrape (max 5)'),
deduplicate: z.boolean().optional().default(true)
.describe('Remove duplicate content blocks across URLs. '
+'Deduplication: removes duplicate content blocks across URLs. Default: true.'),
fields: z.array(z.string()).optional()
.describe('Optional: return only these top-level fields from each result'),
format: z.enum(['markdown', 'raw']).optional().default('markdown')
.describe('Output format'),
include_metrics: z.boolean().optional().default(false)
.describe('Include deduplication metrics in response. Default: false (returns flat array).'),
}),
execute: tool_fn('scrape_batch', async (data, ctx) => {
check_rate_limit();
const cache = data.deduplicate ? new ContextCache() : null;
const t0 = Date.now();

const scrape_promises = data.urls.map(async (url) => {
const t_url = Date.now();
try {
const response = await base_request({
url: `https://api.brightdata.com/request`,
method: 'POST',
headers: api_headers(ctx?.clientName, 'scrape_batch'),
data: {
zone: unlocker_zone,
url,
format: 'raw',
data_format: 'markdown',
},
});

let content = response.data;
if (data.format === 'markdown') {
content = (await remark().use(strip, {
keep: ['link', 'linkReference', 'code', 'inlineCode'],
}).process(content)).value;
}

const dedup = cache?.check(content, url);
const result = {
url,
status: 'success',
latency_ms: Date.now() - t_url,
...(dedup?.isDuplicate
? {
content: null,
skipped: true,
duplicate_of: dedup.duplicateOf,
content_hash: dedup.contentHash,
}
: {
content: data.fields
? filterFields([{ content }], data.fields)[0]
: content,
content_hash: dedup?.contentHash ?? null,
}),
};
return result;
} catch (e) {
return {
url,
status: 'error',
latency_ms: Date.now() - t_url,
error: 'Scrape failed: ' + (e.response?.status ?? e.message),
};
}
});

const results = await Promise.allSettled(scrapePromises);
return JSON.stringify(results, null, 2);
}),
const results = await Promise.allSettled(scrape_promises);
const output = results.map(r =>
r.status === 'fulfilled' ? r.value : { status: 'error', error: r.status === 'rejected' ? 'Request failed: ' + String(r.reason?.message ?? r.reason ?? 'Unknown error') : r.value }
);

if (data.include_metrics) {
return JSON.stringify({
results: output,
metrics: cache
? buildBatchMetrics(cache, { total_ms: Date.now() - t0 })
: null,
}, null, 2);
}
return JSON.stringify(output, null, 2);
}),
});

addTool({
Expand Down
Loading
Loading