Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/tools/search/highlights.ts
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,8 @@ export function expandHighlights(
!result.highlights ||
result.highlights.length === 0
) {
return result; // No modification needed
const { content: _content, ...resultWithoutContent } = result;
return resultWithoutContent as typeof result;
}

// Create a shallow copy with expanded highlights
Expand Down
23 changes: 21 additions & 2 deletions src/tools/search/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ const chunker = {
},
};

/** Maximum chars of scraped content passed to the chunker/reranker per source.
* Overridable via SEARCH_MAX_CONTENT_LENGTH env var. */
const DEFAULT_MAX_CONTENT_LENGTH = 50000;
const MAX_CONTENT_LENGTH =
Number(process.env.SEARCH_MAX_CONTENT_LENGTH) || DEFAULT_MAX_CONTENT_LENGTH;

function createSourceUpdateCallback(sourceMap: Map<string, t.ValidSource>) {
return (link: string, update?: Partial<t.ValidSource>): void => {
const source = sourceMap.get(link);
Expand All @@ -82,12 +88,14 @@ const getHighlights = async ({
content,
reranker,
topResults = 5,
maxContentLength = MAX_CONTENT_LENGTH,
logger,
}: {
content: string;
query: string;
reranker?: BaseReranker;
topResults?: number;
maxContentLength?: number;
logger?: t.Logger;
}): Promise<t.Highlight[] | undefined> => {
const logger_ = logger || createDefaultLogger();
Expand All @@ -102,7 +110,11 @@ const getHighlights = async ({
}

try {
const documents = await chunker.splitText(content);
const cappedContent =
content.length > maxContentLength
? content.slice(0, maxContentLength)
: content;
const documents = await chunker.splitText(cappedContent);
if (Array.isArray(documents)) {
return await reranker.rerank(query, documents, topResults);
} else {
Expand Down Expand Up @@ -445,6 +457,7 @@ export const createSourceProcessor = (
}
const {
topResults = 5,
maxContentLength = MAX_CONTENT_LENGTH,
// strategies = ['no_extraction'],
// filterContent = true,
reranker,
Expand Down Expand Up @@ -479,11 +492,15 @@ export const createSourceProcessor = (
);
if (response.success && response.data) {
const [content, references] = scraper.extractContent(response);
const cleanedContent = chunker.cleanText(content);
return {
url,
references,
attribution,
content: chunker.cleanText(content),
content:
cleanedContent.length > maxContentLength
? cleanedContent.slice(0, maxContentLength)
: cleanedContent,
} as t.ScrapeResult;
} else {
logger_.error(
Expand Down Expand Up @@ -512,6 +529,7 @@ export const createSourceProcessor = (
query,
reranker,
content: result.content,
maxContentLength,
logger: logger_,
});
if (onGetHighlights) {
Expand Down Expand Up @@ -693,6 +711,7 @@ export const createSourceProcessor = (

if (news && topStories.length > 0) {
updateSourcesWithContent(topStories, sourceMap);
result.data.topStories = topStories.slice(0, numElements);
}

return result.data;
Expand Down
2 changes: 2 additions & 0 deletions src/tools/search/tool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ export const createSearchTool = (
searxngApiKey,
rerankerType = 'cohere',
topResults = 5,
maxContentLength,
strategies = ['no_extraction'],
filterContent = true,
safeSearch = 1,
Expand Down Expand Up @@ -435,6 +436,7 @@ export const createSearchTool = (
{
reranker: selectedReranker,
topResults,
maxContentLength,
strategies,
filterContent,
logger,
Expand Down
2 changes: 2 additions & 0 deletions src/tools/search/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ export interface ScrapeResult {

export interface ProcessSourcesConfig {
topResults?: number;
/** Max chars of scraped content per source before chunking. Overridable via SEARCH_MAX_CONTENT_LENGTH. */
maxContentLength?: number;
strategies?: string[];
filterContent?: boolean;
reranker?: BaseReranker;
Expand Down