feat(server): add doc keyword search tool (#12837)

close AI-185

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced a keyword-based document search tool, allowing users to
search for relevant documents within their workspace using keywords.
- Search results include document titles, summaries, and direct links,
enhancing document discovery and navigation.
- **Bug Fixes**
  - None.
- **Tests**
- Added new tests to verify document search by IDs and by keywords,
ensuring accurate and reliable search functionality.
- **Documentation**
  - None.
- **Chores**
- Updated configuration file organization for improved clarity; no
changes to functionality.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->








#### PR Dependency Tree


* **PR #12867**
  * **PR #12863**
    * **PR #12837** 👈

This tree was auto-generated by
[Charcoal](https://github.com/danerwilliams/charcoal)
This commit is contained in:
fengmk2 2025-06-20 18:50:34 +08:00 committed by GitHub
parent 3a124b67bd
commit e978147a16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 148 additions and 26 deletions

View File

@ -864,22 +864,6 @@
}
}
},
"customerIo": {
"type": "object",
"description": "Configuration for customerIo module",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable customer.io integration\n@default false",
"default": false
},
"token": {
"type": "string",
"description": "Customer.io token\n@default \"\"",
"default": ""
}
}
},
"indexer": {
"type": "object",
"description": "Configuration for indexer module",
@ -921,6 +905,22 @@
}
}
},
"customerIo": {
"type": "object",
"description": "Configuration for customerIo module",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable customer.io integration\n@default false",
"default": false
},
"token": {
"type": "string",
"description": "Customer.io token\n@default \"\"",
"default": ""
}
}
},
"oauth": {
"type": "object",
"description": "Configuration for oauth module",

View File

@ -8,6 +8,7 @@ import { FeatureModule } from '../../core/features';
import { PermissionModule } from '../../core/permission';
import { QuotaModule } from '../../core/quota';
import { WorkspaceModule } from '../../core/workspaces';
import { IndexerModule } from '../indexer';
import {
CopilotContextResolver,
CopilotContextRootResolver,
@ -44,6 +45,7 @@ import {
PermissionModule,
ServerConfigModule,
WorkspaceModule,
IndexerModule,
],
providers: [
// providers

View File

@ -10,9 +10,12 @@ import {
OnEvent,
} from '../../../base';
import { AccessController } from '../../../core/permission';
import { IndexerService } from '../../indexer';
import { CopilotContextService } from '../context';
import {
buildDocKeywordSearchGetter,
buildDocSearchGetter,
createDocKeywordSearchTool,
createDocSemanticSearchTool,
createExaCrawlTool,
createExaSearchTool,
@ -125,6 +128,7 @@ export abstract class CopilotProvider<C = any> {
): Promise<ToolSet> {
const tools: ToolSet = {};
if (options?.tools?.length) {
this.logger.debug(`getTools: ${JSON.stringify(options.tools)}`);
for (const tool of options.tools) {
const toolDef = this.getProviderSpecificTools(tool, model);
if (toolDef) {
@ -143,6 +147,24 @@ export abstract class CopilotProvider<C = any> {
);
break;
}
case 'docKeywordSearch': {
if (this.AFFiNEConfig.indexer.enabled) {
const ac = this.moduleRef.get(AccessController, {
strict: false,
});
const indexerService = this.moduleRef.get(IndexerService, {
strict: false,
});
const searchDocs = buildDocKeywordSearchGetter(
ac,
indexerService
);
tools.doc_keyword_search = createDocKeywordSearchTool(
searchDocs.bind(null, options)
);
}
break;
}
case 'webSearch': {
tools.web_search_exa = createExaSearchTool(this.AFFiNEConfig);
tools.web_crawl_exa = createExaCrawlTool(this.AFFiNEConfig);

View File

@ -1,3 +1,4 @@
import { Logger } from '@nestjs/common';
import {
CoreAssistantMessage,
CoreUserMessage,
@ -10,6 +11,7 @@ import {
import { ZodType } from 'zod';
import {
createDocKeywordSearchTool,
createDocSemanticSearchTool,
createExaCrawlTool,
createExaSearchTool,
@ -381,6 +383,7 @@ export class CitationParser {
export interface CustomAITools extends ToolSet {
doc_semantic_search: ReturnType<typeof createDocSemanticSearchTool>;
doc_keyword_search: ReturnType<typeof createDocKeywordSearchTool>;
web_search_exa: ReturnType<typeof createExaSearchTool>;
web_crawl_exa: ReturnType<typeof createExaCrawlTool>;
}
@ -404,6 +407,7 @@ export function parseUnknownError(error: unknown) {
}
export class TextStreamParser {
private readonly logger = new Logger(TextStreamParser.name);
private readonly CALLOUT_PREFIX = '\n[!]\n';
private lastType: ChunkType | undefined;
@ -428,6 +432,9 @@ export class TextStreamParser {
break;
}
case 'tool-call': {
this.logger.debug(
`[tool-call] toolName: ${chunk.toolName}, toolCallId: ${chunk.toolCallId}`
);
result = this.addPrefix(result);
switch (chunk.toolName) {
case 'web_search_exa': {
@ -438,11 +445,18 @@ export class TextStreamParser {
result += `\nCrawling the web "${chunk.args.url}"\n`;
break;
}
case 'doc_keyword_search': {
result += `\nSearching the keyword "${chunk.args.query}"\n`;
break;
}
}
result = this.markAsCallout(result);
break;
}
case 'tool-result': {
this.logger.debug(
`[tool-result] toolName: ${chunk.toolName}, toolCallId: ${chunk.toolCallId}`
);
result = this.addPrefix(result);
switch (chunk.toolName) {
case 'doc_semantic_search': {
@ -451,6 +465,13 @@ export class TextStreamParser {
}
break;
}
case 'doc_keyword_search': {
if (Array.isArray(chunk.result)) {
result += `\nFound ${chunk.result.length} document${chunk.result.length !== 1 ? 's' : ''} related to “${chunk.args.query}”.\n`;
result += `\n${this.getKeywordSearchLinks(chunk.result)}\n`;
}
break;
}
case 'web_search_exa': {
if (Array.isArray(chunk.result)) {
result += `\n${this.getWebSearchLinks(chunk.result)}\n`;
@ -505,6 +526,18 @@ export class TextStreamParser {
}, '');
return links;
}
private getKeywordSearchLinks(
list: {
docId: string;
title: string;
}[]
): string {
const links = list.reduce((acc, result) => {
return acc + `\n\n[${result.title}](${result.docId})\n\n`;
}, '');
return links;
}
}
export class StreamObjectParser {

View File

@ -0,0 +1,64 @@
import { tool } from 'ai';
import { z } from 'zod';
import type { AccessController } from '../../../core/permission';
import type { IndexerService, SearchDoc } from '../../indexer';
import type { CopilotChatOptions } from '../providers';
export const buildDocKeywordSearchGetter = (
ac: AccessController,
indexerService: IndexerService
) => {
const searchDocs = async (options: CopilotChatOptions, query?: string) => {
if (!options || !query?.trim() || !options.user || !options.workspace) {
return undefined;
}
const canAccess = await ac
.user(options.user)
.workspace(options.workspace)
.can('Workspace.Read');
if (!canAccess) return undefined;
const docs = await indexerService.searchDocsByKeyword(
options.workspace,
query
);
// filter current user readable docs
const readableDocs = await ac
.user(options.user)
.workspace(options.workspace)
.docs(docs, 'Doc.Read');
return readableDocs;
};
return searchDocs;
};
export const createDocKeywordSearchTool = (
searchDocs: (query: string) => Promise<SearchDoc[] | undefined>
) => {
return tool({
description:
'Full-text search for relevant documents in the current workspace',
parameters: z.object({
query: z.string().describe('The query to search for'),
}),
execute: async ({ query }) => {
try {
const docs = await searchDocs(query);
if (!docs) {
return;
}
return docs.map(doc => ({
docId: doc.docId,
title: doc.title,
createdAt: doc.createdAt,
updatedAt: doc.updatedAt,
createdByUser: doc.createdByUser,
updatedByUser: doc.updatedByUser,
}));
} catch {
return 'Failed to search documents.';
}
},
});
};

View File

@ -1,2 +1,3 @@
export * from './doc-keyword-search';
export * from './doc-semantic-search';
export * from './web-search';

View File

@ -262,16 +262,6 @@
"desc": "The config for the storage provider."
}
},
"customerIo": {
"enabled": {
"type": "Boolean",
"desc": "Enable customer.io integration"
},
"token": {
"type": "String",
"desc": "Customer.io token"
}
},
"indexer": {
"enabled": {
"type": "Boolean",
@ -310,6 +300,16 @@
"desc": "Number of workspaces automatically indexed per batch"
}
},
"customerIo": {
"enabled": {
"type": "Boolean",
"desc": "Enable customer.io integration"
},
"token": {
"type": "String",
"desc": "Customer.io token"
}
},
"oauth": {
"providers.google": {
"type": "Object",