feat(server): workspace embedding improve (#12022)
fix AI-10 fix AI-109 fix PD-2484 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Added a method to check if a document requires embedding, improving embedding efficiency. - Enhanced document embeddings with enriched metadata, including title, summary, creation/update dates, and author information. - Introduced a new type for document fragments with extended metadata fields. - **Improvements** - Embedding logic now conditionally processes only documents needing updates. - Embedding content now includes document metadata for more informative context. - Expanded and improved test coverage for embedding scenarios and workspace behaviors. - Event emission added for workspace embedding updates on client version mismatch. - Job queueing enhanced with prioritization and explicit job IDs for better management. - Job queue calls updated to include priority and context identifiers in a structured format. - **Bug Fixes** - Improved handling of ignored documents in embedding matches. - Fixed incorrect document ID assignment in embedding job queueing. - **Tests** - Added and updated snapshot and behavioral tests for embedding and workspace document handling. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
262f1a47a4
commit
2a80fbb993
@ -4,19 +4,52 @@ The actual snapshot is saved in `copilot-context.spec.ts.snap`.
|
||||
|
||||
Generated by [AVA](https://avajs.dev).
|
||||
|
||||
## should get null for non-exist job
|
||||
|
||||
> should return null for non-exist job
|
||||
|
||||
null
|
||||
|
||||
## should insert embedding by doc id
|
||||
|
||||
> should match file embedding
|
||||
|
||||
[
|
||||
{
|
||||
chunk: 0,
|
||||
content: 'content',
|
||||
distance: 0,
|
||||
fileId: 'file-id',
|
||||
},
|
||||
]
|
||||
|
||||
> should return empty array when embedding is deleted
|
||||
|
||||
[]
|
||||
|
||||
> should match workspace embedding
|
||||
|
||||
[
|
||||
{
|
||||
docId: 'doc1',
|
||||
},
|
||||
]
|
||||
|
||||
> should return empty array when doc is ignored
|
||||
|
||||
[]
|
||||
|
||||
> should return workspace embedding
|
||||
|
||||
[
|
||||
{
|
||||
docId: 'doc1',
|
||||
},
|
||||
]
|
||||
|
||||
> should return empty array when embedding deleted
|
||||
|
||||
[]
|
||||
|
||||
## should check embedding table
|
||||
|
||||
> should return true when embedding table is available
|
||||
|
||||
true
|
||||
|
Binary file not shown.
@ -6,9 +6,11 @@ import ava, { TestFn } from 'ava';
|
||||
import { Config } from '../../base';
|
||||
import { CopilotContextModel } from '../../models/copilot-context';
|
||||
import { CopilotSessionModel } from '../../models/copilot-session';
|
||||
import { CopilotWorkspaceConfigModel } from '../../models/copilot-workspace';
|
||||
import { UserModel } from '../../models/user';
|
||||
import { WorkspaceModel } from '../../models/workspace';
|
||||
import { createTestingModule, type TestingModule } from '../utils';
|
||||
import { cleanObject } from '../utils/copilot';
|
||||
|
||||
interface Context {
|
||||
config: Config;
|
||||
@ -18,6 +20,7 @@ interface Context {
|
||||
workspace: WorkspaceModel;
|
||||
copilotSession: CopilotSessionModel;
|
||||
copilotContext: CopilotContextModel;
|
||||
copilotWorkspace: CopilotWorkspaceConfigModel;
|
||||
}
|
||||
|
||||
const test = ava as TestFn<Context>;
|
||||
@ -28,6 +31,7 @@ test.before(async t => {
|
||||
t.context.workspace = module.get(WorkspaceModel);
|
||||
t.context.copilotSession = module.get(CopilotSessionModel);
|
||||
t.context.copilotContext = module.get(CopilotContextModel);
|
||||
t.context.copilotWorkspace = module.get(CopilotWorkspaceConfigModel);
|
||||
t.context.db = module.get(PrismaClient);
|
||||
t.context.config = module.get(Config);
|
||||
t.context.module = module;
|
||||
@ -74,7 +78,7 @@ test('should create a copilot context', async t => {
|
||||
|
||||
test('should get null for non-exist job', async t => {
|
||||
const job = await t.context.copilotContext.get('non-exist');
|
||||
t.is(job, null);
|
||||
t.snapshot(job, 'should return null for non-exist job');
|
||||
});
|
||||
|
||||
test('should update context', async t => {
|
||||
@ -111,7 +115,10 @@ test('should insert embedding by doc id', async t => {
|
||||
1,
|
||||
1
|
||||
);
|
||||
t.snapshot(ret, 'should match file embedding');
|
||||
t.snapshot(
|
||||
cleanObject(ret, ['chunk', 'content', 'distance']),
|
||||
'should match file embedding'
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
@ -122,7 +129,7 @@ test('should insert embedding by doc id', async t => {
|
||||
1,
|
||||
1
|
||||
);
|
||||
t.is(ret.length, 0);
|
||||
t.snapshot(ret, 'should return empty array when embedding is deleted');
|
||||
}
|
||||
}
|
||||
|
||||
@ -155,7 +162,7 @@ test('should insert embedding by doc id', async t => {
|
||||
workspace.id,
|
||||
[docId]
|
||||
);
|
||||
t.true(ret.has(docId), 'should return true when embedding exists');
|
||||
t.true(ret.has(docId), 'should return doc id when embedding is inserted');
|
||||
}
|
||||
|
||||
{
|
||||
@ -165,8 +172,39 @@ test('should insert embedding by doc id', async t => {
|
||||
1,
|
||||
1
|
||||
);
|
||||
t.is(ret.length, 1);
|
||||
t.is(ret[0].content, 'content');
|
||||
t.snapshot(
|
||||
cleanObject(ret, ['chunk', 'content', 'distance']),
|
||||
'should match workspace embedding'
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
await t.context.copilotWorkspace.updateIgnoredDocs(workspace.id, [docId]);
|
||||
const ret = await t.context.copilotContext.matchWorkspaceEmbedding(
|
||||
Array.from({ length: 1024 }, () => 0.9),
|
||||
workspace.id,
|
||||
1,
|
||||
1
|
||||
);
|
||||
t.snapshot(ret, 'should return empty array when doc is ignored');
|
||||
}
|
||||
|
||||
{
|
||||
await t.context.copilotWorkspace.updateIgnoredDocs(
|
||||
workspace.id,
|
||||
undefined,
|
||||
[docId]
|
||||
);
|
||||
const ret = await t.context.copilotContext.matchWorkspaceEmbedding(
|
||||
Array.from({ length: 1024 }, () => 0.9),
|
||||
workspace.id,
|
||||
1,
|
||||
1
|
||||
);
|
||||
t.snapshot(
|
||||
cleanObject(ret, ['chunk', 'content', 'distance']),
|
||||
'should return workspace embedding'
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
@ -188,7 +226,7 @@ test('should insert embedding by doc id', async t => {
|
||||
test('should check embedding table', async t => {
|
||||
{
|
||||
const ret = await t.context.copilotContext.checkEmbeddingAvailable();
|
||||
t.true(ret, 'should return true when embedding table is available');
|
||||
t.snapshot(ret, 'should return true when embedding table is available');
|
||||
}
|
||||
|
||||
// {
|
||||
|
@ -201,6 +201,68 @@ test('should insert and search embedding', async t => {
|
||||
}
|
||||
});
|
||||
|
||||
test('should check need to be embedded', async t => {
|
||||
const docId = randomUUID();
|
||||
|
||||
await t.context.doc.upsert({
|
||||
spaceId: workspace.id,
|
||||
docId,
|
||||
blob: Uint8Array.from([1, 2, 3]),
|
||||
timestamp: Date.now(),
|
||||
editorId: user.id,
|
||||
});
|
||||
|
||||
{
|
||||
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspace.id,
|
||||
docId
|
||||
);
|
||||
t.true(needsEmbedding, 'document with no embedding should need embedding');
|
||||
}
|
||||
|
||||
{
|
||||
await t.context.copilotContext.insertWorkspaceEmbedding(
|
||||
workspace.id,
|
||||
docId,
|
||||
[
|
||||
{
|
||||
index: 0,
|
||||
content: 'content',
|
||||
embedding: Array.from({ length: 1024 }, () => 1),
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspace.id,
|
||||
docId
|
||||
);
|
||||
t.false(
|
||||
needsEmbedding,
|
||||
'document with recent embedding should not need embedding'
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
await t.context.doc.upsert({
|
||||
spaceId: workspace.id,
|
||||
docId,
|
||||
blob: Uint8Array.from([4, 5, 6]),
|
||||
timestamp: Date.now() + 1000, // Ensure timestamp is later
|
||||
editorId: user.id,
|
||||
});
|
||||
|
||||
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspace.id,
|
||||
docId
|
||||
);
|
||||
t.true(
|
||||
needsEmbedding,
|
||||
'document updated after embedding should need embedding'
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('should check embedding table', async t => {
|
||||
{
|
||||
const ret = await t.context.copilotWorkspace.checkEmbeddingAvailable();
|
||||
|
@ -14,6 +14,7 @@ import {
|
||||
CallMetric,
|
||||
DocNotFound,
|
||||
DocUpdateBlocked,
|
||||
EventBus,
|
||||
GatewayErrorWrapper,
|
||||
metrics,
|
||||
NotInSpace,
|
||||
@ -144,6 +145,7 @@ export class SpaceSyncGateway
|
||||
|
||||
constructor(
|
||||
private readonly ac: AccessController,
|
||||
private readonly event: EventBus,
|
||||
private readonly workspace: PgWorkspaceDocStorageAdapter,
|
||||
private readonly userspace: PgUserspaceDocStorageAdapter,
|
||||
private readonly docReader: DocReader,
|
||||
@ -201,6 +203,7 @@ export class SpaceSyncGateway
|
||||
await client.join(room);
|
||||
}
|
||||
} else {
|
||||
this.event.emit('workspace.embedding', { workspaceId: spaceId });
|
||||
await this.selectAdapter(client, spaceType).join(user.id, spaceId);
|
||||
}
|
||||
|
||||
|
@ -175,6 +175,55 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
|
||||
};
|
||||
}
|
||||
|
||||
@Transactional()
|
||||
async checkDocNeedEmbedded(workspaceId: string, docId: string) {
|
||||
// NOTE: check if the document needs re-embedding.
|
||||
// 1. check if there have been any recent updates to the document snapshot and update
|
||||
// 2. check if the embedding is older than the snapshot and update
|
||||
// 3. check if the embedding is older than 10 minutes (avoid frequent updates)
|
||||
// if all conditions are met, re-embedding is required.
|
||||
const result = await this.db.$queryRaw<{ needs_embedding: boolean }[]>`
|
||||
SELECT
|
||||
EXISTS (
|
||||
WITH docs AS (
|
||||
SELECT
|
||||
s.workspace_id,
|
||||
s.guid AS doc_id,
|
||||
s.updated_at
|
||||
FROM
|
||||
snapshots s
|
||||
WHERE
|
||||
s.workspace_id = ${workspaceId}
|
||||
AND s.guid = ${docId}
|
||||
UNION
|
||||
ALL
|
||||
SELECT
|
||||
u.workspace_id,
|
||||
u.guid AS doc_id,
|
||||
u.created_at AS updated_at
|
||||
FROM
|
||||
"updates" u
|
||||
WHERE
|
||||
u.workspace_id = ${workspaceId}
|
||||
AND u.guid = ${docId}
|
||||
)
|
||||
SELECT
|
||||
1
|
||||
FROM
|
||||
docs
|
||||
LEFT JOIN ai_workspace_embeddings e
|
||||
ON e.workspace_id = docs.workspace_id
|
||||
AND e.doc_id = docs.doc_id
|
||||
WHERE
|
||||
e.updated_at IS NULL
|
||||
OR docs.updated_at > e.updated_at
|
||||
OR e.updated_at < NOW() - INTERVAL '10 minutes'
|
||||
) AS needs_embedding;
|
||||
`;
|
||||
|
||||
return result[0]?.needs_embedding ?? false;
|
||||
}
|
||||
|
||||
// ================ embeddings ================
|
||||
|
||||
async checkEmbeddingAvailable(): Promise<boolean> {
|
||||
|
@ -16,6 +16,7 @@ import { Models } from '../../../models';
|
||||
import { CopilotStorage } from '../storage';
|
||||
import { readStream } from '../utils';
|
||||
import { OpenAIEmbeddingClient } from './embedding';
|
||||
import type { Chunk, DocFragment } from './types';
|
||||
import { EMBEDDING_DIMENSIONS, EmbeddingClient } from './types';
|
||||
|
||||
@Injectable()
|
||||
@ -78,16 +79,23 @@ export class CopilotContextDocJob {
|
||||
@OnEvent('workspace.doc.embedding')
|
||||
async addDocEmbeddingQueue(
|
||||
docs: Events['workspace.doc.embedding'],
|
||||
contextId?: string
|
||||
options?: { contextId: string; priority: number }
|
||||
) {
|
||||
if (!this.supportEmbedding) return;
|
||||
|
||||
for (const { workspaceId, docId } of docs) {
|
||||
await this.queue.add('copilot.embedding.docs', {
|
||||
contextId,
|
||||
await this.queue.add(
|
||||
'copilot.embedding.docs',
|
||||
{
|
||||
contextId: options?.contextId,
|
||||
workspaceId,
|
||||
docId,
|
||||
});
|
||||
},
|
||||
{
|
||||
jobId: `workspace:embedding:${workspaceId}:${docId}`,
|
||||
priority: options?.priority ?? 1,
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -110,14 +118,26 @@ export class CopilotContextDocJob {
|
||||
}: Events['workspace.embedding']) {
|
||||
if (!this.supportEmbedding || !this.embeddingClient) return;
|
||||
|
||||
if (enableDocEmbedding === undefined) {
|
||||
enableDocEmbedding =
|
||||
await this.models.workspace.allowEmbedding(workspaceId);
|
||||
}
|
||||
|
||||
if (enableDocEmbedding) {
|
||||
const toBeEmbedDocIds =
|
||||
await this.models.copilotWorkspace.findDocsToEmbed(workspaceId);
|
||||
for (const docId of toBeEmbedDocIds) {
|
||||
await this.queue.add('copilot.embedding.docs', {
|
||||
await this.queue.add(
|
||||
'copilot.embedding.docs',
|
||||
{
|
||||
workspaceId,
|
||||
docId,
|
||||
});
|
||||
},
|
||||
{
|
||||
jobId: `workspace:embedding:${workspaceId}:${docId}`,
|
||||
priority: 1,
|
||||
}
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const controller = this.workspaceJobAbortController.get(workspaceId);
|
||||
@ -132,14 +152,25 @@ export class CopilotContextDocJob {
|
||||
async addDocEmbeddingQueueFromEvent(doc: Events['doc.indexer.updated']) {
|
||||
if (!this.supportEmbedding || !this.embeddingClient) return;
|
||||
|
||||
await this.queue.add('copilot.embedding.docs', {
|
||||
await this.queue.add(
|
||||
'copilot.embedding.docs',
|
||||
{
|
||||
workspaceId: doc.workspaceId,
|
||||
docId: doc.workspaceId,
|
||||
});
|
||||
docId: doc.docId,
|
||||
},
|
||||
{
|
||||
jobId: `workspace:embedding:${doc.workspaceId}:${doc.docId}`,
|
||||
priority: 2,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@OnEvent('doc.indexer.deleted')
|
||||
async deleteDocEmbeddingQueueFromEvent(doc: Events['doc.indexer.deleted']) {
|
||||
await this.queue.remove(
|
||||
`workspace:embedding:${doc.workspaceId}:${doc.docId}`,
|
||||
'copilot.embedding.docs'
|
||||
);
|
||||
await this.models.copilotContext.deleteWorkspaceEmbedding(
|
||||
doc.workspaceId,
|
||||
doc.docId
|
||||
@ -221,6 +252,43 @@ export class CopilotContextDocJob {
|
||||
}
|
||||
}
|
||||
|
||||
private async getDocFragment(
|
||||
workspaceId: string,
|
||||
docId: string
|
||||
): Promise<DocFragment | null> {
|
||||
const docContent = await this.doc.getFullDocContent(workspaceId, docId);
|
||||
const authors = await this.models.doc.getAuthors(workspaceId, docId);
|
||||
if (docContent?.summary && authors) {
|
||||
const { title = 'Untitled', summary } = docContent;
|
||||
const { createdAt, updatedAt, createdByUser, updatedByUser } = authors;
|
||||
return {
|
||||
title,
|
||||
summary,
|
||||
createdAt: createdAt.toDateString(),
|
||||
updatedAt: updatedAt.toDateString(),
|
||||
createdBy: createdByUser?.name,
|
||||
updatedBy: updatedByUser?.name,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private formatDocChunks(chunks: Chunk[], fragment: DocFragment): Chunk[] {
|
||||
return chunks.map(chunk => ({
|
||||
index: chunk.index,
|
||||
content: [
|
||||
`Title: ${fragment.title}`,
|
||||
`Created at: ${fragment.createdAt}`,
|
||||
`Updated at: ${fragment.updatedAt}`,
|
||||
fragment.createdBy ? `Created by: ${fragment.createdBy}` : undefined,
|
||||
fragment.updatedBy ? `Updated by: ${fragment.updatedBy}` : undefined,
|
||||
chunk.content,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n'),
|
||||
}));
|
||||
}
|
||||
|
||||
private getWorkspaceSignal(workspaceId: string) {
|
||||
let controller = this.workspaceJobAbortController.get(workspaceId);
|
||||
if (!controller) {
|
||||
@ -241,14 +309,23 @@ export class CopilotContextDocJob {
|
||||
const signal = this.getWorkspaceSignal(workspaceId);
|
||||
|
||||
try {
|
||||
const content = await this.doc.getFullDocContent(workspaceId, docId);
|
||||
if (signal.aborted) {
|
||||
return;
|
||||
} else if (content) {
|
||||
const needEmbedding =
|
||||
await this.models.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspaceId,
|
||||
docId
|
||||
);
|
||||
if (needEmbedding) {
|
||||
if (signal.aborted) return;
|
||||
const fragment = await this.getDocFragment(workspaceId, docId);
|
||||
if (fragment) {
|
||||
// fast fall for empty doc, journal is easily to create a empty doc
|
||||
if (content.summary) {
|
||||
if (fragment.summary) {
|
||||
const embeddings = await this.embeddingClient.getFileEmbeddings(
|
||||
new File([content.summary], `${content.title || 'Untitled'}.md`),
|
||||
new File(
|
||||
[fragment.summary],
|
||||
`${fragment.title || 'Untitled'}.md`
|
||||
),
|
||||
chunks => this.formatDocChunks(chunks, fragment),
|
||||
signal
|
||||
);
|
||||
|
||||
@ -275,6 +352,7 @@ export class CopilotContextDocJob {
|
||||
} else if (contextId) {
|
||||
throw new DocNotFound({ spaceId: workspaceId, docId });
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (contextId) {
|
||||
this.event.emit('workspace.doc.embed.failed', {
|
||||
|
@ -498,7 +498,7 @@ export class CopilotContextResolver {
|
||||
workspaceId: session.workspaceId,
|
||||
docId,
|
||||
})),
|
||||
session.id
|
||||
{ contextId: session.id, priority: 0 }
|
||||
);
|
||||
}
|
||||
|
||||
@ -559,7 +559,7 @@ export class CopilotContextResolver {
|
||||
|
||||
await this.jobs.addDocEmbeddingQueue(
|
||||
[{ workspaceId: session.workspaceId, docId: options.docId }],
|
||||
session.id
|
||||
{ contextId: session.id, priority: 0 }
|
||||
);
|
||||
|
||||
return { ...record, status: record.status || null };
|
||||
|
@ -3,6 +3,7 @@ import { File } from 'node:buffer';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { CopilotContextFileNotSupported } from '../../../base';
|
||||
import type { PageDocContent } from '../../../core/utils/blocksuite';
|
||||
import { ChunkSimilarity, Embedding } from '../../../models';
|
||||
import { parseDoc } from '../../../native';
|
||||
|
||||
@ -10,7 +11,7 @@ declare global {
|
||||
interface Events {
|
||||
'workspace.embedding': {
|
||||
workspaceId: string;
|
||||
enableDocEmbedding: boolean;
|
||||
enableDocEmbedding?: boolean;
|
||||
};
|
||||
|
||||
'workspace.doc.embedding': Array<{
|
||||
@ -53,6 +54,13 @@ declare global {
|
||||
}
|
||||
}
|
||||
|
||||
export type DocFragment = PageDocContent & {
|
||||
createdAt: string;
|
||||
createdBy?: string;
|
||||
updatedAt: string;
|
||||
updatedBy?: string;
|
||||
};
|
||||
|
||||
export type Chunk = {
|
||||
index: number;
|
||||
content: string;
|
||||
@ -63,11 +71,12 @@ export const EMBEDDING_DIMENSIONS = 1024;
|
||||
export abstract class EmbeddingClient {
|
||||
async getFileEmbeddings(
|
||||
file: File,
|
||||
chunkMapper: (chunk: Chunk[]) => Chunk[],
|
||||
signal?: AbortSignal
|
||||
): Promise<Embedding[][]> {
|
||||
const chunks = await this.getFileChunks(file, signal);
|
||||
const chunkedEmbeddings = await Promise.all(
|
||||
chunks.map(chunk => this.generateEmbeddings(chunk))
|
||||
chunks.map(chunk => this.generateEmbeddings(chunkMapper(chunk)))
|
||||
);
|
||||
return chunkedEmbeddings;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user