1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
|
- import { handleConversationRelevance } from '../../src/external/assertions/deepeval';
- import { matchesConversationRelevance } from '../../src/external/matchers/deepeval';
- import { ConversationRelevancyTemplate } from '../../src/external/matchers/conversationRelevancyTemplate';
- import { getDefaultProviders } from '../../src/providers/defaults';
- import type { AssertionParams, AtomicTestCase } from '../../src/types';
- jest.mock('../../src/providers/defaults', () => ({
- getDefaultProviders: jest.fn(),
- }));
- jest.mock('../../src/matchers', () => ({
- ...jest.requireActual('../../src/matchers'),
- getAndCheckProvider: jest.fn().mockImplementation(async (type, provider, defaultProvider) => {
- return provider || defaultProvider;
- }),
- }));
- describe('ConversationRelevancyTemplate', () => {
- describe('generateVerdicts', () => {
- it('should generate proper verdict prompt', () => {
- const messages = [
- { role: 'user' as const, content: 'What is the weather?' },
- { role: 'assistant' as const, content: 'It is sunny today.' },
- ];
- const prompt = ConversationRelevancyTemplate.generateVerdicts(messages);
- expect(prompt).toContain(
- 'generate a JSON object to indicate whether the LAST `assistant` message is relevant',
- );
- expect(prompt).toContain(JSON.stringify(messages, null, 2));
- });
- });
- describe('generateReason', () => {
- it('should generate proper reason prompt', () => {
- const score = 0.6;
- const irrelevancies = [
- 'Response about weather was not related to math question',
- 'Assistant talked about food when asked about programming',
- ];
- const prompt = ConversationRelevancyTemplate.generateReason(score, irrelevancies);
- expect(prompt).toContain(`Relevancy Score:\n${score}`);
- expect(prompt).toContain(JSON.stringify(irrelevancies, null, 2));
- });
- });
- });
- describe('matchesConversationRelevance with template', () => {
- beforeEach(() => {
- jest.clearAllMocks();
- });
- it('should use ConversationRelevancyTemplate for prompts', async () => {
- const mockProvider = {
- id: () => 'mock-provider',
- callApi: jest.fn().mockResolvedValue({
- output: JSON.stringify({ verdict: 'yes' }),
- tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
- }),
- };
- jest.mocked(getDefaultProviders).mockResolvedValue({
- embeddingProvider: mockProvider,
- gradingJsonProvider: mockProvider,
- gradingProvider: mockProvider,
- llmRubricProvider: mockProvider,
- moderationProvider: mockProvider,
- suggestionsProvider: mockProvider,
- synthesizeProvider: mockProvider,
- });
- const messages = [
- { input: 'What is 2+2?', output: '4' },
- { input: 'What is the capital of France?', output: 'Paris' },
- ];
- await matchesConversationRelevance(messages, 0.5);
- const callArg = mockProvider.callApi.mock.calls[0][0];
- // Should contain the template structure
- expect(callArg).toContain(
- 'generate a JSON object to indicate whether the LAST `assistant` message is relevant',
- );
- expect(callArg).toContain('"role": "user"');
- expect(callArg).toContain('"role": "assistant"');
- });
- });
- describe('handleConversationRelevance with reason generation', () => {
- it('should generate comprehensive reason when there are irrelevancies', async () => {
- // Mock provider for verdict generation
- let callCount = 0;
- const mockProvider = {
- id: () => 'mock-provider',
- callApi: jest.fn().mockImplementation(async (prompt: string) => {
- callCount++;
- // First few calls are verdicts, last one is reason generation
- if (prompt.includes('irrelevancies')) {
- // This is the reason generation call
- return {
- output: JSON.stringify({
- reason:
- 'The score is 0.6 because 2 out of 5 responses were irrelevant to the conversation context.',
- }),
- tokenUsage: { total: 20, prompt: 10, completion: 10, cached: 0 },
- };
- } else {
- // Verdict calls - with 5 windows and windowSize 3, we evaluate positions 1-5
- // Make positions 3 and 5 irrelevant (2 out of 5)
- const isIrrelevant = callCount === 3 || callCount === 5;
- return {
- output: JSON.stringify({
- verdict: isIrrelevant ? 'no' : 'yes',
- ...(isIrrelevant && {
- reason: `Response ${callCount} was irrelevant`,
- }),
- }),
- tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
- };
- }
- }),
- };
- jest.mocked(getDefaultProviders).mockResolvedValue({
- embeddingProvider: mockProvider,
- gradingJsonProvider: mockProvider,
- gradingProvider: mockProvider,
- llmRubricProvider: mockProvider,
- moderationProvider: mockProvider,
- suggestionsProvider: mockProvider,
- synthesizeProvider: mockProvider,
- });
- const conversation = [
- { input: 'Question 1', output: 'Answer 1' },
- { input: 'Question 2', output: 'Answer 2' },
- { input: 'Question 3', output: 'Answer 3' },
- { input: 'Question 4', output: 'Answer 4' },
- { input: 'Question 5', output: 'Answer 5' },
- ];
- const params: AssertionParams = {
- baseType: 'conversation-relevance' as const,
- assertion: {
- type: 'conversation-relevance',
- threshold: 0.8,
- config: { windowSize: 3 },
- },
- context: {
- vars: {},
- test: {} as AtomicTestCase,
- prompt: 'test',
- logProbs: undefined,
- provider: mockProvider,
- providerResponse: { output: 'test' },
- },
- output: 'test',
- outputString: 'test',
- providerResponse: { output: 'test' },
- test: {
- vars: { _conversation: conversation },
- options: { provider: mockProvider },
- } as AtomicTestCase,
- inverse: false,
- };
- const result = await handleConversationRelevance(params);
- // Should have generated a comprehensive reason
- expect(result.reason).toContain(
- 'The score is 0.6 because 2 out of 5 responses were irrelevant',
- );
- expect(result.tokensUsed).toBeDefined();
- expect(result.tokensUsed!.total).toBeGreaterThan(0);
- });
- it('should handle empty conversations gracefully', async () => {
- const mockProvider = {
- id: () => 'mock-provider',
- callApi: jest.fn().mockResolvedValue({
- output: JSON.stringify({ verdict: 'yes' }),
- tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
- }),
- };
- jest.mocked(getDefaultProviders).mockResolvedValue({
- embeddingProvider: mockProvider,
- gradingJsonProvider: mockProvider,
- gradingProvider: mockProvider,
- llmRubricProvider: mockProvider,
- moderationProvider: mockProvider,
- suggestionsProvider: mockProvider,
- synthesizeProvider: mockProvider,
- });
- const params: AssertionParams = {
- baseType: 'conversation-relevance' as const,
- assertion: {
- type: 'conversation-relevance',
- threshold: 0.8,
- },
- context: {
- vars: {},
- test: {} as AtomicTestCase,
- prompt: 'Hello',
- logProbs: undefined,
- provider: mockProvider,
- providerResponse: { output: 'Hi there!' },
- },
- output: 'Hi there!',
- outputString: 'Hi there!',
- providerResponse: { output: 'Hi there!' },
- prompt: 'Hello',
- test: {
- vars: {},
- options: { provider: mockProvider },
- } as AtomicTestCase,
- inverse: false,
- };
- const result = await handleConversationRelevance(params);
- expect(result.pass).toBe(true);
- expect(result.score).toBe(1);
- });
- });
|