1version: v1beta1
2kind: Spicepod
3name: demo
4
5dependencies:
6 - Jeadie/evals
7
8runtime:
9 task_history:
10 captured_output: truncated
11
12datasets:
13 - from: s3://spiceai-demo-datasets/taxi_trips/2024/
14 name: taxi_trips
15 params:
16 file_format: parquet
17
18views:
19 - name: user_queries
20 sql: |
21 SELECT
22 json_get_json(input, 'messages') AS input,
23 json_get_str((captured_output -> 0), 'content') as ideal
24 FROM runtime.task_history
25 WHERE task='ai_completion'
26
27 - name: latest_eval_runs
28 sql: |
29 SELECT model, MAX(created_at) as latest_run
30 FROM eval.runs
31 GROUP BY model
32
33 - name: model_stats
34 sql: |
35 SELECT
36 r.model,
37 COUNT(*) as total_queries,
38 SUM(CASE WHEN res.value = 1.0 THEN 1 ELSE 0 END) as correct_answers,
39 AVG(res.value) as accuracy
40 FROM eval.runs r
41 JOIN latest_eval_runs lr ON r.model = lr.model AND r.created_at = lr.latest_run
42 JOIN eval.results res ON res.run_id = r.id
43 GROUP BY r.model
44
45evals:
46 - name: mimic-user-queries
47 description: |
48 Evaluates how well a model can copy the exact answers already returned to a user. Useful for testing if a smaller/cheaper model is sufficient.
49 dataset: user_queries
50 scorers:
51 - match
52
53models:
54 - name: gpt-4o
55 from: openai:gpt-4o
56 params:
57 openai_api_key: ${ secrets:SPICE_OPENAI_API_KEY }
58
59 - name: llama3
60 from: huggingface:huggingface.co/meta-llama/Llama-3.2-3B-Instruct
61 params:
62 hf_token: ${ secrets:SPICE_HUGGINGFACE_API_KEY }
63
1version: v1beta1
2kind: Spicepod
3name: demo
4
5dependencies:
6 - Jeadie/evals
7
8runtime:
9 task_history:
10 captured_output: truncated
11
12datasets:
13 - from: s3://spiceai-demo-datasets/taxi_trips/2024/
14 name: taxi_trips
15 params:
16 file_format: parquet
17
18views:
19 - name: user_queries
20 sql: |
21 SELECT
22 json_get_json(input, 'messages') AS input,
23 json_get_str((captured_output -> 0), 'content') as ideal
24 FROM runtime.task_history
25 WHERE task='ai_completion'
26
27 - name: latest_eval_runs
28 sql: |
29 SELECT model, MAX(created_at) as latest_run
30 FROM eval.runs
31 GROUP BY model
32
33 - name: model_stats
34 sql: |
35 SELECT
36 r.model,
37 COUNT(*) as total_queries,
38 SUM(CASE WHEN res.value = 1.0 THEN 1 ELSE 0 END) as correct_answers,
39 AVG(res.value) as accuracy
40 FROM eval.runs r
41 JOIN latest_eval_runs lr ON r.model = lr.model AND r.created_at = lr.latest_run
42 JOIN eval.results res ON res.run_id = r.id
43 GROUP BY r.model
44
45evals:
46 - name: mimic-user-queries
47 description: |
48 Evaluates how well a model can copy the exact answers already returned to a user. Useful for testing if a smaller/cheaper model is sufficient.
49 dataset: user_queries
50 scorers:
51 - match
52
53models:
54 - name: gpt-4o
55 from: openai:gpt-4o
56 params:
57 openai_api_key: ${ secrets:SPICE_OPENAI_API_KEY }
58
59 - name: llama3
60 from: huggingface:huggingface.co/meta-llama/Llama-3.2-3B-Instruct
61 params:
62 hf_token: ${ secrets:SPICE_HUGGINGFACE_API_KEY }
63