1version: v1beta1
2kind: Spicepod
3name: spice-oss-docs
4
5datasets:
6 - from: github:github.com/spiceai/docs/files/trunk
7 name: spiceai.docs
8 description: Spice.ai OSS documentation and reference, from https://docs.spiceai.org
9 metadata:
10 instructions: |
11 Documents are stored in Markdown. Always provide citations.
12 When generating reference links for docs, use https://docs.spiceai.org/<docs_path> as a template
13 Exclude `spiceaidocs/docs` prefix from docs path and `.md` file extension.
14 Also replace `/index.md` with `/` in the path.
15 reference_base_url: https://docs.spiceai.org/<docs_path>
16 params:
17 file_format: md
18 github_client_id: ${secrets:GITHUB_CLIENT_ID}
19 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
20 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
21 include: "spiceaidocs/docs/**/*.md"
22 acceleration:
23 enabled: true
24 refresh_check_interval: 4h
25 refresh_jitter_enabled: true
26 refresh_jitter_max: 30m
27 columns:
28 - name: content
29 embeddings:
30 - from: openai_embeddings
31 row_id:
32 - path
33 chunking:
34 enabled: true
35 target_chunk_size: 512
36 overlap_size: 128
37 trim_whitespace: true
38
39 - from: github:github.com/spiceai/cookbook/files/trunk
40 name: spiceai.cookbook
41 description: Spice.ai OSS cookbook recipes
42 metadata:
43 instructions: Documents are stored in Markdown. Always provide citations.
44 reference_base_url: https://github.com/spiceai/cookbook/tree/trunk/<recipe_path>
45 params:
46 file_format: md
47 github_client_id: ${secrets:GITHUB_CLIENT_ID}
48 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
49 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
50 include: "**/*.md"
51 acceleration:
52 enabled: true
53 refresh_check_interval: 4h
54 refresh_jitter_enabled: true
55 refresh_jitter_max: 30m
56 columns:
57 - name: content
58 embeddings:
59 - from: openai_embeddings
60 row_id:
61 - path
62 chunking:
63 enabled: true
64 target_chunk_size: 512
65 overlap_size: 128
66 trim_whitespace: true
67
68 - from: github:github.com/spiceai/blog/files/trunk
69 name: spiceai.blog
70 description: Spice.ai OSS blog posts
71 metadata:
72 instructions: |
73 This dataset provides access to the Spice.ai OSS project blog posts in Markdown format. The content is sourced from the Spice.ai OSS blog repository at https://github.com/spiceai/blog.
74 reference_base_url: https://github.com/spiceai/blog/tree/trunk/content/posts/<post_path>
75 params:
76 file_format: md
77 github_client_id: ${secrets:GITHUB_CLIENT_ID}
78 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
79 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
80 include: "content/posts/**/*.md"
81 acceleration:
82 enabled: true
83 refresh_check_interval: 4h
84 refresh_jitter_enabled: true
85 refresh_jitter_max: 30m
86 columns:
87 - name: content
88 embeddings:
89 - from: openai_embeddings
90 row_id:
91 - path
92 chunking:
93 enabled: true
94 target_chunk_size: 512
95 overlap_size: 128
96 trim_whitespace: true
97
98 - from: github:github.com/spiceai/spiceai/issues
99 name: spiceai.issues
100 description: Spice.ai OSS issues from https://github.com/spiceai/spiceai/issues
101 params:
102 github_client_id: ${secrets:GITHUB_CLIENT_ID}
103 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
104 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
105 acceleration:
106 enabled: true
107 refresh_check_interval: 4h
108 refresh_jitter_enabled: true
109 refresh_jitter_max: 30m
110
111embeddings:
112 - from: openai
113 name: openai_embeddings
114 params:
115 openai_api_key: ${ secrets:OPENAI_API_KEY }
116
117models:
118 - name: openai
119 from: openai:gpt-4o
120 params:
121 tools: auto
122 openai_api_key: ${secrets:OPENAI_API_KEY}
123 system_prompt: |
124 You are an AI assistant assisting engineers with the Spice.ai OSS Project.
125
126 Always strive to be accurate, concise, and helpful in your responses.
127
128 Apply instructions and reference_base_url metadata from the datasets to provide accurate and relevant information.
129
130 Prefer "docs" dataset for documentation and reference information questions.
131
132 Prefer "cookbook" recipe datasets for use cases, sample code, and configuration questions. Always include links to relevant recipes.
133
134 Use the SQL tool (sql_query) when:
135 1. The query involves precise numerical data, statistics, or aggregations.
136 2. The user asks for specific counts, sums, averages, or other calculations.
137 3. The query requires joining or comparing data from multiple related tables.
138
139 If the SQL tool returns a query, syntax, or planning error, call the `list_datasets` tool to get the available tables and continue to refine and retry the query until it succeeds. If the query fails after 5 attempts, on each subsequent run `EXPLAIN <query>` to better understand what went wrong. If it continues to fail after 10 attempts, fall back to other available tools.
140
141 When returning results from datasets, always provide citations and reference links if possible.
142
143 Use the document search tool when:
144 1. The query is about unstructured text information, such as policies, reports, or articles.
145 2. The user is looking for qualitative information or explanations.
146 3. The query requires understanding context or interpreting written content.
147
148 General guidelines:
149 1. If a query could be answered by either tool, prefer SQL for more precise, quantitative answers.
150
1version: v1beta1
2kind: Spicepod
3name: spice-oss-docs
4
5datasets:
6 - from: github:github.com/spiceai/docs/files/trunk
7 name: spiceai.docs
8 description: Spice.ai OSS documentation and reference, from https://docs.spiceai.org
9 metadata:
10 instructions: |
11 Documents are stored in Markdown. Always provide citations.
12 When generating reference links for docs, use https://docs.spiceai.org/<docs_path> as a template
13 Exclude `spiceaidocs/docs` prefix from docs path and `.md` file extension.
14 Also replace `/index.md` with `/` in the path.
15 reference_base_url: https://docs.spiceai.org/<docs_path>
16 params:
17 file_format: md
18 github_client_id: ${secrets:GITHUB_CLIENT_ID}
19 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
20 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
21 include: "spiceaidocs/docs/**/*.md"
22 acceleration:
23 enabled: true
24 refresh_check_interval: 4h
25 refresh_jitter_enabled: true
26 refresh_jitter_max: 30m
27 columns:
28 - name: content
29 embeddings:
30 - from: openai_embeddings
31 row_id:
32 - path
33 chunking:
34 enabled: true
35 target_chunk_size: 512
36 overlap_size: 128
37 trim_whitespace: true
38
39 - from: github:github.com/spiceai/cookbook/files/trunk
40 name: spiceai.cookbook
41 description: Spice.ai OSS cookbook recipes
42 metadata:
43 instructions: Documents are stored in Markdown. Always provide citations.
44 reference_base_url: https://github.com/spiceai/cookbook/tree/trunk/<recipe_path>
45 params:
46 file_format: md
47 github_client_id: ${secrets:GITHUB_CLIENT_ID}
48 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
49 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
50 include: "**/*.md"
51 acceleration:
52 enabled: true
53 refresh_check_interval: 4h
54 refresh_jitter_enabled: true
55 refresh_jitter_max: 30m
56 columns:
57 - name: content
58 embeddings:
59 - from: openai_embeddings
60 row_id:
61 - path
62 chunking:
63 enabled: true
64 target_chunk_size: 512
65 overlap_size: 128
66 trim_whitespace: true
67
68 - from: github:github.com/spiceai/blog/files/trunk
69 name: spiceai.blog
70 description: Spice.ai OSS blog posts
71 metadata:
72 instructions: |
73 This dataset provides access to the Spice.ai OSS project blog posts in Markdown format. The content is sourced from the Spice.ai OSS blog repository at https://github.com/spiceai/blog.
74 reference_base_url: https://github.com/spiceai/blog/tree/trunk/content/posts/<post_path>
75 params:
76 file_format: md
77 github_client_id: ${secrets:GITHUB_CLIENT_ID}
78 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
79 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
80 include: "content/posts/**/*.md"
81 acceleration:
82 enabled: true
83 refresh_check_interval: 4h
84 refresh_jitter_enabled: true
85 refresh_jitter_max: 30m
86 columns:
87 - name: content
88 embeddings:
89 - from: openai_embeddings
90 row_id:
91 - path
92 chunking:
93 enabled: true
94 target_chunk_size: 512
95 overlap_size: 128
96 trim_whitespace: true
97
98 - from: github:github.com/spiceai/spiceai/issues
99 name: spiceai.issues
100 description: Spice.ai OSS issues from https://github.com/spiceai/spiceai/issues
101 params:
102 github_client_id: ${secrets:GITHUB_CLIENT_ID}
103 github_private_key: ${secrets:GITHUB_PRIVATE_KEY}
104 github_installation_id: ${secrets:GITHUB_INSTALLATION_ID}
105 acceleration:
106 enabled: true
107 refresh_check_interval: 4h
108 refresh_jitter_enabled: true
109 refresh_jitter_max: 30m
110
111embeddings:
112 - from: openai
113 name: openai_embeddings
114 params:
115 openai_api_key: ${ secrets:OPENAI_API_KEY }
116
117models:
118 - name: openai
119 from: openai:gpt-4o
120 params:
121 tools: auto
122 openai_api_key: ${secrets:OPENAI_API_KEY}
123 system_prompt: |
124 You are an AI assistant assisting engineers with the Spice.ai OSS Project.
125
126 Always strive to be accurate, concise, and helpful in your responses.
127
128 Apply instructions and reference_base_url metadata from the datasets to provide accurate and relevant information.
129
130 Prefer "docs" dataset for documentation and reference information questions.
131
132 Prefer "cookbook" recipe datasets for use cases, sample code, and configuration questions. Always include links to relevant recipes.
133
134 Use the SQL tool (sql_query) when:
135 1. The query involves precise numerical data, statistics, or aggregations.
136 2. The user asks for specific counts, sums, averages, or other calculations.
137 3. The query requires joining or comparing data from multiple related tables.
138
139 If the SQL tool returns a query, syntax, or planning error, call the `list_datasets` tool to get the available tables and continue to refine and retry the query until it succeeds. If the query fails after 5 attempts, on each subsequent run `EXPLAIN <query>` to better understand what went wrong. If it continues to fail after 10 attempts, fall back to other available tools.
140
141 When returning results from datasets, always provide citations and reference links if possible.
142
143 Use the document search tool when:
144 1. The query is about unstructured text information, such as policies, reports, or articles.
145 2. The user is looking for qualitative information or explanations.
146 3. The query requires understanding context or interpreting written content.
147
148 General guidelines:
149 1. If a query could be answered by either tool, prefer SQL for more precise, quantitative answers.
150