library(pacman)
::p_load(tidyverse, httr, jsonlite, glue, dotenv, cli)
pacman
::load_dot_env()
dotenv
<- Sys.getenv("SECRET_NOTION")
NOTION_TOKEN <- "aa290b0231ed45ca92439e6cb05ba3e4"
DATABASE_PARENT_ID <- "DBT Docs - Proof of Concept (Database)"
DATABASE_NAME <- '31bb9b7a9ab246f5b7f47227bb5fec89'
DATABASE_ID <- -1 NUMERIC_ZERO_VALUE
DBT Docs to Notion Integration - Proof of Concept
1 Introduction
This guide outlines the process of integrating DBT (Data Build Tool) documentation with Notion to create a lightweight, user-friendly data catalog. By leveraging the strengths of both tools, we can increase the visibility of our data warehouse with minimal infrastructure overhead.
1.1 Prerequisites
- Basic understanding of DBT
- DBT project initialized with existing SQL or Python models and documentation
- R programming environment
- Notion account with admin privileges
2 Set up Notion Integration
- Navigate to https://www.notion.so/profile/integrations
- Click ‘New Integration’
- Enter information:
- Basic information:
- Integration name (e.g., “dbt-docs-v1”)
- Select type as internal
- Select workspace (e.g., UHC Climate)
- Configurations:
- Check all content and comment capacity
- User capabilities: check “No user information”
- Basic information:
3 Create a Notion page for dbt-docs
- Navigate to Notion’s home
- Create a new page (e.g., “Datawarehouse”)
- Add a connection to the integration:
- Click page settings (… button in the top right)
- Add a connection to the integration created in step 1
- Search for the integration
- Select and confirm the connection
4 Implement the ETL Pipeline
4.1 Setup
Load required libraries and environment variables:
4.2 Helper Functions
Implement helper functions for API requests and data parsing:
<- function(endpoint,
make_request querystring = "",
method = "GET",
body = NULL,
body_format_json = F) {
Sys.sleep(0.34) # notion api limit is 3 requests per second
<- glue("https://api.notion.com/v1/{endpoint}{querystring}")
url
<- c(
headers "Authorization" = NOTION_TOKEN,
"Content-Type" = "application/json",
"Notion-Version" = "2022-02-22"
)
= body
body_json if (!body_format_json) body_json = toJSON(body, auto_unbox = TRUE)
<- VERB(
response verb = method,
url = url,
body = if (!is.null(body)) body_json,
add_headers(headers)
)
if (http_error(response)) {
stop(glue("Request returned status code {status_code(response)}
Response text: {content(response, 'text')}"))
}
content(response, "parsed")
}
<- function(parent_object, paths_array, zero_value = "") {
get_paths_or_empty %>%
paths_array detect(~ !is.null(purrr::pluck(parent_object, !!!.x)), .default = zero_value) %>%
if (. == zero_value) zero_value else purrr::pluck(parent_object, !!!.)}
{
}
<- function(data, catalog_nodes, model_name) {
get_owner <- get_paths_or_empty(data, list(c("config", "meta", "owner")), NULL)
owner if (!is.null(owner)) return(owner)
get_paths_or_empty(catalog_nodes, list(c(model_name, "metadata", "owner")), "")
}
4.3 Parse DBT Docs
Parse manifest.json and catalog.json:
<- jsonlite::read_json("../target/manifest.json")
manifest <- jsonlite::read_json("../target/catalog.json")
catalog
<- manifest$nodes %>% keep(~.x$description != '')
manifest_nodes <- catalog$nodes %>% keep(~.x$unique_id %in% names(manifest_nodes))
catalog_nodes
<- manifest_nodes %>%
models keep(~ .$resource_type == "model") %>%
enframe(name = "node_name", value = "data")
4.4 Initialize Notion Database
Check if the database exists and create it if not:
<- make_request(
children_query_resp endpoint = "blocks/",
querystring = glue("{DATABASE_PARENT_ID}/children"),
method = "GET"
)
<- children_query_resp$results %>%
database_id keep(~ !is.null(.$child_database) && identical(.$child_database$title, DATABASE_NAME)) %>%
map_chr("id") %>%
first()
= !is.na(database_id)
db_exists
if (db_exists) {
cat(glue("database {database_id} already exists, proceeding to update records!\n"))
else {
} cat("creating database\n")
<- make_request(
database_creation_resp endpoint = "databases/",
method = "POST",
body = fromJSON(database_json)
)<- database_creation_resp$id
database_id cat(glue("\ncreated database {database_id}, proceeding to create records!\n"))
}
database 31bb9b7a-9ab2-46f5-b7f4-7227bb5fec89 already exists, proceeding to update records!
4.5 Create/Update Records
Implement functions to create JSON structures for Notion records:
<- function(database_id) {
make_parent_json glue('
"parent": {{
"database_id": "{database_id}"
}}
')
}
<- function(data) {
make_properties_json glue('
"properties": {{
"Name": {{
"title": [
{{
"text": {{
"content": "{data$name}"
}}
}}
]
}},
"Description": {{
"rich_text": [
{{
"text": {{
"content": "{str_remove_all(data$description, "\n")}"
}}
}}
]
}}
}}
')
}
<- function(text) {
make_text_block <- gsub("\n", "\\\\n", text)
text_escaped glue('
{{
"object": "block",
"type": "paragraph",
"paragraph": {{
"rich_text": [
{{
"type": "text",
"text": {{
"content": "{text_escaped}"
}}
}}
]
}}
}}
')
}
<- function(title) {
make_h1_block glue('
{{
"object": "block",
"type": "heading_1",
"heading_1": {{
"rich_text": [
{{
"type": "text",
"text": {{
"content": "{title}"
}}
}}
]
}}
}}
')
}
<- function(data) {
make_dbt_code_block = data$raw_code %>%
code_str str_replace_all("'", "\\'") %>%
str_replace_all("\r\n", "\\\\n") %>% # Replace Windows-style line endings
str_replace_all("\n", "\\\\n") %>% # Replace Unix-style line endings
str_replace_all("\r", "\\\\n")
glue('
{{
"object": "block",
"type": "code",
"code": {{
"rich_text": [
{{
"type": "text",
"text": {{
"content": "{code_str}"
}}
}}
],
"language": "sql"
}}
}}
')
}
4.5.1 Columns Table
= function(data){
prep_df_columns_from_dbt_docs $columns %>%
datamap(~{
tibble(
name = .x$name,
data_type = .x$data_type,
description = .x$description
)%>%
}) bind_rows()
}
= function(string){
make_table_row_cell glue('
[
{{
"type": "text",
"text": {{
"content": "{string}"
}},
"plain_text": "{string}"
}}
]
')
}
= function(cell1, cell2, cell3){
make_3_cell_row_notion_api glue('
{{
"type": "table_row",
"table_row": {{
"cells": [
{make_table_row_cell(cell1)},
{make_table_row_cell(cell2)},
{make_table_row_cell(cell3)}
]
}}
}}
')
}
= function(data){
make_table_children_dbt_notion_api
= prep_df_columns_from_dbt_docs(data)
df_columns
= df_columns %>%
children_json group_by(row_number()) %>%
group_map(~{
make_3_cell_row_notion_api(.x$name, .x$data_type, .x$description)
%>%
}) paste(collapse = ",")
return(
glue('
[
{make_3_cell_row_notion_api("Name", "Data Type", "Description")},
{children_json}
]
')
)
}
= function(data){
make_dbt_table_block glue('{{
"object": "block",
"type": "table",
"table": {{
"table_width": 3,
"has_column_header": true,
"has_row_header": false,
"children": {make_table_children_dbt_notion_api(data)}
}}
}}')
}
Create a function to check if a record exists:
<- function(database_id, data) {
check_record_exists <- glue('
query_json {{
"filter": {{
"property": "Name",
"title": {{
"equals": "{data$name}"
}}
}}
}}
')
<- make_request(
record_query_resp endpoint = "databases/",
querystring = paste0(database_id, "/query"),
method = "POST",
body = query_json,
body_format_json = T
)= length(record_query_resp$results) > 0
record_exists return(record_exists)
}
Create a function to create or update a DBT record:
<- function(DATABASE_ID, data) {
create_dbt_record <- glue('
body_json {{
{make_parent_json(DATABASE_ID)},
{make_properties_json(data)},
"children": [
{make_h1_block("Description")},
{make_text_block(data$description)},
{make_h1_block("Columns")},
{make_dbt_table_block(data)},
{make_h1_block("Code")},
{make_dbt_code_block(data)}
]
}}
')
= check_record_exists(DATABASE_ID, data)
record_exists
if (record_exists) {
cli_alert("Record exists, do nothing")
else {
} cli_alert("Creating record")
make_request(
endpoint = "pages/",
querystring = "",
method = "POST",
body = body_json,
body_format_json = T
)
} }
4.6 Update All Records
Iterate through all models and create/update records:
for (i in 1:nrow(models)) {
<- models$node_name[i]
model_name <- models$data[[i]]
data cli_alert_info("Processing model: {data$name}")
create_dbt_record(DATABASE_ID, data)
}
Now we have a minimal lightweight catalog that builds off our work with DBT
5 Conclusion
This proof of concept demonstrates a successful integration of DBT documentation with Notion, creating a lightweight, accessible data catalog. Key achievements include:
- Leveraging rich DBT metadata
- Bridging technical data management and user-friendly documentation
- Enhancing data warehouse visibility across the organization
5.1 Next Steps
Future enhancements will focus on:
- Utilizing DBT tags for selective model inclusion
- Leveraging DBT meta fields for custom metadata (e.g., owners)
- Operationalizing columns/variables/measures as separate databases
- This will improve reporting capabilities at the measure level
- organize the R function logic into a package like system or consider into NotionR package
As we refine this integration, we aim to create a more dynamic, customizable data catalog aligned with organizational needs. Regular updates will ensure the catalog remains a valuable resource for all stakeholders.
We encourage implementation, expansion, and community sharing of experiences to further innovate in making data more accessible and understandable across organizations.