bookworm-smart-assistant/skills/backend-builder/references/database_setup.md

11 KiB

Database Setup and Configuration

Connection Strategies

PostgreSQL Connection

Node.js with pg:

const { Pool } = require('pg');

const pool = new Pool({
  connectionString: process.env.DATABASE_URL,
  max: 20,
  idleTimeoutMillis: 30000,
  connectionTimeoutMillis: 2000,
});

module.exports = pool;

Python with psycopg2:

import psycopg2
from psycopg2 import pool

connection_pool = psycopg2.pool.SimpleConnectionPool(
    1, 20,
    host=os.getenv('DB_HOST'),
    database=os.getenv('DB_NAME'),
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD')
)

Connection Pooling Best Practices

  1. Pool size - Set based on concurrent requests (typically 10-20)
  2. Idle timeout - Release idle connections (30s recommended)
  3. Connection timeout - Fail fast on connection issues (2s)
  4. Health checks - Test connections before use

Migration Strategies

Node.js Migration Tools

Using node-pg-migrate:

// migrations/1234567890_create_users.js
exports.up = pgm => {
  pgm.createTable('users', {
    id: 'uuid',
    email: { type: 'varchar(255)', unique: true, notNull: true },
    password_hash: { type: 'varchar(255)', notNull: true },
    created_at: {
      type: 'timestamp',
      notNull: true,
      default: pgm.func('current_timestamp'),
    },
  });
  
  pgm.createIndex('users', 'email');
};

exports.down = pgm => {
  pgm.dropTable('users');
};

Using Knex.js:

exports.up = function(knex) {
  return knex.schema.createTable('users', table => {
    table.uuid('id').primary().defaultTo(knex.raw('gen_random_uuid()'));
    table.string('email').unique().notNull();
    table.string('password_hash').notNull();
    table.timestamps(true, true);
  });
};

exports.down = function(knex) {
  return knex.schema.dropTable('users');
};

Python Migration Tools

Using Alembic (with SQLAlchemy):

# alembic/versions/001_create_users.py
from alembic import op
import sqlalchemy as sa

def upgrade():
    op.create_table(
        'users',
        sa.Column('id', sa.UUID(), primary_key=True),
        sa.Column('email', sa.String(255), unique=True, nullable=False),
        sa.Column('password_hash', sa.String(255), nullable=False),
        sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()),
    )
    op.create_index('idx_users_email', 'users', ['email'])

def downgrade():
    op.drop_table('users')

Schema Design Patterns

Timestamps Pattern

Always include standard timestamp fields:

created_at TIMESTAMP DEFAULT NOW() NOT NULL,
updated_at TIMESTAMP DEFAULT NOW() NOT NULL

Trigger for auto-updating updated_at:

CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = NOW();
    RETURN NEW;
END;
$$ language 'plpgsql';

CREATE TRIGGER update_users_updated_at 
    BEFORE UPDATE ON users 
    FOR EACH ROW 
    EXECUTE FUNCTION update_updated_at_column();

Soft Delete Pattern

ALTER TABLE users ADD COLUMN deleted_at TIMESTAMP NULL;
CREATE INDEX idx_users_deleted_at ON users(deleted_at);

-- Query only active records
SELECT * FROM users WHERE deleted_at IS NULL;

-- Soft delete
UPDATE users SET deleted_at = NOW() WHERE id = ?;

UUID vs Auto-increment IDs

UUID advantages:

  • Globally unique
  • Secure (non-sequential)
  • Distributed system friendly

Auto-increment advantages:

  • Smaller storage
  • Better index performance
  • Easier debugging

PostgreSQL UUID setup:

CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

CREATE TABLE users (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4()
);

Enum Types

PostgreSQL enums:

CREATE TYPE user_role AS ENUM ('admin', 'user', 'guest');

CREATE TABLE users (
    id UUID PRIMARY KEY,
    role user_role NOT NULL DEFAULT 'user'
);

Alternative: Check constraints:

CREATE TABLE users (
    id UUID PRIMARY KEY,
    role VARCHAR(20) NOT NULL DEFAULT 'user',
    CONSTRAINT valid_role CHECK (role IN ('admin', 'user', 'guest'))
);

Indexing Strategies

Common Index Types

B-tree (default):

CREATE INDEX idx_users_email ON users(email);

Unique index:

CREATE UNIQUE INDEX idx_users_email ON users(email);

Composite index:

CREATE INDEX idx_posts_author_date ON posts(author_id, created_at DESC);

Partial index:

CREATE INDEX idx_active_users ON users(email) WHERE deleted_at IS NULL;

Full-text search:

CREATE INDEX idx_posts_search ON posts USING GIN(to_tsvector('english', title || ' ' || content));

When to Add Indexes

Add indexes for:

  • Foreign keys
  • Columns in WHERE clauses
  • Columns in ORDER BY
  • Columns in JOIN conditions
  • Unique constraints

Avoid indexes on:

  • Small tables (<1000 rows)
  • Columns with low cardinality
  • Columns that are frequently updated

Relationship Patterns

One-to-Many

CREATE TABLE authors (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name VARCHAR(255) NOT NULL
);

CREATE TABLE books (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    title VARCHAR(255) NOT NULL,
    author_id UUID REFERENCES authors(id) ON DELETE CASCADE,
    FOREIGN KEY (author_id) REFERENCES authors(id)
);

CREATE INDEX idx_books_author ON books(author_id);

Many-to-Many

CREATE TABLE students (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name VARCHAR(255) NOT NULL
);

CREATE TABLE courses (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name VARCHAR(255) NOT NULL
);

CREATE TABLE enrollments (
    student_id UUID REFERENCES students(id) ON DELETE CASCADE,
    course_id UUID REFERENCES courses(id) ON DELETE CASCADE,
    enrolled_at TIMESTAMP DEFAULT NOW(),
    PRIMARY KEY (student_id, course_id)
);

CREATE INDEX idx_enrollments_student ON enrollments(student_id);
CREATE INDEX idx_enrollments_course ON enrollments(course_id);

Self-Referencing (Tree Structure)

CREATE TABLE categories (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name VARCHAR(255) NOT NULL,
    parent_id UUID REFERENCES categories(id) ON DELETE SET NULL
);

CREATE INDEX idx_categories_parent ON categories(parent_id);

Database Functions and Procedures

Stored Procedure Example

CREATE OR REPLACE FUNCTION create_user_with_profile(
    p_email VARCHAR,
    p_password VARCHAR,
    p_name VARCHAR
) RETURNS UUID AS $$
DECLARE
    v_user_id UUID;
BEGIN
    INSERT INTO users (email, password_hash)
    VALUES (p_email, crypt(p_password, gen_salt('bf')))
    RETURNING id INTO v_user_id;
    
    INSERT INTO profiles (user_id, name)
    VALUES (v_user_id, p_name);
    
    RETURN v_user_id;
END;
$$ LANGUAGE plpgsql;

Trigger Example

CREATE OR REPLACE FUNCTION log_user_changes()
RETURNS TRIGGER AS $$
BEGIN
    INSERT INTO audit_log (table_name, record_id, action, changed_at)
    VALUES ('users', NEW.id, TG_OP, NOW());
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER user_audit_trigger
AFTER INSERT OR UPDATE OR DELETE ON users
FOR EACH ROW EXECUTE FUNCTION log_user_changes();

Performance Optimization

Query Optimization

Use EXPLAIN ANALYZE:

EXPLAIN ANALYZE
SELECT * FROM users WHERE email = 'user@example.com';

Avoid N+1 queries:

// Bad
const users = await User.findAll();
for (const user of users) {
    const posts = await Post.findAll({ where: { userId: user.id } });
}

// Good
const users = await User.findAll({
    include: [{ model: Post }]
});

Connection Pool Tuning

const pool = new Pool({
    max: 20,                    // Maximum connections
    min: 5,                     // Minimum connections
    idleTimeoutMillis: 30000,   // Close idle after 30s
    connectionTimeoutMillis: 2000,
    statement_timeout: 10000,   // Query timeout 10s
});

Caching Strategy

Application-level cache (Redis):

const cacheKey = `user:${userId}`;
let user = await redis.get(cacheKey);

if (!user) {
    user = await db.query('SELECT * FROM users WHERE id = $1', [userId]);
    await redis.setex(cacheKey, 3600, JSON.stringify(user));
}

Database query cache:

-- PostgreSQL: Use materialized views for expensive queries
CREATE MATERIALIZED VIEW user_stats AS
SELECT 
    user_id,
    COUNT(*) as post_count,
    MAX(created_at) as last_post_at
FROM posts
GROUP BY user_id;

-- Refresh periodically
REFRESH MATERIALIZED VIEW user_stats;

Backup and Recovery

Backup Script (PostgreSQL)

#!/bin/bash
# backup_db.sh

BACKUP_DIR="/backups"
DB_NAME="myapp_db"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/${DB_NAME}_${TIMESTAMP}.sql.gz"

# Create backup
pg_dump -U postgres -d $DB_NAME | gzip > $BACKUP_FILE

# Keep only last 7 days
find $BACKUP_DIR -name "*.sql.gz" -mtime +7 -delete

echo "Backup completed: $BACKUP_FILE"

Point-in-Time Recovery

# Enable WAL archiving in postgresql.conf
wal_level = replica
archive_mode = on
archive_command = 'cp %p /archive/%f'

# Restore to specific point
pg_restore -d myapp_db -t "2024-12-09 10:30:00" backup.sql

Multi-Database Setup

Read Replicas

const masterPool = new Pool({
    connectionString: process.env.MASTER_DB_URL,
    max: 20
});

const replicaPool = new Pool({
    connectionString: process.env.REPLICA_DB_URL,
    max: 50  // More connections for read-heavy load
});

// Write operations
async function createUser(data) {
    return masterPool.query('INSERT INTO users...', [data]);
}

// Read operations
async function getUsers() {
    return replicaPool.query('SELECT * FROM users...');
}

Sharding Strategy

function getShardId(userId) {
    // Hash-based sharding
    return parseInt(userId, 16) % NUM_SHARDS;
}

function getPool(shardId) {
    return pools[shardId];
}

// Usage
const shardId = getShardId(userId);
const pool = getPool(shardId);
await pool.query('SELECT...', [userId]);

Security Considerations

SQL Injection Prevention

// ❌ NEVER DO THIS
const query = `SELECT * FROM users WHERE email = '${email}'`;

// ✅ ALWAYS USE PARAMETERIZED QUERIES
const query = 'SELECT * FROM users WHERE email = $1';
await pool.query(query, [email]);

Encryption at Rest

-- Enable pgcrypto extension
CREATE EXTENSION IF NOT EXISTS pgcrypto;

-- Encrypt sensitive data
INSERT INTO users (email, ssn_encrypted)
VALUES ('user@example.com', pgp_sym_encrypt('123-45-6789', 'encryption_key'));

-- Decrypt when needed
SELECT email, pgp_sym_decrypt(ssn_encrypted, 'encryption_key') 
FROM users;

Row-Level Security (PostgreSQL)

-- Enable RLS
ALTER TABLE posts ENABLE ROW LEVEL SECURITY;

-- Create policy
CREATE POLICY user_posts_policy ON posts
    FOR ALL
    USING (author_id = current_setting('app.current_user_id')::uuid);

-- Set user context in application
await pool.query("SET app.current_user_id = $1", [userId]);