outline-sync/export_with_trees.sh

#!/bin/bash
#
# Outline Export Script with Tree Visualization
# Exports all Outline documents with full hierarchy and shows side-by-side tree comparison
#
# Usage: ./export_with_trees.sh [OPTIONS]
# Options are passed through to the Python script (--dry-run, -v, etc.)
#

set -e  # Exit on error

# Capture CLI arguments to pass to Python
CLI_ARGS="$@"

# Colors for output
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color

# Configuration
WORK_DIR="$(pwd)"
SETTINGS_FILE="$WORK_DIR/settings.json"
EXPORT_DIR="$WORK_DIR/outline_export"

echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
echo -e "${BLUE}  OUTLINE EXPORT${NC}"
echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
echo ""

# Check if settings.json exists
if [ ! -f "$SETTINGS_FILE" ]; then
    echo -e "${RED}Error: settings.json not found${NC}"
    exit 1
fi

# Extract API details from settings.json
API_URL=$(jq -r '.source.url' "$SETTINGS_FILE")
API_TOKEN=$(jq -r '.source.token' "$SETTINGS_FILE")

# Backup old export if it exists
if [ -d "$EXPORT_DIR" ]; then
    TIMESTAMP=$(date +%Y%m%d_%H%M%S)
    BACKUP_FILE="$WORK_DIR/outline_backup_${TIMESTAMP}.tar.gz"
    echo -e "${YELLOW}Backing up previous export...${NC}"
    tar -czf "$BACKUP_FILE" -C "$WORK_DIR" "outline_export" 2>/dev/null
    echo -e "${GREEN}✓ Backup: $BACKUP_FILE ($(du -sh "$BACKUP_FILE" | cut -f1))${NC}"
    rm -rf "$EXPORT_DIR"
fi

echo -e "${GREEN}Exporting documents...${NC}"
echo ""

# Run the export with CLI arguments (as current user to avoid root-owned files)
docker run --rm --network domnet \
    --user "$(id -u):$(id -g)" \
    -e HOME=/tmp \
    -v "$WORK_DIR:/work" \
    -w /work \
    python:3.11-slim \
    bash -c "pip install -qqq requests tqdm 2>/dev/null && python3 outline_export_fixed.py $CLI_ARGS"

echo ""

# Create Python script for side-by-side tree comparison
cat > "$WORK_DIR/.tree_compare.py" << 'PYTHON_SCRIPT'
#!/usr/bin/env python3
"""
Side-by-side comparison of Outline online vs exported files.
Matches documents row by row and highlights differences.
"""
import sys
import re
import shutil
import requests
from pathlib import Path

# Colors
GREEN = '\033[0;32m'
RED = '\033[0;31m'
YELLOW = '\033[1;33m'
BLUE = '\033[0;34m'
CYAN = '\033[0;36m'
BOLD = '\033[1m'
DIM = '\033[2m'
RESET = '\033[0m'

def get_terminal_width():
    try:
        return shutil.get_terminal_size().columns
    except:
        return 120

def normalize_filename(name):
    """Normalize a name for comparison (handles / -> _ conversion etc)."""
    # Replace characters that filesystems don't allow
    normalized = name.replace('/', '_').replace('\\', '_')
    normalized = normalized.replace(':', '_').replace('*', '_')
    normalized = normalized.replace('?', '_').replace('"', '_')
    normalized = normalized.replace('<', '_').replace('>', '_')
    normalized = normalized.replace('|', '_')
    return normalized.strip()

def get_online_docs(api_url, api_token):
    """Fetch all documents from Outline API, organized by collection."""
    headers = {
        "Authorization": f"Bearer {api_token}",
        "Content-Type": "application/json"
    }

    response = requests.post(f"{api_url}/api/collections.list", headers=headers, json={})
    collections = response.json().get("data", [])
    collections = sorted(collections, key=lambda c: c.get('name', ''))

    # Build collection ID to name mapping
    coll_id_to_name = {c['id']: c['name'] for c in collections}

    # Fetch all documents with timestamps using documents.list
    all_docs_response = requests.post(
        f"{api_url}/api/documents.list",
        headers=headers,
        json={"limit": 1000}  # Get all docs
    )
    all_docs = all_docs_response.json().get("data", [])

    # Create timestamp lookup by (collection_name, normalized_title)
    timestamp_lookup = {}
    for doc in all_docs:
        coll_id = doc.get("collectionId")
        coll_name = coll_id_to_name.get(coll_id, "Unknown")
        title = doc.get("title", "Untitled")
        norm_title = normalize_filename(title)
        timestamp_lookup[(coll_name, norm_title)] = doc.get("updatedAt")

    result = {}

    for coll in collections:
        coll_name = coll['name']
        result[coll_name] = []

        # Get navigation tree
        nav_response = requests.post(
            f"{api_url}/api/collections.documents",
            headers=headers,
            json={"id": coll["id"]}
        )
        nav_tree = nav_response.json().get("data", [])

        def collect_docs(nodes):
            docs = []
            for node in nodes:
                title = node.get("title", "Untitled")
                norm_title = normalize_filename(title)
                has_children = len(node.get("children", [])) > 0
                updated_at = timestamp_lookup.get((coll_name, norm_title))
                docs.append({
                    'title': title,
                    'normalized': norm_title,
                    'has_children': has_children,
                    'updatedAt': updated_at
                })
                if has_children:
                    docs.extend(collect_docs(node.get("children", [])))
            return docs

        result[coll_name] = collect_docs(nav_tree)

    return result

def get_export_docs(export_dir):
    """Get all exported documents, organized by collection."""
    import os
    export_path = Path(export_dir)
    result = {}

    if not export_path.exists():
        return result

    for coll_dir in sorted(export_path.iterdir()):
        if coll_dir.is_dir():
            coll_name = coll_dir.name
            docs = []
            for md_file in sorted(coll_dir.glob("*.md")):
                title = md_file.stem
                if title:  # Skip empty filenames
                    mtime = os.path.getmtime(md_file)
                    docs.append({
                        'title': title,
                        'normalized': normalize_filename(title),
                        'path': md_file,
                        'mtime': mtime
                    })
            result[coll_name] = docs

    return result

def match_and_compare(online_docs, export_docs):
    """Match online and export docs, return comparison data per collection."""
    from datetime import datetime

    all_collections = sorted(set(online_docs.keys()) | set(export_docs.keys()))
    comparison = []

    for coll_name in all_collections:
        online_list = online_docs.get(coll_name, [])
        export_list = export_docs.get(coll_name, [])

        # Create lookup by normalized name
        export_lookup = {d['normalized']: d for d in export_list}
        online_lookup = {d['normalized']: d for d in online_list}

        rows = []
        matched_export = set()

        # First pass: match online docs to export
        for doc in sorted(online_list, key=lambda d: d['title'].lower()):
            norm = doc['normalized']
            if norm in export_lookup:
                export_doc = export_lookup[norm]
                # Check freshness
                freshness = 'current'  # default
                if doc.get('updatedAt') and export_doc.get('mtime'):
                    online_dt = datetime.fromisoformat(doc['updatedAt'].replace('Z', '+00:00'))
                    online_ts = online_dt.timestamp()
                    export_ts = export_doc['mtime']
                    # Allow 60s tolerance
                    if export_ts < online_ts - 60:
                        freshness = 'stale'
                rows.append({
                    'online': doc['title'],
                    'export': export_doc['title'],
                    'status': 'match',
                    'is_folder': doc['has_children'],
                    'freshness': freshness
                })
                matched_export.add(norm)
            else:
                rows.append({
                    'online': doc['title'],
                    'export': None,
                    'status': 'missing',
                    'is_folder': doc['has_children'],
                    'freshness': None
                })

        # Second pass: find extra export docs
        for doc in sorted(export_list, key=lambda d: d['title'].lower()):
            if doc['normalized'] not in matched_export:
                rows.append({
                    'online': None,
                    'export': doc['title'],
                    'status': 'extra',
                    'is_folder': False,
                    'freshness': None
                })

        # Sort rows: matched first, then missing, then extra
        rows.sort(key=lambda r: (
            0 if r['status'] == 'match' else (1 if r['status'] == 'missing' else 2),
            (r['online'] or r['export'] or '').lower()
        ))

        comparison.append({
            'collection': coll_name,
            'rows': rows,
            'online_count': len(online_list),
            'export_count': len(export_list)
        })

    return comparison

def print_comparison(comparison):
    """Print the side-by-side comparison with status indicators."""
    term_width = get_terminal_width()
    col_width = (term_width - 10) // 2  # -10 for separators and status icons

    total_online = 0
    total_export = 0
    total_matched = 0
    total_missing = 0
    total_extra = 0
    total_stale = 0

    print(f"\n{BLUE}{'═' * term_width}{RESET}")
    print(f"{BOLD}{CYAN}{'ONLINE':<{col_width}} {'':5} {'EXPORTED':<{col_width}}{RESET}")
    print(f"{BLUE}{'═' * term_width}{RESET}")

    for coll in comparison:
        total_online += coll['online_count']
        total_export += coll['export_count']

        # Collection header
        coll_matched = sum(1 for r in coll['rows'] if r['status'] == 'match')
        coll_missing = sum(1 for r in coll['rows'] if r['status'] == 'missing')
        coll_extra = sum(1 for r in coll['rows'] if r['status'] == 'extra')
        coll_stale = sum(1 for r in coll['rows'] if r.get('freshness') == 'stale')

        total_matched += coll_matched
        total_missing += coll_missing
        total_extra += coll_extra
        total_stale += coll_stale

        if coll_missing == 0 and coll_extra == 0:
            coll_status = f"{GREEN}✓{RESET}"
        else:
            coll_status = f"{RED}✗{RESET}"

        header = f"{coll['collection']}/ ({coll['online_count']} → {coll['export_count']})"
        print(f"\n{BOLD}{YELLOW}{header}{RESET} {coll_status}")
        print(f"{BLUE}{'─' * term_width}{RESET}")

        for row in coll['rows']:
            online_name = row['online'] or ''
            export_name = row['export'] or ''

            # Add folder indicator
            if row['is_folder'] and online_name:
                online_name = f"📁 {online_name}"

            # Truncate if needed
            if len(online_name) > col_width - 1:
                online_name = online_name[:col_width-4] + '...'
            if len(export_name) > col_width - 1:
                export_name = export_name[:col_width-4] + '...'

            # Status and colors
            if row['status'] == 'match':
                # Freshness indicator
                if row.get('freshness') == 'stale':
                    freshness = f"{YELLOW}●{RESET}"
                else:
                    freshness = f"{GREEN}●{RESET}"
                status = f"{GREEN}✓{RESET}{freshness}"
                left = f"{online_name}"
                right = f"{export_name}"
            elif row['status'] == 'missing':
                status = f"{RED}✗{RESET} "
                left = f"{RED}{online_name}{RESET}"
                right = f"{DIM}---{RESET}"
            else:  # extra
                status = f"{YELLOW}+{RESET} "
                left = f"{DIM}---{RESET}"
                right = f"{YELLOW}{export_name}{RESET}"

            # Calculate visible width (without ANSI codes)
            def visible_len(s):
                return len(re.sub(r'\033\[[0-9;]*m', '', s))

            left_pad = col_width - visible_len(left)
            right_pad = col_width - visible_len(right)

            print(f"  {left}{' ' * max(0, left_pad)} {status} {right}")

    # Summary
    print(f"\n{BLUE}{'═' * term_width}{RESET}")
    print(f"{BOLD}SUMMARY:{RESET}")
    print(f"  Online:   {total_online} documents")
    print(f"  Exported: {total_export} documents")
    print(f"  {GREEN}✓● Matched & current: {total_matched - total_stale}{RESET}")

    if total_stale > 0:
        print(f"  {YELLOW}✓● Matched but stale: {total_stale} (export older than online){RESET}")
    if total_missing > 0:
        print(f"  {RED}✗  Missing: {total_missing} (online but not exported){RESET}")
    if total_extra > 0:
        print(f"  {YELLOW}+  Extra: {total_extra} (exported but not online){RESET}")

    if total_missing == 0 and total_extra == 0 and total_stale == 0:
        print(f"\n{GREEN}✓ All documents exported and current!{RESET}")
    elif total_missing == 0 and total_extra == 0:
        print(f"\n{YELLOW}⚠ All documents exported but {total_stale} are stale{RESET}")
    print()

def get_latest_changes(api_url, api_token, limit=3):
    """Fetch the most recently updated documents."""
    headers = {
        "Authorization": f"Bearer {api_token}",
        "Content-Type": "application/json"
    }

    response = requests.post(
        f"{api_url}/api/documents.list",
        headers=headers,
        json={
            "sort": "updatedAt",
            "direction": "DESC",
            "limit": limit
        }
    )

    docs = response.json().get("data", [])
    result = []

    for doc in docs:
        # Get collection name
        coll_id = doc.get("collectionId")
        coll_name = "Unknown"
        if coll_id:
            coll_response = requests.post(
                f"{api_url}/api/collections.info",
                headers=headers,
                json={"id": coll_id}
            )
            coll_data = coll_response.json().get("data", {})
            coll_name = coll_data.get("name", "Unknown")

        result.append({
            'title': doc.get("title", "Untitled"),
            'collection': coll_name,
            'updatedAt': doc.get("updatedAt"),
            'normalized': normalize_filename(doc.get("title", "Untitled"))
        })

    return result

def find_export_file(export_dir, collection, normalized_title):
    """Find the exported file matching the document."""
    export_path = Path(export_dir)

    # Try exact collection match first
    coll_dir = export_path / collection
    if coll_dir.exists():
        for md_file in coll_dir.glob("*.md"):
            if normalize_filename(md_file.stem) == normalized_title:
                return md_file

    # Try all collections (in case of name mismatch)
    for coll_dir in export_path.iterdir():
        if coll_dir.is_dir():
            for md_file in coll_dir.glob("*.md"):
                if normalize_filename(md_file.stem) == normalized_title:
                    return md_file

    return None

def print_latest_changes(latest_docs, export_dir):
    """Print the latest changes section."""
    term_width = get_terminal_width()
    from datetime import datetime
    import os

    print(f"\n{BLUE}{'═' * term_width}{RESET}")
    print(f"{BOLD}{CYAN}LATEST CHANGES (verify actuality){RESET}")
    print(f"{BLUE}{'─' * term_width}{RESET}")

    for i, doc in enumerate(latest_docs, 1):
        title = doc['title']
        collection = doc['collection']
        updated_at = doc['updatedAt']

        # Parse online timestamp
        if updated_at:
            # Handle ISO format with timezone
            online_dt = datetime.fromisoformat(updated_at.replace('Z', '+00:00'))
            online_str = online_dt.strftime("%Y-%m-%d %H:%M:%S")
        else:
            online_str = "Unknown"

        # Find export file
        export_file = find_export_file(export_dir, collection, doc['normalized'])

        if export_file and export_file.exists():
            export_mtime = os.path.getmtime(export_file)
            export_dt = datetime.fromtimestamp(export_mtime)
            export_str = export_dt.strftime("%Y-%m-%d %H:%M:%S")

            # Compare (export should be same time or newer)
            if updated_at:
                # Convert online to local timestamp for comparison
                online_ts = online_dt.timestamp()
                if export_mtime >= online_ts - 60:  # Allow 60s tolerance
                    status = f"{GREEN}✓{RESET}"
                else:
                    status = f"{YELLOW}⚠ older{RESET}"
            else:
                status = f"{GREEN}✓{RESET}"
        else:
            export_str = "NOT FOUND"
            status = f"{RED}✗{RESET}"

        # Print entry
        print(f"\n  {BOLD}{i}. {title}{RESET}")
        print(f"     {DIM}Collection:{RESET} {collection}")
        print(f"     {DIM}Online:{RESET}     {online_str}")
        print(f"     {DIM}Exported:{RESET}   {export_str}  {status}")

    print(f"\n{BLUE}{'═' * term_width}{RESET}")

def main():
    if len(sys.argv) != 4:
        print("Usage: script.py <API_URL> <API_TOKEN> <EXPORT_DIR>")
        sys.exit(1)

    api_url = sys.argv[1]
    api_token = sys.argv[2]
    export_dir = sys.argv[3]

    # Get documents from both sources
    online_docs = get_online_docs(api_url, api_token)
    export_docs = get_export_docs(export_dir)

    # Match and compare
    comparison = match_and_compare(online_docs, export_docs)

    # Print results
    print_comparison(comparison)

    # Get and print latest changes
    latest_docs = get_latest_changes(api_url, api_token, limit=3)
    print_latest_changes(latest_docs, export_dir)

if __name__ == "__main__":
    main()
PYTHON_SCRIPT

# Run the side-by-side tree comparison (use /work/outline_export as container path)
docker run --rm --network domnet \
    --user "$(id -u):$(id -g)" \
    -e HOME=/tmp \
    -v "$WORK_DIR:/work" \
    -w /work \
    python:3.11-slim \
    bash -c "pip install -qqq requests 2>/dev/null && python3 /work/.tree_compare.py '$API_URL' '$API_TOKEN' '/work/outline_export'"

# Cleanup
rm -f "$WORK_DIR/.tree_compare.py"

echo ""