Initial commit: Export tools and import script requirements

- export_with_trees.sh: Bash wrapper for Outline export - outline_export_fixed.py: Python export implementation - IMPORT_SCRIPT.MD: PRD for import script (to be built) - RALPH_PROMPT.md: Ralph Loop prompt for building import script - CLAUDE.md: Project documentation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 22:33:55 +01:00
commit d9161f64f5
7 changed files with 2608 additions and 0 deletions
--- a/export_with_trees.sh
+++ b/export_with_trees.sh
@@ -0,0 +1,529 @@
+#!/bin/bash
+#
+# Outline Export Script with Tree Visualization
+# Exports all Outline documents with full hierarchy and shows side-by-side tree comparison
+#
+# Usage: ./export_with_trees.sh [OPTIONS]
+# Options are passed through to the Python script (--dry-run, -v, etc.)
+#
+
+set -e  # Exit on error
+
+# Capture CLI arguments to pass to Python
+CLI_ARGS="$@"
+
+# Colors for output
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Configuration
+WORK_DIR="$(pwd)"
+SETTINGS_FILE="$WORK_DIR/settings.json"
+EXPORT_DIR="$WORK_DIR/outline_export"
+
+echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
+echo -e "${BLUE}  OUTLINE EXPORT${NC}"
+echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}"
+echo ""
+
+# Check if settings.json exists
+if [ ! -f "$SETTINGS_FILE" ]; then
+    echo -e "${RED}Error: settings.json not found${NC}"
+    exit 1
+fi
+
+# Extract API details from settings.json
+API_URL=$(jq -r '.source.url' "$SETTINGS_FILE")
+API_TOKEN=$(jq -r '.source.token' "$SETTINGS_FILE")
+
+# Backup old export if it exists
+if [ -d "$EXPORT_DIR" ]; then
+    TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+    BACKUP_FILE="$WORK_DIR/outline_backup_${TIMESTAMP}.tar.gz"
+    echo -e "${YELLOW}Backing up previous export...${NC}"
+    tar -czf "$BACKUP_FILE" -C "$WORK_DIR" "outline_export" 2>/dev/null
+    echo -e "${GREEN}✓ Backup: $BACKUP_FILE ($(du -sh "$BACKUP_FILE" | cut -f1))${NC}"
+    rm -rf "$EXPORT_DIR"
+fi
+
+echo -e "${GREEN}Exporting documents...${NC}"
+echo ""
+
+# Run the export with CLI arguments (as current user to avoid root-owned files)
+docker run --rm --network domnet \
+    --user "$(id -u):$(id -g)" \
+    -e HOME=/tmp \
+    -v "$WORK_DIR:/work" \
+    -w /work \
+    python:3.11-slim \
+    bash -c "pip install -qqq requests tqdm 2>/dev/null && python3 outline_export_fixed.py $CLI_ARGS"
+
+echo ""
+
+# Create Python script for side-by-side tree comparison
+cat > "$WORK_DIR/.tree_compare.py" << 'PYTHON_SCRIPT'
+#!/usr/bin/env python3
+"""
+Side-by-side comparison of Outline online vs exported files.
+Matches documents row by row and highlights differences.
+"""
+import sys
+import re
+import shutil
+import requests
+from pathlib import Path
+
+# Colors
+GREEN = '\033[0;32m'
+RED = '\033[0;31m'
+YELLOW = '\033[1;33m'
+BLUE = '\033[0;34m'
+CYAN = '\033[0;36m'
+BOLD = '\033[1m'
+DIM = '\033[2m'
+RESET = '\033[0m'
+
+def get_terminal_width():
+    try:
+        return shutil.get_terminal_size().columns
+    except:
+        return 120
+
+def normalize_filename(name):
+    """Normalize a name for comparison (handles / -> _ conversion etc)."""
+    # Replace characters that filesystems don't allow
+    normalized = name.replace('/', '_').replace('\\', '_')
+    normalized = normalized.replace(':', '_').replace('*', '_')
+    normalized = normalized.replace('?', '_').replace('"', '_')
+    normalized = normalized.replace('<', '_').replace('>', '_')
+    normalized = normalized.replace('|', '_')
+    return normalized.strip()
+
+def get_online_docs(api_url, api_token):
+    """Fetch all documents from Outline API, organized by collection."""
+    headers = {
+        "Authorization": f"Bearer {api_token}",
+        "Content-Type": "application/json"
+    }
+
+    response = requests.post(f"{api_url}/api/collections.list", headers=headers, json={})
+    collections = response.json().get("data", [])
+    collections = sorted(collections, key=lambda c: c.get('name', ''))
+
+    # Build collection ID to name mapping
+    coll_id_to_name = {c['id']: c['name'] for c in collections}
+
+    # Fetch all documents with timestamps using documents.list
+    all_docs_response = requests.post(
+        f"{api_url}/api/documents.list",
+        headers=headers,
+        json={"limit": 1000}  # Get all docs
+    )
+    all_docs = all_docs_response.json().get("data", [])
+
+    # Create timestamp lookup by (collection_name, normalized_title)
+    timestamp_lookup = {}
+    for doc in all_docs:
+        coll_id = doc.get("collectionId")
+        coll_name = coll_id_to_name.get(coll_id, "Unknown")
+        title = doc.get("title", "Untitled")
+        norm_title = normalize_filename(title)
+        timestamp_lookup[(coll_name, norm_title)] = doc.get("updatedAt")
+
+    result = {}
+
+    for coll in collections:
+        coll_name = coll['name']
+        result[coll_name] = []
+
+        # Get navigation tree
+        nav_response = requests.post(
+            f"{api_url}/api/collections.documents",
+            headers=headers,
+            json={"id": coll["id"]}
+        )
+        nav_tree = nav_response.json().get("data", [])
+
+        def collect_docs(nodes):
+            docs = []
+            for node in nodes:
+                title = node.get("title", "Untitled")
+                norm_title = normalize_filename(title)
+                has_children = len(node.get("children", [])) > 0
+                updated_at = timestamp_lookup.get((coll_name, norm_title))
+                docs.append({
+                    'title': title,
+                    'normalized': norm_title,
+                    'has_children': has_children,
+                    'updatedAt': updated_at
+                })
+                if has_children:
+                    docs.extend(collect_docs(node.get("children", [])))
+            return docs
+
+        result[coll_name] = collect_docs(nav_tree)
+
+    return result
+
+def get_export_docs(export_dir):
+    """Get all exported documents, organized by collection."""
+    import os
+    export_path = Path(export_dir)
+    result = {}
+
+    if not export_path.exists():
+        return result
+
+    for coll_dir in sorted(export_path.iterdir()):
+        if coll_dir.is_dir():
+            coll_name = coll_dir.name
+            docs = []
+            for md_file in sorted(coll_dir.glob("*.md")):
+                title = md_file.stem
+                if title:  # Skip empty filenames
+                    mtime = os.path.getmtime(md_file)
+                    docs.append({
+                        'title': title,
+                        'normalized': normalize_filename(title),
+                        'path': md_file,
+                        'mtime': mtime
+                    })
+            result[coll_name] = docs
+
+    return result
+
+def match_and_compare(online_docs, export_docs):
+    """Match online and export docs, return comparison data per collection."""
+    from datetime import datetime
+
+    all_collections = sorted(set(online_docs.keys()) | set(export_docs.keys()))
+    comparison = []
+
+    for coll_name in all_collections:
+        online_list = online_docs.get(coll_name, [])
+        export_list = export_docs.get(coll_name, [])
+
+        # Create lookup by normalized name
+        export_lookup = {d['normalized']: d for d in export_list}
+        online_lookup = {d['normalized']: d for d in online_list}
+
+        rows = []
+        matched_export = set()
+
+        # First pass: match online docs to export
+        for doc in sorted(online_list, key=lambda d: d['title'].lower()):
+            norm = doc['normalized']
+            if norm in export_lookup:
+                export_doc = export_lookup[norm]
+                # Check freshness
+                freshness = 'current'  # default
+                if doc.get('updatedAt') and export_doc.get('mtime'):
+                    online_dt = datetime.fromisoformat(doc['updatedAt'].replace('Z', '+00:00'))
+                    online_ts = online_dt.timestamp()
+                    export_ts = export_doc['mtime']
+                    # Allow 60s tolerance
+                    if export_ts < online_ts - 60:
+                        freshness = 'stale'
+                rows.append({
+                    'online': doc['title'],
+                    'export': export_doc['title'],
+                    'status': 'match',
+                    'is_folder': doc['has_children'],
+                    'freshness': freshness
+                })
+                matched_export.add(norm)
+            else:
+                rows.append({
+                    'online': doc['title'],
+                    'export': None,
+                    'status': 'missing',
+                    'is_folder': doc['has_children'],
+                    'freshness': None
+                })
+
+        # Second pass: find extra export docs
+        for doc in sorted(export_list, key=lambda d: d['title'].lower()):
+            if doc['normalized'] not in matched_export:
+                rows.append({
+                    'online': None,
+                    'export': doc['title'],
+                    'status': 'extra',
+                    'is_folder': False,
+                    'freshness': None
+                })
+
+        # Sort rows: matched first, then missing, then extra
+        rows.sort(key=lambda r: (
+            0 if r['status'] == 'match' else (1 if r['status'] == 'missing' else 2),
+            (r['online'] or r['export'] or '').lower()
+        ))
+
+        comparison.append({
+            'collection': coll_name,
+            'rows': rows,
+            'online_count': len(online_list),
+            'export_count': len(export_list)
+        })
+
+    return comparison
+
+def print_comparison(comparison):
+    """Print the side-by-side comparison with status indicators."""
+    term_width = get_terminal_width()
+    col_width = (term_width - 10) // 2  # -10 for separators and status icons
+
+    total_online = 0
+    total_export = 0
+    total_matched = 0
+    total_missing = 0
+    total_extra = 0
+    total_stale = 0
+
+    print(f"\n{BLUE}{'═' * term_width}{RESET}")
+    print(f"{BOLD}{CYAN}{'ONLINE':<{col_width}} {'':5} {'EXPORTED':<{col_width}}{RESET}")
+    print(f"{BLUE}{'═' * term_width}{RESET}")
+
+    for coll in comparison:
+        total_online += coll['online_count']
+        total_export += coll['export_count']
+
+        # Collection header
+        coll_matched = sum(1 for r in coll['rows'] if r['status'] == 'match')
+        coll_missing = sum(1 for r in coll['rows'] if r['status'] == 'missing')
+        coll_extra = sum(1 for r in coll['rows'] if r['status'] == 'extra')
+        coll_stale = sum(1 for r in coll['rows'] if r.get('freshness') == 'stale')
+
+        total_matched += coll_matched
+        total_missing += coll_missing
+        total_extra += coll_extra
+        total_stale += coll_stale
+
+        if coll_missing == 0 and coll_extra == 0:
+            coll_status = f"{GREEN}✓{RESET}"
+        else:
+            coll_status = f"{RED}✗{RESET}"
+
+        header = f"{coll['collection']}/ ({coll['online_count']} → {coll['export_count']})"
+        print(f"\n{BOLD}{YELLOW}{header}{RESET} {coll_status}")
+        print(f"{BLUE}{'─' * term_width}{RESET}")
+
+        for row in coll['rows']:
+            online_name = row['online'] or ''
+            export_name = row['export'] or ''
+
+            # Add folder indicator
+            if row['is_folder'] and online_name:
+                online_name = f"📁 {online_name}"
+
+            # Truncate if needed
+            if len(online_name) > col_width - 1:
+                online_name = online_name[:col_width-4] + '...'
+            if len(export_name) > col_width - 1:
+                export_name = export_name[:col_width-4] + '...'
+
+            # Status and colors
+            if row['status'] == 'match':
+                # Freshness indicator
+                if row.get('freshness') == 'stale':
+                    freshness = f"{YELLOW}●{RESET}"
+                else:
+                    freshness = f"{GREEN}●{RESET}"
+                status = f"{GREEN}✓{RESET}{freshness}"
+                left = f"{online_name}"
+                right = f"{export_name}"
+            elif row['status'] == 'missing':
+                status = f"{RED}✗{RESET} "
+                left = f"{RED}{online_name}{RESET}"
+                right = f"{DIM}---{RESET}"
+            else:  # extra
+                status = f"{YELLOW}+{RESET} "
+                left = f"{DIM}---{RESET}"
+                right = f"{YELLOW}{export_name}{RESET}"
+
+            # Calculate visible width (without ANSI codes)
+            def visible_len(s):
+                return len(re.sub(r'\033\[[0-9;]*m', '', s))
+
+            left_pad = col_width - visible_len(left)
+            right_pad = col_width - visible_len(right)
+
+            print(f"  {left}{' ' * max(0, left_pad)} {status} {right}")
+
+    # Summary
+    print(f"\n{BLUE}{'═' * term_width}{RESET}")
+    print(f"{BOLD}SUMMARY:{RESET}")
+    print(f"  Online:   {total_online} documents")
+    print(f"  Exported: {total_export} documents")
+    print(f"  {GREEN}✓● Matched & current: {total_matched - total_stale}{RESET}")
+
+    if total_stale > 0:
+        print(f"  {YELLOW}✓● Matched but stale: {total_stale} (export older than online){RESET}")
+    if total_missing > 0:
+        print(f"  {RED}✗  Missing: {total_missing} (online but not exported){RESET}")
+    if total_extra > 0:
+        print(f"  {YELLOW}+  Extra: {total_extra} (exported but not online){RESET}")
+
+    if total_missing == 0 and total_extra == 0 and total_stale == 0:
+        print(f"\n{GREEN}✓ All documents exported and current!{RESET}")
+    elif total_missing == 0 and total_extra == 0:
+        print(f"\n{YELLOW}⚠ All documents exported but {total_stale} are stale{RESET}")
+    print()
+
+def get_latest_changes(api_url, api_token, limit=3):
+    """Fetch the most recently updated documents."""
+    headers = {
+        "Authorization": f"Bearer {api_token}",
+        "Content-Type": "application/json"
+    }
+
+    response = requests.post(
+        f"{api_url}/api/documents.list",
+        headers=headers,
+        json={
+            "sort": "updatedAt",
+            "direction": "DESC",
+            "limit": limit
+        }
+    )
+
+    docs = response.json().get("data", [])
+    result = []
+
+    for doc in docs:
+        # Get collection name
+        coll_id = doc.get("collectionId")
+        coll_name = "Unknown"
+        if coll_id:
+            coll_response = requests.post(
+                f"{api_url}/api/collections.info",
+                headers=headers,
+                json={"id": coll_id}
+            )
+            coll_data = coll_response.json().get("data", {})
+            coll_name = coll_data.get("name", "Unknown")
+
+        result.append({
+            'title': doc.get("title", "Untitled"),
+            'collection': coll_name,
+            'updatedAt': doc.get("updatedAt"),
+            'normalized': normalize_filename(doc.get("title", "Untitled"))
+        })
+
+    return result
+
+def find_export_file(export_dir, collection, normalized_title):
+    """Find the exported file matching the document."""
+    export_path = Path(export_dir)
+
+    # Try exact collection match first
+    coll_dir = export_path / collection
+    if coll_dir.exists():
+        for md_file in coll_dir.glob("*.md"):
+            if normalize_filename(md_file.stem) == normalized_title:
+                return md_file
+
+    # Try all collections (in case of name mismatch)
+    for coll_dir in export_path.iterdir():
+        if coll_dir.is_dir():
+            for md_file in coll_dir.glob("*.md"):
+                if normalize_filename(md_file.stem) == normalized_title:
+                    return md_file
+
+    return None
+
+def print_latest_changes(latest_docs, export_dir):
+    """Print the latest changes section."""
+    term_width = get_terminal_width()
+    from datetime import datetime
+    import os
+
+    print(f"\n{BLUE}{'═' * term_width}{RESET}")
+    print(f"{BOLD}{CYAN}LATEST CHANGES (verify actuality){RESET}")
+    print(f"{BLUE}{'─' * term_width}{RESET}")
+
+    for i, doc in enumerate(latest_docs, 1):
+        title = doc['title']
+        collection = doc['collection']
+        updated_at = doc['updatedAt']
+
+        # Parse online timestamp
+        if updated_at:
+            # Handle ISO format with timezone
+            online_dt = datetime.fromisoformat(updated_at.replace('Z', '+00:00'))
+            online_str = online_dt.strftime("%Y-%m-%d %H:%M:%S")
+        else:
+            online_str = "Unknown"
+
+        # Find export file
+        export_file = find_export_file(export_dir, collection, doc['normalized'])
+
+        if export_file and export_file.exists():
+            export_mtime = os.path.getmtime(export_file)
+            export_dt = datetime.fromtimestamp(export_mtime)
+            export_str = export_dt.strftime("%Y-%m-%d %H:%M:%S")
+
+            # Compare (export should be same time or newer)
+            if updated_at:
+                # Convert online to local timestamp for comparison
+                online_ts = online_dt.timestamp()
+                if export_mtime >= online_ts - 60:  # Allow 60s tolerance
+                    status = f"{GREEN}✓{RESET}"
+                else:
+                    status = f"{YELLOW}⚠ older{RESET}"
+            else:
+                status = f"{GREEN}✓{RESET}"
+        else:
+            export_str = "NOT FOUND"
+            status = f"{RED}✗{RESET}"
+
+        # Print entry
+        print(f"\n  {BOLD}{i}. {title}{RESET}")
+        print(f"     {DIM}Collection:{RESET} {collection}")
+        print(f"     {DIM}Online:{RESET}     {online_str}")
+        print(f"     {DIM}Exported:{RESET}   {export_str}  {status}")
+
+    print(f"\n{BLUE}{'═' * term_width}{RESET}")
+
+def main():
+    if len(sys.argv) != 4:
+        print("Usage: script.py <API_URL> <API_TOKEN> <EXPORT_DIR>")
+        sys.exit(1)
+
+    api_url = sys.argv[1]
+    api_token = sys.argv[2]
+    export_dir = sys.argv[3]
+
+    # Get documents from both sources
+    online_docs = get_online_docs(api_url, api_token)
+    export_docs = get_export_docs(export_dir)
+
+    # Match and compare
+    comparison = match_and_compare(online_docs, export_docs)
+
+    # Print results
+    print_comparison(comparison)
+
+    # Get and print latest changes
+    latest_docs = get_latest_changes(api_url, api_token, limit=3)
+    print_latest_changes(latest_docs, export_dir)
+
+if __name__ == "__main__":
+    main()
+PYTHON_SCRIPT
+
+# Run the side-by-side tree comparison (use /work/outline_export as container path)
+docker run --rm --network domnet \
+    --user "$(id -u):$(id -g)" \
+    -e HOME=/tmp \
+    -v "$WORK_DIR:/work" \
+    -w /work \
+    python:3.11-slim \
+    bash -c "pip install -qqq requests 2>/dev/null && python3 /work/.tree_compare.py '$API_URL' '$API_TOKEN' '/work/outline_export'"
+
+# Cleanup
+rm -f "$WORK_DIR/.tree_compare.py"
+
+echo ""