commit d9161f64f54d6430ca3763e3ef379333f8793a0c Author: Claude Date: Mon Jan 19 22:33:55 2026 +0100 Initial commit: Export tools and import script requirements - export_with_trees.sh: Bash wrapper for Outline export - outline_export_fixed.py: Python export implementation - IMPORT_SCRIPT.MD: PRD for import script (to be built) - RALPH_PROMPT.md: Ralph Loop prompt for building import script - CLAUDE.md: Project documentation Co-Authored-By: Claude Opus 4.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a77928f --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# Secrets +settings.json + +# Export data (may contain sensitive content) +outline_export/ + +# Backups +outline_backup_*.tar.gz + +# Python +__pycache__/ +*.pyc +*.pyo +.pytest_cache/ + +# Ralph Loop state +.claude/ + +# IDE +.vscode/ +.idea/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..7385a7c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,94 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a tool for exporting Outline wiki data via API. The script runs inside a Docker container on the `domnet` network to bypass Authentik SSO authentication and access the internal Outline API directly (`http://outline:3000`). + +## Usage + +```bash +# Run the export with tree visualization +./export_with_trees.sh + +# Preview without exporting (dry run) +./export_with_trees.sh --dry-run + +# Run with verbose output +./export_with_trees.sh -v +``` + +### CLI Options +``` +--dry-run, -n Preview what would be exported without writing files +--output, -o DIR Output directory (overrides settings.json) +--verbose, -v Increase verbosity (-vv for debug) +--skip-verify Skip post-export verification +--skip-health-check Skip pre-export health check +--settings FILE Path to settings file (default: settings.json) +``` + +### Running the Python Export Directly +```bash +docker run --rm --network domnet \ + -v "$(pwd):/work" \ + -w /work \ + python:3.11-slim \ + bash -c "pip install -q requests tqdm && python3 outline_export_fixed.py" + +# With options +docker run --rm --network domnet \ + -v "$(pwd):/work" \ + -w /work \ + python:3.11-slim \ + bash -c "pip install -q requests tqdm && python3 outline_export_fixed.py --dry-run" +``` + +## Architecture + +### Docker Network Integration +- Script runs in Docker container attached to `domnet` bridge network +- Direct API access to `http://outline:3000` (internal) bypasses SSO +- Uses `python:3.11-slim` image with `requests` and `tqdm` dependencies + +### Export Flow +1. Fetch collections via `/api/collections.list` +2. Get navigation tree via `/api/collections.documents` (source of truth for hierarchy) +3. Fetch full document content via `/api/documents.info` (with caching) +4. Export recursively maintaining parent-child structure +5. Save metadata (`_collection_metadata.json`) per collection +6. Generate manifest with checksums for verification + +### Key Files +- `export_with_trees.sh` - Main export script with tree visualization +- `outline_export_fixed.py` - Core export logic with `OutlineExporter` class +- `settings.json` - API URL and token configuration (contains secrets) +- `outline_export/` - Output directory with markdown files and metadata +- `outline_backup_*.tar.gz` - Timestamped compressed backups + +### Configuration +Settings are in `settings.json`: +- `source.url` - Internal Docker URL (`http://outline:3000`) +- `source.token` - Outline API token +- `export.output_directory` - Output path (default: `outline_export`) +- `advanced.max_hierarchy_depth` - Prevent infinite recursion (default: 100) + +## Important Notes + +### Security +- `settings.json` contains API token - never commit to git +- Backup files may contain sensitive wiki content + +### Backup System +- Each export automatically backs up previous exports to `outline_backup_YYYYMMDD_HHMMSS.tar.gz` +- Old uncompressed export directory is deleted after backup +- Backups achieve 90%+ compression on markdown content + +### Reliability Features +- **Health check**: Verifies API connectivity before export +- **Retry logic**: Failed API requests retry up to 3 times with exponential backoff +- **Logging**: Structured logging with configurable verbosity levels + +### Document Counting +The navigation tree (`/api/collections.documents`) is the source of truth for document hierarchy. Document counting is recursive to include all nested children. diff --git a/IMPORT_SCRIPT.MD b/IMPORT_SCRIPT.MD new file mode 100644 index 0000000..fbde812 --- /dev/null +++ b/IMPORT_SCRIPT.MD @@ -0,0 +1,514 @@ +# Outline Import Script - Product Requirements Document + +**Document Version:** 1.0 +**Created:** 2026-01-17 +**Last Updated:** 2026-01-19 +**Status:** Draft + +--- + +## 1. Executive Summary + +Create `import_to_outline.sh` - a companion script to the existing export tool that imports markdown files back into Outline. The script restores documents with their full hierarchy using metadata preserved during export, enabling disaster recovery, migration between Outline instances, and content restoration workflows. + +--- + +## 2. Problem Statement + +### Current State +- Export functionality exists via `export_with_trees.sh` and `outline_export_fixed.py` +- Exports include markdown content and `_collection_metadata.json` with full hierarchy +- No automated way to restore or migrate exported content back into Outline + +### Pain Points +1. **Disaster Recovery**: Manual recreation of collections and documents after data loss +2. **Migration**: No tooling to move content between Outline instances +3. **Restore Workflow**: Cannot selectively restore deleted documents or collections +4. **Testing**: No way to verify export integrity via round-trip import + +### Business Impact +- Hours of manual work to rebuild wiki structure after incidents +- Risk of hierarchy/relationship loss during manual restoration +- No confidence in backup validity without restore testing + +--- + +## 3. Goals & Success Criteria + +### Primary Goals +1. Restore exported markdown files to Outline with preserved hierarchy +2. Support both full restore and selective import workflows +3. Provide clear feedback on import progress and results + +### Success Criteria +| Metric | Target | +|--------|--------| +| Document import success rate | ≥99% | +| Hierarchy accuracy | 100% parent-child relationships preserved | +| Performance | ≥10 documents/second | +| Dry-run accuracy | 100% match between preview and actual import | + +### Non-Goals +- Image/attachment import (future enhancement) +- Conflict resolution with existing content (skip or fail) +- Real-time sync between instances +- User/permission migration + +--- + +## 4. User Stories + +### US-1: Disaster Recovery +> As an **administrator**, I want to **restore all collections from a backup** so that **I can recover from data loss**. + +**Acceptance Criteria:** +- Import all collections from `outline_export/` directory +- Recreate exact hierarchy as shown in metadata +- Report success/failure summary + +### US-2: Selective Restoration +> As a **user**, I want to **import a single collection** so that **I can restore specific content without affecting other data**. + +**Acceptance Criteria:** +- Specify source directory containing single collection +- Create collection if it doesn't exist +- Skip import if collection already exists (configurable) + +### US-3: Migration to New Instance +> As an **administrator**, I want to **import all content into a fresh Outline instance** so that **I can migrate to new infrastructure**. + +**Acceptance Criteria:** +- Works against empty Outline instance +- Creates all collections and documents +- Preserves document nesting structure + +### US-4: Safe Preview +> As a **user**, I want to **preview what will be imported** so that **I can verify before making changes**. + +**Acceptance Criteria:** +- `--dry-run` flag shows all planned operations +- No API calls that modify data during dry run +- Output matches actual import behavior + +### US-5: Consolidated Import +> As a **user**, I want to **import multiple collections into a single new collection** so that **I can reorganize content during import**. + +**Acceptance Criteria:** +- `--single` mode creates timestamped collection +- Original collection names become top-level documents +- All nested hierarchy preserved under these parents + +--- + +## 5. Functional Requirements + +### 5.1 Import Modes + +#### Mode 1: Collection-per-Folder (Default) +``` +outline_export/ +├── Bewerbungen/ → Creates "Bewerbungen" collection +├── Projekte/ → Creates "Projekte" collection +└── Privat/ → Creates "Privat" collection +``` + +**Behavior:** +- Each subdirectory in source becomes a separate collection +- Collection names match folder names exactly +- If collection exists: skip entire collection (default) or error + +#### Mode 2: Single Collection (`--single`) +``` +outline_export/ +├── Bewerbungen/ → Becomes parent doc "Bewerbungen" +├── Projekte/ → Becomes parent doc "Projekte" +└── Privat/ → Becomes parent doc "Privat" + +All imported into: "import_20260119_143052" collection +``` + +**Behavior:** +- Creates one collection named `import_YYYYMMDD_HHMMSS` +- Each original collection folder becomes a top-level parent document +- Original document hierarchy nested under these parents + +### 5.2 Command-Line Interface + +```bash +./import_to_outline.sh [OPTIONS] + +Options: + -s, --single Import all into single timestamped collection + -n, --dry-run Preview operations without making changes + -d, --source DIR Source directory (default: outline_export) + -v, --verbose Increase output verbosity (-vv for debug) + -f, --force Overwrite existing collections (instead of skip) + --settings FILE Path to settings file (default: settings.json) + -h, --help Show help message +``` + +### 5.3 Document Creation Logic + +#### Hierarchy Reconstruction Algorithm +``` +1. Load _collection_metadata.json +2. Build document tree from `documents` array (using parent_id) +3. Topological sort: ensure parents created before children +4. For each document in sorted order: + a. Read markdown content from file + b. Map old parent_id → new parent_id (from creation responses) + c. Create document via API with parentDocumentId + d. Store id mapping: old_id → new_id +5. Verify: created count matches expected count +``` + +#### ID Mapping Example +``` +Export metadata: + doc_A (id: abc-123, parent_id: null) + doc_B (id: def-456, parent_id: abc-123) + +After creating doc_A: + id_map = { "abc-123": "new-789" } + +Creating doc_B: + parent_id = id_map["abc-123"] = "new-789" + API call: create doc_B with parentDocumentId: "new-789" +``` + +### 5.4 Duplicate Handling + +| Scenario | Default Behavior | With `--force` | +|----------|------------------|----------------| +| Collection exists | Skip entire collection | Delete and recreate | +| Document title exists in collection | Skip document | Update document | + +### 5.5 Error Handling + +| Error Type | Behavior | +|------------|----------| +| API connection failure | Abort with error message | +| Collection creation fails | Abort that collection, continue others | +| Document creation fails | Log error, continue with siblings | +| Missing markdown file | Log warning, skip document | +| Invalid metadata JSON | Abort that collection | +| Parent document not found | Create as root-level document | + +--- + +## 6. Technical Design + +### 6.1 Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ import_to_outline.sh │ +│ (Bash wrapper - Docker execution, backup, UI) │ +└─────────────────────────┬───────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ outline_import.py │ +│ (Python core - API calls, hierarchy logic) │ +└─────────────────────────┬───────────────────────────────────┘ + │ + ┌─────────────┼─────────────┐ + ▼ ▼ ▼ + ┌──────────────┐ ┌──────────┐ ┌──────────────┐ + │ settings.json│ │ metadata │ │ Outline API │ + │ (API config) │ │ .json │ │ (HTTP POST) │ + └──────────────┘ └──────────┘ └──────────────┘ +``` + +### 6.2 API Endpoints + +| Endpoint | Method | Purpose | +|----------|--------|---------| +| `/api/collections.list` | POST | Check existing collections | +| `/api/collections.create` | POST | Create new collection | +| `/api/collections.delete` | POST | Delete collection (--force mode) | +| `/api/documents.create` | POST | Create document with content | +| `/api/documents.list` | POST | Check existing documents | +| `/api/documents.update` | POST | Update document (--force mode) | + +### 6.3 API Request Examples + +#### Create Collection +```json +POST /api/collections.create +{ + "name": "Bewerbungen", + "permission": "read_write" +} +``` + +#### Create Document +```json +POST /api/documents.create +{ + "collectionId": "col-uuid-here", + "title": "DORA Metrics (Top 4)", + "text": "# DORA Metrics\n\nContent here...", + "parentDocumentId": "parent-uuid-or-null", + "publish": true +} +``` + +### 6.4 Data Structures + +#### Input: `_collection_metadata.json` +```json +{ + "id": "original-collection-uuid", + "name": "Bewerbungen", + "directory": "Bewerbungen", + "expected_count": 11, + "documents": [ + { + "id": "doc-uuid", + "title": "Document Title", + "filename": "Document Title.md", + "parent_id": "parent-uuid-or-null", + "checksum": "sha256-hash", + "children": [...] + } + ] +} +``` + +#### Runtime: ID Mapping +```python +id_map: Dict[str, str] = { + "old-uuid-1": "new-uuid-1", + "old-uuid-2": "new-uuid-2" +} +``` + +### 6.5 Docker Execution + +```bash +docker run --rm --network domnet \ + --user "$(id -u):$(id -g)" \ + -e HOME=/tmp \ + -v "$WORK_DIR:/work" \ + -w /work \ + python:3.11-slim \ + bash -c "pip install -qqq requests 2>/dev/null && \ + python3 outline_import.py $CLI_ARGS" +``` + +--- + +## 7. User Interface + +### 7.1 Progress Output + +``` +════════════════════════════════════════════════════════════ + OUTLINE IMPORT +════════════════════════════════════════════════════════════ + +Source: outline_export/ +Target: http://outline:3000 +Mode: Collection per folder + +Checking API connectivity... ✓ + +Bewerbungen/ (11 documents) + Creating collection... ✓ (id: 7f3a...) + ├── CV.md ✓ created + ├── DORA Metrics (Top 4).md ✓ created + ├── Tipico.md ✓ created + │ ├── Pitch Tipico.md ✓ created + │ ├── Fragen 3. Runde.md ✓ created + │ ├── Tipico 3rd Party.md ✓ created + │ └── Tipico Top 10 Functions.md ✓ created + └── Ihre PVS.md ✓ created + ├── Mobilepass.md ✓ created + ├── PVS erster Call.md ✓ created + └── Fragen Dirk.md ✓ created + +Projekte/ (8 documents) + Collection exists, skipping... + +════════════════════════════════════════════════════════════ +SUMMARY +════════════════════════════════════════════════════════════ + Collections: 1 created, 1 skipped, 0 errors + Documents: 11 created, 0 skipped, 0 errors + Duration: 2.3 seconds +════════════════════════════════════════════════════════════ +``` + +### 7.2 Dry-Run Output + +``` +════════════════════════════════════════════════════════════ + OUTLINE IMPORT (DRY RUN) +════════════════════════════════════════════════════════════ + +Source: outline_export/ +Target: http://outline:3000 +Mode: Collection per folder + +[DRY RUN] No changes will be made + +Bewerbungen/ (11 documents) + [DRY RUN] Would create collection "Bewerbungen" + [DRY RUN] Would create 11 documents: + ├── CV.md + ├── DORA Metrics (Top 4).md + ├── Tipico.md + │ ├── Pitch Tipico.md + │ └── ... + └── ... + +Projekte/ (8 documents) + [DRY RUN] Collection exists - would skip + +════════════════════════════════════════════════════════════ +DRY RUN SUMMARY +════════════════════════════════════════════════════════════ + Would create: 1 collection, 11 documents + Would skip: 1 collection (exists) +════════════════════════════════════════════════════════════ +``` + +### 7.3 Error Output + +``` +Bewerbungen/ (11 documents) + Creating collection... ✓ + ├── CV.md ✓ created + ├── Missing Doc.md ✗ file not found + └── Tipico.md ✗ API error: 500 + └── (children skipped due to parent failure) +``` + +--- + +## 8. Configuration + +### 8.1 settings.json (shared with export) + +```json +{ + "source": { + "url": "http://outline:3000", + "token": "ol_api_xxxxxxxxxxxx" + }, + "import": { + "source_directory": "outline_export", + "on_collection_exists": "skip", + "on_document_exists": "skip", + "default_permission": "read_write" + }, + "advanced": { + "request_timeout": 30, + "retry_attempts": 3, + "retry_delay": 1.0, + "rate_limit_delay": 0.1 + } +} +``` + +### 8.2 Configuration Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `import.source_directory` | string | `outline_export` | Default source path | +| `import.on_collection_exists` | enum | `skip` | `skip`, `error`, `merge` | +| `import.on_document_exists` | enum | `skip` | `skip`, `error`, `update` | +| `import.default_permission` | enum | `read_write` | `read`, `read_write` | +| `advanced.request_timeout` | int | 30 | API timeout in seconds | +| `advanced.retry_attempts` | int | 3 | Retries on failure | +| `advanced.rate_limit_delay` | float | 0.1 | Delay between API calls | + +--- + +## 9. Testing Strategy + +### 9.1 Test Cases + +| ID | Category | Test Case | Expected Result | +|----|----------|-----------|-----------------| +| T1 | Happy Path | Import single collection | Collection + docs created | +| T2 | Happy Path | Import multiple collections | All collections created | +| T3 | Happy Path | Import nested hierarchy (3+ levels) | All parent-child relationships preserved | +| T4 | Duplicate | Collection already exists | Skip collection | +| T5 | Duplicate | Document title exists | Skip document | +| T6 | Error | Missing markdown file | Log warning, continue | +| T7 | Error | Invalid metadata JSON | Abort collection | +| T8 | Error | API unreachable | Abort with clear error | +| T9 | Mode | --single flag | Single timestamped collection | +| T10 | Mode | --dry-run flag | No API mutations | +| T11 | Mode | --force flag | Overwrites existing | +| T12 | Edge | Empty collection | Create empty collection | +| T13 | Edge | Special chars in title | Handled correctly | +| T14 | Edge | Very large document | Imported successfully | + +### 9.2 Verification Methods + +1. **Round-trip Test**: Export → Import to test instance → Export again → Compare checksums +2. **API Verification**: Query created documents and verify parent relationships +3. **Manual Inspection**: Spot-check imported content in Outline UI + +--- + +## 10. Rollback & Recovery + +### Pre-Import Safety +- No automatic backup (user should have export as backup) +- `--dry-run` always available to preview + +### Rollback Procedure +If import fails partway through: +1. Note which collections were created (from import log) +2. Manually delete partial collections via Outline UI or API +3. Fix issue and re-run import + +### Future Enhancement +- `--backup` flag to export existing content before import +- Transaction-like behavior: delete partially-imported collection on failure + +--- + +## 11. Future Enhancements + +| Priority | Enhancement | Description | +|----------|-------------|-------------| +| P1 | Attachment support | Import images and file attachments | +| P1 | Merge mode | Add documents to existing collections | +| P2 | Selective import | Import specific documents by path/pattern | +| P2 | Update mode | Update existing documents with new content | +| P3 | User mapping | Preserve authorship via user email mapping | +| P3 | Permission sync | Restore document-level permissions | +| P3 | Incremental import | Only import new/changed documents | + +--- + +## 12. Implementation Checklist + +- [ ] Create `outline_import.py` with core import logic +- [ ] Create `import_to_outline.sh` bash wrapper +- [ ] Implement collection creation +- [ ] Implement document creation with hierarchy +- [ ] Implement ID mapping for parent references +- [ ] Add `--dry-run` mode +- [ ] Add `--single` mode +- [ ] Add `--force` mode +- [ ] Add progress visualization +- [ ] Add error handling and reporting +- [ ] Add retry logic for API failures +- [ ] Update settings.json schema +- [ ] Write tests +- [ ] Update CLAUDE.md documentation + +--- + +## 13. References + +- **Export Script**: `export_with_trees.sh`, `outline_export_fixed.py` +- **Outline API Docs**: https://www.getoutline.com/developers +- **Metadata Format**: See `outline_export/*/_collection_metadata.json` +- **Settings Format**: See `settings.json` diff --git a/RALPH_PROMPT.md b/RALPH_PROMPT.md new file mode 100644 index 0000000..763a66e --- /dev/null +++ b/RALPH_PROMPT.md @@ -0,0 +1,202 @@ +# Outline Import Script - Ralph Loop Prompt + +## Your Mission + +Build `import_to_outline.sh` and `outline_import.py` - companion scripts to the existing export tools that import markdown files back into Outline wiki. + +**Requirements Document:** Read `IMPORT_SCRIPT.MD` for full specifications. + +**Reference Implementation:** Study `export_with_trees.sh` and `outline_export_fixed.py` for patterns. + +--- + +## Iteration Protocol + +Each iteration, follow this cycle: + +### 1. Assess Current State +```bash +# Check what exists +ls -la *.sh *.py 2>/dev/null +git status +git log --oneline -5 2>/dev/null || echo "No git history" +``` + +### 2. Read Requirements (if needed) +- Review `IMPORT_SCRIPT.MD` for specifications +- Review `outline_export_fixed.py` for API patterns and settings.json structure +- Review `settings.json` for configuration format + +### 3. Implement Next Phase +Work on the current phase until complete, then move to the next. + +### 4. Test Your Work +- Run syntax checks: `python3 -m py_compile outline_import.py` +- Run bash checks: `bash -n import_to_outline.sh` +- Test `--help` output +- Test `--dry-run` mode against `outline_export/` directory + +### 5. Commit Progress +```bash +git add -A && git commit -m "Phase X: description" +``` + +--- + +## Implementation Phases + +### Phase 1: Core Python Structure +Create `outline_import.py` with: +- [ ] `OutlineImporter` class with settings loading (copy pattern from `outline_export_fixed.py`) +- [ ] API helper methods: `_api_request()`, `_get_collections()`, `_create_collection()` +- [ ] Argument parsing with all CLI options from spec +- [ ] Basic logging setup + +**Verification:** `python3 -m py_compile outline_import.py` passes + +### Phase 2: Metadata Loading +- [ ] Load `_collection_metadata.json` from each collection directory +- [ ] Build document tree from `documents` array +- [ ] Implement topological sort for parent-before-child ordering +- [ ] Handle missing/invalid metadata gracefully + +**Verification:** Can parse metadata from `outline_export/*/` + +### Phase 3: Collection Import Logic +- [ ] Check if collection exists via `/api/collections.list` +- [ ] Create collection via `/api/collections.create` +- [ ] Handle `--force` mode (delete and recreate) +- [ ] Skip existing collections by default + +**Verification:** `--dry-run` shows correct collection operations + +### Phase 4: Document Import with Hierarchy +- [ ] Read markdown content from files +- [ ] Create documents via `/api/documents.create` +- [ ] Maintain ID mapping: `old_id -> new_id` +- [ ] Set `parentDocumentId` using mapped IDs +- [ ] Handle missing parent (create as root-level) + +**Verification:** `--dry-run` shows correct document hierarchy + +### Phase 5: Single Collection Mode +- [ ] Implement `--single` flag +- [ ] Create timestamped collection name `import_YYYYMMDD_HHMMSS` +- [ ] Convert original collection folders to parent documents +- [ ] Preserve nested hierarchy under these parents + +**Verification:** `--dry-run --single` shows consolidated structure + +### Phase 6: Progress Visualization +- [ ] Tree-style output matching spec (├──, └──, │) +- [ ] Status indicators (✓ created, ✗ error, ○ skipped) +- [ ] Summary statistics (collections/documents created/skipped/errors) +- [ ] Duration tracking + +**Verification:** Output matches examples in IMPORT_SCRIPT.MD Section 7 + +### Phase 7: Bash Wrapper Script +Create `import_to_outline.sh` with: +- [ ] Docker execution (matching `export_with_trees.sh` pattern) +- [ ] CLI argument passthrough +- [ ] Help text +- [ ] Pre-flight checks (settings.json exists, source directory exists) + +**Verification:** `./import_to_outline.sh --help` works + +### Phase 8: Error Handling & Polish +- [ ] Retry logic for API failures (3 attempts, exponential backoff) +- [ ] Proper error messages for all failure modes +- [ ] Rate limiting delay between API calls +- [ ] Verbose/debug output levels + +**Verification:** All error scenarios from spec handled + +--- + +## Success Criteria + +All of the following must be true: + +1. **Files exist:** `import_to_outline.sh` and `outline_import.py` +2. **Syntax valid:** Both pass syntax checks without errors +3. **Help works:** `./import_to_outline.sh --help` shows usage +4. **Dry-run works:** `./import_to_outline.sh --dry-run` parses `outline_export/` and shows planned operations +5. **Single mode:** `./import_to_outline.sh --dry-run --single` shows consolidated import plan +6. **Matches spec:** Output format matches IMPORT_SCRIPT.MD Section 7 examples + +--- + +## Completion Signal + +When ALL success criteria are met, output: + +``` +IMPORT SCRIPT COMPLETE +``` + +**Do not output this promise until:** +- Both files exist and pass syntax checks +- `--help` displays properly +- `--dry-run` successfully parses metadata and shows planned operations +- Output format matches the specification + +--- + +## Anti-Patterns to Avoid + +1. **Don't skip phases** - Complete each phase before moving on +2. **Don't forget commits** - Commit after each successful phase +3. **Don't ignore errors** - Fix syntax/import errors before proceeding +4. **Don't deviate from spec** - Follow IMPORT_SCRIPT.MD precisely +5. **Don't over-engineer** - Implement exactly what's specified, no more + +--- + +## Helpful Context + +### API Endpoint Examples (from spec) +```python +# Create collection +POST /api/collections.create +{"name": "Bewerbungen", "permission": "read_write"} + +# Create document +POST /api/documents.create +{ + "collectionId": "col-uuid", + "title": "Document Title", + "text": "# Content\n\nMarkdown here...", + "parentDocumentId": "parent-uuid-or-null", + "publish": true +} +``` + +### Docker Execution Pattern +```bash +docker run --rm --network domnet \ + --user "$(id -u):$(id -g)" \ + -e HOME=/tmp \ + -v "$WORK_DIR:/work" \ + -w /work \ + python:3.11-slim \ + bash -c "pip install -qqq requests 2>/dev/null && python3 outline_import.py $CLI_ARGS" +``` + +### Settings Structure (existing in settings.json) +```json +{ + "source": { + "url": "http://outline:3000", + "token": "ol_api_xxx" + } +} +``` + +--- + +## Current Iteration + +Read the files, check git history, determine which phase you're on, and continue from there. + +If starting fresh: Begin with Phase 1. diff --git a/README.md b/README.md new file mode 100644 index 0000000..5d011a0 --- /dev/null +++ b/README.md @@ -0,0 +1,217 @@ +# Outline Export Tool + +Export Outline wiki data with full hierarchy and tree visualization. + +## Quick Start + +### 1. Configure Settings + +Ensure `settings.json` contains your Outline API token: +```bash +cat settings.json +``` + +### 2. Run Export + +```bash +./export_with_trees.sh +``` + +## Command Line Options + +```bash +# Standard export with tree visualization +./export_with_trees.sh + +# Preview without exporting (dry run) +./export_with_trees.sh --dry-run + +# Verbose output +./export_with_trees.sh -v + +# Debug output +./export_with_trees.sh -vv + +# Skip verification step +./export_with_trees.sh --skip-verify + +# Custom output directory +./export_with_trees.sh -o /path/to/output +``` + +### All CLI Options + +| Option | Short | Description | +|--------|-------|-------------| +| `--dry-run` | `-n` | Preview without writing files | +| `--output` | `-o` | Output directory (overrides settings) | +| `--verbose` | `-v` | Increase verbosity (-vv for debug) | +| `--skip-verify` | | Skip post-export verification | +| `--skip-health-check` | | Skip pre-export health check | +| `--settings` | | Path to settings file | + +## What It Does + +1. **Health check** - Verifies API connectivity and authentication +2. **Shows current structure** - Tree view from Outline API +3. **Backs up previous exports** - Timestamped `.tar.gz` archives +4. **Exports all documents** - With full hierarchy preserved +5. **Shows exported structure** - Tree view of files +6. **Verifies counts** - Compares API vs exported documents + +## Features + +- **Retry logic**: Failed API requests retry up to 3 times with exponential backoff +- **Health check**: Verifies API before starting export +- **Dry-run mode**: Preview what would be exported +- **Structured logging**: Configurable verbosity levels +- **Document caching**: Prevents duplicate API fetches +- **Checksum verification**: Ensures export integrity + +## File Structure + +``` +outline-tools/ +├── export_with_trees.sh # Main export script +├── outline_export_fixed.py # Python export logic +├── settings.json # API configuration +├── CLAUDE.md # AI assistant docs +├── README.md # This file +│ +└── Output (created after export): + ├── exports/ # Exported documents + └── exports_backup_*.tar.gz # Previous backups +``` + +## Configuration + +### settings.json + +```json +{ + "source": { + "url": "http://outline:3000", + "token": "ol_api_..." + }, + "export": { + "output_directory": "exports", + "verify_after_export": true + }, + "advanced": { + "max_hierarchy_depth": 100, + "api_timeout_seconds": 30, + "progress_bar": true, + "max_retries": 3, + "retry_backoff": 1.0 + } +} +``` + +### Configuration Options + +| Section | Key | Default | Description | +|---------|-----|---------|-------------| +| source | url | - | Outline API URL | +| source | token | - | API authentication token | +| export | output_directory | exports | Where to save files | +| export | verify_after_export | true | Run verification after export | +| advanced | max_hierarchy_depth | 100 | Prevent infinite recursion | +| advanced | progress_bar | true | Show progress bars | +| advanced | max_retries | 3 | API retry attempts | +| advanced | retry_backoff | 1.0 | Retry backoff multiplier | + +## How It Works + +### Docker Network Access + +The script runs in a Docker container on `domnet` to access Outline internally: + +``` +export_with_trees.sh → Docker Container (domnet) → outline:3000 +``` + +This bypasses Authentik SSO that would block external HTTPS requests. + +### Export Process + +1. **Health check** - Verify API connectivity +2. **Fetch collections** from Outline API +3. **Build hierarchy** from navigation tree (source of truth) +4. **Export recursively** maintaining parent-child structure +5. **Save metadata** per collection +6. **Verify** document counts and checksums + +## Output Format + +### Collection Structure +``` +exports/ +├── Collection_Name/ +│ ├── _collection_metadata.json +│ ├── Document.md +│ └── Child_Document.md +├── export_metadata.json +└── manifest.json +``` + +### Document Format +```markdown +# Document Title + + + + + + +--- + +Document content here... +``` + +## Troubleshooting + +### Health Check Fails +```bash +# Check if Outline is accessible +docker exec -it outline curl -s http://localhost:3000/api/auth.info + +# Verify API token +docker run --rm --network domnet python:3.11-slim \ + python3 -c "import requests; r=requests.post('http://outline:3000/api/auth.info', headers={'Authorization': 'Bearer YOUR_TOKEN'}); print(r.status_code)" +``` + +### Docker Permission Denied +```bash +sudo usermod -aG docker $USER +newgrp docker +``` + +### Container Not Found +```bash +# Verify Outline is running +docker ps | grep outline +``` + +### Verification Fails +```bash +# Clean start +rm -rf exports/ +./export_with_trees.sh +``` + +### API Errors +Check `exports/export_errors.json` for details on failed documents. + +## Security + +- `settings.json` contains API token - never commit to git +- Backup files may contain sensitive wiki content +- Consider restricting file permissions: + ```bash + chmod 600 settings.json + chmod 700 exports/ + ``` + +--- + +**Last Updated:** 2026-01-14 diff --git a/export_with_trees.sh b/export_with_trees.sh new file mode 100755 index 0000000..1c9926d --- /dev/null +++ b/export_with_trees.sh @@ -0,0 +1,529 @@ +#!/bin/bash +# +# Outline Export Script with Tree Visualization +# Exports all Outline documents with full hierarchy and shows side-by-side tree comparison +# +# Usage: ./export_with_trees.sh [OPTIONS] +# Options are passed through to the Python script (--dry-run, -v, etc.) +# + +set -e # Exit on error + +# Capture CLI arguments to pass to Python +CLI_ARGS="$@" + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# Configuration +WORK_DIR="$(pwd)" +SETTINGS_FILE="$WORK_DIR/settings.json" +EXPORT_DIR="$WORK_DIR/outline_export" + +echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" +echo -e "${BLUE} OUTLINE EXPORT${NC}" +echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" +echo "" + +# Check if settings.json exists +if [ ! -f "$SETTINGS_FILE" ]; then + echo -e "${RED}Error: settings.json not found${NC}" + exit 1 +fi + +# Extract API details from settings.json +API_URL=$(jq -r '.source.url' "$SETTINGS_FILE") +API_TOKEN=$(jq -r '.source.token' "$SETTINGS_FILE") + +# Backup old export if it exists +if [ -d "$EXPORT_DIR" ]; then + TIMESTAMP=$(date +%Y%m%d_%H%M%S) + BACKUP_FILE="$WORK_DIR/outline_backup_${TIMESTAMP}.tar.gz" + echo -e "${YELLOW}Backing up previous export...${NC}" + tar -czf "$BACKUP_FILE" -C "$WORK_DIR" "outline_export" 2>/dev/null + echo -e "${GREEN}✓ Backup: $BACKUP_FILE ($(du -sh "$BACKUP_FILE" | cut -f1))${NC}" + rm -rf "$EXPORT_DIR" +fi + +echo -e "${GREEN}Exporting documents...${NC}" +echo "" + +# Run the export with CLI arguments (as current user to avoid root-owned files) +docker run --rm --network domnet \ + --user "$(id -u):$(id -g)" \ + -e HOME=/tmp \ + -v "$WORK_DIR:/work" \ + -w /work \ + python:3.11-slim \ + bash -c "pip install -qqq requests tqdm 2>/dev/null && python3 outline_export_fixed.py $CLI_ARGS" + +echo "" + +# Create Python script for side-by-side tree comparison +cat > "$WORK_DIR/.tree_compare.py" << 'PYTHON_SCRIPT' +#!/usr/bin/env python3 +""" +Side-by-side comparison of Outline online vs exported files. +Matches documents row by row and highlights differences. +""" +import sys +import re +import shutil +import requests +from pathlib import Path + +# Colors +GREEN = '\033[0;32m' +RED = '\033[0;31m' +YELLOW = '\033[1;33m' +BLUE = '\033[0;34m' +CYAN = '\033[0;36m' +BOLD = '\033[1m' +DIM = '\033[2m' +RESET = '\033[0m' + +def get_terminal_width(): + try: + return shutil.get_terminal_size().columns + except: + return 120 + +def normalize_filename(name): + """Normalize a name for comparison (handles / -> _ conversion etc).""" + # Replace characters that filesystems don't allow + normalized = name.replace('/', '_').replace('\\', '_') + normalized = normalized.replace(':', '_').replace('*', '_') + normalized = normalized.replace('?', '_').replace('"', '_') + normalized = normalized.replace('<', '_').replace('>', '_') + normalized = normalized.replace('|', '_') + return normalized.strip() + +def get_online_docs(api_url, api_token): + """Fetch all documents from Outline API, organized by collection.""" + headers = { + "Authorization": f"Bearer {api_token}", + "Content-Type": "application/json" + } + + response = requests.post(f"{api_url}/api/collections.list", headers=headers, json={}) + collections = response.json().get("data", []) + collections = sorted(collections, key=lambda c: c.get('name', '')) + + # Build collection ID to name mapping + coll_id_to_name = {c['id']: c['name'] for c in collections} + + # Fetch all documents with timestamps using documents.list + all_docs_response = requests.post( + f"{api_url}/api/documents.list", + headers=headers, + json={"limit": 1000} # Get all docs + ) + all_docs = all_docs_response.json().get("data", []) + + # Create timestamp lookup by (collection_name, normalized_title) + timestamp_lookup = {} + for doc in all_docs: + coll_id = doc.get("collectionId") + coll_name = coll_id_to_name.get(coll_id, "Unknown") + title = doc.get("title", "Untitled") + norm_title = normalize_filename(title) + timestamp_lookup[(coll_name, norm_title)] = doc.get("updatedAt") + + result = {} + + for coll in collections: + coll_name = coll['name'] + result[coll_name] = [] + + # Get navigation tree + nav_response = requests.post( + f"{api_url}/api/collections.documents", + headers=headers, + json={"id": coll["id"]} + ) + nav_tree = nav_response.json().get("data", []) + + def collect_docs(nodes): + docs = [] + for node in nodes: + title = node.get("title", "Untitled") + norm_title = normalize_filename(title) + has_children = len(node.get("children", [])) > 0 + updated_at = timestamp_lookup.get((coll_name, norm_title)) + docs.append({ + 'title': title, + 'normalized': norm_title, + 'has_children': has_children, + 'updatedAt': updated_at + }) + if has_children: + docs.extend(collect_docs(node.get("children", []))) + return docs + + result[coll_name] = collect_docs(nav_tree) + + return result + +def get_export_docs(export_dir): + """Get all exported documents, organized by collection.""" + import os + export_path = Path(export_dir) + result = {} + + if not export_path.exists(): + return result + + for coll_dir in sorted(export_path.iterdir()): + if coll_dir.is_dir(): + coll_name = coll_dir.name + docs = [] + for md_file in sorted(coll_dir.glob("*.md")): + title = md_file.stem + if title: # Skip empty filenames + mtime = os.path.getmtime(md_file) + docs.append({ + 'title': title, + 'normalized': normalize_filename(title), + 'path': md_file, + 'mtime': mtime + }) + result[coll_name] = docs + + return result + +def match_and_compare(online_docs, export_docs): + """Match online and export docs, return comparison data per collection.""" + from datetime import datetime + + all_collections = sorted(set(online_docs.keys()) | set(export_docs.keys())) + comparison = [] + + for coll_name in all_collections: + online_list = online_docs.get(coll_name, []) + export_list = export_docs.get(coll_name, []) + + # Create lookup by normalized name + export_lookup = {d['normalized']: d for d in export_list} + online_lookup = {d['normalized']: d for d in online_list} + + rows = [] + matched_export = set() + + # First pass: match online docs to export + for doc in sorted(online_list, key=lambda d: d['title'].lower()): + norm = doc['normalized'] + if norm in export_lookup: + export_doc = export_lookup[norm] + # Check freshness + freshness = 'current' # default + if doc.get('updatedAt') and export_doc.get('mtime'): + online_dt = datetime.fromisoformat(doc['updatedAt'].replace('Z', '+00:00')) + online_ts = online_dt.timestamp() + export_ts = export_doc['mtime'] + # Allow 60s tolerance + if export_ts < online_ts - 60: + freshness = 'stale' + rows.append({ + 'online': doc['title'], + 'export': export_doc['title'], + 'status': 'match', + 'is_folder': doc['has_children'], + 'freshness': freshness + }) + matched_export.add(norm) + else: + rows.append({ + 'online': doc['title'], + 'export': None, + 'status': 'missing', + 'is_folder': doc['has_children'], + 'freshness': None + }) + + # Second pass: find extra export docs + for doc in sorted(export_list, key=lambda d: d['title'].lower()): + if doc['normalized'] not in matched_export: + rows.append({ + 'online': None, + 'export': doc['title'], + 'status': 'extra', + 'is_folder': False, + 'freshness': None + }) + + # Sort rows: matched first, then missing, then extra + rows.sort(key=lambda r: ( + 0 if r['status'] == 'match' else (1 if r['status'] == 'missing' else 2), + (r['online'] or r['export'] or '').lower() + )) + + comparison.append({ + 'collection': coll_name, + 'rows': rows, + 'online_count': len(online_list), + 'export_count': len(export_list) + }) + + return comparison + +def print_comparison(comparison): + """Print the side-by-side comparison with status indicators.""" + term_width = get_terminal_width() + col_width = (term_width - 10) // 2 # -10 for separators and status icons + + total_online = 0 + total_export = 0 + total_matched = 0 + total_missing = 0 + total_extra = 0 + total_stale = 0 + + print(f"\n{BLUE}{'═' * term_width}{RESET}") + print(f"{BOLD}{CYAN}{'ONLINE':<{col_width}} {'':5} {'EXPORTED':<{col_width}}{RESET}") + print(f"{BLUE}{'═' * term_width}{RESET}") + + for coll in comparison: + total_online += coll['online_count'] + total_export += coll['export_count'] + + # Collection header + coll_matched = sum(1 for r in coll['rows'] if r['status'] == 'match') + coll_missing = sum(1 for r in coll['rows'] if r['status'] == 'missing') + coll_extra = sum(1 for r in coll['rows'] if r['status'] == 'extra') + coll_stale = sum(1 for r in coll['rows'] if r.get('freshness') == 'stale') + + total_matched += coll_matched + total_missing += coll_missing + total_extra += coll_extra + total_stale += coll_stale + + if coll_missing == 0 and coll_extra == 0: + coll_status = f"{GREEN}✓{RESET}" + else: + coll_status = f"{RED}✗{RESET}" + + header = f"{coll['collection']}/ ({coll['online_count']} → {coll['export_count']})" + print(f"\n{BOLD}{YELLOW}{header}{RESET} {coll_status}") + print(f"{BLUE}{'─' * term_width}{RESET}") + + for row in coll['rows']: + online_name = row['online'] or '' + export_name = row['export'] or '' + + # Add folder indicator + if row['is_folder'] and online_name: + online_name = f"📁 {online_name}" + + # Truncate if needed + if len(online_name) > col_width - 1: + online_name = online_name[:col_width-4] + '...' + if len(export_name) > col_width - 1: + export_name = export_name[:col_width-4] + '...' + + # Status and colors + if row['status'] == 'match': + # Freshness indicator + if row.get('freshness') == 'stale': + freshness = f"{YELLOW}●{RESET}" + else: + freshness = f"{GREEN}●{RESET}" + status = f"{GREEN}✓{RESET}{freshness}" + left = f"{online_name}" + right = f"{export_name}" + elif row['status'] == 'missing': + status = f"{RED}✗{RESET} " + left = f"{RED}{online_name}{RESET}" + right = f"{DIM}---{RESET}" + else: # extra + status = f"{YELLOW}+{RESET} " + left = f"{DIM}---{RESET}" + right = f"{YELLOW}{export_name}{RESET}" + + # Calculate visible width (without ANSI codes) + def visible_len(s): + return len(re.sub(r'\033\[[0-9;]*m', '', s)) + + left_pad = col_width - visible_len(left) + right_pad = col_width - visible_len(right) + + print(f" {left}{' ' * max(0, left_pad)} {status} {right}") + + # Summary + print(f"\n{BLUE}{'═' * term_width}{RESET}") + print(f"{BOLD}SUMMARY:{RESET}") + print(f" Online: {total_online} documents") + print(f" Exported: {total_export} documents") + print(f" {GREEN}✓● Matched & current: {total_matched - total_stale}{RESET}") + + if total_stale > 0: + print(f" {YELLOW}✓● Matched but stale: {total_stale} (export older than online){RESET}") + if total_missing > 0: + print(f" {RED}✗ Missing: {total_missing} (online but not exported){RESET}") + if total_extra > 0: + print(f" {YELLOW}+ Extra: {total_extra} (exported but not online){RESET}") + + if total_missing == 0 and total_extra == 0 and total_stale == 0: + print(f"\n{GREEN}✓ All documents exported and current!{RESET}") + elif total_missing == 0 and total_extra == 0: + print(f"\n{YELLOW}⚠ All documents exported but {total_stale} are stale{RESET}") + print() + +def get_latest_changes(api_url, api_token, limit=3): + """Fetch the most recently updated documents.""" + headers = { + "Authorization": f"Bearer {api_token}", + "Content-Type": "application/json" + } + + response = requests.post( + f"{api_url}/api/documents.list", + headers=headers, + json={ + "sort": "updatedAt", + "direction": "DESC", + "limit": limit + } + ) + + docs = response.json().get("data", []) + result = [] + + for doc in docs: + # Get collection name + coll_id = doc.get("collectionId") + coll_name = "Unknown" + if coll_id: + coll_response = requests.post( + f"{api_url}/api/collections.info", + headers=headers, + json={"id": coll_id} + ) + coll_data = coll_response.json().get("data", {}) + coll_name = coll_data.get("name", "Unknown") + + result.append({ + 'title': doc.get("title", "Untitled"), + 'collection': coll_name, + 'updatedAt': doc.get("updatedAt"), + 'normalized': normalize_filename(doc.get("title", "Untitled")) + }) + + return result + +def find_export_file(export_dir, collection, normalized_title): + """Find the exported file matching the document.""" + export_path = Path(export_dir) + + # Try exact collection match first + coll_dir = export_path / collection + if coll_dir.exists(): + for md_file in coll_dir.glob("*.md"): + if normalize_filename(md_file.stem) == normalized_title: + return md_file + + # Try all collections (in case of name mismatch) + for coll_dir in export_path.iterdir(): + if coll_dir.is_dir(): + for md_file in coll_dir.glob("*.md"): + if normalize_filename(md_file.stem) == normalized_title: + return md_file + + return None + +def print_latest_changes(latest_docs, export_dir): + """Print the latest changes section.""" + term_width = get_terminal_width() + from datetime import datetime + import os + + print(f"\n{BLUE}{'═' * term_width}{RESET}") + print(f"{BOLD}{CYAN}LATEST CHANGES (verify actuality){RESET}") + print(f"{BLUE}{'─' * term_width}{RESET}") + + for i, doc in enumerate(latest_docs, 1): + title = doc['title'] + collection = doc['collection'] + updated_at = doc['updatedAt'] + + # Parse online timestamp + if updated_at: + # Handle ISO format with timezone + online_dt = datetime.fromisoformat(updated_at.replace('Z', '+00:00')) + online_str = online_dt.strftime("%Y-%m-%d %H:%M:%S") + else: + online_str = "Unknown" + + # Find export file + export_file = find_export_file(export_dir, collection, doc['normalized']) + + if export_file and export_file.exists(): + export_mtime = os.path.getmtime(export_file) + export_dt = datetime.fromtimestamp(export_mtime) + export_str = export_dt.strftime("%Y-%m-%d %H:%M:%S") + + # Compare (export should be same time or newer) + if updated_at: + # Convert online to local timestamp for comparison + online_ts = online_dt.timestamp() + if export_mtime >= online_ts - 60: # Allow 60s tolerance + status = f"{GREEN}✓{RESET}" + else: + status = f"{YELLOW}⚠ older{RESET}" + else: + status = f"{GREEN}✓{RESET}" + else: + export_str = "NOT FOUND" + status = f"{RED}✗{RESET}" + + # Print entry + print(f"\n {BOLD}{i}. {title}{RESET}") + print(f" {DIM}Collection:{RESET} {collection}") + print(f" {DIM}Online:{RESET} {online_str}") + print(f" {DIM}Exported:{RESET} {export_str} {status}") + + print(f"\n{BLUE}{'═' * term_width}{RESET}") + +def main(): + if len(sys.argv) != 4: + print("Usage: script.py ") + sys.exit(1) + + api_url = sys.argv[1] + api_token = sys.argv[2] + export_dir = sys.argv[3] + + # Get documents from both sources + online_docs = get_online_docs(api_url, api_token) + export_docs = get_export_docs(export_dir) + + # Match and compare + comparison = match_and_compare(online_docs, export_docs) + + # Print results + print_comparison(comparison) + + # Get and print latest changes + latest_docs = get_latest_changes(api_url, api_token, limit=3) + print_latest_changes(latest_docs, export_dir) + +if __name__ == "__main__": + main() +PYTHON_SCRIPT + +# Run the side-by-side tree comparison (use /work/outline_export as container path) +docker run --rm --network domnet \ + --user "$(id -u):$(id -g)" \ + -e HOME=/tmp \ + -v "$WORK_DIR:/work" \ + -w /work \ + python:3.11-slim \ + bash -c "pip install -qqq requests 2>/dev/null && python3 /work/.tree_compare.py '$API_URL' '$API_TOKEN' '/work/outline_export'" + +# Cleanup +rm -f "$WORK_DIR/.tree_compare.py" + +echo "" diff --git a/outline_export_fixed.py b/outline_export_fixed.py new file mode 100755 index 0000000..b995997 --- /dev/null +++ b/outline_export_fixed.py @@ -0,0 +1,1031 @@ +#!/usr/bin/env python3 +""" +Outline API Export Script - Enhanced Version +Exports all collections, documents, and their hierarchy from Outline wiki. +Reads configuration from settings.json in the current directory. + +Improvements: +- Failed document tracking with detailed error reports +- Document caching to eliminate double API fetching +- Proper timeout configuration +- Depth limit protection for deep hierarchies +- Enhanced verification comparing with API counts +- Tree view visualization (before and after export) +- Recursive document counting for accurate verification +- Proper logging system with configurable levels +""" + +import os +import sys +import json +import hashlib +import logging +import time +from datetime import datetime +from functools import wraps +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple, Callable, TypeVar +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +T = TypeVar('T') + + +def retry_on_failure(max_attempts: int = 3, backoff_factor: float = 1.0, + exceptions: tuple = (requests.RequestException,)) -> Callable: + """ + Decorator for retrying failed operations with exponential backoff. + + Args: + max_attempts: Maximum number of retry attempts + backoff_factor: Multiplier for exponential backoff (wait = backoff_factor * 2^attempt) + exceptions: Tuple of exception types to catch and retry + """ + def decorator(func: Callable[..., T]) -> Callable[..., T]: + @wraps(func) + def wrapper(*args, **kwargs) -> T: + last_exception = None + for attempt in range(max_attempts): + try: + return func(*args, **kwargs) + except exceptions as e: + last_exception = e + if attempt < max_attempts - 1: + wait_time = backoff_factor * (2 ** attempt) + logger.warning(f"Attempt {attempt + 1}/{max_attempts} failed: {e}. " + f"Retrying in {wait_time:.1f}s...") + time.sleep(wait_time) + else: + logger.error(f"All {max_attempts} attempts failed for {func.__name__}") + raise last_exception + return wrapper + return decorator + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s | %(levelname)-8s | %(message)s', + datefmt='%H:%M:%S' +) +logger = logging.getLogger('outline_export') + +# Try to import tqdm for progress bars +try: + from tqdm import tqdm + HAS_TQDM = True +except ImportError: + HAS_TQDM = False + logger.info("Install tqdm for progress bars: pip install tqdm") + + +class TreeViewGenerator: + """Generate ASCII tree views of document hierarchies""" + + @staticmethod + def generate_from_api(nav_nodes: List[Dict], prefix: str = "", is_last: bool = True) -> List[str]: + """Generate tree view from API navigation structure""" + lines = [] + for i, node in enumerate(nav_nodes): + is_last_node = (i == len(nav_nodes) - 1) + + # Tree characters + if prefix == "": + connector = "" + else: + connector = "└── " if is_last_node else "├── " + + title = node.get("title", "Untitled") + doc_id = node.get("id", "")[:8] # Short ID for display + lines.append(f"{prefix}{connector}{title} ({doc_id}...)") + + # Process children + children = node.get("children", []) + if children: + if prefix == "": + child_prefix = "" + else: + child_prefix = prefix + (" " if is_last_node else "│ ") + child_lines = TreeViewGenerator.generate_from_api(children, child_prefix, is_last_node) + lines.extend(child_lines) + + return lines + + @staticmethod + def generate_from_files(collection_path: Path, metadata: Dict) -> List[str]: + """Generate tree view from exported files""" + lines = [] + + def build_tree_recursive(docs: List[Dict], prefix: str = "", is_last: bool = True) -> List[str]: + tree_lines = [] + for i, doc in enumerate(docs): + is_last_node = (i == len(docs) - 1) + + # Tree characters + if prefix == "": + connector = "" + else: + connector = "└── " if is_last_node else "├── " + + filename = doc.get("filename", "Unknown") + tree_lines.append(f"{prefix}{connector}{filename}") + + # Process children + children = doc.get("children", []) + if children: + if prefix == "": + child_prefix = "" + else: + child_prefix = prefix + (" " if is_last_node else "│ ") + child_lines = build_tree_recursive(children, child_prefix, is_last_node) + tree_lines.extend(child_lines) + + return tree_lines + + documents = metadata.get("documents", []) + return build_tree_recursive(documents) + + @staticmethod + def print_comparison(online_tree: List[str], exported_tree: List[str], collection_name: str): + """Print comparison between online and exported structures""" + logger.info(f"--- Comparison for '{collection_name}' ---") + logger.info(f"Online documents: {len(online_tree)}") + logger.info(f"Exported files: {len(exported_tree)}") + if len(online_tree) == len(exported_tree): + logger.info("Counts match!") + else: + diff = abs(len(online_tree) - len(exported_tree)) + logger.warning(f"Difference: {diff}") + + +class OutlineExporter: + """Export Outline documents with enhanced error tracking and verification""" + + def __init__(self, base_url: str, api_token: str, output_dir: str = "exports", + verify_after_export: bool = True, max_hierarchy_depth: int = 100, + show_progress: bool = True, generate_manifests: bool = True, + max_retries: int = 3, retry_backoff: float = 1.0): + self.base_url = base_url.rstrip('/') + self.api_token = api_token + self.output_dir = Path(output_dir) + self.verify_after_export = verify_after_export + self.max_hierarchy_depth = max_hierarchy_depth + self.show_progress = show_progress and HAS_TQDM + self.generate_manifests = generate_manifests + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + # Setup session with retry logic + self.session = requests.Session() + retry_strategy = Retry( + total=3, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + self.headers = { + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json" + } + + # NEW: Document cache to avoid double fetching + self.document_cache: Dict[str, Dict] = {} + + # NEW: Track failed documents with detailed info + self.failed_documents: List[Dict] = [] + + # NEW: Track API errors + self.api_errors: List[Dict] = [] + + # NEW: Track expected vs actual counts per collection + self.collection_stats: Dict[str, Dict] = {} + + # Manifest data + self.manifest = { + "export_date": datetime.now().isoformat(), + "source_url": self.base_url, + "collections": [], + "documents": [], + "failed_documents": [], + "statistics": {} + } + + # Statistics + self.stats = { + "collections": 0, + "documents": 0, + "bytes_written": 0, + "failed": 0, + "api_errors": 0 + } + + def make_request(self, endpoint: str, data: Dict = None, method: str = "POST", + retry: bool = True) -> Optional[Dict]: + """Make API request with error handling and optional retry. + + Args: + endpoint: API endpoint path + data: Request body data + method: HTTP method (POST or GET) + retry: Whether to retry on failure (default True) + """ + url = f"{self.base_url}{endpoint}" + last_error = None + + attempts = self.max_retries if retry else 1 + for attempt in range(attempts): + try: + if method == "POST": + response = self.session.post(url, headers=self.headers, json=data or {}, timeout=30) + else: + response = self.session.get(url, headers=self.headers, timeout=30) + + if response.status_code == 200: + return response.json() + elif response.status_code in [429, 500, 502, 503, 504] and attempt < attempts - 1: + # Retryable error + wait_time = self.retry_backoff * (2 ** attempt) + logger.warning(f"API error {response.status_code} on {endpoint}, " + f"retrying in {wait_time:.1f}s (attempt {attempt + 1}/{attempts})") + time.sleep(wait_time) + continue + else: + # Non-retryable error or final attempt + error_info = { + "endpoint": endpoint, + "status_code": response.status_code, + "error": response.text[:200], + "timestamp": datetime.now().isoformat() + } + self.api_errors.append(error_info) + self.stats["api_errors"] += 1 + logger.error(f"API error on {endpoint}: HTTP {response.status_code}") + return None + except requests.RequestException as e: + last_error = e + if attempt < attempts - 1: + wait_time = self.retry_backoff * (2 ** attempt) + logger.warning(f"Request failed on {endpoint}: {e}, " + f"retrying in {wait_time:.1f}s (attempt {attempt + 1}/{attempts})") + time.sleep(wait_time) + else: + error_info = { + "endpoint": endpoint, + "error": str(e), + "timestamp": datetime.now().isoformat() + } + self.api_errors.append(error_info) + self.stats["api_errors"] += 1 + logger.error(f"All {attempts} attempts failed on {endpoint}: {e}") + return None + except Exception as e: + error_info = { + "endpoint": endpoint, + "error": str(e), + "timestamp": datetime.now().isoformat() + } + self.api_errors.append(error_info) + self.stats["api_errors"] += 1 + logger.exception(f"Unexpected exception on {endpoint}: {e}") + return None + + return None + + def health_check(self) -> bool: + """ + Verify API connectivity and authentication before export. + + Returns: + True if API is accessible and authenticated, False otherwise + """ + logger.info("Performing health check...") + + # Test API connectivity with auth.info endpoint + try: + result = self.make_request("/api/auth.info", retry=False) + if result and "data" in result: + user = result["data"].get("user", {}) + team = result["data"].get("team", {}) + logger.info(f"Authenticated as: {user.get('name', 'Unknown')} ({user.get('email', 'N/A')})") + logger.info(f"Team: {team.get('name', 'Unknown')}") + logger.info("Health check passed") + return True + else: + logger.error("Health check failed: Unable to verify authentication") + return False + except Exception as e: + logger.error(f"Health check failed: {e}") + return False + + def get_collections(self) -> List[Dict]: + """Fetch all collections""" + logger.info("Fetching collections...") + result = self.make_request("/api/collections.list") + if result and "data" in result: + collections = result["data"] + logger.info(f"Found {len(collections)} collections") + return collections + return [] + + def get_documents_in_collection(self, collection_id: str) -> Tuple[List[Dict], List[Dict]]: + """ + Fetch all documents in a collection + Returns: (list of documents, navigation tree) + """ + result = self.make_request("/api/documents.list", {"collectionId": collection_id}) + documents = [] + if result and "data" in result: + documents = result["data"] + + # Also get navigation tree for hierarchy + nav_result = self.make_request("/api/collections.documents", {"id": collection_id}) + nav_tree = [] + if nav_result and "data" in nav_result: + nav_tree = nav_result["data"] + + return documents, nav_tree + + def get_document_info(self, doc_id: str) -> Optional[Dict]: + """ + Fetch full document content + FIXED: Uses cache to avoid double fetching + """ + # Check cache first + if doc_id in self.document_cache: + return self.document_cache[doc_id] + + result = self.make_request("/api/documents.info", {"id": doc_id}) + if result and "data" in result: + doc = result["data"] + # Cache the document + self.document_cache[doc_id] = doc + return doc + return None + + def sanitize_filename(self, name: str) -> str: + """Convert document title to safe filename""" + # Replace invalid characters + invalid_chars = '<>:"/\\|?*' + for char in invalid_chars: + name = name.replace(char, '_') + # Limit length + if len(name) > 200: + name = name[:200] + return name.strip() + + def calculate_checksum(self, content: str) -> str: + """Calculate SHA256 checksum of content""" + return hashlib.sha256(content.encode('utf-8')).hexdigest() + + def build_hierarchy(self, documents: List[Dict], nav_tree: List[Dict]) -> Dict: + """ + Build hierarchy mapping from navigation tree + FIXED: Export directly from nav_tree, don't rely on documents list + """ + hierarchy = { + "root": [], + "children": {}, + "all_ids": set() # Track all document IDs we've seen + } + + def process_nav_node(node: Dict, parent_id: Optional[str] = None): + doc_id = node["id"] + doc_title = node.get("title", "Untitled") + + # Track this ID + hierarchy["all_ids"].add(doc_id) + + # Create a minimal document dict from nav node + # We'll fetch full content during export + doc_data = { + "id": doc_id, + "title": doc_title, + "parentDocumentId": parent_id + } + + # Add to hierarchy + if parent_id is None: + hierarchy["root"].append(doc_data) + else: + if parent_id not in hierarchy["children"]: + hierarchy["children"][parent_id] = [] + hierarchy["children"][parent_id].append(doc_data) + + # Process children recursively + children = node.get("children", []) + for child in children: + process_nav_node(child, doc_id) + + for root_node in nav_tree: + process_nav_node(root_node) + + return hierarchy + + def export_document(self, document: Dict, collection_name: str, collection_path: Path, + hierarchy: Dict, level: int = 0) -> Optional[Dict]: + """ + Export a single document and its children recursively + FIXED: Enhanced error tracking and failed children tracking + """ + doc_id = document["id"] + doc_title = document.get("title", "Untitled") + + if level == 0: + logger.debug(f"Exporting: {doc_title}") + + # Fetch full document content (uses cache, so no double fetching) + full_doc = self.get_document_info(doc_id) + if not full_doc: + # FIXED: Track failed documents with details + self.failed_documents.append({ + "id": doc_id, + "title": doc_title, + "collection": collection_name, + "reason": "Failed to fetch document info from API", + "level": level + }) + logger.warning(f"Failed to fetch document: {doc_title} (ID: {doc_id})") + return None + + # Generate filename + safe_title = self.sanitize_filename(doc_title) + filename = f"{safe_title}.md" + filepath = collection_path / filename + + # Handle duplicates + counter = 1 + while filepath.exists(): + filename = f"{safe_title}_{counter}.md" + filepath = collection_path / filename + counter += 1 + + # Build markdown content + content = f"# {doc_title}\n\n" + content += f"\n" + content += f"\n" + content += f"\n" + content += f"\n\n" + content += "---\n\n" + content += full_doc.get("text", "") + + # Write file + try: + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + except Exception as e: + # Track file write failures + self.failed_documents.append({ + "id": doc_id, + "title": doc_title, + "collection": collection_name, + "reason": f"Failed to write file: {e}", + "level": level + }) + logger.error(f"Failed to write file for: {doc_title}") + return None + + file_size = filepath.stat().st_size + self.stats["bytes_written"] += file_size + self.stats["documents"] += 1 + + # Calculate checksum + checksum = self.calculate_checksum(content) + + # Build metadata + doc_metadata = { + "id": doc_id, + "title": doc_title, + "filename": filename, + "collection_name": collection_name, + "parent_id": document.get("parentDocumentId"), + "checksum": checksum, + "size_bytes": file_size, + "created_at": full_doc.get('createdAt'), + "updated_at": full_doc.get('updatedAt'), + "children": [], + "failed_children": [] # NEW: Track children that failed to export + } + + # Add to manifest + if self.generate_manifests: + self.manifest["documents"].append(doc_metadata) + + # Export children recursively + child_docs = hierarchy["children"].get(doc_id, []) + for child in child_docs: + child_metadata = self.export_document( + child, collection_name, collection_path, hierarchy, level + 1 + ) + if child_metadata: + doc_metadata["children"].append(child_metadata) + else: + # FIXED: Track failed children + doc_metadata["failed_children"].append({ + "id": child["id"], + "title": child.get("title", "Untitled") + }) + + return doc_metadata + + def export_collection(self, collection: Dict) -> None: + """Export a single collection with all its documents""" + collection_id = collection["id"] + collection_name = collection["name"] + + logger.info("=" * 60) + logger.info(f"Exporting collection: {collection_name}") + logger.info("=" * 60) + + # Fetch documents and navigation tree + documents, nav_tree = self.get_documents_in_collection(collection_id) + + # Build hierarchy from navigation tree + hierarchy = self.build_hierarchy(documents, nav_tree) + + # FIXED: Count documents from nav_tree (source of truth), not documents.list + # The nav_tree includes ALL documents including nested ones + expected_count = len(hierarchy["all_ids"]) + logger.info(f"Documents in navigation tree: {expected_count}") + + if expected_count == 0: + logger.info("No documents to export") + # Still track this for statistics + self.collection_stats[collection_id] = { + "name": collection_name, + "expected": expected_count, + "fetched": 0, + "exported": 0 + } + return + + # Create collection directory + safe_name = self.sanitize_filename(collection_name) + collection_path = self.output_dir / safe_name + collection_path.mkdir(parents=True, exist_ok=True) + + # NEW: Generate tree view of ONLINE structure + logger.info("--- Online Structure (from Outline API) ---") + online_tree = TreeViewGenerator.generate_from_api(nav_tree) + for line in online_tree[:20]: # Show first 20 lines + logger.info(line) + if len(online_tree) > 20: + logger.info(f"... and {len(online_tree) - 20} more lines") + + # Prepare collection metadata + collection_metadata = { + "id": collection_id, + "name": collection_name, + "directory": safe_name, + "expected_count": expected_count, # From navigation tree (all nested docs) + "documents_list_count": len(documents), # From documents.list API + "document_count": 0, # Will be updated after export + "navigation_tree": nav_tree, # Preserve original navigation structure + "documents": [] + } + + # Export documents with optional progress bar + root_docs = hierarchy["root"] + if self.show_progress: + iterator = tqdm(root_docs, desc=f" Exporting {collection_name}", leave=False) + else: + iterator = root_docs + + exported_count = 0 + for doc in iterator: + doc_metadata = self.export_document( + doc, collection_name, collection_path, hierarchy + ) + if doc_metadata: + collection_metadata["documents"].append(doc_metadata) + exported_count += 1 + + # FIXED: Count ALL documents recursively (including children) + def count_recursive(docs): + count = 0 + for doc in docs: + count += 1 # Count this document + count += count_recursive(doc.get("children", [])) # Count children recursively + return count + + actual_exported_count = count_recursive(collection_metadata["documents"]) + + # Update with actual exported count + collection_metadata["document_count"] = actual_exported_count + + # Save collection metadata + metadata_path = collection_path / "_collection_metadata.json" + with open(metadata_path, 'w', encoding='utf-8') as f: + json.dump(collection_metadata, f, indent=2, ensure_ascii=False) + + # NEW: Generate tree view of EXPORTED files + logger.info("--- Exported Files (on disk) ---") + exported_tree = TreeViewGenerator.generate_from_files(collection_path, collection_metadata) + for line in exported_tree[:20]: # Show first 20 lines + logger.info(line) + if len(exported_tree) > 20: + logger.info(f"... and {len(exported_tree) - 20} more lines") + + # NEW: Print comparison + TreeViewGenerator.print_comparison(online_tree, exported_tree, collection_name) + + # Add to manifest + if self.generate_manifests: + self.manifest["collections"].append({ + "id": collection_id, + "name": collection_name, + "directory": safe_name, + "expected_count": expected_count, # From nav_tree + "documents_list_count": len(documents), # From API documents.list + "exported_count": actual_exported_count # FIXED: Use recursive count + }) + + # NEW: Store collection stats + self.collection_stats[collection_id] = { + "name": collection_name, + "expected": expected_count, # From nav_tree (source of truth) + "documents_list_count": len(documents), # From API + "exported": actual_exported_count # FIXED: Use recursive count + } + + self.stats["collections"] += 1 + + # Enhanced summary + if actual_exported_count == expected_count: + logger.info(f"Exported {actual_exported_count}/{expected_count} documents from '{collection_name}' - COMPLETE") + else: + missing = expected_count - actual_exported_count + logger.warning(f"Exported {actual_exported_count}/{expected_count} documents from '{collection_name}' - {missing} MISSING") + + def save_manifest(self) -> None: + """Save export manifest""" + if not self.generate_manifests: + return + + manifest_path = self.output_dir / "manifest.json" + with open(manifest_path, 'w', encoding='utf-8') as f: + json.dump(self.manifest, f, indent=2, ensure_ascii=False) + + def save_export_metadata(self) -> None: + """Save export metadata with statistics""" + metadata = { + "export_date": datetime.now().isoformat(), + "source_url": self.base_url, + "statistics": self.stats, + "collections": self.manifest["collections"], + "failed_documents_count": len(self.failed_documents), + "api_errors_count": len(self.api_errors) + } + + metadata_path = self.output_dir / "export_metadata.json" + with open(metadata_path, 'w', encoding='utf-8') as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) + + def save_error_report(self) -> None: + """Save detailed error report""" + if not self.failed_documents and not self.api_errors: + return + + error_report = { + "export_date": datetime.now().isoformat(), + "failed_documents": self.failed_documents, + "api_errors": self.api_errors, + "statistics": { + "total_failed_documents": len(self.failed_documents), + "total_api_errors": len(self.api_errors) + } + } + + error_path = self.output_dir / "export_errors.json" + with open(error_path, 'w', encoding='utf-8') as f: + json.dump(error_report, f, indent=2, ensure_ascii=False) + + logger.warning(f"Error report saved to: {error_path}") + logger.warning(f" - {len(self.failed_documents)} failed documents") + logger.warning(f" - {len(self.api_errors)} API errors") + + def verify_export(self) -> bool: + """ + Verify export integrity and completeness + FIXED: Enhanced verification comparing with API counts + """ + logger.info("=" * 60) + logger.info("Verifying Export Integrity and Completeness") + logger.info("=" * 60) + + if not self.generate_manifests: + logger.warning("Skipping verification (manifests disabled)") + return True + + errors = [] + warnings = [] + + # NEW: Step 1 - Verify document count completeness + logger.info("Step 1: Verifying document count completeness...") + total_expected = 0 + total_exported = 0 + + for coll_id, stats in self.collection_stats.items(): + expected = stats["expected"] + exported = stats["exported"] + total_expected += expected + total_exported += exported + + if exported < expected: + missing = expected - exported + warnings.append(f"Collection '{stats['name']}': {missing} documents missing") + + if total_exported < total_expected: + errors.append( + f"Document count mismatch: Expected {total_expected} from API, " + f"exported {total_exported} (missing {total_expected - total_exported})" + ) + else: + logger.info(f"All {total_expected} documents accounted for") + + # Step 2 - Verify files exist and checksums match + logger.info("Step 2: Verifying file integrity...") + file_errors = 0 + checksum_errors = 0 + + if self.show_progress: + iterator = tqdm(self.manifest["documents"], desc=" Verifying", leave=False) + else: + iterator = self.manifest["documents"] + + for doc in iterator: + collection_dir = self.output_dir / doc["collection_name"] + filepath = collection_dir / doc["filename"] + + # Check file exists + if not filepath.exists(): + file_errors += 1 + errors.append(f"Missing file: {doc['filename']}") + continue + + # Verify checksum + try: + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + actual_checksum = self.calculate_checksum(content) + if actual_checksum != doc["checksum"]: + checksum_errors += 1 + errors.append(f"Checksum mismatch: {doc['filename']}") + except Exception as e: + errors.append(f"Error reading file {doc['filename']}: {e}") + + if file_errors == 0 and checksum_errors == 0: + logger.info(f"All {len(self.manifest['documents'])} files exist and checksums match") + + # Summary + logger.info("=" * 60) + if errors: + logger.error(f"Verification FAILED: {len(errors)} critical errors") + for err in errors[:10]: + logger.error(f" - {err}") + if len(errors) > 10: + logger.error(f" ... and {len(errors) - 10} more errors") + return False + elif warnings: + logger.warning(f"Verification PASSED with warnings: {len(warnings)} issues") + for warn in warnings: + logger.warning(f" - {warn}") + return True + else: + logger.info("Verification PASSED - Export is complete and verified") + logger.info(f" - All {total_expected} documents from API accounted for") + logger.info(f" - All {len(self.manifest['documents'])} files exist") + logger.info(f" - All checksums match") + return True + + def dry_run(self) -> Dict: + """ + Preview what would be exported without writing files. + + Returns: + Dictionary with collection/document counts and estimated size + """ + logger.info("=" * 60) + logger.info("DRY RUN - Preview Export") + logger.info("=" * 60) + + if not self.health_check(): + logger.error("Dry run aborted due to failed health check") + return {} + + collections = self.get_collections() + if not collections: + logger.warning("No collections found") + return {"collections": 0, "documents": 0} + + total_docs = 0 + results = { + "collections": [], + "total_collections": len(collections), + "total_documents": 0 + } + + for collection in collections: + _, nav_tree = self.get_documents_in_collection(collection["id"]) + hierarchy = self.build_hierarchy([], nav_tree) + doc_count = len(hierarchy["all_ids"]) + total_docs += doc_count + + results["collections"].append({ + "name": collection["name"], + "documents": doc_count + }) + logger.info(f" {collection['name']}: {doc_count} documents") + + results["total_documents"] = total_docs + + logger.info("=" * 60) + logger.info(f"Total: {len(collections)} collections, {total_docs} documents") + logger.info("=" * 60) + logger.info("Dry run complete - no files written") + + return results + + def export_all(self, skip_health_check: bool = False) -> None: + """Export all collections and documents. + + Args: + skip_health_check: Skip the pre-export health check (default False) + """ + logger.info("=" * 60) + logger.info("OUTLINE EXPORT - ENHANCED VERSION") + logger.info("=" * 60) + logger.info(f"Source: {self.base_url}") + logger.info(f"Output: {self.output_dir}") + logger.info(f"Max depth: {self.max_hierarchy_depth}") + logger.info(f"Max retries: {self.max_retries}") + logger.info(f"Progress bars: {'Enabled' if self.show_progress else 'Disabled'}") + logger.info(f"Verification: {'Enabled' if self.verify_after_export else 'Disabled'}") + logger.info("=" * 60) + + # Health check + if not skip_health_check: + if not self.health_check(): + logger.error("Export aborted due to failed health check") + return + + # Create output directory + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Fetch collections + collections = self.get_collections() + if not collections: + logger.error("No collections found or API error") + self.save_error_report() + return + + # Export each collection + for collection in collections: + try: + self.export_collection(collection) + except Exception as e: + logger.error(f"Failed to export collection {collection['name']}: {e}") + self.api_errors.append({ + "collection": collection['name'], + "error": str(e), + "timestamp": datetime.now().isoformat() + }) + + # Save manifests and metadata + self.save_manifest() + self.save_export_metadata() + if self.failed_documents or self.api_errors: + self.save_error_report() + + # Print summary + logger.info("=" * 60) + logger.info("EXPORT SUMMARY") + logger.info("=" * 60) + logger.info(f"Collections exported: {self.stats['collections']}") + logger.info(f"Documents exported: {self.stats['documents']}") + logger.info(f"Total size: {self.stats['bytes_written'] / (1024*1024):.2f} MB") + logger.info(f"Failed documents: {len(self.failed_documents)}") + logger.info(f"API errors: {len(self.api_errors)}") + logger.info("=" * 60) + + # Verify export + if self.verify_after_export: + verification_passed = self.verify_export() + if not verification_passed: + logger.warning("Export completed with verification errors") + logger.warning("Check export_errors.json for details") + else: + logger.info("Export completed (verification skipped)") + + +def load_settings(settings_file: str = "settings.json") -> Dict: + """Load settings from JSON file""" + try: + with open(settings_file, 'r') as f: + return json.load(f) + except FileNotFoundError: + logger.error(f"Settings file not found: {settings_file}") + logger.error("Create a settings.json file with your configuration") + sys.exit(1) + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in settings file: {e}") + sys.exit(1) + + +def parse_args() -> 'argparse.Namespace': + """Parse command line arguments.""" + import argparse + parser = argparse.ArgumentParser( + description="Export Outline wiki documents", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + '--dry-run', '-n', + action='store_true', + help='Preview what would be exported without writing files' + ) + parser.add_argument( + '--output', '-o', + help='Output directory (overrides settings.json)' + ) + parser.add_argument( + '--verbose', '-v', + action='count', + default=0, + help='Increase verbosity (use -vv for debug)' + ) + parser.add_argument( + '--skip-verify', + action='store_true', + help='Skip post-export verification' + ) + parser.add_argument( + '--skip-health-check', + action='store_true', + help='Skip pre-export health check' + ) + parser.add_argument( + '--settings', + default='settings.json', + help='Path to settings file (default: settings.json)' + ) + return parser.parse_args() + + +def main() -> None: + """Main entry point""" + args = parse_args() + + # Set log level based on verbosity + if args.verbose >= 2: + logger.setLevel(logging.DEBUG) + elif args.verbose == 1: + logger.setLevel(logging.INFO) + + # Load settings + settings = load_settings(args.settings) + + source = settings.get("source", {}) + export_config = settings.get("export", {}) + advanced = settings.get("advanced", {}) + + # Validate required settings + if not source.get("url") or not source.get("token"): + logger.error("Missing required settings: source.url and source.token") + sys.exit(1) + + # CLI overrides for settings + output_dir = args.output or export_config.get("output_directory", "exports") + verify_after = not args.skip_verify and export_config.get("verify_after_export", True) + + # Create exporter + exporter = OutlineExporter( + base_url=source["url"], + api_token=source["token"], + output_dir=output_dir, + verify_after_export=verify_after, + max_hierarchy_depth=advanced.get("max_hierarchy_depth", 100), + show_progress=advanced.get("progress_bar", True), + generate_manifests=advanced.get("generate_manifests", True), + max_retries=advanced.get("max_retries", 3), + retry_backoff=advanced.get("retry_backoff", 1.0) + ) + + # Run export or dry run + try: + if args.dry_run: + exporter.dry_run() + else: + exporter.export_all(skip_health_check=args.skip_health_check) + except KeyboardInterrupt: + logger.warning("Export cancelled by user") + sys.exit(1) + except Exception as e: + logger.exception(f"Export failed: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main()