#!/usr/bin/env python3 """ Outline API Import Script Imports markdown files back into Outline wiki with hierarchy preservation. Companion script to outline_export_fixed.py. Usage: python3 outline_import.py [OPTIONS] Options: -s, --single Import all into single timestamped collection -n, --dry-run Preview operations without making changes -d, --source DIR Source directory (default: outline_export) -v, --verbose Increase verbosity (-vv for debug) -f, --force Overwrite existing collections --settings FILE Path to settings file (default: settings.json) -h, --help Show help message """ import os import sys import json import logging import time import argparse from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s | %(levelname)-8s | %(message)s', datefmt='%H:%M:%S' ) logger = logging.getLogger('outline_import') class OutlineImporter: """Import documents into Outline with hierarchy preservation.""" def __init__( self, base_url: str, api_token: str, source_dir: str = "outline_export", dry_run: bool = False, single_mode: bool = False, force: bool = False, on_collection_exists: str = "skip", on_document_exists: str = "skip", default_permission: str = "read_write", request_timeout: int = 30, retry_attempts: int = 3, retry_delay: float = 1.0, rate_limit_delay: float = 0.1 ): self.base_url = base_url.rstrip('/') self.api_token = api_token self.source_dir = Path(source_dir) self.dry_run = dry_run self.single_mode = single_mode self.force = force self.on_collection_exists = on_collection_exists self.on_document_exists = on_document_exists self.default_permission = default_permission self.request_timeout = request_timeout self.retry_attempts = retry_attempts self.retry_delay = retry_delay self.rate_limit_delay = rate_limit_delay # Setup session with retry logic self.session = requests.Session() retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], ) adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) self.headers = { "Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json" } # ID mapping: old_id -> new_id self.id_map: Dict[str, str] = {} # Track existing collections self.existing_collections: Dict[str, str] = {} # name -> id # Statistics self.stats = { "collections_created": 0, "collections_skipped": 0, "collections_errors": 0, "documents_created": 0, "documents_skipped": 0, "documents_errors": 0, } # Error tracking self.errors: List[Dict] = [] def _api_request( self, endpoint: str, data: Optional[Dict] = None, method: str = "POST" ) -> Optional[Dict]: """ Make API request with error handling and retry logic. Args: endpoint: API endpoint path (e.g., '/api/collections.list') data: Request body data method: HTTP method (POST or GET) Returns: Response data dict or None on failure """ url = f"{self.base_url}{endpoint}" for attempt in range(self.retry_attempts): try: if method == "POST": response = self.session.post( url, headers=self.headers, json=data or {}, timeout=self.request_timeout ) else: response = self.session.get( url, headers=self.headers, timeout=self.request_timeout ) if response.status_code == 200: return response.json() elif response.status_code in [429, 500, 502, 503, 504]: if attempt < self.retry_attempts - 1: wait_time = self.retry_delay * (2 ** attempt) logger.warning( f"API error {response.status_code} on {endpoint}, " f"retrying in {wait_time:.1f}s (attempt {attempt + 1}/{self.retry_attempts})" ) time.sleep(wait_time) continue # Non-retryable error or final attempt logger.error(f"API error on {endpoint}: HTTP {response.status_code}") logger.debug(f"Response: {response.text[:200]}") return None except requests.RequestException as e: if attempt < self.retry_attempts - 1: wait_time = self.retry_delay * (2 ** attempt) logger.warning( f"Request failed on {endpoint}: {e}, " f"retrying in {wait_time:.1f}s" ) time.sleep(wait_time) else: logger.error(f"All {self.retry_attempts} attempts failed on {endpoint}: {e}") return None return None def health_check(self) -> bool: """ Verify API connectivity and authentication. Returns: True if API is accessible and authenticated """ logger.info("Checking API connectivity...") result = self._api_request("/api/auth.info") if result and "data" in result: user = result["data"].get("user", {}) team = result["data"].get("team", {}) logger.info(f"Authenticated as: {user.get('name', 'Unknown')} ({user.get('email', 'N/A')})") logger.info(f"Team: {team.get('name', 'Unknown')}") return True logger.error("Health check failed: Unable to verify authentication") return False def _get_collections(self) -> List[Dict]: """Fetch all existing collections from Outline.""" result = self._api_request("/api/collections.list") if result and "data" in result: collections = result["data"] # Cache name -> id mapping self.existing_collections = {c["name"]: c["id"] for c in collections} return collections return [] def _create_collection(self, name: str, permission: str = None) -> Optional[str]: """ Create a new collection. Args: name: Collection name permission: Permission level ('read' or 'read_write') Returns: Collection ID if created, None on failure """ if permission is None: permission = self.default_permission if self.dry_run: logger.info(f" [DRY RUN] Would create collection \"{name}\"") return "dry-run-collection-id" result = self._api_request("/api/collections.create", { "name": name, "permission": permission }) if result and "data" in result: collection_id = result["data"]["id"] logger.debug(f"Created collection: {name} (id: {collection_id})") self.existing_collections[name] = collection_id return collection_id logger.error(f"Failed to create collection: {name}") return None def _delete_collection(self, collection_id: str) -> bool: """ Delete a collection. Args: collection_id: Collection ID to delete Returns: True if deleted successfully """ if self.dry_run: logger.info(f" [DRY RUN] Would delete collection {collection_id}") return True result = self._api_request("/api/collections.delete", {"id": collection_id}) return result is not None def _create_document( self, collection_id: str, title: str, text: str, parent_document_id: Optional[str] = None, publish: bool = True ) -> Optional[str]: """ Create a new document in a collection. Args: collection_id: Parent collection ID title: Document title text: Markdown content parent_document_id: Optional parent document ID for nesting publish: Whether to publish immediately Returns: Document ID if created, None on failure """ if self.dry_run: return "dry-run-document-id" data = { "collectionId": collection_id, "title": title, "text": text, "publish": publish } if parent_document_id: data["parentDocumentId"] = parent_document_id # Rate limiting if self.rate_limit_delay > 0: time.sleep(self.rate_limit_delay) result = self._api_request("/api/documents.create", data) if result and "data" in result: return result["data"]["id"] logger.error(f"Failed to create document: {title}") return None def _get_documents_in_collection(self, collection_id: str) -> List[Dict]: """Fetch all documents in a collection.""" result = self._api_request("/api/documents.list", {"collectionId": collection_id}) if result and "data" in result: return result["data"] return [] def load_collection_metadata(self, collection_dir: Path) -> Optional[Dict]: """ Load _collection_metadata.json from a collection directory. Args: collection_dir: Path to collection directory Returns: Metadata dict or None if not found/invalid """ metadata_path = collection_dir / "_collection_metadata.json" if not metadata_path.exists(): logger.warning(f"No metadata file found in {collection_dir}") return None try: with open(metadata_path, 'r', encoding='utf-8') as f: return json.load(f) except json.JSONDecodeError as e: logger.error(f"Invalid JSON in {metadata_path}: {e}") return None except Exception as e: logger.error(f"Error reading {metadata_path}: {e}") return None def get_source_collections(self) -> List[Path]: """ Get list of collection directories from source. Returns: List of collection directory paths """ if not self.source_dir.exists(): logger.error(f"Source directory not found: {self.source_dir}") return [] collections = [] for item in sorted(self.source_dir.iterdir()): if item.is_dir() and not item.name.startswith('.'): # Check for metadata file if (item / "_collection_metadata.json").exists(): collections.append(item) else: logger.warning(f"Skipping {item.name}: no metadata file") return collections def build_document_tree(self, documents: List[Dict]) -> List[Dict]: """ Build ordered document tree from flat metadata list. Uses topological sort to ensure parents are created before children. Args: documents: List of document metadata dicts from _collection_metadata.json Returns: List of root documents with nested children """ # Build lookup by ID doc_by_id: Dict[str, Dict] = {} for doc in documents: doc_by_id[doc["id"]] = doc.copy() doc_by_id[doc["id"]]["_children"] = [] # Build parent-child relationships roots = [] for doc in documents: parent_id = doc.get("parent_id") if parent_id and parent_id in doc_by_id: doc_by_id[parent_id]["_children"].append(doc_by_id[doc["id"]]) else: roots.append(doc_by_id[doc["id"]]) return roots def flatten_for_import(self, doc_tree: List[Dict], result: List[Dict] = None) -> List[Dict]: """ Flatten document tree in topological order (parents before children). Args: doc_tree: Nested document tree result: Accumulator list (used internally) Returns: Flat list of documents in import order """ if result is None: result = [] for doc in doc_tree: # Add this document result.append({ "id": doc["id"], "title": doc["title"], "filename": doc["filename"], "parent_id": doc.get("parent_id"), }) # Then add children recursively children = doc.get("_children", []) or doc.get("children", []) if children: self.flatten_for_import(children, result) return result def read_document_content(self, collection_dir: Path, filename: str) -> Optional[str]: """ Read markdown content from file. Args: collection_dir: Path to collection directory filename: Document filename Returns: Markdown content or None if not found """ filepath = collection_dir / filename if not filepath.exists(): logger.warning(f"File not found: {filepath}") return None try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Strip the header metadata added by export # Format: # Title\n\n\n\n---\n\nActual content lines = content.split('\n') content_start = 0 for i, line in enumerate(lines): if line.strip() == '---': content_start = i + 1 break if content_start > 0 and content_start < len(lines): return '\n'.join(lines[content_start:]).strip() return content except Exception as e: logger.error(f"Error reading {filepath}: {e}") return None def import_collection( self, collection_dir: Path, target_collection_id: Optional[str] = None, parent_document_id: Optional[str] = None ) -> Tuple[int, int, int]: """ Import a single collection. Args: collection_dir: Path to collection directory target_collection_id: Override target collection (for single mode) parent_document_id: Parent document ID (for single mode) Returns: Tuple of (created, skipped, errors) """ metadata = self.load_collection_metadata(collection_dir) if not metadata: self.stats["collections_errors"] += 1 self.errors.append({ "type": "collection", "name": collection_dir.name, "error": "Invalid or missing metadata" }) return (0, 0, 1) collection_name = metadata.get("name", collection_dir.name) documents = metadata.get("documents", []) # Count documents recursively def count_docs(docs): count = 0 for doc in docs: count += 1 count += count_docs(doc.get("children", [])) return count doc_count = count_docs(documents) # Determine collection ID collection_id = target_collection_id if not collection_id: # Check if collection exists if collection_name in self.existing_collections: if self.force: logger.info(f" Deleting existing collection \"{collection_name}\"...") if not self.dry_run: self._delete_collection(self.existing_collections[collection_name]) del self.existing_collections[collection_name] else: logger.info(f" Collection exists, skipping...") self.stats["collections_skipped"] += 1 return (0, doc_count, 0) # Create collection logger.info(f" Creating collection...") collection_id = self._create_collection(collection_name) if not collection_id: self.stats["collections_errors"] += 1 self.errors.append({ "type": "collection", "name": collection_name, "error": "Failed to create collection" }) return (0, 0, 1) if not self.dry_run: logger.info(f" ✓ (id: {collection_id[:8]}...)") self.stats["collections_created"] += 1 # Build document tree and flatten for import doc_tree = self.build_document_tree(documents) import_order = self.flatten_for_import(doc_tree) # Import documents created = 0 skipped = 0 errors = 0 for doc_meta in import_order: old_id = doc_meta["id"] title = doc_meta["title"] filename = doc_meta["filename"] old_parent_id = doc_meta.get("parent_id") # Resolve parent ID new_parent_id = parent_document_id # Default for single mode if old_parent_id: new_parent_id = self.id_map.get(old_parent_id) if not new_parent_id and not self.dry_run: logger.warning(f"Parent not found for {title}, creating as root-level") # Read content content = self.read_document_content(collection_dir, filename) if content is None: self._print_doc_status(title, "error", "file not found") errors += 1 self.stats["documents_errors"] += 1 self.errors.append({ "type": "document", "title": title, "collection": collection_name, "error": "File not found" }) continue # Create document new_id = self._create_document( collection_id, title, content, parent_document_id=new_parent_id ) if new_id: self.id_map[old_id] = new_id self._print_doc_status(title, "created") created += 1 self.stats["documents_created"] += 1 else: self._print_doc_status(title, "error", "API error") errors += 1 self.stats["documents_errors"] += 1 self.errors.append({ "type": "document", "title": title, "collection": collection_name, "error": "API error during creation" }) return (created, skipped, errors) def _print_doc_status(self, title: str, status: str, message: str = None): """Print document import status.""" if status == "created": symbol = "✓" label = "created" elif status == "skipped": symbol = "○" label = "skipped" else: symbol = "✗" label = message or "error" # This will be enhanced in Phase 6 with tree formatting logger.info(f" {symbol} {title[:50]:<50} {label}") def import_all(self) -> None: """Import all collections from source directory.""" start_time = time.time() # Print header mode_str = "Single collection" if self.single_mode else "Collection per folder" dry_run_str = " (DRY RUN)" if self.dry_run else "" print("=" * 60) print(f" OUTLINE IMPORT{dry_run_str}") print("=" * 60) print() print(f"Source: {self.source_dir}/") print(f"Target: {self.base_url}") print(f"Mode: {mode_str}") print() if self.dry_run: print("[DRY RUN] No changes will be made") print() # Health check if not self.health_check(): logger.error("Import aborted due to failed health check") return print() # Get existing collections self._get_collections() # Get source collections source_collections = self.get_source_collections() if not source_collections: logger.error("No collections found in source directory") return if self.single_mode: # Single collection mode timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") single_collection_name = f"import_{timestamp}" logger.info(f"Creating single collection: {single_collection_name}") collection_id = self._create_collection(single_collection_name) if not collection_id and not self.dry_run: logger.error("Failed to create import collection") return self.stats["collections_created"] += 1 for collection_dir in source_collections: metadata = self.load_collection_metadata(collection_dir) if not metadata: continue collection_name = metadata.get("name", collection_dir.name) doc_count = metadata.get("expected_count", 0) print(f"\n{collection_name}/ ({doc_count} documents)") # Create parent document for this "collection" parent_doc_id = self._create_document( collection_id, collection_name, f"# {collection_name}\n\nImported collection.", parent_document_id=None ) if parent_doc_id: self.stats["documents_created"] += 1 # Import documents under this parent self.import_collection( collection_dir, target_collection_id=collection_id, parent_document_id=parent_doc_id ) else: # Standard mode: one collection per folder for collection_dir in source_collections: metadata = self.load_collection_metadata(collection_dir) if not metadata: continue collection_name = metadata.get("name", collection_dir.name) doc_count = metadata.get("expected_count", 0) print(f"\n{collection_name}/ ({doc_count} documents)") self.import_collection(collection_dir) # Print summary duration = time.time() - start_time print() print("=" * 60) print("SUMMARY") print("=" * 60) print(f" Collections: {self.stats['collections_created']} created, " f"{self.stats['collections_skipped']} skipped, " f"{self.stats['collections_errors']} errors") print(f" Documents: {self.stats['documents_created']} created, " f"{self.stats['documents_skipped']} skipped, " f"{self.stats['documents_errors']} errors") print(f" Duration: {duration:.1f} seconds") print("=" * 60) if self.errors: print() logger.warning(f"Encountered {len(self.errors)} errors during import") def load_settings(settings_file: str = "settings.json") -> Dict: """Load settings from JSON file.""" try: with open(settings_file, 'r') as f: return json.load(f) except FileNotFoundError: logger.error(f"Settings file not found: {settings_file}") logger.error("Create a settings.json file with your configuration") sys.exit(1) except json.JSONDecodeError as e: logger.error(f"Invalid JSON in settings file: {e}") sys.exit(1) def parse_args() -> argparse.Namespace: """Parse command line arguments.""" parser = argparse.ArgumentParser( description="Import markdown files into Outline wiki", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s Import all collections from outline_export/ %(prog)s --dry-run Preview what would be imported %(prog)s --single Import all into a single timestamped collection %(prog)s -d backup/ Import from custom directory %(prog)s --force Overwrite existing collections """ ) parser.add_argument( '-s', '--single', action='store_true', help='Import all into single timestamped collection' ) parser.add_argument( '-n', '--dry-run', action='store_true', help='Preview operations without making changes' ) parser.add_argument( '-d', '--source', default=None, help='Source directory (default: outline_export)' ) parser.add_argument( '-v', '--verbose', action='count', default=0, help='Increase verbosity (use -vv for debug)' ) parser.add_argument( '-f', '--force', action='store_true', help='Overwrite existing collections (instead of skip)' ) parser.add_argument( '--settings', default='settings.json', help='Path to settings file (default: settings.json)' ) return parser.parse_args() def main() -> None: """Main entry point.""" args = parse_args() # Set log level based on verbosity if args.verbose >= 2: logger.setLevel(logging.DEBUG) elif args.verbose == 1: logger.setLevel(logging.INFO) # Load settings settings = load_settings(args.settings) source = settings.get("source", {}) import_config = settings.get("import", {}) advanced = settings.get("advanced", {}) # Validate required settings if not source.get("url") or not source.get("token"): logger.error("Missing required settings: source.url and source.token") sys.exit(1) # Determine source directory source_dir = args.source or import_config.get("source_directory", "outline_export") # Create importer importer = OutlineImporter( base_url=source["url"], api_token=source["token"], source_dir=source_dir, dry_run=args.dry_run, single_mode=args.single, force=args.force, on_collection_exists=import_config.get("on_collection_exists", "skip"), on_document_exists=import_config.get("on_document_exists", "skip"), default_permission=import_config.get("default_permission", "read_write"), request_timeout=advanced.get("request_timeout", 30), retry_attempts=advanced.get("retry_attempts", 3), retry_delay=advanced.get("retry_delay", 1.0), rate_limit_delay=advanced.get("rate_limit_delay", 0.1) ) # Run import try: importer.import_all() except KeyboardInterrupt: logger.warning("Import cancelled by user") sys.exit(1) except Exception as e: logger.exception(f"Import failed: {e}") sys.exit(1) if __name__ == "__main__": main()