feat: implement pngx-controller with Gitea CI/CD deployment
All checks were successful
Deploy / deploy (push) Successful in 30s

- Full FastAPI sync engine: master→replica document sync via paperless REST API
- Web UI: dashboard, replicas, logs, settings (Jinja2 + HTMX + Pico CSS)
- APScheduler background sync, SSE live log stream, Prometheus metrics
- Fernet encryption for API tokens at rest
- pngx.env credential file: written on save, pre-fills forms on load
- Dockerfile with layer-cached uv build, Python healthcheck
- docker-compose with host networking for Tailscale access
- Gitea Actions workflow: version bump, secret injection, docker compose deploy

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-22 17:59:25 +01:00
parent 942482daab
commit b99dbf694d
40 changed files with 4184 additions and 0 deletions

0
app/sync/__init__.py Normal file
View File

749
app/sync/engine.py Normal file
View File

@@ -0,0 +1,749 @@
"""Core sync engine: runs the full sync cycle across all eligible replicas."""
import asyncio
import hashlib
import sqlite3
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Optional
from sqlmodel import Session, select
from ..config import get_config
from ..crypto import decrypt
from ..database import get_engine
from ..logger import emit_log
from ..models import Replica, SyncMap, SyncRun
from .. import metrics
from .paperless import PaperlessClient, PaperlessError
_sync_lock = asyncio.Lock()
@dataclass
class SyncProgress:
running: bool = False
phase: str = ""
docs_done: int = 0
docs_total: int = 0
_progress = SyncProgress()
def get_progress() -> SyncProgress:
return _progress
async def run_sync_cycle(
triggered_by: str = "scheduler",
replica_id: Optional[int] = None,
) -> bool:
"""Trigger a sync cycle in the background. Returns False if already running."""
if _sync_lock.locked():
return False
asyncio.create_task(_do_sync(triggered_by, replica_id))
return True
async def _get_settings() -> dict:
from ..scheduler import SETTINGS_DEFAULTS
from ..models import Setting
with Session(get_engine()) as s:
rows = s.exec(select(Setting)).all()
result = dict(SETTINGS_DEFAULTS)
for row in rows:
if row.value is not None:
result[row.key] = row.value
return result
async def _ensure_schema_parity(
master: PaperlessClient,
replica: PaperlessClient,
) -> dict:
"""Create missing tags/correspondents/document_types/custom_fields on replica.
Returns maps: master_id → replica_id for each entity type."""
master_tags = {t["name"]: t for t in await master.get_tags()}
replica_tags = {t["name"]: t for t in await replica.get_tags()}
tag_map: dict[int, int] = {}
for name, mt in master_tags.items():
rt = replica_tags.get(name) or await replica.create_tag(
name,
color=mt.get("color", ""),
is_inbox_tag=mt.get("is_inbox_tag", False),
)
tag_map[mt["id"]] = rt["id"]
master_corrs = {c["name"]: c for c in await master.get_correspondents()}
replica_corrs = {c["name"]: c for c in await replica.get_correspondents()}
corr_map: dict[int, int] = {}
for name, mc in master_corrs.items():
rc = replica_corrs.get(name) or await replica.create_correspondent(name)
corr_map[mc["id"]] = rc["id"]
master_dts = {d["name"]: d for d in await master.get_document_types()}
replica_dts = {d["name"]: d for d in await replica.get_document_types()}
dt_map: dict[int, int] = {}
for name, mdt in master_dts.items():
rdt = replica_dts.get(name) or await replica.create_document_type(name)
dt_map[mdt["id"]] = rdt["id"]
master_cfs = {cf["name"]: cf for cf in await master.get_custom_fields()}
replica_cfs = {cf["name"]: cf for cf in await replica.get_custom_fields()}
cf_map: dict[int, int] = {}
for name, mcf in master_cfs.items():
rcf = replica_cfs.get(name) or await replica.create_custom_field(
name, mcf.get("data_type", "string")
)
cf_map[mcf["id"]] = rcf["id"]
return {
"tags": tag_map,
"correspondents": corr_map,
"document_types": dt_map,
"custom_fields": cf_map,
}
def _translate_metadata(meta: dict, maps: dict) -> dict:
"""Translate master entity IDs to replica entity IDs."""
result: dict = {
"title": meta.get("title", ""),
"created": meta.get("created") or meta.get("created_date"),
"archive_serial_number": meta.get("archive_serial_number"),
}
if meta.get("correspondent") is not None:
result["correspondent"] = maps["correspondents"].get(meta["correspondent"])
if meta.get("document_type") is not None:
result["document_type"] = maps["document_types"].get(meta["document_type"])
result["tags"] = [
maps["tags"][t] for t in meta.get("tags", []) if t in maps["tags"]
]
cf_list = []
for cf_entry in meta.get("custom_fields", []):
master_cf_id = cf_entry.get("field")
if master_cf_id in maps["custom_fields"]:
cf_list.append(
{
"field": maps["custom_fields"][master_cf_id],
"value": cf_entry.get("value"),
}
)
result["custom_fields"] = cf_list
return result
def _sha256(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
async def _resolve_pending_tasks(
replica: PaperlessClient,
replica_obj: Replica,
task_poll_timeout: int,
run_id: int,
session: Session,
) -> tuple[int, int]:
"""Resolve pending sync_map entries. Returns (resolved, failed)."""
pending = session.exec(
select(SyncMap).where(
SyncMap.replica_id == replica_obj.id,
SyncMap.status == "pending",
SyncMap.task_id.is_not(None), # type: ignore[union-attr]
)
).all()
resolved = failed = 0
now = datetime.now(timezone.utc)
for entry in pending:
try:
task = await replica.get_task(entry.task_id) # type: ignore[arg-type]
status = task.get("status", "")
age_seconds = 0
if entry.last_synced:
last = entry.last_synced
if last.tzinfo is None:
last = last.replace(tzinfo=timezone.utc)
age_seconds = (now - last).total_seconds()
if not task or age_seconds > task_poll_timeout:
entry.status = "error"
entry.error_msg = "task timed out"
entry.retry_count += 1
session.add(entry)
emit_log(
"warning",
f"Task timed out for doc {entry.master_doc_id}",
replica=replica_obj.name,
replica_id=replica_obj.id,
doc_id=entry.master_doc_id,
run_id=run_id,
session=session,
)
failed += 1
elif status == "SUCCESS":
# Extract replica_doc_id from task result
related = task.get("related_document")
if related is not None:
entry.replica_doc_id = int(str(related))
entry.task_id = None
entry.status = "ok"
entry.last_synced = now
session.add(entry)
resolved += 1
elif status in ("FAILURE", "REVOKED"):
entry.status = "error"
entry.error_msg = task.get("result", "task failed")[:500]
entry.retry_count += 1
session.add(entry)
emit_log(
"warning",
f"Task failed for doc {entry.master_doc_id}: {entry.error_msg}",
replica=replica_obj.name,
replica_id=replica_obj.id,
doc_id=entry.master_doc_id,
run_id=run_id,
session=session,
)
failed += 1
# else: still PENDING/STARTED — leave it
except Exception as e:
emit_log(
"warning",
f"Could not check task for doc {entry.master_doc_id}: {e}",
replica=replica_obj.name,
replica_id=replica_obj.id,
run_id=run_id,
session=session,
)
if pending:
session.commit()
return resolved, failed
async def _sync_replica(
replica_obj: Replica,
master: PaperlessClient,
changed_docs: list[dict],
settings: dict,
run_id: int,
engine,
) -> tuple[int, int]:
"""Sync one replica. Returns (docs_synced, docs_failed)."""
config = get_config()
max_concurrent = int(settings.get("max_concurrent_requests", "4"))
task_poll_timeout = int(settings.get("task_poll_timeout_seconds", "600"))
replica_token = decrypt(replica_obj.api_token, config.secret_key)
replica_semaphore = asyncio.Semaphore(max_concurrent)
docs_synced = docs_failed = 0
async with PaperlessClient(
replica_obj.url, replica_token, replica_semaphore
) as replica:
with Session(engine) as session:
# Step 5a: ensure schema parity
_progress.phase = f"schema parity — {replica_obj.name}"
try:
maps = await _ensure_schema_parity(master, replica)
except Exception as e:
emit_log(
"error",
f"Schema parity failed: {e}",
replica=replica_obj.name,
replica_id=replica_obj.id,
run_id=run_id,
session=session,
)
raise
# Step 5b: resolve pending tasks
_progress.phase = f"resolving tasks — {replica_obj.name}"
await _resolve_pending_tasks(
replica, replica_obj, task_poll_timeout, run_id, session
)
# Step 5c: collect docs to process
last_ts = replica_obj.last_sync_ts
if last_ts and last_ts.tzinfo is None:
last_ts = last_ts.replace(tzinfo=timezone.utc)
docs_for_replica = [
d
for d in changed_docs
if last_ts is None
or _parse_dt(d.get("modified", "")) is None
or _parse_dt(d.get("modified", "")) >= last_ts
]
# Include error-status docs (capped at 50)
error_entries = session.exec(
select(SyncMap).where(
SyncMap.replica_id == replica_obj.id,
SyncMap.status == "error",
)
).all()[:50]
error_doc_ids = {e.master_doc_id for e in error_entries}
existing_ids = {d["id"] for d in docs_for_replica}
for e in error_entries:
if e.master_doc_id not in existing_ids:
docs_for_replica.append({"id": e.master_doc_id, "_retry": True})
_progress.docs_total = len(docs_for_replica)
_progress.docs_done = 0
_progress.phase = f"syncing {replica_obj.name}"
# Step 5d: process each document
for doc_stub in docs_for_replica:
doc_id = doc_stub["id"]
try:
# Fetch full metadata from master
meta = await master.get_document(doc_id)
file_bytes = await master.download_document(doc_id, original=True)
checksum = _sha256(file_bytes)
filename = meta.get("original_file_name") or f"document-{doc_id}.pdf"
translated = _translate_metadata(meta, maps)
existing = session.exec(
select(SyncMap).where(
SyncMap.replica_id == replica_obj.id,
SyncMap.master_doc_id == doc_id,
)
).first()
if existing and existing.replica_doc_id is not None and existing.status == "ok":
# Update metadata on replica
await replica.patch_document(existing.replica_doc_id, translated)
existing.last_synced = datetime.now(timezone.utc)
existing.file_checksum = checksum
session.add(existing)
session.commit()
docs_synced += 1
emit_log(
"info",
f"Updated doc {doc_id} → replica {existing.replica_doc_id}",
replica=replica_obj.name,
replica_id=replica_obj.id,
doc_id=doc_id,
run_id=run_id,
session=session,
)
else:
# Upload new document
task_id = await master_post_to_replica(
replica, file_bytes, filename, translated
)
now = datetime.now(timezone.utc)
if existing:
existing.task_id = task_id
existing.status = "pending"
existing.replica_doc_id = None
existing.file_checksum = checksum
existing.last_synced = now
existing.retry_count = existing.retry_count + 1
session.add(existing)
else:
entry = SyncMap(
replica_id=replica_obj.id,
master_doc_id=doc_id,
task_id=task_id,
status="pending",
file_checksum=checksum,
last_synced=now,
)
session.add(entry)
session.commit()
emit_log(
"info",
f"Uploaded doc {doc_id}, task {task_id}",
replica=replica_obj.name,
replica_id=replica_obj.id,
doc_id=doc_id,
run_id=run_id,
session=session,
)
except Exception as e:
docs_failed += 1
emit_log(
"error",
f"Failed to sync doc {doc_id}: {e}",
replica=replica_obj.name,
replica_id=replica_obj.id,
doc_id=doc_id,
run_id=run_id,
session=session,
)
# Mark as error in sync_map
existing = session.exec(
select(SyncMap).where(
SyncMap.replica_id == replica_obj.id,
SyncMap.master_doc_id == doc_id,
)
).first()
if existing:
existing.status = "error"
existing.error_msg = str(e)[:500]
session.add(existing)
session.commit()
_progress.docs_done += 1
metrics.docs_total.labels(
replica=replica_obj.name,
status="ok" if docs_failed == 0 else "error",
).inc()
return docs_synced, docs_failed
async def master_post_to_replica(
replica: PaperlessClient,
file_bytes: bytes,
filename: str,
metadata: dict,
) -> str:
return await replica.post_document(file_bytes, filename, metadata)
def _parse_dt(s: str) -> datetime | None:
if not s:
return None
try:
dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt
except Exception:
return None
async def _do_sync(triggered_by: str, target_replica_id: Optional[int]) -> None:
global _progress
async with _sync_lock:
_progress = SyncProgress(running=True, phase="starting")
metrics.sync_running.set(1)
config = get_config()
engine = get_engine()
start_time = datetime.now(timezone.utc)
run_id: Optional[int] = None
try:
settings = await _get_settings()
master_url = settings.get("master_url", "")
master_token_enc = settings.get("master_token", "")
if not master_url or not master_token_enc:
emit_log("error", "Master URL or token not configured")
return
master_token = decrypt(master_token_enc, config.secret_key)
max_concurrent = int(settings.get("max_concurrent_requests", "4"))
sync_cycle_timeout = int(settings.get("sync_cycle_timeout_seconds", "1800"))
suspend_threshold = int(settings.get("replica_suspend_threshold", "5"))
# Create sync_run record
with Session(engine) as session:
sync_run = SyncRun(
replica_id=target_replica_id,
started_at=start_time,
triggered_by=triggered_by,
)
session.add(sync_run)
session.commit()
session.refresh(sync_run)
run_id = sync_run.id
# Determine eligible replicas
with Session(engine) as session:
stmt = select(Replica).where(Replica.enabled == True) # noqa: E712
if target_replica_id:
stmt = stmt.where(Replica.id == target_replica_id)
all_replicas = session.exec(stmt).all()
now = datetime.now(timezone.utc)
eligible: list[Replica] = []
for r in all_replicas:
if r.suspended_at is not None:
continue
if r.sync_interval_seconds is not None and r.last_sync_ts:
last = r.last_sync_ts
if last.tzinfo is None:
last = last.replace(tzinfo=timezone.utc)
if (now - last).total_seconds() < r.sync_interval_seconds:
continue
eligible.append(r)
if not eligible:
emit_log("info", "No eligible replicas for this cycle")
_close_run(engine, run_id, 0, 0, False)
return
# Find min last_sync_ts for master query.
# If ANY eligible replica has never synced, fetch ALL master docs.
any_never_synced = any(r.last_sync_ts is None for r in eligible)
if any_never_synced:
modified_gte = None
else:
last_sync_times = [r.last_sync_ts for r in eligible] # type: ignore[misc]
min_ts = min(
(t if t.tzinfo else t.replace(tzinfo=timezone.utc))
for t in last_sync_times
)
modified_gte = min_ts.isoformat()
master_semaphore = asyncio.Semaphore(max_concurrent)
result_container = [0, 0]
try:
await asyncio.wait_for(
_run_all_replicas(
eligible=eligible,
master_url=master_url,
master_token=master_token,
master_semaphore=master_semaphore,
modified_gte=modified_gte,
settings=settings,
run_id=run_id,
suspend_threshold=suspend_threshold,
engine=engine,
start_time=start_time,
result_container=result_container,
),
timeout=sync_cycle_timeout,
)
except asyncio.TimeoutError:
emit_log(
"warning",
f"Sync cycle timed out after {sync_cycle_timeout}s",
)
_close_run(engine, run_id, 0, 0, True)
return
_close_run(engine, run_id, result_container[0], result_container[1], False)
_do_backup(config.db_path)
except Exception as e:
emit_log("error", f"Sync cycle crashed: {e}")
if run_id:
_close_run(engine, run_id, 0, 0, False)
finally:
elapsed = (datetime.now(timezone.utc) - start_time).total_seconds()
metrics.sync_duration.labels(triggered_by=triggered_by).observe(elapsed)
metrics.sync_running.set(0)
_progress = SyncProgress(running=False)
async def _run_all_replicas(
*,
eligible: list[Replica],
master_url: str,
master_token: str,
master_semaphore: asyncio.Semaphore,
modified_gte: str | None,
settings: dict,
run_id: int,
suspend_threshold: int,
engine,
start_time: datetime,
result_container: list,
) -> None:
"""Fetch changed docs once, then sync each replica."""
_progress.phase = "fetching master documents"
async with PaperlessClient(master_url, master_token, master_semaphore) as master:
changed_docs = await master.get_all_documents(modified_gte=modified_gte)
total_synced = total_failed = 0
for replica_obj in eligible:
_progress.phase = f"syncing {replica_obj.name}"
try:
async with PaperlessClient(
master_url, master_token, master_semaphore
) as master:
synced, failed = await _sync_replica(
replica_obj=replica_obj,
master=master,
changed_docs=changed_docs,
settings=settings,
run_id=run_id,
engine=engine,
)
total_synced += synced
total_failed += failed
# Update replica success state
with Session(engine) as session:
r = session.get(Replica, replica_obj.id)
if r:
r.last_sync_ts = start_time
r.consecutive_failures = 0
session.add(r)
session.commit()
metrics.replica_consecutive_failures.labels(replica=replica_obj.name).set(0)
# Check alert threshold
alert_threshold = int(settings.get("alert_error_threshold", "5"))
if failed >= alert_threshold:
await _send_alert(
replica_obj,
"sync_failures_threshold",
{"docs_synced": synced, "docs_failed": failed},
settings,
engine,
)
except Exception as e:
emit_log(
"error",
f"Replica sync failed: {e}",
replica=replica_obj.name,
replica_id=replica_obj.id,
run_id=run_id,
)
total_failed += 1
with Session(engine) as session:
r = session.get(Replica, replica_obj.id)
if r:
r.consecutive_failures += 1
if r.consecutive_failures >= suspend_threshold:
r.suspended_at = datetime.now(timezone.utc)
emit_log(
"error",
f"Replica {r.name} suspended after {r.consecutive_failures} consecutive failures",
replica=r.name,
replica_id=r.id,
)
await _send_alert(
r,
"replica_suspended",
{"docs_synced": 0, "docs_failed": 1},
settings,
engine,
)
session.add(r)
session.commit()
metrics.replica_consecutive_failures.labels(
replica=replica_obj.name
).set(replica_obj.consecutive_failures + 1)
# Update Prometheus lag
with Session(engine) as session:
r = session.get(Replica, replica_obj.id)
if r and r.last_sync_ts:
ts = r.last_sync_ts
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
lag = (datetime.now(timezone.utc) - ts).total_seconds()
metrics.replica_lag.labels(replica=replica_obj.name).set(lag)
result_container[0] = total_synced
result_container[1] = total_failed
async def _send_alert(
replica: Replica,
event: str,
run_stats: dict,
settings: dict,
engine,
) -> None:
import httpx
target_type = settings.get("alert_target_type", "")
target_url = settings.get("alert_target_url", "")
cooldown = int(settings.get("alert_cooldown_seconds", "3600"))
if not target_type or not target_url:
return
now = datetime.now(timezone.utc)
if replica.last_alert_at:
last = replica.last_alert_at
if last.tzinfo is None:
last = last.replace(tzinfo=timezone.utc)
if (now - last).total_seconds() < cooldown:
return
payload = {
"event": event,
"replica": replica.name,
"replica_url": replica.url,
"consecutive_failures": replica.consecutive_failures,
"docs_failed": run_stats.get("docs_failed", 0),
"docs_synced": run_stats.get("docs_synced", 0),
"timestamp": now.isoformat(),
}
config = get_config()
token_enc = settings.get("alert_target_token", "")
token = decrypt(token_enc, config.secret_key) if token_enc else ""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
if target_type == "gotify":
await client.post(
f"{target_url}/message",
json={
"title": "pngx-controller alert",
"message": str(payload),
"priority": 7,
},
headers={"X-Gotify-Key": token},
)
elif target_type == "webhook":
headers = {}
if token:
headers["Authorization"] = token
await client.post(target_url, json=payload, headers=headers)
with Session(engine) as session:
r = session.get(Replica, replica.id)
if r:
r.last_alert_at = now
session.add(r)
session.commit()
except Exception as e:
emit_log("warning", f"Alert send failed: {e}")
def _close_run(
engine, run_id: int, synced: int, failed: int, timed_out: bool
) -> None:
with Session(engine) as session:
sr = session.get(SyncRun, run_id)
if sr:
sr.finished_at = datetime.now(timezone.utc)
sr.docs_synced = synced
sr.docs_failed = failed
sr.timed_out = timed_out
session.add(sr)
session.commit()
def _do_backup(db_path: str) -> None:
"""Copy DB to .bak file after a successful sync run."""
import os
bak_path = db_path + ".bak"
try:
import sqlite3 as _sqlite3
src = _sqlite3.connect(db_path)
dst = _sqlite3.connect(bak_path)
src.backup(dst)
dst.close()
src.close()
except Exception as e:
emit_log("warning", f"DB backup failed: {e}")

209
app/sync/paperless.py Normal file
View File

@@ -0,0 +1,209 @@
"""Paperless-ngx REST API client with retry/backoff and semaphore throttling."""
import asyncio
import time
from typing import Any
import httpx
class PaperlessError(Exception):
pass
class PaperlessClient:
def __init__(self, url: str, token: str, semaphore: asyncio.Semaphore) -> None:
self.base_url = url.rstrip("/")
self.token = token
self.semaphore = semaphore
self._client: httpx.AsyncClient | None = None
async def __aenter__(self) -> "PaperlessClient":
self._client = httpx.AsyncClient(
headers={"Authorization": f"Token {self.token}"},
timeout=120.0,
)
return self
async def __aexit__(self, *args) -> None:
if self._client:
await self._client.aclose()
self._client = None
async def _request(self, method: str, path: str, **kwargs) -> httpx.Response:
assert self._client is not None, "Use as async context manager"
url = f"{self.base_url}{path}"
delays = [2, 4, 8]
last_exc: Exception | None = None
for attempt in range(3):
try:
async with self.semaphore:
r = await self._client.request(method, url, **kwargs)
r.raise_for_status()
return r
except (httpx.NetworkError, httpx.TimeoutException, httpx.ConnectError) as e:
last_exc = e
if attempt < 2:
await asyncio.sleep(delays[attempt])
except httpx.HTTPStatusError as e:
if e.response.status_code >= 500:
last_exc = e
if attempt < 2:
await asyncio.sleep(delays[attempt])
else:
raise PaperlessError(
f"HTTP {e.response.status_code} {method} {path}: {e.response.text[:300]}"
) from e
raise PaperlessError(f"Request failed after 3 attempts: {last_exc}") from last_exc
async def _get_all(self, path: str, params: dict | None = None) -> list[dict]:
"""Paginate through all results."""
results: list[dict] = []
page = 1
base_params = dict(params or {})
base_params["page_size"] = 100
while True:
r = await self._request("GET", path, params={**base_params, "page": page})
data = r.json()
results.extend(data.get("results", []))
if not data.get("next"):
break
page += 1
return results
# ── Documents ──────────────────────────────────────────────────────────────
async def get_documents_page(
self,
page: int = 1,
modified_gte: str | None = None,
page_size: int = 100,
) -> dict:
params: dict[str, Any] = {
"ordering": "modified",
"page_size": page_size,
"page": page,
}
if modified_gte:
params["modified__gte"] = modified_gte
r = await self._request("GET", "/api/documents/", params=params)
return r.json()
async def get_all_documents(self, modified_gte: str | None = None) -> list[dict]:
docs: list[dict] = []
page = 1
while True:
data = await self.get_documents_page(page=page, modified_gte=modified_gte)
docs.extend(data.get("results", []))
if not data.get("next"):
break
page += 1
return docs
async def get_document(self, doc_id: int) -> dict:
r = await self._request("GET", f"/api/documents/{doc_id}/")
return r.json()
async def download_document(self, doc_id: int, original: bool = True) -> bytes:
params: dict[str, Any] = {}
if not original:
params["original"] = "false"
r = await self._request("GET", f"/api/documents/{doc_id}/download/", params=params)
return r.content
async def post_document(
self, file_bytes: bytes, filename: str, metadata: dict
) -> str:
"""Upload a document; returns the Celery task_id UUID string."""
form: list[tuple[str, str]] = []
for key in ("title", "created", "archive_serial_number"):
val = metadata.get(key)
if val is not None:
form.append((key, str(val)))
if metadata.get("correspondent") is not None:
form.append(("correspondent", str(metadata["correspondent"])))
if metadata.get("document_type") is not None:
form.append(("document_type", str(metadata["document_type"])))
for tag_id in metadata.get("tags", []):
form.append(("tags", str(tag_id)))
r = await self._request(
"POST",
"/api/documents/post_document/",
files={"document": (filename, file_bytes, "application/octet-stream")},
data=form,
)
result = r.json()
# API returns a plain task UUID string
if isinstance(result, str):
return result
# Some versions wrap it
if isinstance(result, dict):
return result.get("task_id", result.get("id", ""))
return str(result)
async def patch_document(self, doc_id: int, metadata: dict) -> dict:
r = await self._request("PATCH", f"/api/documents/{doc_id}/", json=metadata)
return r.json()
async def get_task(self, task_id: str) -> dict:
r = await self._request("GET", "/api/tasks/", params={"task_id": task_id})
results = r.json()
if isinstance(results, list) and results:
return results[0]
return {}
# ── Metadata entities ──────────────────────────────────────────────────────
async def get_tags(self) -> list[dict]:
return await self._get_all("/api/tags/")
async def get_correspondents(self) -> list[dict]:
return await self._get_all("/api/correspondents/")
async def get_document_types(self) -> list[dict]:
return await self._get_all("/api/document_types/")
async def get_custom_fields(self) -> list[dict]:
return await self._get_all("/api/custom_fields/")
async def create_tag(self, name: str, **kwargs) -> dict:
r = await self._request("POST", "/api/tags/", json={"name": name, **kwargs})
return r.json()
async def create_correspondent(self, name: str, **kwargs) -> dict:
r = await self._request(
"POST", "/api/correspondents/", json={"name": name, **kwargs}
)
return r.json()
async def create_document_type(self, name: str, **kwargs) -> dict:
r = await self._request(
"POST", "/api/document_types/", json={"name": name, **kwargs}
)
return r.json()
async def create_custom_field(self, name: str, data_type: str, **kwargs) -> dict:
r = await self._request(
"POST",
"/api/custom_fields/",
json={"name": name, "data_type": data_type, **kwargs},
)
return r.json()
async def test_connection(self) -> dict:
"""Returns {ok, error, latency_ms, doc_count}."""
t0 = time.monotonic()
try:
r = await self._request("GET", "/api/documents/", params={"page_size": 1})
elapsed = int((time.monotonic() - t0) * 1000)
data = r.json()
return {
"ok": True,
"error": None,
"latency_ms": elapsed,
"doc_count": data.get("count", 0),
}
except Exception as e:
return {"ok": False, "error": str(e), "latency_ms": 0, "doc_count": 0}

131
app/sync/reconcile.py Normal file
View File

@@ -0,0 +1,131 @@
"""Reconcile mode: match existing replica documents to master without re-uploading."""
import asyncio
from datetime import datetime, timezone
from sqlmodel import Session, select
from ..config import get_config
from ..crypto import decrypt
from ..database import get_engine
from ..logger import emit_log
from ..models import Replica, SyncMap
from .paperless import PaperlessClient
async def run_reconcile(replica_id: int) -> dict:
"""
Match replica documents to master by ASN / (title + created_date).
Populates sync_map without uploading files.
Returns {matched, unmatched, errors}.
"""
config = get_config()
engine = get_engine()
from ..models import Setting
from ..scheduler import SETTINGS_DEFAULTS
with Session(engine) as session:
settings = {s.key: s.value for s in session.exec(select(Setting)).all()}
replica_obj = session.get(Replica, replica_id)
if not replica_obj:
raise ValueError(f"Replica {replica_id} not found")
master_url = settings.get("master_url", "")
master_token_enc = settings.get("master_token", "")
if not master_url or not master_token_enc:
raise ValueError("Master URL or token not configured")
master_token = decrypt(master_token_enc, config.secret_key)
replica_token = decrypt(replica_obj.api_token, config.secret_key)
max_concurrent = int(settings.get("max_concurrent_requests") or SETTINGS_DEFAULTS["max_concurrent_requests"])
master_sem = asyncio.Semaphore(max_concurrent)
replica_sem = asyncio.Semaphore(max_concurrent)
matched = unmatched = errors = 0
async with PaperlessClient(master_url, master_token, master_sem) as master:
async with PaperlessClient(replica_obj.url, replica_token, replica_sem) as replica:
# Build replica index: asn → doc, (title, date) → doc
emit_log("info", "Reconcile: indexing replica documents", replica=replica_obj.name)
replica_docs = await replica.get_all_documents()
asn_index: dict[int, dict] = {}
title_date_index: dict[tuple, dict] = {}
for doc in replica_docs:
asn = doc.get("archive_serial_number")
if asn is not None:
asn_index[int(asn)] = doc
title = (doc.get("title", "") or "").strip().lower()
created = str(doc.get("created") or doc.get("created_date") or "")[:10]
if title:
title_date_index[(title, created)] = doc
# Walk master documents
emit_log("info", "Reconcile: indexing master documents", replica=replica_obj.name)
master_docs = await master.get_all_documents()
now = datetime.now(timezone.utc)
with Session(engine) as session:
for mdoc in master_docs:
master_id = mdoc["id"]
# Skip if already in sync_map
existing = session.exec(
select(SyncMap).where(
SyncMap.replica_id == replica_id,
SyncMap.master_doc_id == master_id,
)
).first()
if existing:
continue
# Try to match
replica_match: dict | None = None
masn = mdoc.get("archive_serial_number")
if masn is not None and int(masn) in asn_index:
replica_match = asn_index[int(masn)]
else:
mtitle = (mdoc.get("title", "") or "").strip().lower()
mcreated = str(
mdoc.get("created") or mdoc.get("created_date") or ""
)[:10]
if mtitle:
replica_match = title_date_index.get((mtitle, mcreated))
if replica_match:
try:
file_bytes = await master.download_document(master_id)
import hashlib
checksum = hashlib.sha256(file_bytes).hexdigest()
except Exception:
checksum = None
entry = SyncMap(
replica_id=replica_id,
master_doc_id=master_id,
replica_doc_id=replica_match["id"],
status="ok",
file_checksum=checksum,
last_synced=now,
)
session.add(entry)
matched += 1
else:
unmatched += 1
try:
session.commit()
except Exception as e:
errors += 1
emit_log(
"error",
f"Reconcile DB commit failed: {e}",
replica=replica_obj.name,
)
emit_log(
"info",
f"Reconcile complete: {matched} matched, {unmatched} unmatched, {errors} errors",
replica=replica_obj.name,
)
return {"matched": matched, "unmatched": unmatched, "errors": errors}