fix: recover stale jobs on worker restart, persist active page on reload

- Worker resets running/cancelling jobs to idle on startup to fix jobs stuck after Docker restart - Frontend saves current page to localStorage so reload returns to last visited page instead of always dashboard Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 14:07:13 +02:00
parent f2d749ba3f
commit f600cd52f3
4 changed files with 721 additions and 205 deletions
@@ -36,7 +36,6 @@ def get_db():


 def parse_imapsync_output(text: str) -> dict:
-    """Extract stats from imapsync stdout/stderr."""
    stats = {"messages_synced": 0, "messages_skipped": 0, "errors": 0}
    m = re.search(r"Messages transferred:\s+(\d+)", text)
    if m:
@@ -44,17 +43,15 @@ def parse_imapsync_output(text: str) -> dict:
    m = re.search(r"Messages skipped:\s+(\d+)", text)
    if m:
        stats["messages_skipped"] = int(m.group(1))
-    # Count error lines
    stats["errors"] = len(re.findall(r"(?i)^\s*(error|err)\b", text, re.MULTILINE))
    return stats


 def check_due_schedules():
-    """Queue jobs whose cron schedule is due (within last POLL_INTERVAL seconds)."""
    try:
        from croniter import croniter
    except ImportError:
-        return  # croniter not installed in this image, skip
+        return

    conn = get_db()
    try:
@@ -70,7 +67,6 @@ def check_due_schedules():
                cron = croniter(row["schedule"])
                last_run = datetime.fromisoformat(row["last_run"]) if row["last_run"] else datetime(2000, 1, 1)
                prev_due = cron.get_prev(datetime)
-                # If last scheduled run is after last actual run, queue it
                if prev_due > last_run:
                    conn.execute(
                        "UPDATE sync_jobs SET status='queued' WHERE id=?",
@@ -103,7 +99,6 @@ def run_job(job: sqlite3.Row):
    conn.commit()
    conn.close()

-    # Build imapsync command
    ssl1 = "--ssl1" if job["src_ssl"] else "--nossl1"
    ssl2 = "--ssl2" if job["dst_ssl"] else "--nossl2"
    cmd = [
@@ -126,27 +121,44 @@ def run_job(job: sqlite3.Row):
    started = time.time()
    exit_code = 0
    output = ""
+    cancelled = False

    try:
        with open(log_path, "w") as lf:
            lf.write(f"# ImapSync Job: {job['name']}\n")
            lf.write(f"# Started: {datetime.utcnow().isoformat()}\n")
            lf.write(f"# Command: {' '.join(cmd[:20])}...\n\n")
-            result = subprocess.run(
+            lf.flush()
+            proc = subprocess.Popen(
                cmd,
                stdout=lf,
                stderr=subprocess.STDOUT,
-                timeout=7200,  # 2h max
            )
-            exit_code = result.returncode
+            while proc.poll() is None:
+                conn2 = get_db()
+                row = conn2.execute("SELECT status FROM sync_jobs WHERE id=?", (job_id,)).fetchone()
+                conn2.close()
+                if row and row["status"] == "cancelling":
+                    log.info(f"Job {job_id} cancel requested, terminating process")
+                    proc.terminate()
+                    try:
+                        proc.wait(timeout=10)
+                    except subprocess.TimeoutExpired:
+                        proc.kill()
+                        proc.wait()
+                    cancelled = True
+                    break
+                time.sleep(3)
+                elapsed = time.time() - started
+                if elapsed > 7200:
+                    log.error(f"Job {job_id} timed out after 2h")
+                    proc.kill()
+                    proc.wait()
+                    break
+            if not cancelled and proc.returncode is not None:
+                exit_code = proc.returncode
        with open(log_path, "r", errors="replace") as lf:
            output = lf.read()
-    except subprocess.TimeoutExpired:
-        log.error(f"Job {job_id} timed out after 2h")
-        exit_code = -1
-        output = "TIMEOUT after 2 hours"
-        with open(log_path, "a") as lf:
-            lf.write("\n\nTIMEOUT: Job exceeded 2 hour limit\n")
    except Exception as e:
        log.error(f"Job {job_id} exception: {e}")
        exit_code = -2
@@ -156,20 +168,36 @@ def run_job(job: sqlite3.Row):

    duration = int(time.time() - started)
    stats = parse_imapsync_output(output)
-    job_status = "done" if exit_code == 0 else "failed"
+
+    conn3 = get_db()
+    current_status = conn3.execute("SELECT status FROM sync_jobs WHERE id=?", (job_id,)).fetchone()
+    conn3.close()
+    if not cancelled and current_status and current_status["status"] == "cancelling":
+        cancelled = True
+
+    if cancelled:
+        job_status = "cancelled"
+        with open(log_path, "a") as lf:
+            lf.write("\n\nCANCELLED: Job was cancelled by user\n")
+    elif exit_code == 0:
+        job_status = "done"
+    else:
+        job_status = "failed"

    conn = get_db()
    conn.execute("""
        UPDATE job_runs SET
            status=?, finished_at=?, messages_synced=?,
            messages_skipped=?, errors=?, duration_sec=?
-        WHERE id=?
+        WHERE id=? AND status != 'cancelled'
    """, (job_status, datetime.utcnow().isoformat(),
          stats["messages_synced"], stats["messages_skipped"],
          stats["errors"], duration, run_id))
+    if job_status == "cancelled":
+        conn.execute("UPDATE job_runs SET duration_sec=? WHERE id=?", (duration, run_id))
    conn.execute(
-        "UPDATE sync_jobs SET status=? WHERE id=?",
-        ("idle", job_id)
+        "UPDATE sync_jobs SET status='idle' WHERE id=?",
+        (job_id,)
    )
    conn.commit()
    conn.close()
@@ -181,24 +209,54 @@ def run_job(job: sqlite3.Row):
    )


+def recover_stale_jobs():
+    conn = get_db()
+    try:
+        stale = conn.execute(
+            "SELECT id FROM sync_jobs WHERE status IN ('running', 'cancelling')"
+        ).fetchall()
+        for row in stale:
+            job_id = row["id"]
+            conn.execute(
+                "UPDATE job_runs SET status='failed', finished_at=? "
+                "WHERE job_id=? AND status='running'",
+                (datetime.utcnow().isoformat(), job_id)
+            )
+            conn.execute(
+                "UPDATE sync_jobs SET status='idle' WHERE id=?",
+                (job_id,)
+            )
+            log.warning(f"Recovered stale job {job_id} → idle (worker restart)")
+        conn.commit()
+    finally:
+        conn.close()
+
+
 def main():
    log.info(f"Worker started. DB={DB_PATH} LOG_DIR={LOG_DIR} POLL={POLL_INTERVAL}s")
-    # Wait for DB to be initialized by the web container
    for i in range(30):
        if os.path.exists(DB_PATH):
            break
        log.info(f"Waiting for DB... ({i+1}/30)")
        time.sleep(2)

+    recover_stale_jobs()
+
    while True:
        try:
            check_due_schedules()

            conn = get_db()
-            job = conn.execute(
-                "SELECT * FROM sync_jobs WHERE status='queued' AND enabled=1 "
-                "ORDER BY created_at ASC LIMIT 1"
-            ).fetchone()
+            job = conn.execute("""
+                SELECT j.*,
+                       s1.host as src_host, s1.port as src_port, s1.ssl as src_ssl,
+                       s2.host as dst_host, s2.port as dst_port, s2.ssl as dst_ssl
+                FROM sync_jobs j
+                JOIN servers s1 ON j.src_server_id = s1.id
+                JOIN servers s2 ON j.dst_server_id = s2.id
+                WHERE j.status='queued' AND j.enabled=1
+                ORDER BY j.created_at ASC LIMIT 1
+            """).fetchone()
            conn.close()

            if job: