fix: recover stale jobs on worker restart, persist active page on reload

- Worker resets running/cancelling jobs to idle on startup to fix jobs stuck after Docker restart
- Frontend saves current page to localStorage so reload returns to last visited page instead of always dashboard

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Sebastian Serfling
2026-04-22 14:07:13 +02:00
parent f2d749ba3f
commit f600cd52f3
4 changed files with 721 additions and 205 deletions
+82 -24
View File
@@ -36,7 +36,6 @@ def get_db():
def parse_imapsync_output(text: str) -> dict:
"""Extract stats from imapsync stdout/stderr."""
stats = {"messages_synced": 0, "messages_skipped": 0, "errors": 0}
m = re.search(r"Messages transferred:\s+(\d+)", text)
if m:
@@ -44,17 +43,15 @@ def parse_imapsync_output(text: str) -> dict:
m = re.search(r"Messages skipped:\s+(\d+)", text)
if m:
stats["messages_skipped"] = int(m.group(1))
# Count error lines
stats["errors"] = len(re.findall(r"(?i)^\s*(error|err)\b", text, re.MULTILINE))
return stats
def check_due_schedules():
"""Queue jobs whose cron schedule is due (within last POLL_INTERVAL seconds)."""
try:
from croniter import croniter
except ImportError:
return # croniter not installed in this image, skip
return
conn = get_db()
try:
@@ -70,7 +67,6 @@ def check_due_schedules():
cron = croniter(row["schedule"])
last_run = datetime.fromisoformat(row["last_run"]) if row["last_run"] else datetime(2000, 1, 1)
prev_due = cron.get_prev(datetime)
# If last scheduled run is after last actual run, queue it
if prev_due > last_run:
conn.execute(
"UPDATE sync_jobs SET status='queued' WHERE id=?",
@@ -103,7 +99,6 @@ def run_job(job: sqlite3.Row):
conn.commit()
conn.close()
# Build imapsync command
ssl1 = "--ssl1" if job["src_ssl"] else "--nossl1"
ssl2 = "--ssl2" if job["dst_ssl"] else "--nossl2"
cmd = [
@@ -126,27 +121,44 @@ def run_job(job: sqlite3.Row):
started = time.time()
exit_code = 0
output = ""
cancelled = False
try:
with open(log_path, "w") as lf:
lf.write(f"# ImapSync Job: {job['name']}\n")
lf.write(f"# Started: {datetime.utcnow().isoformat()}\n")
lf.write(f"# Command: {' '.join(cmd[:20])}...\n\n")
result = subprocess.run(
lf.flush()
proc = subprocess.Popen(
cmd,
stdout=lf,
stderr=subprocess.STDOUT,
timeout=7200, # 2h max
)
exit_code = result.returncode
while proc.poll() is None:
conn2 = get_db()
row = conn2.execute("SELECT status FROM sync_jobs WHERE id=?", (job_id,)).fetchone()
conn2.close()
if row and row["status"] == "cancelling":
log.info(f"Job {job_id} cancel requested, terminating process")
proc.terminate()
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait()
cancelled = True
break
time.sleep(3)
elapsed = time.time() - started
if elapsed > 7200:
log.error(f"Job {job_id} timed out after 2h")
proc.kill()
proc.wait()
break
if not cancelled and proc.returncode is not None:
exit_code = proc.returncode
with open(log_path, "r", errors="replace") as lf:
output = lf.read()
except subprocess.TimeoutExpired:
log.error(f"Job {job_id} timed out after 2h")
exit_code = -1
output = "TIMEOUT after 2 hours"
with open(log_path, "a") as lf:
lf.write("\n\nTIMEOUT: Job exceeded 2 hour limit\n")
except Exception as e:
log.error(f"Job {job_id} exception: {e}")
exit_code = -2
@@ -156,20 +168,36 @@ def run_job(job: sqlite3.Row):
duration = int(time.time() - started)
stats = parse_imapsync_output(output)
job_status = "done" if exit_code == 0 else "failed"
conn3 = get_db()
current_status = conn3.execute("SELECT status FROM sync_jobs WHERE id=?", (job_id,)).fetchone()
conn3.close()
if not cancelled and current_status and current_status["status"] == "cancelling":
cancelled = True
if cancelled:
job_status = "cancelled"
with open(log_path, "a") as lf:
lf.write("\n\nCANCELLED: Job was cancelled by user\n")
elif exit_code == 0:
job_status = "done"
else:
job_status = "failed"
conn = get_db()
conn.execute("""
UPDATE job_runs SET
status=?, finished_at=?, messages_synced=?,
messages_skipped=?, errors=?, duration_sec=?
WHERE id=?
WHERE id=? AND status != 'cancelled'
""", (job_status, datetime.utcnow().isoformat(),
stats["messages_synced"], stats["messages_skipped"],
stats["errors"], duration, run_id))
if job_status == "cancelled":
conn.execute("UPDATE job_runs SET duration_sec=? WHERE id=?", (duration, run_id))
conn.execute(
"UPDATE sync_jobs SET status=? WHERE id=?",
("idle", job_id)
"UPDATE sync_jobs SET status='idle' WHERE id=?",
(job_id,)
)
conn.commit()
conn.close()
@@ -181,24 +209,54 @@ def run_job(job: sqlite3.Row):
)
def recover_stale_jobs():
conn = get_db()
try:
stale = conn.execute(
"SELECT id FROM sync_jobs WHERE status IN ('running', 'cancelling')"
).fetchall()
for row in stale:
job_id = row["id"]
conn.execute(
"UPDATE job_runs SET status='failed', finished_at=? "
"WHERE job_id=? AND status='running'",
(datetime.utcnow().isoformat(), job_id)
)
conn.execute(
"UPDATE sync_jobs SET status='idle' WHERE id=?",
(job_id,)
)
log.warning(f"Recovered stale job {job_id} → idle (worker restart)")
conn.commit()
finally:
conn.close()
def main():
log.info(f"Worker started. DB={DB_PATH} LOG_DIR={LOG_DIR} POLL={POLL_INTERVAL}s")
# Wait for DB to be initialized by the web container
for i in range(30):
if os.path.exists(DB_PATH):
break
log.info(f"Waiting for DB... ({i+1}/30)")
time.sleep(2)
recover_stale_jobs()
while True:
try:
check_due_schedules()
conn = get_db()
job = conn.execute(
"SELECT * FROM sync_jobs WHERE status='queued' AND enabled=1 "
"ORDER BY created_at ASC LIMIT 1"
).fetchone()
job = conn.execute("""
SELECT j.*,
s1.host as src_host, s1.port as src_port, s1.ssl as src_ssl,
s2.host as dst_host, s2.port as dst_port, s2.ssl as dst_ssl
FROM sync_jobs j
JOIN servers s1 ON j.src_server_id = s1.id
JOIN servers s2 ON j.dst_server_id = s2.id
WHERE j.status='queued' AND j.enabled=1
ORDER BY j.created_at ASC LIMIT 1
""").fetchone()
conn.close()
if job: