fix: recover stale jobs on worker restart, persist active page on reload
- Worker resets running/cancelling jobs to idle on startup to fix jobs stuck after Docker restart - Frontend saves current page to localStorage so reload returns to last visited page instead of always dashboard Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+42
-3
@@ -1,12 +1,51 @@
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
# Install imapsync and dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
imapsync \
|
||||
wget \
|
||||
git \
|
||||
lsb-release \
|
||||
libauthen-ntlm-perl \
|
||||
libdist-checkconflicts-perl \
|
||||
libpar-packer-perl \
|
||||
libtest-requires-perl \
|
||||
libtest-fatal-perl \
|
||||
libtest-mock-guard-perl \
|
||||
libcgi-pm-perl \
|
||||
libcrypt-openssl-rsa-perl \
|
||||
libdata-uniqid-perl \
|
||||
libencode-imaputf7-perl \
|
||||
libfile-copy-recursive-perl \
|
||||
libfile-tail-perl \
|
||||
libio-socket-inet6-perl \
|
||||
libio-socket-ssl-perl \
|
||||
libio-tee-perl \
|
||||
libhtml-parser-perl \
|
||||
libjson-webtoken-perl \
|
||||
libmail-imapclient-perl \
|
||||
libparse-recdescent-perl \
|
||||
libmodule-scandeps-perl \
|
||||
libreadonly-perl \
|
||||
libregexp-common-perl \
|
||||
libsys-meminfo-perl \
|
||||
libterm-readkey-perl \
|
||||
libtest-mockobject-perl \
|
||||
libtest-pod-perl \
|
||||
libunicode-string-perl \
|
||||
liburi-perl \
|
||||
libwww-perl \
|
||||
libtest-nowarnings-perl \
|
||||
libtest-deep-perl \
|
||||
libtest-warn-perl \
|
||||
make \
|
||||
cpanminus \
|
||||
gcc \
|
||||
python3 \
|
||||
python3-pip \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN wget -O /usr/local/bin/imapsync https://imapsync.lamiral.info/imapsync && \
|
||||
chmod +x /usr/local/bin/imapsync
|
||||
|
||||
WORKDIR /app
|
||||
COPY worker.py .
|
||||
|
||||
|
||||
+82
-24
@@ -36,7 +36,6 @@ def get_db():
|
||||
|
||||
|
||||
def parse_imapsync_output(text: str) -> dict:
|
||||
"""Extract stats from imapsync stdout/stderr."""
|
||||
stats = {"messages_synced": 0, "messages_skipped": 0, "errors": 0}
|
||||
m = re.search(r"Messages transferred:\s+(\d+)", text)
|
||||
if m:
|
||||
@@ -44,17 +43,15 @@ def parse_imapsync_output(text: str) -> dict:
|
||||
m = re.search(r"Messages skipped:\s+(\d+)", text)
|
||||
if m:
|
||||
stats["messages_skipped"] = int(m.group(1))
|
||||
# Count error lines
|
||||
stats["errors"] = len(re.findall(r"(?i)^\s*(error|err)\b", text, re.MULTILINE))
|
||||
return stats
|
||||
|
||||
|
||||
def check_due_schedules():
|
||||
"""Queue jobs whose cron schedule is due (within last POLL_INTERVAL seconds)."""
|
||||
try:
|
||||
from croniter import croniter
|
||||
except ImportError:
|
||||
return # croniter not installed in this image, skip
|
||||
return
|
||||
|
||||
conn = get_db()
|
||||
try:
|
||||
@@ -70,7 +67,6 @@ def check_due_schedules():
|
||||
cron = croniter(row["schedule"])
|
||||
last_run = datetime.fromisoformat(row["last_run"]) if row["last_run"] else datetime(2000, 1, 1)
|
||||
prev_due = cron.get_prev(datetime)
|
||||
# If last scheduled run is after last actual run, queue it
|
||||
if prev_due > last_run:
|
||||
conn.execute(
|
||||
"UPDATE sync_jobs SET status='queued' WHERE id=?",
|
||||
@@ -103,7 +99,6 @@ def run_job(job: sqlite3.Row):
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# Build imapsync command
|
||||
ssl1 = "--ssl1" if job["src_ssl"] else "--nossl1"
|
||||
ssl2 = "--ssl2" if job["dst_ssl"] else "--nossl2"
|
||||
cmd = [
|
||||
@@ -126,27 +121,44 @@ def run_job(job: sqlite3.Row):
|
||||
started = time.time()
|
||||
exit_code = 0
|
||||
output = ""
|
||||
cancelled = False
|
||||
|
||||
try:
|
||||
with open(log_path, "w") as lf:
|
||||
lf.write(f"# ImapSync Job: {job['name']}\n")
|
||||
lf.write(f"# Started: {datetime.utcnow().isoformat()}\n")
|
||||
lf.write(f"# Command: {' '.join(cmd[:20])}...\n\n")
|
||||
result = subprocess.run(
|
||||
lf.flush()
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=lf,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=7200, # 2h max
|
||||
)
|
||||
exit_code = result.returncode
|
||||
while proc.poll() is None:
|
||||
conn2 = get_db()
|
||||
row = conn2.execute("SELECT status FROM sync_jobs WHERE id=?", (job_id,)).fetchone()
|
||||
conn2.close()
|
||||
if row and row["status"] == "cancelling":
|
||||
log.info(f"Job {job_id} cancel requested, terminating process")
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
cancelled = True
|
||||
break
|
||||
time.sleep(3)
|
||||
elapsed = time.time() - started
|
||||
if elapsed > 7200:
|
||||
log.error(f"Job {job_id} timed out after 2h")
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
break
|
||||
if not cancelled and proc.returncode is not None:
|
||||
exit_code = proc.returncode
|
||||
with open(log_path, "r", errors="replace") as lf:
|
||||
output = lf.read()
|
||||
except subprocess.TimeoutExpired:
|
||||
log.error(f"Job {job_id} timed out after 2h")
|
||||
exit_code = -1
|
||||
output = "TIMEOUT after 2 hours"
|
||||
with open(log_path, "a") as lf:
|
||||
lf.write("\n\nTIMEOUT: Job exceeded 2 hour limit\n")
|
||||
except Exception as e:
|
||||
log.error(f"Job {job_id} exception: {e}")
|
||||
exit_code = -2
|
||||
@@ -156,20 +168,36 @@ def run_job(job: sqlite3.Row):
|
||||
|
||||
duration = int(time.time() - started)
|
||||
stats = parse_imapsync_output(output)
|
||||
job_status = "done" if exit_code == 0 else "failed"
|
||||
|
||||
conn3 = get_db()
|
||||
current_status = conn3.execute("SELECT status FROM sync_jobs WHERE id=?", (job_id,)).fetchone()
|
||||
conn3.close()
|
||||
if not cancelled and current_status and current_status["status"] == "cancelling":
|
||||
cancelled = True
|
||||
|
||||
if cancelled:
|
||||
job_status = "cancelled"
|
||||
with open(log_path, "a") as lf:
|
||||
lf.write("\n\nCANCELLED: Job was cancelled by user\n")
|
||||
elif exit_code == 0:
|
||||
job_status = "done"
|
||||
else:
|
||||
job_status = "failed"
|
||||
|
||||
conn = get_db()
|
||||
conn.execute("""
|
||||
UPDATE job_runs SET
|
||||
status=?, finished_at=?, messages_synced=?,
|
||||
messages_skipped=?, errors=?, duration_sec=?
|
||||
WHERE id=?
|
||||
WHERE id=? AND status != 'cancelled'
|
||||
""", (job_status, datetime.utcnow().isoformat(),
|
||||
stats["messages_synced"], stats["messages_skipped"],
|
||||
stats["errors"], duration, run_id))
|
||||
if job_status == "cancelled":
|
||||
conn.execute("UPDATE job_runs SET duration_sec=? WHERE id=?", (duration, run_id))
|
||||
conn.execute(
|
||||
"UPDATE sync_jobs SET status=? WHERE id=?",
|
||||
("idle", job_id)
|
||||
"UPDATE sync_jobs SET status='idle' WHERE id=?",
|
||||
(job_id,)
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@@ -181,24 +209,54 @@ def run_job(job: sqlite3.Row):
|
||||
)
|
||||
|
||||
|
||||
def recover_stale_jobs():
|
||||
conn = get_db()
|
||||
try:
|
||||
stale = conn.execute(
|
||||
"SELECT id FROM sync_jobs WHERE status IN ('running', 'cancelling')"
|
||||
).fetchall()
|
||||
for row in stale:
|
||||
job_id = row["id"]
|
||||
conn.execute(
|
||||
"UPDATE job_runs SET status='failed', finished_at=? "
|
||||
"WHERE job_id=? AND status='running'",
|
||||
(datetime.utcnow().isoformat(), job_id)
|
||||
)
|
||||
conn.execute(
|
||||
"UPDATE sync_jobs SET status='idle' WHERE id=?",
|
||||
(job_id,)
|
||||
)
|
||||
log.warning(f"Recovered stale job {job_id} → idle (worker restart)")
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
log.info(f"Worker started. DB={DB_PATH} LOG_DIR={LOG_DIR} POLL={POLL_INTERVAL}s")
|
||||
# Wait for DB to be initialized by the web container
|
||||
for i in range(30):
|
||||
if os.path.exists(DB_PATH):
|
||||
break
|
||||
log.info(f"Waiting for DB... ({i+1}/30)")
|
||||
time.sleep(2)
|
||||
|
||||
recover_stale_jobs()
|
||||
|
||||
while True:
|
||||
try:
|
||||
check_due_schedules()
|
||||
|
||||
conn = get_db()
|
||||
job = conn.execute(
|
||||
"SELECT * FROM sync_jobs WHERE status='queued' AND enabled=1 "
|
||||
"ORDER BY created_at ASC LIMIT 1"
|
||||
).fetchone()
|
||||
job = conn.execute("""
|
||||
SELECT j.*,
|
||||
s1.host as src_host, s1.port as src_port, s1.ssl as src_ssl,
|
||||
s2.host as dst_host, s2.port as dst_port, s2.ssl as dst_ssl
|
||||
FROM sync_jobs j
|
||||
JOIN servers s1 ON j.src_server_id = s1.id
|
||||
JOIN servers s2 ON j.dst_server_id = s2.id
|
||||
WHERE j.status='queued' AND j.enabled=1
|
||||
ORDER BY j.created_at ASC LIMIT 1
|
||||
""").fetchone()
|
||||
conn.close()
|
||||
|
||||
if job:
|
||||
|
||||
Reference in New Issue
Block a user