docs(offload): document persistence and fix status script orchestrator

This commit is contained in:
mkorwel
2026-03-17 13:03:40 -07:00
parent bc9f0fbbd8
commit e937681cd3
2 changed files with 61 additions and 52 deletions

View File

@@ -64,7 +64,21 @@ The offload system uses a **Hybrid VM + Docker** architecture designed for maxim
### Getting Started (Onboarding)
For a complete guide on setting up your remote environment, see the [Maintainer Onboarding Guide](../../../MAINTAINER_ONBOARDING.md).
### Technical details
### Persistence and Job Recovery
The offload system is designed for high reliability and persistence. Jobs use a nested execution model to ensure they continue running even if your local terminal is closed or the connection is lost.
### How it Works
1. **Host-Level Persistence**: The orchestrator launches each job in a named **`tmux`** session on the remote VM.
2. **Container Isolation**: The actual work is performed inside the persistent `maintainer-worker` Docker container.
### Re-attaching to a Job
If you lose your connection, you can easily resume your session:
- **Automatic**: Simply run the exact same command you started with (e.g., `npm run offload 123 review`). The system will automatically detect the existing session and re-attach you.
- **Manual**: Use `npm run offload:status` to find the session name, then use `ssh gcli-worker` to jump into the VM and `tmux attach -t <session>` to resume.
## Technical details
This skill uses a **Worker Provider** abstraction (`GceCosProvider`) to manage the remote lifecycle. It uses an isolated Gemini profile on the remote host (`~/.offload/gemini-cli-config`) to ensure that verification tasks do not interfere with your primary configuration.

View File

@@ -1,66 +1,61 @@
/**
* Offload Status Inspector (Remote)
* Offload Status Inspector (Local)
*
* Scans tmux sessions (host) and logs (container) to provide job status.
* Orchestrates remote status retrieval via the WorkerProvider.
*/
import { spawnSync } from 'child_process';
import fs from 'fs';
import path from 'path';
import os from 'os';
import fs from 'fs';
import { fileURLToPath } from 'url';
import { ProviderFactory } from './providers/ProviderFactory.ts';
const WORKTREE_BASE = '/home/node/dev/worktrees';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const REPO_ROOT = path.resolve(__dirname, '../../../..');
function getStatus() {
console.log('\n🛰 Offload Mission Control Status (Container Mode):');
console.log(''.padEnd(100, '-'));
console.log(`${'JOB ID'.padEnd(10)} | ${'ACTION'.padEnd(10)} | ${'STATE'.padEnd(12)} | ${'SESSION'.padEnd(25)}`);
console.log(''.padEnd(100, '-'));
// 1. Get active tmux sessions on the HOST
const tmux = spawnSync('tmux', ['ls', '-F', '#{session_name}']);
const activeSessions = tmux.stdout.toString().split('\n').filter(s => s.startsWith('offload-'));
// 2. Scan worktrees inside the CONTAINER
const findJobs = spawnSync('docker', ['exec', 'maintainer-worker', 'ls', WORKTREE_BASE], { stdio: 'pipe' });
const jobs = findJobs.stdout.toString().split('\n').filter(d => d.startsWith('offload-'));
if (jobs.length === 0 && activeSessions.length === 0) {
console.log(' No jobs found.');
return;
async function runStatus(env: NodeJS.ProcessEnv = process.env) {
const settingsPath = path.join(REPO_ROOT, '.gemini/offload/settings.json');
if (!fs.existsSync(settingsPath)) {
console.error('❌ Settings not found. Run "npm run offload:setup" first.');
return 1;
}
const settings = JSON.parse(fs.readFileSync(settingsPath, 'utf8'));
const config = settings.deepReview;
if (!config) {
console.error('❌ Deep Review configuration not found.');
return 1;
}
const allJobIds = Array.from(new Set([...jobs, ...activeSessions]));
const { projectId, zone } = config;
const targetVM = `gcli-offload-${env.USER || 'mattkorwel'}`;
const provider = ProviderFactory.getProvider({ projectId, zone, instanceName: targetVM });
allJobIds.forEach(id => {
if (!id) return;
const parts = id.split('-'); // offload-123-review
const pr = parts[1] || '???';
const action = parts[2] || '???';
console.log(`\n🛰 Offload Mission Control: ${targetVM}`);
console.log(`--------------------------------------------------------------------------------`);
const status = await provider.getStatus();
console.log(` - VM State: ${status.status}`);
console.log(` - Internal IP: ${status.internalIp || 'N/A'}`);
if (status.status === 'RUNNING') {
console.log(`\n🧵 Active Sessions (tmux):`);
// We fetch the list of sessions from the host
const tmuxRes = await provider.getExecOutput('tmux list-sessions -F "#S" 2>/dev/null');
let state = '💤 IDLE';
if (activeSessions.includes(id)) {
state = '🏃 RUNNING';
if (tmuxRes.status === 0 && tmuxRes.stdout.trim()) {
const sessions = tmuxRes.stdout.trim().split('\n');
sessions.forEach(s => {
if (s.startsWith('offload-')) {
console.log(`${s}`);
} else {
console.log(` 🔹 ${s} (Non-offload)`);
}
});
} else {
// Check logs inside the container
const logCheck = spawnSync('docker', ['exec', 'maintainer-worker', 'sh', '-c', `ls ${WORKTREE_BASE}/${id}/.gemini/logs/*.log 2>/dev/null | tail -n 1`], { stdio: 'pipe' });
const lastLogFile = logCheck.stdout.toString().trim();
if (lastLogFile) {
const logContent = spawnSync('docker', ['exec', 'maintainer-worker', 'cat', lastLogFile], { stdio: 'pipe' }).stdout.toString();
if (logContent.includes('SUCCESS')) state = '✅ SUCCESS';
else if (logContent.includes('FAILED')) state = '❌ FAILED';
else state = '🏁 FINISHED';
}
console.log(' - No active sessions');
}
}
console.log(`${pr.padEnd(10)} | ${action.padEnd(10)} | ${state.padEnd(12)} | ${id.padEnd(25)}`);
if (state === '🏃 RUNNING') {
console.log(` ├─ Attach: npm run offload:attach ${pr} ${action} [--local]`);
console.log(` ├─ Logs: npm run offload:logs ${pr} ${action}`);
}
console.log(` └─ Remove: npm run offload:remove ${pr} ${action}`);
});
console.log(''.padEnd(100, '-'));
console.log(`--------------------------------------------------------------------------------\n`);
return 0;
}
getStatus();
runStatus().catch(console.error);