diff --git a/.gemini/skills/offload/NETWORK_RESEARCH.md b/.gemini/skills/offload/NETWORK_RESEARCH.md new file mode 100644 index 0000000000..e834461c57 --- /dev/null +++ b/.gemini/skills/offload/NETWORK_RESEARCH.md @@ -0,0 +1,49 @@ +# Network Architecture & Troubleshooting Research + +This document captures the empirical research and final configuration settled upon for the Gemini CLI Offload system, specifically addressing the challenges of connecting from corporate environments to private GCP workers. + +## ๐Ÿ” The Challenge +The goal was to achieve **Direct internal SSH** access to GCE workers that have **no public IP addresses**, allowing for high-performance file synchronization (`rsync`) and interactive sessions without the overhead of `gcloud` wrappers. + +## ๐Ÿงช What Was Tested + +### 1. Standard Internal DNS (`..c..internal`) +- **Result**: โŒ FAILED +- **Observation**: Standard GCE internal DNS suffixes often fail to resolve or route correctly from local workstations in certain corporate environments, even when VPN/Peering is active. + +### 2. IAP Tunneling (`gcloud compute ssh --tunnel-through-iap`) +- **Result**: โš ๏ธ INCONSISTENT +- **Observation**: While IAP is the standard fallback for private VMs, it failed with "failed to connect to backend" (4003) when the underlying VPC network lacked proper configuration or when firewall rules were misaligned with the specific network interface. + +### 3. Custom "Auto" Networks +- **Result**: โŒ FAILED +- **Observation**: Creating a fresh VPC with default "auto" settings was insufficient. The "magic" corporate routing paths did not automatically extend to these new, isolated networks. + +## โœ… The Final State (The "Magic" Configuration) + +Through comparison with the working `gemini-cli-team-quota` project and empirical testing in a sandbox, we settled on the following requirements: + +### 1. Hostname Construction +The system **MUST** use the following specific hostname pattern for direct internal reachability: +`nic0...c..internal.gcpnode.com` + +### 2. VPC Configuration +The VPC (e.g., `iap-vpc`) must be a **Custom Mode** network with the following properties: +- **Private Google Access**: MUST be enabled on the subnetwork. This allows the private VM to communicate with Google services (like Artifact Registry) without a public IP. +- **Firewall Rule**: An ingress rule allowing `tcp:22` from `0.0.0.0/0`. + - *Note*: While `0.0.0.0/0` seems broad, in this context it is typically restricted by the corporate-level gateway/peering that provides the `internal.gcpnode.com` route. + +### 3. Worker Provider Abstraction +To manage this complexity, we implemented a `WorkerProvider` architecture: +- **`BaseProvider`**: Defines a common interface for `exec`, `sync`, and `provision`. +- **`GceCosProvider`**: Encapsulates the GCE-specific "magic" (hostname construction, IAP fallbacks, COS startup scripts). + +## ๐Ÿ› ๏ธ Why This Works +This configuration aligns with the **Google Corporate Direct-Access** pattern. By using the `nic0` prefix and the `.gcpnode.com` suffix, the connection is routed through internal corporate proxies that recognize the authenticated developer identity and permit the direct SSH handshake to the private IP. + +## ๐Ÿ“œ Technical Metadata Summary +- **Network**: `iap-vpc` (Custom) +- **Subnet**: `iap-subnet` (Private Google Access: Enabled) +- **Identity**: OS Login (`enable-oslogin=TRUE`) +- **Image**: Container-Optimized OS (COS) +- **Connectivity**: Direct SSH via `nic0` -> Automatic Fallback to IAP. diff --git a/.gemini/skills/offload/scripts/providers/GceCosProvider.ts b/.gemini/skills/offload/scripts/providers/GceCosProvider.ts index 49d1187068..b23f918fb1 100644 --- a/.gemini/skills/offload/scripts/providers/GceCosProvider.ts +++ b/.gemini/skills/offload/scripts/providers/GceCosProvider.ts @@ -51,7 +51,7 @@ export class GceCosProvider implements WorkerProvider { '--boot-disk-size', '200GB', '--boot-disk-type', 'pd-balanced', '--metadata', `startup-script=${startupScript},enable-oslogin=TRUE`, - '--network-interface', 'network-tier=PREMIUM,no-address', + '--network-interface', 'network=gcli-network,no-address', '--scopes', 'https://www.googleapis.com/auth/cloud-platform' ], { stdio: 'inherit' }); @@ -73,11 +73,11 @@ export class GceCosProvider implements WorkerProvider { } async setup(options: SetupOptions): Promise { - const dnsSuffix = options.dnsSuffix || '.internal'; + const dnsSuffix = options.dnsSuffix || '.internal.gcpnode.com'; - // Construct hostname. We attempt direct internal first. - // We've removed 'nic0' by default as it was reported as inconsistent. - const internalHostname = `${this.instanceName}.${this.zone}.c.${this.projectId}${dnsSuffix.startsWith('.') ? dnsSuffix : '.' + dnsSuffix}`; + // Construct hostname. Restoring verified corporate path requirements: + // MUST use 'nic0.' prefix and SHOULD default to '.internal.gcpnode.com' + const internalHostname = `nic0.${this.instanceName}.${this.zone}.c.${this.projectId}${dnsSuffix.startsWith('.') ? dnsSuffix : '.' + dnsSuffix}`; const sshEntry = ` Host ${this.sshAlias} diff --git a/.gemini/skills/offload/scripts/providers/ProviderFactory.ts b/.gemini/skills/offload/scripts/providers/ProviderFactory.ts index 1d93cc5120..5707e24c86 100644 --- a/.gemini/skills/offload/scripts/providers/ProviderFactory.ts +++ b/.gemini/skills/offload/scripts/providers/ProviderFactory.ts @@ -10,7 +10,7 @@ import path from 'path'; import { fileURLToPath } from 'url'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); -const REPO_ROOT = path.resolve(__dirname, '../../../..'); +const REPO_ROOT = path.resolve(__dirname, '../../../../..'); export class ProviderFactory { static getProvider(config: { projectId: string; zone: string; instanceName: string }): WorkerProvider { diff --git a/.gemini/skills/offload/scripts/setup.ts b/.gemini/skills/offload/scripts/setup.ts index e1d19cf40b..29b57afc88 100644 --- a/.gemini/skills/offload/scripts/setup.ts +++ b/.gemini/skills/offload/scripts/setup.ts @@ -47,6 +47,13 @@ export async function runSetup(env: NodeJS.ProcessEnv = process.env) { const zone = await prompt('Compute Zone', 'us-west1-a'); const targetVM = `gcli-offload-${env.USER || 'mattkorwel'}`; + // Early save of discovery info so fleet commands can work + const initialSettingsPath = path.join(REPO_ROOT, '.gemini/offload/settings.json'); + if (!fs.existsSync(path.dirname(initialSettingsPath))) fs.mkdirSync(path.dirname(initialSettingsPath), { recursive: true }); + const initialSettings = fs.existsSync(initialSettingsPath) ? JSON.parse(fs.readFileSync(initialSettingsPath, 'utf8')) : {}; + initialSettings.deepReview = { ...initialSettings.deepReview, projectId, zone }; + fs.writeFileSync(initialSettingsPath, JSON.stringify(initialSettings, null, 2)); + const provider = ProviderFactory.getProvider({ projectId, zone, instanceName: targetVM }); console.log(`๐Ÿ” Verifying access and finding worker ${targetVM}...`); @@ -63,7 +70,7 @@ export async function runSetup(env: NodeJS.ProcessEnv = process.env) { // 1. Configure Isolated SSH Alias (Direct Internal Path with IAP Fallback) console.log(`\n๐Ÿš€ Configuring Isolated SSH Alias...`); - const dnsSuffix = await prompt('Internal DNS Suffix', '.internal'); + const dnsSuffix = await prompt('Internal DNS Suffix', '.internal.gcpnode.com'); const setupRes = await provider.setup({ projectId, zone, dnsSuffix }); if (setupRes !== 0) return setupRes;