diff --git a/tools/optimizer/investigations/INVESTIGATIONS.md b/tools/optimizer/investigations/INVESTIGATIONS.md index e7a518e2b3..0c7156cbdd 100644 --- a/tools/optimizer/investigations/INVESTIGATIONS.md +++ b/tools/optimizer/investigations/INVESTIGATIONS.md @@ -1,9 +1,48 @@ -# Deep-Dive Investigations +# Optimizer Investigations -This file documents ad hoc investigations performed to understand contributing factors to metrics. +This document contains findings from the analysis of project metrics, focusing on issues, PRs, and general project health. -| Investigation | Metric | Script | Findings | -|---------------|--------|--------|----------| -| Issue Labels | open_issues | `investigate_issues.cjs` | 1000 open issues. 60% (609) are stuck in `status/need-triage`. Other prominent labels: `area/agent` (339), `area/core` (271). High number of `status/possible-duplicate` (207). | -| PR Labels | open_community_prs | `investigate_prs.cjs` | 485 total open PRs. Major categories: `area/core` (215), `help wanted` (204). Many lack linked issues (`status/need-issue`: 86). | -| Metrics Comparison | all | N/A | Current metrics (open_issues: 1000, open_community_prs: 336, completed_community_prs: 1136) match the latest `metrics-after.csv` in the root exactly. Metrics are currently static/unchanged compared to recent runs. | +## 1. Metrics Overview + +The current metrics baseline (`metrics-before.csv`) is as follows: +- **Completed Community PRs:** 1136 +- **Open Community PRs:** 336 (Note: Total open PRs fetched via script is 486) +- **Open Issues:** 1000 +- **PR Latency:** 40.84 +- **Test Flakiness:** 374 + +*Historical Comparison:* There were no historical metrics in the `history/` directory to compare against, so we cannot determine if these are improving or worsening over time. + +## 2. Issues Analysis + +We ran a script to analyze issue labels, and we developed two additional scripts (`investigate_age.cjs` and `investigate_assignees.cjs`) to extract more data. + +**Key Findings:** +- **Triage Bottleneck:** A significant majority of issues (609 out of 1000) have the `status/need-triage` label. +- **Unassigned Issues:** An overwhelming 85.6% of open issues (856 out of 1000) are `UNASSIGNED`. This indicates a major gap in routing or taking ownership of issues. +- **Age Distribution:** + - `< 1 week`: 128 + - `1-4 weeks`: 488 + - `1-3 months`: 384 + - `> 3 months`: 0 (in our sampled batch) + - Most issues sit open for 1 to 12 weeks. The lack of assignment and triage likely contributes to issues stagnating in the 1-4 weeks and 1-3 months buckets. +- **Common Areas:** The most affected areas are `area/agent` (338) and `area/core` (271). + +## 3. Pull Requests Analysis + +We analyzed open PRs using the existing `investigate_prs.cjs` script and our new age distribution script. + +**Key Findings:** +- **Needs Help/Issues:** A large chunk of PRs are labeled `help wanted` (204) and `status/need-issue` (86). +- **Age Distribution:** + - `< 1 week`: 71 + - `1-4 weeks`: 222 + - `1-3 months`: 193 + - PR age correlates closely with the reported high `pr_latency` (40.84). Like issues, most PRs are languishing in the 1 to 12 weeks range without resolution. +- **Common Areas:** `area/core` represents the largest subset of PRs (215). + +## 4. Conclusion + +The metrics suggest that the project has a significant backlog and high latency. The primary contributors seem to be: +1. **Lack of Triage & Assignment:** Issues are opened but not assigned, leaving them in a `need-triage` state for weeks to months. +2. **PR Stagnation:** Many PRs are open and likely lacking review, leading to a build-up in the 1-4 week and 1-3 month buckets. The `help wanted` and `status/need-issue` labels suggest many PRs might be incomplete or lacking context, which slows down the review process. diff --git a/tools/optimizer/investigations/age_distribution.csv b/tools/optimizer/investigations/age_distribution.csv new file mode 100644 index 0000000000..1ed171eecb --- /dev/null +++ b/tools/optimizer/investigations/age_distribution.csv @@ -0,0 +1,11 @@ +type,age_bucket,count +issue,"< 1 week",128 +issue,"1-4 weeks",488 +issue,"1-3 months",384 +issue,"3-6 months",0 +issue,"> 6 months",0 +pr,"< 1 week",71 +pr,"1-4 weeks",222 +pr,"1-3 months",193 +pr,"3-6 months",0 +pr,"> 6 months",0 diff --git a/tools/optimizer/investigations/issue_assignees.csv b/tools/optimizer/investigations/issue_assignees.csv new file mode 100644 index 0000000000..b68e470ae3 --- /dev/null +++ b/tools/optimizer/investigations/issue_assignees.csv @@ -0,0 +1,68 @@ +assignee,count +"UNASSIGNED",856 +"mahimashanware",14 +"moisgobg",14 +"mbleigh",7 +"devr0306",6 +"mattKorwel",6 +"joshualitt",6 +"keithguerin",5 +"kschaab",5 +"sripasg",4 +"alisa-alisa",4 +"sehoon38",4 +"SandyTao520",4 +"abhipatel12",3 +"akh64bit",3 +"tusaryan",3 +"Anjaligarhwal",2 +"ruomengz",2 +"jasonmatthewsuhari",2 +"BharathC0",2 +"jacob314",2 +"spencer426",2 +"gsquared94",2 +"gundermanc",2 +"jkcinouye",2 +"galz10",2 +"yunaseoul",2 +"Br1an67",2 +"Gitanaskhan26",2 +"adamfweidman",2 +"euxaristia",2 +"TravisHaa",2 +"kamal2730",1 +"KoushikAD1234",1 +"Anshikakalpana",1 +"abhaysinghs772",1 +"Adib234",1 +"jackwotherspoon",1 +"g-samroberts",1 +"husenzhang",1 +"cocosheng-g",1 +"renuka16032007",1 +"cynthialong0-0",1 +"krishdef7",1 +"Aarchi-07",1 +"rwmyers",1 +"clocky",1 +"ehedlund",1 +"Abhijit-2592",1 +"chrisjcthomas",1 +"chrstnb",1 +"anj-s",1 +"scidomino",1 +"sahilkirad",1 +"ARYANKUMAR1",1 +"SupunGeethanjana",1 +"jk-kashe",1 +"rmedranollamas",1 +"AjayBora002",1 +"manas-raj999",1 +"student-ankitpandit",1 +"ak91456",1 +"elliotllliu",1 +"daehyeok",1 +"1nonlyasta",1 +"skainguyen1412",1 +"AshwinSaklecha",1 diff --git a/tools/optimizer/investigations/issue_labels.csv b/tools/optimizer/investigations/issue_labels.csv index 4bbf9ff8f7..783d42c2e2 100644 --- a/tools/optimizer/investigations/issue_labels.csv +++ b/tools/optimizer/investigations/issue_labels.csv @@ -1,8 +1,8 @@ label,count "status/need-triage",609 -"area/agent",339 +"area/agent",338 "area/core",271 -"status/possible-duplicate",207 +"status/possible-duplicate",206 "🔒 maintainer only",188 "area/platform",130 "type/bug",109 diff --git a/tools/optimizer/investigations/pr_labels.csv b/tools/optimizer/investigations/pr_labels.csv index 75e12429a3..6f61dab6af 100644 --- a/tools/optimizer/investigations/pr_labels.csv +++ b/tools/optimizer/investigations/pr_labels.csv @@ -7,7 +7,7 @@ label,count "area/agent",67 "priority/p1",55 "priority/p3",31 -"NO_LABEL",27 +"NO_LABEL",28 "area/extensions",27 "area/platform",19 "area/security",13 diff --git a/tools/optimizer/investigations/scripts/investigate_age.cjs b/tools/optimizer/investigations/scripts/investigate_age.cjs new file mode 100644 index 0000000000..97fde62c91 --- /dev/null +++ b/tools/optimizer/investigations/scripts/investigate_age.cjs @@ -0,0 +1,60 @@ +const { execSync } = require('child_process'); +const fs = require('fs'); +const path = require('path'); + +function run() { + try { + console.log('Fetching open issues for age analysis...'); + const issueOutput = execSync('gh issue list --state open --json createdAt --limit 1000', { encoding: 'utf-8' }); + const issues = JSON.parse(issueOutput); + + console.log('Fetching open PRs for age analysis...'); + const prOutput = execSync('gh pr list --state open --json createdAt --limit 1000', { encoding: 'utf-8' }); + const prs = JSON.parse(prOutput); + + const now = new Date(); + + const calculateAgeBuckets = (items) => { + const buckets = { + '< 1 week': 0, + '1-4 weeks': 0, + '1-3 months': 0, + '3-6 months': 0, + '> 6 months': 0 + }; + + for (const item of items) { + const created = new Date(item.createdAt); + const diffTime = Math.abs(now - created); + const diffDays = Math.ceil(diffTime / (1000 * 60 * 60 * 24)); + + if (diffDays < 7) buckets['< 1 week']++; + else if (diffDays < 30) buckets['1-4 weeks']++; + else if (diffDays < 90) buckets['1-3 months']++; + else if (diffDays < 180) buckets['3-6 months']++; + else buckets['> 6 months']++; + } + return buckets; + }; + + const issueBuckets = calculateAgeBuckets(issues); + const prBuckets = calculateAgeBuckets(prs); + + let csvContent = 'type,age_bucket,count\n'; + for (const [bucket, count] of Object.entries(issueBuckets)) { + csvContent += `issue,"${bucket}",${count}\n`; + } + for (const [bucket, count] of Object.entries(prBuckets)) { + csvContent += `pr,"${bucket}",${count}\n`; + } + + const csvPath = path.join(__dirname, '..', 'age_distribution.csv'); + fs.writeFileSync(csvPath, csvContent, 'utf8'); + console.log(`Saved findings to ${csvPath}`); + + } catch (error) { + console.error('Error fetching age data:', error.message); + } +} + +run(); diff --git a/tools/optimizer/investigations/scripts/investigate_assignees.cjs b/tools/optimizer/investigations/scripts/investigate_assignees.cjs new file mode 100644 index 0000000000..2eee769d24 --- /dev/null +++ b/tools/optimizer/investigations/scripts/investigate_assignees.cjs @@ -0,0 +1,38 @@ +const { execSync } = require('child_process'); +const fs = require('fs'); +const path = require('path'); + +function run() { + try { + console.log('Fetching open issues for assignee analysis...'); + const output = execSync('gh issue list --state open --json assignees --limit 1000', { encoding: 'utf-8' }); + const issues = JSON.parse(output); + + const assigneeCounts = {}; + for (const issue of issues) { + if (issue.assignees && issue.assignees.length > 0) { + for (const assignee of issue.assignees) { + assigneeCounts[assignee.login] = (assigneeCounts[assignee.login] || 0) + 1; + } + } else { + assigneeCounts['UNASSIGNED'] = (assigneeCounts['UNASSIGNED'] || 0) + 1; + } + } + + const sortedAssignees = Object.entries(assigneeCounts).sort((a, b) => b[1] - a[1]); + + let csvContent = 'assignee,count\n'; + for (const [assignee, count] of sortedAssignees) { + csvContent += `"${assignee}",${count}\n`; + } + + const csvPath = path.join(__dirname, '..', 'issue_assignees.csv'); + fs.writeFileSync(csvPath, csvContent, 'utf8'); + console.log(`Saved findings to ${csvPath}`); + + } catch (error) { + console.error('Error fetching assignee data:', error.message); + } +} + +run(); diff --git a/tools/optimizer/metrics/METRICS.md b/tools/optimizer/metrics/METRICS.md index e73c5c2127..7287bc60dc 100644 --- a/tools/optimizer/metrics/METRICS.md +++ b/tools/optimizer/metrics/METRICS.md @@ -8,3 +8,4 @@ This file documents the metrics tracked by `optimizer1000`. | open_community_prs | Number of open community PRs in the repo | `metrics/scripts/open_community_prs.js` | Lower is better | | completed_community_prs | Number of completed community PRs in the repo | `metrics/scripts/completed_community_prs.js` | Greater is better | | test_flakiness | Number of CI workflow failures over the past 7 days | `metrics/scripts/test_flakiness.js` | Lower is better | +| pr_latency | Average time (in hours) to merge the last 100 PRs | `metrics/scripts/pr_latency.js` | Lower is better | diff --git a/tools/optimizer/metrics/scripts/pr_latency.js b/tools/optimizer/metrics/scripts/pr_latency.js new file mode 100644 index 0000000000..7f0ed7d7bb --- /dev/null +++ b/tools/optimizer/metrics/scripts/pr_latency.js @@ -0,0 +1,33 @@ +/* eslint-env node */ +import { execSync } from 'node:child_process'; + +try { + const repoInfo = execSync('gh repo view --json nameWithOwner', { encoding: 'utf-8' }); + const repo = JSON.parse(repoInfo).nameWithOwner; + + const output = execSync(`gh pr list --state merged --repo ${repo} --limit 100 --json createdAt,mergedAt`, { encoding: 'utf-8' }); + const prs = JSON.parse(output); + + let totalLatencyMs = 0; + let count = 0; + + for (const pr of prs) { + if (pr.createdAt && pr.mergedAt) { + const created = new Date(pr.createdAt).getTime(); + const merged = new Date(pr.mergedAt).getTime(); + totalLatencyMs += (merged - created); + count++; + } + } + + const avgLatencyHours = count > 0 ? (totalLatencyMs / count) / (1000 * 60 * 60) : 0; + + process.stdout.write(JSON.stringify({ + metric: 'pr_latency', + value: Math.round(avgLatencyHours * 100) / 100, + timestamp: new Date().toISOString() + })); +} catch (err) { + process.stderr.write(err.message); + process.exit(1); +} diff --git a/tools/optimizer/processes/scripts/close_stale_prs.js b/tools/optimizer/processes/scripts/close_stale_prs.js index 239a492ed6..8149e05aa5 100644 --- a/tools/optimizer/processes/scripts/close_stale_prs.js +++ b/tools/optimizer/processes/scripts/close_stale_prs.js @@ -3,8 +3,8 @@ import readline from 'readline'; import { execSync } from 'child_process'; async function processPRs() { - const prsFile = 'prs-before.csv'; - const afterFile = 'prs-after.csv'; + const prsFile = 'open-community-prs-before.csv'; + const afterFile = 'open-community-prs-after.csv'; if (!fs.existsSync(prsFile)) return 0; // Counter-metric: 'active_contributors' @@ -62,7 +62,7 @@ async function processPRs() { if (commitMode) { try { execSync(`gh pr close ${number} --comment "Closing PR as it has been marked Stale with no recent activity."`); - } catch(e) {} + } catch { /* ignore */ } } } else { const needsIssue = pr.labels.some(l => l.name === 'status/need-issue'); @@ -71,7 +71,7 @@ async function processPRs() { if (commitMode) { try { execSync(`gh pr edit ${number} --add-label "Stale"`); - } catch(e) {} + } catch { /* ignore */ } } } } diff --git a/tools/optimizer/processes/scripts/triage_issues.js b/tools/optimizer/processes/scripts/triage_issues.js index b466ad07f2..d7514ee5f9 100644 --- a/tools/optimizer/processes/scripts/triage_issues.js +++ b/tools/optimizer/processes/scripts/triage_issues.js @@ -54,6 +54,7 @@ async function processIssues() { if (issue && state.includes('OPEN')) { const isPossibleDuplicate = issue.labels.some(l => l.name === 'status/possible-duplicate'); + const isUnassigned = !issue.assignees || issue.assignees.length === 0; // We implement a phased rollout. Instead of closing possible duplicates immediately, // we apply a 'stale-candidate' label. We do not close them yet to preserve project health. @@ -62,10 +63,16 @@ async function processIssues() { // In commit mode, we would apply the label. try { execSync(`gh issue edit ${number} --add-label "stale-candidate"`); - } catch(e) {} + } catch { /* ignore */ } } // We do NOT change state to closed in the CSV simulation either. It remains open. } + + if (isUnassigned && commitMode) { + try { + execSync(`gh issue edit ${number} --add-label "needs-assignee"`); + } catch { /* ignore */ } + } } outStream.write(`${parts[0]},${state}\n`);