From aeb41b42c2b4d66bef17c711fcdcd61874eb6dc6 Mon Sep 17 00:00:00 2001 From: Matthew Jackson Date: Fri, 6 Mar 2026 14:39:18 -0800 Subject: [PATCH] Optimize form_filler: batch DOM reads into single evaluate() call Instead of 60+ sequential CDP round-trips per step (isVisible, getLabel, inputValue, isRequired for each element), snapshot all form state in one evaluate() call, do answer matching locally, then only make CDP calls to fill/click elements that need action. Co-Authored-By: Claude Opus 4.6 --- lib/form_filler.mjs | 649 ++++++++++++++++++++++++++++---------------- 1 file changed, 410 insertions(+), 239 deletions(-) diff --git a/lib/form_filler.mjs b/lib/form_filler.mjs index 9a54191..ded8213 100644 --- a/lib/form_filler.mjs +++ b/lib/form_filler.mjs @@ -2,6 +2,9 @@ * form_filler.mjs — Generic form filling * Config-driven: answers loaded from answers.json * Returns list of unknown required fields + * + * Performance: uses a single evaluate() to snapshot all form state from the DOM, + * does answer matching locally in Node, then only makes CDP calls to fill/click. */ import { writeFileSync, renameSync } from 'fs'; import { @@ -13,8 +16,8 @@ import { /** * Normalize answers from either format: - * Object: { "question": "answer" } → [{ pattern: "question", answer: "answer" }] - * Array: [{ pattern, answer }] → as-is + * Object: { "question": "answer" } -> [{ pattern: "question", answer: "answer" }] + * Array: [{ pattern, answer }] -> as-is */ function normalizeAnswers(answers) { if (!answers) return []; @@ -25,23 +28,94 @@ function normalizeAnswers(answers) { return []; } +/** + * Extract label text from a DOM node. Runs inside evaluate(). + * Checks: label[for], aria-label, aria-labelledby, ancestor label, placeholder, name. + */ +function extractLabel(node) { + const id = node.id; + const forLabel = id ? document.querySelector(`label[for="${id}"]`)?.textContent?.trim() : ''; + const ariaLabel = node.getAttribute('aria-label') || ''; + const ariaLabelledBy = node.getAttribute('aria-labelledby'); + const linked = ariaLabelledBy ? document.getElementById(ariaLabelledBy)?.textContent?.trim() : ''; + + let ancestorLabel = ''; + if (!forLabel && !ariaLabel && !linked) { + let parent = node.parentElement; + for (let i = 0; i < 5 && parent; i++) { + const lbl = parent.querySelector('label'); + if (lbl) { + ancestorLabel = lbl.textContent?.trim() || ''; + break; + } + parent = parent.parentElement; + } + } + + let raw = forLabel || ariaLabel || linked || ancestorLabel || node.placeholder || node.name || ''; + raw = raw.replace(/\s+/g, ' ').replace(/\s*\*\s*$/, '').replace(/\s*Required\s*$/i, '').trim(); + // Deduplicate repeated label text (LinkedIn renders label text twice) + if (raw.length > 8) { + for (let len = Math.ceil(raw.length / 2); len >= 4; len--) { + const candidate = raw.slice(0, len); + if (raw.startsWith(candidate + candidate)) { + raw = candidate.trim(); + break; + } + } + } + return raw; +} + +/** + * Check if a node is required. Runs inside evaluate(). + */ +function checkRequired(node) { + if (node.required || node.getAttribute('required') !== null) return true; + if (node.getAttribute('aria-required') === 'true') return true; + const id = node.id; + if (id) { + const label = document.querySelector(`label[for="${id}"]`); + if (label && label.textContent.includes('*')) return true; + } + let parent = node.parentElement; + for (let i = 0; i < 5 && parent; i++) { + const lbl = parent.querySelector('label'); + if (lbl && lbl.textContent.includes('*')) return true; + const reqSpan = parent.querySelector('[class*="required"], .artdeco-text-input--required'); + if (reqSpan) return true; + parent = parent.parentElement; + } + return false; +} + +/** + * Normalize a fieldset legend, same logic as extractLabel dedup. + */ +function normalizeLegend(el) { + let raw = (el.textContent || '').replace(/\s+/g, ' ').replace(/\s*\*\s*$/, '').replace(/\s*Required\s*$/i, '').trim(); + if (raw.length > 8) { + for (let len = Math.ceil(raw.length / 2); len >= 4; len--) { + const candidate = raw.slice(0, len); + if (raw.startsWith(candidate + candidate)) { raw = candidate.trim(); break; } + } + } + return raw; +} + export class FormFiller { constructor(profile, answers, opts = {}) { this.profile = profile; this.answers = normalizeAnswers(answers); // [{ pattern, answer }] this.apiKey = opts.apiKey || null; - this.answersPath = opts.answersPath || null; // path to answers.json for saving - this.jobContext = opts.jobContext || {}; // { title, company } + this.answersPath = opts.answersPath || null; + this.jobContext = opts.jobContext || {}; } - /** - * Save a new answer to answers.json and in-memory cache. - * Skips if pattern already exists. - */ saveAnswer(pattern, answer) { if (!pattern || !answer) return; const existing = this.answers.findIndex(a => a.pattern === pattern); - if (existing >= 0) return; // already saved + if (existing >= 0) return; this.answers.push({ pattern, answer }); if (this.answersPath) { try { @@ -52,12 +126,11 @@ export class FormFiller { } } - // Find answer for a label — checks custom answers first, then built-ins answerFor(label) { if (!label) return null; const l = label.toLowerCase(); - // Check custom answers first (user-defined, pattern is substring or regex) + // Check custom answers first for (const entry of this.answers) { try { if (entry.pattern.length > FORM_PATTERN_MAX_LENGTH) throw new Error('pattern too long'); @@ -68,7 +141,6 @@ export class FormFiller { } } - // Built-in answers const p = this.profile; // Contact @@ -148,85 +220,19 @@ export class FormFiller { l.includes('best way to apply') || l.includes('hidden code') || l.includes('passcode'); } + // Keep these for external callers (test scripts etc) async getLabel(el) { - return await el.evaluate(node => { - const id = node.id; - const forLabel = id ? document.querySelector(`label[for="${id}"]`)?.textContent?.trim() : ''; - const ariaLabel = node.getAttribute('aria-label') || ''; - const ariaLabelledBy = node.getAttribute('aria-labelledby'); - const linked = ariaLabelledBy ? document.getElementById(ariaLabelledBy)?.textContent?.trim() : ''; - - // LinkedIn doesn't use label[for] — labels are ancestor elements. - // Walk up the DOM to find the nearest label in a parent container. - let ancestorLabel = ''; - if (!forLabel && !ariaLabel && !linked) { - let parent = node.parentElement; - for (let i = 0; i < 5 && parent; i++) { - const lbl = parent.querySelector('label'); - if (lbl) { - ancestorLabel = lbl.textContent?.trim() || ''; - break; - } - parent = parent.parentElement; - } - } - - // Clean up label text - let raw = forLabel || ariaLabel || linked || ancestorLabel || node.placeholder || node.name || ''; - // Normalize whitespace, strip trailing *, strip "Required" suffix - raw = raw.replace(/\s+/g, ' ').replace(/\s*\*\s*$/, '').replace(/\s*Required\s*$/i, '').trim(); - // Deduplicate repeated label text (LinkedIn renders label text twice) - // e.g. "First sales hire?First sales hire?" → "First sales hire?" - if (raw.length > 8) { - for (let len = Math.ceil(raw.length / 2); len >= 4; len--) { - const candidate = raw.slice(0, len); - if (raw.startsWith(candidate + candidate)) { - raw = candidate.trim(); - break; - } - } - } - return raw; - }).catch(() => ''); + return await el.evaluate(extractLabel).catch(() => ''); } - /** - * Check if a form element is required. - * LinkedIn uses multiple patterns: required attribute, aria-required, or * in label. - */ async isRequired(el) { - return await el.evaluate(node => { - if (node.required || node.getAttribute('required') !== null) return true; - if (node.getAttribute('aria-required') === 'true') return true; - // Check if any associated label contains * — try label[for], then ancestor labels - const id = node.id; - if (id) { - const label = document.querySelector(`label[for="${id}"]`); - if (label && label.textContent.includes('*')) return true; - } - // Walk up ancestors to find a label with * - let parent = node.parentElement; - for (let i = 0; i < 5 && parent; i++) { - const lbl = parent.querySelector('label'); - if (lbl && lbl.textContent.includes('*')) return true; - // Also check for "Required" text in parent - const reqSpan = parent.querySelector('[class*="required"], .artdeco-text-input--required'); - if (reqSpan) return true; - parent = parent.parentElement; - } - return false; - }).catch(() => false); + return await el.evaluate(checkRequired).catch(() => false); } - /** - * Ask AI to answer an unknown question. Passes all saved answers so AI can - * recognize variations of previously answered questions. - * Returns the answer string, or null if AI can't help. - */ async aiAnswerFor(label, opts = {}) { if (!this.apiKey) return null; - const savedAnswers = this.answers.map(a => `Q: "${a.pattern}" → A: "${a.answer}"`).join('\n'); + const savedAnswers = this.answers.map(a => `Q: "${a.pattern}" -> A: "${a.answer}"`).join('\n'); const optionsHint = opts.options?.length ? `\nAvailable options: ${opts.options.join(', ')}` : ''; const systemPrompt = `You are helping a job candidate fill out application forms. You have access to their profile and previously answered questions. @@ -269,23 +275,17 @@ Answer:`; if (!res.ok) return null; const data = await res.json(); const answer = data.content?.[0]?.text?.trim() || null; - if (answer) console.log(` [AI] "${label}" → "${answer}"`); + if (answer) console.log(` [AI] "${label}" -> "${answer}"`); return answer; } catch { return null; } } - /** - * Select an option from a within the fieldset + // Last resort: select within fieldset if (!clicked || !(await fs.$('input:checked'))) { - const sel = await fs.$('select'); - if (sel) await this.selectOptionFuzzy(sel, answer); + if (field.hasSelect) { + const sel = await fs.$('select'); + if (sel) await this.selectOptionFuzzy(sel, answer); + } } } } else { - unknown.push(leg); + unknown.push(field.legend); } } - // Selects - for (const sel of await container.$$('select')) { - if (!await sel.isVisible().catch(() => false)) continue; - const lbl = await this.getLabel(sel); - const existing = await sel.inputValue().catch(() => ''); - // "Select an option" is LinkedIn's placeholder — treat as unfilled + // --- Selects (standalone) --- + const selEls = snap.selects.length > 0 ? await container.$$('select') : []; + for (const field of snap.selects) { + if (field.inFieldset) continue; // handled above + const sel = selEls[field.index]; + if (!sel) continue; + + const existing = field.selectedText || field.value || ''; if (existing && !/^select an? /i.test(existing)) continue; - // Get available options for validation - const availOpts = await sel.$$eval('option', els => - els.map(el => el.textContent?.trim()).filter(t => t && !/^select/i.test(t)) - ).catch(() => []); - let answer = this.answerFor(lbl); - // If built-in answer doesn't match any option (e.g. got "7" but options are Yes/No), discard it - if (answer && availOpts.length > 0) { + + let answer = this.answerFor(field.label); + // Validate answer against available options + if (answer && field.options.length > 0) { const ansLower = answer.toLowerCase(); - const matches = availOpts.some(o => o.toLowerCase() === ansLower || o.toLowerCase().includes(ansLower) || ansLower.includes(o.toLowerCase())); + const matches = field.options.some(o => + o.toLowerCase() === ansLower || o.toLowerCase().includes(ansLower) || ansLower.includes(o.toLowerCase()) + ); if (!matches) answer = null; } if (!answer) { - // EEO/voluntary fields — default to "Prefer not to disclose" - const ll = lbl.toLowerCase(); + // EEO/voluntary fields + const ll = field.label.toLowerCase(); if (ll.includes('race') || ll.includes('ethnicity') || ll.includes('gender') || ll.includes('veteran') || ll.includes('disability') || ll.includes('identification')) { - const opts = await sel.$$('option'); - for (const opt of opts) { - const text = await opt.textContent().catch(() => ''); - if (/prefer not|decline|do not wish|i don/i.test(text || '')) { - await sel.selectOption({ label: text.trim() }).catch(() => {}); - break; - } + // Find "prefer not to disclose" option from snapshot + const declineOpt = field.options.find(t => /prefer not|decline|do not wish|i don/i.test(t)); + if (declineOpt) { + await sel.selectOption({ label: declineOpt }).catch(() => {}); } continue; } // AI fallback for required selects - if (await this.isRequired(sel)) { - const opts = await sel.$$('option'); - const options = []; - for (const opt of opts) { - const text = (await opt.textContent().catch(() => '') || '').trim(); - if (text && !/^select/i.test(text)) options.push(text); - } - answer = await this.aiAnswerFor(lbl, { options }); + if (field.required) { + answer = await this.aiAnswerFor(field.label, { options: field.options }); if (answer) { - this.saveAnswer(lbl, answer); + this.saveAnswer(field.label, answer); } else { - unknown.push({ label: lbl, type: 'select', options }); + unknown.push({ label: field.label, type: 'select', options: field.options }); continue; } } @@ -550,14 +721,14 @@ Answer:`; } } - // Checkboxes — "mark as top choice" and similar opt-ins - for (const cb of await container.$$('input[type="checkbox"]')) { - if (!await cb.isVisible().catch(() => false)) continue; - if (await cb.isChecked().catch(() => false)) continue; - const lbl = await this.getLabel(cb); - const ll = lbl.toLowerCase(); + // --- Checkboxes (standalone) --- + const cbEls = snap.checkboxes.length > 0 ? await container.$$('input[type="checkbox"]') : []; + for (const field of snap.checkboxes) { + if (field.checked) continue; + const ll = field.label.toLowerCase(); if (ll.includes('top choice') || ll.includes('interested') || ll.includes('confirm') || ll.includes('agree') || ll.includes('consent')) { - await cb.check().catch(() => {}); + const el = cbEls[field.index]; + if (el) await el.check().catch(() => {}); } }