Make S3 the primary storage layer (not backup)

storage.mjs is now a single interface: loadJSON() and saveJSON()
route to either local disk or S3 based on settings.storage.type.
The app never touches disk/S3 directly.

- All queue/log functions are now async (saveQueue, appendLog, etc.)
- All callers updated with await
- Data validation prevents saving corrupt types (strings, nulls)
- S3 versioned bucket preserves every write
- Config: storage.type = "local" (disk) or "s3" (S3 primary)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 22:03:16 -08:00
parent 253d1888e9
commit 534d318953
6 changed files with 178 additions and 271 deletions

View File

@@ -19,7 +19,7 @@ const origStderrWrite = process.stderr.write.bind(process.stderr);
process.stdout.write = (chunk, ...args) => { logStream.write(chunk); return origStdoutWrite(chunk, ...args); }; process.stdout.write = (chunk, ...args) => { logStream.write(chunk); return origStdoutWrite(chunk, ...args); };
process.stderr.write = (chunk, ...args) => { logStream.write(chunk); return origStderrWrite(chunk, ...args); }; process.stderr.write = (chunk, ...args) => { logStream.write(chunk); return origStderrWrite(chunk, ...args); };
import { getJobsByStatus, updateJobStatus, appendLog, loadConfig, isAlreadyApplied, initQueueFromS3 } from './lib/queue.mjs'; import { getJobsByStatus, updateJobStatus, appendLog, loadConfig, isAlreadyApplied, initQueue } from './lib/queue.mjs';
import { acquireLock } from './lib/lock.mjs'; import { acquireLock } from './lib/lock.mjs';
import { createBrowser } from './lib/browser.mjs'; import { createBrowser } from './lib/browser.mjs';
import { ensureAuth } from './lib/session.mjs'; import { ensureAuth } from './lib/session.mjs';
@@ -43,7 +43,7 @@ async function main() {
const lock = acquireLock('applier', resolve(__dir, 'data')); const lock = acquireLock('applier', resolve(__dir, 'data'));
const settings = loadConfig(resolve(__dir, 'config/settings.json')); const settings = loadConfig(resolve(__dir, 'config/settings.json'));
await initQueueFromS3(settings); await initQueue(settings);
const profile = loadConfig(resolve(__dir, 'config/profile.json')); const profile = loadConfig(resolve(__dir, 'config/profile.json'));
const answersPath = resolve(__dir, 'config/answers.json'); const answersPath = resolve(__dir, 'config/answers.json');
const answers = existsSync(answersPath) ? loadConfig(answersPath) : []; const answers = existsSync(answersPath) ? loadConfig(answersPath) : [];
@@ -180,7 +180,7 @@ async function main() {
console.error(`${platform} auth failed: ${authResult.reason}`); console.error(`${platform} auth failed: ${authResult.reason}`);
await sendTelegram(settings, `⚠️ *${platform}* auth failed — ${authResult.reason}`).catch(() => {}); await sendTelegram(settings, `⚠️ *${platform}* auth failed — ${authResult.reason}`).catch(() => {});
for (const job of platformJobs) { for (const job of platformJobs) {
updateJobStatus(job.id, 'new', { retry_reason: 'auth_failed' }); await updateJobStatus(job.id, 'new', { retry_reason: 'auth_failed' });
} }
continue; continue;
} }
@@ -190,9 +190,9 @@ async function main() {
} }
for (const job of platformJobs) { for (const job of platformJobs) {
if (isAlreadyApplied(job.id)) { if (await isAlreadyApplied(job.id)) {
console.log(` ⏭️ Already applied — ${job.title} @ ${job.company || '?'}`); console.log(` ⏭️ Already applied — ${job.title} @ ${job.company || '?'}`);
updateJobStatus(job.id, 'already_applied', {}); await updateJobStatus(job.id, 'already_applied', {});
results.already_applied++; results.already_applied++;
continue; continue;
} }
@@ -261,10 +261,10 @@ async function main() {
const retries = (job.retry_count || 0) + 1; const retries = (job.retry_count || 0) + 1;
if (retries <= maxRetries) { if (retries <= maxRetries) {
updateJobStatus(job.id, 'new', { retry_count: retries }); await updateJobStatus(job.id, 'new', { retry_count: retries });
} else { } else {
updateJobStatus(job.id, 'failed', { error: e.message }); await updateJobStatus(job.id, 'failed', { error: e.message });
appendLog({ ...job, status: 'failed', error: e.message }); await appendLog({ ...job, status: 'failed', error: e.message });
results.failed++; results.failed++;
} }
} }
@@ -311,8 +311,8 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
switch (status) { switch (status) {
case 'submitted': case 'submitted':
console.log(` ✅ Applied!${applyDuration ? ` (${applyDuration}s)` : ''}`); console.log(` ✅ Applied!${applyDuration ? ` (${applyDuration}s)` : ''}`);
updateJobStatus(job.id, 'applied', { title, company, applied_at: Date.now(), apply_started_at: applyStartedAt }); await updateJobStatus(job.id, 'applied', { title, company, applied_at: Date.now(), apply_started_at: applyStartedAt });
appendLog({ ...job, title, company, status: 'applied', applied_at: Date.now(), apply_started_at: applyStartedAt }); await appendLog({ ...job, title, company, status: 'applied', applied_at: Date.now(), apply_started_at: applyStartedAt });
results.submitted++; results.submitted++;
break; break;
@@ -339,12 +339,12 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
const telegramMsgId = await sendTelegram(settings, msg); const telegramMsgId = await sendTelegram(settings, msg);
updateJobStatus(job.id, 'needs_answer', { await updateJobStatus(job.id, 'needs_answer', {
title, company, pending_question, title, company, pending_question,
ai_suggested_answer: aiAnswer || null, ai_suggested_answer: aiAnswer || null,
telegram_message_id: telegramMsgId, telegram_message_id: telegramMsgId,
}); });
appendLog({ ...job, title, company, status: 'needs_answer', pending_question, ai_suggested_answer: aiAnswer }); await appendLog({ ...job, title, company, status: 'needs_answer', pending_question, ai_suggested_answer: aiAnswer });
results.needs_answer++; results.needs_answer++;
console.log(` ⏸️ Question sent to Telegram. Job will retry after you reply.`); console.log(` ⏸️ Question sent to Telegram. Job will retry after you reply.`);
break; break;
@@ -352,16 +352,16 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
case 'skipped_recruiter_only': case 'skipped_recruiter_only':
console.log(` 🚫 Recruiter-only`); console.log(` 🚫 Recruiter-only`);
updateJobStatus(job.id, 'skipped_recruiter_only', { title, company }); await updateJobStatus(job.id, 'skipped_recruiter_only', { title, company });
appendLog({ ...job, title, company, status: 'skipped_recruiter_only' }); await appendLog({ ...job, title, company, status: 'skipped_recruiter_only' });
results.skipped_recruiter++; results.skipped_recruiter++;
break; break;
case 'skipped_external_unsupported': { case 'skipped_external_unsupported': {
const platform = ats_platform || job.apply_type || 'unknown'; const platform = ats_platform || job.apply_type || 'unknown';
console.log(` ⏭️ External ATS: ${platform}`); console.log(` ⏭️ External ATS: ${platform}`);
updateJobStatus(job.id, 'skipped_external_unsupported', { title, company, ats_url: externalUrl, ats_platform: platform }); await updateJobStatus(job.id, 'skipped_external_unsupported', { title, company, ats_url: externalUrl, ats_platform: platform });
appendLog({ ...job, title, company, status: 'skipped_external_unsupported', ats_url: externalUrl, ats_platform: platform }); await appendLog({ ...job, title, company, status: 'skipped_external_unsupported', ats_url: externalUrl, ats_platform: platform });
results.skipped_external++; results.skipped_external++;
results.atsCounts[platform] = (results.atsCounts[platform] || 0) + 1; results.atsCounts[platform] = (results.atsCounts[platform] || 0) + 1;
break; break;
@@ -369,14 +369,14 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
case 'rate_limited': case 'rate_limited':
console.log(` ⚠️ LinkedIn Easy Apply daily limit reached — stopping run`); console.log(` ⚠️ LinkedIn Easy Apply daily limit reached — stopping run`);
updateJobStatus(job.id, 'new', { title, company, retry_reason: 'rate_limited' }); await updateJobStatus(job.id, 'new', { title, company, retry_reason: 'rate_limited' });
results.rate_limited = true; results.rate_limited = true;
break; break;
case 'closed': case 'closed':
console.log(` 🚫 Closed — no longer accepting applications`); console.log(` 🚫 Closed — no longer accepting applications`);
updateJobStatus(job.id, 'closed', { title, company }); await updateJobStatus(job.id, 'closed', { title, company });
appendLog({ ...job, title, company, status: 'closed' }); await appendLog({ ...job, title, company, status: 'closed' });
results.closed = (results.closed || 0) + 1; results.closed = (results.closed || 0) + 1;
break; break;
@@ -384,8 +384,8 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
case 'skipped_no_apply': case 'skipped_no_apply':
case 'skipped_easy_apply_unsupported': case 'skipped_easy_apply_unsupported':
console.log(` ⏭️ Skipped — ${status}`); console.log(` ⏭️ Skipped — ${status}`);
updateJobStatus(job.id, status, { title, company }); await updateJobStatus(job.id, status, { title, company });
appendLog({ ...job, title, company, status }); await appendLog({ ...job, title, company, status });
results.skipped_no_apply++; results.skipped_no_apply++;
break; break;
@@ -393,8 +393,8 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
case 'skipped_login_required': case 'skipped_login_required':
case 'skipped_captcha': case 'skipped_captcha':
console.log(` ⏭️ Skipped — ${status.replace('skipped_', '')}`); console.log(` ⏭️ Skipped — ${status.replace('skipped_', '')}`);
updateJobStatus(job.id, status, { title, company }); await updateJobStatus(job.id, status, { title, company });
appendLog({ ...job, title, company, status }); await appendLog({ ...job, title, company, status });
results.skipped_other++; results.skipped_other++;
break; break;
@@ -407,11 +407,11 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
const maxRetry = settings.max_retries ?? DEFAULT_MAX_RETRIES; const maxRetry = settings.max_retries ?? DEFAULT_MAX_RETRIES;
if (retries <= maxRetry) { if (retries <= maxRetry) {
console.log(` ⏭️ ${status} — will retry (attempt ${retries}/${maxRetry})`); console.log(` ⏭️ ${status} — will retry (attempt ${retries}/${maxRetry})`);
updateJobStatus(job.id, 'new', { title, company, retry_count: retries }); await updateJobStatus(job.id, 'new', { title, company, retry_count: retries });
} else { } else {
console.log(` ⏭️ ${status} — max retries reached`); console.log(` ⏭️ ${status} — max retries reached`);
updateJobStatus(job.id, status, { title, company }); await updateJobStatus(job.id, status, { title, company });
appendLog({ ...job, title, company, status }); await appendLog({ ...job, title, company, status });
} }
results.skipped_other++; results.skipped_other++;
break; break;
@@ -419,8 +419,8 @@ async function handleResult(job, result, results, settings, profile, apiKey) {
default: default:
console.warn(` ⚠️ Unhandled status: ${status}`); console.warn(` ⚠️ Unhandled status: ${status}`);
updateJobStatus(job.id, status, { title, company }); await updateJobStatus(job.id, status, { title, company });
appendLog({ ...job, title, company, status }); await appendLog({ ...job, title, company, status });
results.skipped_other++; results.skipped_other++;
} }
} }

View File

@@ -30,7 +30,7 @@ const origStderrWrite = process.stderr.write.bind(process.stderr);
process.stdout.write = (chunk, ...args) => { logStream.write(chunk); return origStdoutWrite(chunk, ...args); }; process.stdout.write = (chunk, ...args) => { logStream.write(chunk); return origStdoutWrite(chunk, ...args); };
process.stderr.write = (chunk, ...args) => { logStream.write(chunk); return origStderrWrite(chunk, ...args); }; process.stderr.write = (chunk, ...args) => { logStream.write(chunk); return origStderrWrite(chunk, ...args); };
import { getJobsByStatus, updateJobStatus, loadConfig, loadQueue, saveQueue, dedupeAfterFilter, initQueueFromS3 } from './lib/queue.mjs'; import { getJobsByStatus, updateJobStatus, loadConfig, loadQueue, saveQueue, dedupeAfterFilter, initQueue } from './lib/queue.mjs';
import { loadProfile, submitBatches, checkBatch, downloadResults } from './lib/filter.mjs'; import { loadProfile, submitBatches, checkBatch, downloadResults } from './lib/filter.mjs';
import { sendTelegram, formatFilterSummary } from './lib/notify.mjs'; import { sendTelegram, formatFilterSummary } from './lib/notify.mjs';
import { DEFAULT_FILTER_MODEL, DEFAULT_FILTER_MIN_SCORE } from './lib/constants.mjs'; import { DEFAULT_FILTER_MODEL, DEFAULT_FILTER_MIN_SCORE } from './lib/constants.mjs';
@@ -163,10 +163,10 @@ async function collect(state, settings) {
else { filtered++; job.status = 'filtered'; } else { filtered++; job.status = 'filtered'; }
} }
saveQueue(queue); await saveQueue(queue);
// Dedup cross-track copies — keep highest-scoring version of each job // Dedup cross-track copies — keep highest-scoring version of each job
const duped = dedupeAfterFilter(); const duped = await dedupeAfterFilter();
if (duped > 0) console.log(` Deduped ${duped} cross-track copies`); if (duped > 0) console.log(` Deduped ${duped} cross-track copies`);
clearState(); clearState();
@@ -221,7 +221,7 @@ async function submit(settings, searchConfig, candidateProfile) {
} }
} }
if (cleared > 0) { if (cleared > 0) {
saveQueue(queue); await saveQueue(queue);
console.log(`🔄 Cleared ${cleared} stale batch markers (batch completed without scoring)`); console.log(`🔄 Cleared ${cleared} stale batch markers (batch completed without scoring)`);
} }
} }
@@ -279,7 +279,7 @@ async function submit(settings, searchConfig, candidateProfile) {
job.filter_submitted_at = submittedAt; job.filter_submitted_at = submittedAt;
} }
} }
saveQueue(allJobs); await saveQueue(allJobs);
const batchSummary = submitted.map(b => `${b.track}: ${b.jobCount} jobs`).join(', '); const batchSummary = submitted.map(b => `${b.track}: ${b.jobCount} jobs`).join(', ');
console.log(` ${batchSummary}`); console.log(` ${batchSummary}`);
@@ -305,7 +305,7 @@ async function main() {
} }
const settings = loadConfig(resolve(__dir, 'config/settings.json')); const settings = loadConfig(resolve(__dir, 'config/settings.json'));
await initQueueFromS3(settings); await initQueue(settings);
const searchConfig = loadConfig(resolve(__dir, 'config/search_config.json')); const searchConfig = loadConfig(resolve(__dir, 'config/search_config.json'));
const candidateProfile = loadConfig(resolve(__dir, 'config/profile.json')); const candidateProfile = loadConfig(resolve(__dir, 'config/profile.json'));

View File

@@ -20,7 +20,7 @@ const origStderrWrite = process.stderr.write.bind(process.stderr);
process.stdout.write = (chunk, ...args) => { logStream.write(chunk); return origStdoutWrite(chunk, ...args); }; process.stdout.write = (chunk, ...args) => { logStream.write(chunk); return origStdoutWrite(chunk, ...args); };
process.stderr.write = (chunk, ...args) => { logStream.write(chunk); return origStderrWrite(chunk, ...args); }; process.stderr.write = (chunk, ...args) => { logStream.write(chunk); return origStderrWrite(chunk, ...args); };
import { addJobs, loadQueue, loadConfig, getJobsByStatus, updateJobStatus, initQueueFromS3 } from './lib/queue.mjs'; import { addJobs, loadQueue, loadConfig, getJobsByStatus, updateJobStatus, initQueue } from './lib/queue.mjs';
import { writeFileSync, readFileSync, existsSync } from 'fs'; import { writeFileSync, readFileSync, existsSync } from 'fs';
import { acquireLock } from './lib/lock.mjs'; import { acquireLock } from './lib/lock.mjs';
import { createBrowser } from './lib/browser.mjs'; import { createBrowser } from './lib/browser.mjs';
@@ -60,7 +60,7 @@ async function main() {
const startedAt = Date.now(); const startedAt = Date.now();
const settings = loadConfig(resolve(__dir, 'config/settings.json')); const settings = loadConfig(resolve(__dir, 'config/settings.json'));
await initQueueFromS3(settings); await initQueue(settings);
const writeLastRun = (finished = false) => { const writeLastRun = (finished = false) => {
const entry = { const entry = {
@@ -184,8 +184,8 @@ async function main() {
const effectiveSearch = { ...search, keywords: search.keywords.slice(keywordStart), keywordOffset: keywordStart, filters: { ...search.filters, posted_within_days: lookbackDays } }; const effectiveSearch = { ...search, keywords: search.keywords.slice(keywordStart), keywordOffset: keywordStart, filters: { ...search.filters, posted_within_days: lookbackDays } };
let queryFound = 0, queryAdded = 0; let queryFound = 0, queryAdded = 0;
await searchLinkedIn(liBrowser.page, effectiveSearch, { await searchLinkedIn(liBrowser.page, effectiveSearch, {
onPage: (pageJobs) => { onPage: async (pageJobs) => {
const added = addJobs(pageJobs); const added = await addJobs(pageJobs);
totalAdded += added; totalAdded += added;
totalSeen += pageJobs.length; totalSeen += pageJobs.length;
queryFound += pageJobs.length; queryFound += pageJobs.length;
@@ -209,7 +209,7 @@ async function main() {
if (unclassified.length > 0) { if (unclassified.length > 0) {
console.log(`\n🔍 Classifying ${unclassified.length} external jobs...`); console.log(`\n🔍 Classifying ${unclassified.length} external jobs...`);
const { classified, remaining } = await classifyExternalJobs(liBrowser.page, unclassified, (job, applyType, applyUrl) => { const { classified, remaining } = await classifyExternalJobs(liBrowser.page, unclassified, (job, applyType, applyUrl) => {
updateJobStatus(job.id, 'new', { apply_type: applyType, apply_url: applyUrl }); await updateJobStatus(job.id, 'new', { apply_type: applyType, apply_url: applyUrl });
}); });
console.log(` ✅ Classified ${classified}, ${remaining} still unknown`); console.log(` ✅ Classified ${classified}, ${remaining} still unknown`);
} }
@@ -241,8 +241,8 @@ async function main() {
const effectiveSearch = { ...search, filters: { ...search.filters, posted_within_days: lookbackDays } }; const effectiveSearch = { ...search, filters: { ...search.filters, posted_within_days: lookbackDays } };
let queryFound = 0, queryAdded = 0; let queryFound = 0, queryAdded = 0;
await searchWellfound(wfBrowser.page, effectiveSearch, { await searchWellfound(wfBrowser.page, effectiveSearch, {
onPage: (pageJobs) => { onPage: async (pageJobs) => {
const added = addJobs(pageJobs); const added = await addJobs(pageJobs);
totalAdded += added; totalAdded += added;
totalSeen += pageJobs.length; totalSeen += pageJobs.length;
queryFound += pageJobs.length; queryFound += pageJobs.length;

View File

@@ -1,13 +1,18 @@
/** /**
* queue.mjs — Job queue management * queue.mjs — Job queue management
* Handles jobs_queue.json read/write/update * Handles jobs_queue.json read/write/update
* Uses in-memory cache to avoid redundant disk I/O within a run. *
* S3-backed: every write syncs to versioned S3 bucket (never lose data). * Storage is pluggable via settings.storage:
* { type: "local" } — reads/writes to local disk (default)
* { type: "s3", bucket: "...", region: "..." } — S3 is primary store
*
* In-memory cache avoids redundant I/O within a run.
* Call initStorage() once at startup before any queue operations.
*/ */
import { readFileSync, writeFileSync, appendFileSync, renameSync, unlinkSync, existsSync, mkdirSync } from 'fs'; import { readFileSync, writeFileSync, appendFileSync, renameSync, unlinkSync, existsSync, mkdirSync } from 'fs';
import { dirname, resolve } from 'path'; import { dirname, resolve } from 'path';
import { fileURLToPath } from 'url'; import { fileURLToPath } from 'url';
import { initStorage, backupToS3, loadJSONSafe } from './storage.mjs'; import { initStorage as _initStorage, loadJSON, saveJSON, storageType } from './storage.mjs';
const __dir = dirname(fileURLToPath(import.meta.url)); const __dir = dirname(fileURLToPath(import.meta.url));
const QUEUE_PATH = `${__dir}/../data/jobs_queue.json`; const QUEUE_PATH = `${__dir}/../data/jobs_queue.json`;
@@ -40,33 +45,40 @@ function ensureDir(path) {
if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
} }
/**
* Atomic write — writes to a temp file then renames.
* Prevents corruption if two processes write simultaneously or the process
* crashes mid-write. rename() is atomic on POSIX filesystems.
*/
function atomicWriteJSON(filePath, data) {
const tmp = filePath + '.tmp';
writeFileSync(tmp, JSON.stringify(data, null, 2));
renameSync(tmp, filePath);
}
// --- In-memory caches (populated on first read, invalidated on write) --- // --- In-memory caches (populated on first read, invalidated on write) ---
let _queueCache = null; let _queueCache = null;
let _logCache = null; let _logCache = null;
let _initialized = false;
/**
* Initialize storage. Must be called (and awaited) once at startup
* before any queue operations.
*/
export async function initQueue(settings) {
_initStorage(settings);
// Load queue and log from primary storage (S3 or local)
_queueCache = await loadJSON(QUEUE_PATH, []);
_logCache = await loadJSON(LOG_PATH, []);
// Apply any pending sidecar updates (local-only mechanism)
if (applyPendingUpdates(_queueCache)) {
await saveQueue(_queueCache);
}
_initialized = true;
const type = storageType();
console.log(`📦 Storage: ${type}${type === 's3' ? ` (${settings.storage.bucket})` : ''}${_queueCache.length} jobs loaded`);
}
/** /**
* Apply pending updates from the sidecar JSONL file. * Apply pending updates from the sidecar JSONL file.
* Secondary processes (e.g. standalone classifier) write updates here * Secondary processes (e.g. standalone classifier) write updates here
* instead of modifying jobs_queue.json directly, avoiding write conflicts. * instead of modifying jobs_queue.json directly, avoiding write conflicts.
*
* Uses rename-then-read to avoid losing updates appended between read and delete.
* Each line is JSON: { id, ...fields }
*/ */
function applyPendingUpdates(queue) { function applyPendingUpdates(queue) {
if (!existsSync(UPDATES_PATH)) return false; if (!existsSync(UPDATES_PATH)) return false;
// Atomically claim the file by renaming it — new appends go to a fresh file
const claimedPath = UPDATES_PATH + '.applying'; const claimedPath = UPDATES_PATH + '.applying';
try { renameSync(UPDATES_PATH, claimedPath); } catch { return false; } try { renameSync(UPDATES_PATH, claimedPath); } catch { return false; }
@@ -93,118 +105,72 @@ function applyPendingUpdates(queue) {
return applied > 0; return applied > 0;
} }
/**
* Get the in-memory queue. Must call initQueue() first.
*/
export function loadQueue() { export function loadQueue() {
if (_queueCache) return _queueCache; if (!_initialized) {
// Fallback for code that hasn't been updated to call initQueue yet
ensureDir(QUEUE_PATH); ensureDir(QUEUE_PATH);
if (!_queueCache) {
let data = null; _queueCache = existsSync(QUEUE_PATH) ? JSON.parse(readFileSync(QUEUE_PATH, 'utf8')) : [];
if (existsSync(QUEUE_PATH)) { if (!Array.isArray(_queueCache)) _queueCache = [];
try {
const raw = readFileSync(QUEUE_PATH, 'utf8');
const parsed = JSON.parse(raw);
if (!Array.isArray(parsed)) throw new Error(`Expected array, got ${typeof parsed}`);
data = parsed;
} catch (err) {
console.warn(`⚠️ Queue file corrupt: ${err.message} — will attempt S3 restore`);
} }
} }
// S3 restore handled async at startup via initQueueFromS3() — for sync path, use empty array
_queueCache = data || [];
if (applyPendingUpdates(_queueCache)) {
saveQueue(_queueCache);
}
return _queueCache; return _queueCache;
} }
/** /**
* Async queue initialization with S3 fallback. * Force a fresh read from storage + apply pending updates.
* Call once at startup before processing jobs. * Call this between iterations in long-running processes.
*/ */
export async function initQueueFromS3(settings) { export async function reloadQueue() {
initStorage(settings); _queueCache = await loadJSON(QUEUE_PATH, []);
if (applyPendingUpdates(_queueCache)) {
// If local queue is missing or empty, try S3 await saveQueue(_queueCache);
let needsRestore = false;
if (!existsSync(QUEUE_PATH)) {
needsRestore = true;
} else {
try {
const raw = readFileSync(QUEUE_PATH, 'utf8');
const parsed = JSON.parse(raw);
if (!Array.isArray(parsed)) needsRestore = true;
} catch {
needsRestore = true;
}
}
if (needsRestore) {
const restored = await loadJSONSafe(QUEUE_PATH, []);
if (Array.isArray(restored) && restored.length > 0) {
_queueCache = restored;
console.log(`✅ Queue restored from S3: ${restored.length} jobs`);
}
}
// Also ensure applications log is backed up
if (existsSync(LOG_PATH)) {
backupToS3(LOG_PATH);
} }
return _queueCache;
} }
/** /**
* Force a fresh read from disk + apply pending updates. * Save the queue to primary storage.
* Call this between iterations in long-running processes to pick up
* sidecar updates from secondary processes.
*/ */
export function reloadQueue() { export async function saveQueue(jobs) {
_queueCache = null;
return loadQueue();
}
export function saveQueue(jobs) {
if (!Array.isArray(jobs)) { if (!Array.isArray(jobs)) {
throw new Error(`saveQueue: expected array, got ${typeof jobs} — refusing to write corrupt data`); throw new Error(`saveQueue: expected array, got ${typeof jobs} — refusing to write corrupt data`);
} }
ensureDir(QUEUE_PATH); await saveJSON(QUEUE_PATH, jobs);
atomicWriteJSON(QUEUE_PATH, jobs);
_queueCache = jobs; _queueCache = jobs;
backupToS3(QUEUE_PATH);
} }
function loadLog() { async function loadLog() {
if (_logCache) return _logCache; if (_logCache) return _logCache;
ensureDir(LOG_PATH); _logCache = await loadJSON(LOG_PATH, []);
_logCache = existsSync(LOG_PATH) ? JSON.parse(readFileSync(LOG_PATH, 'utf8')) : [];
return _logCache; return _logCache;
} }
function saveLog(log) { async function saveLog(log) {
if (!Array.isArray(log)) { if (!Array.isArray(log)) {
throw new Error(`saveLog: expected array, got ${typeof log} — refusing to write corrupt data`); throw new Error(`saveLog: expected array, got ${typeof log} — refusing to write corrupt data`);
} }
ensureDir(LOG_PATH); await saveJSON(LOG_PATH, log);
atomicWriteJSON(LOG_PATH, log);
_logCache = log; _logCache = log;
backupToS3(LOG_PATH);
} }
export function appendLog(entry) { export async function appendLog(entry) {
const log = loadLog(); const log = await loadLog();
log.push({ ...entry, logged_at: new Date().toISOString() }); log.push({ ...entry, logged_at: new Date().toISOString() });
saveLog(log); await saveLog(log);
} }
/** /**
* After AI filtering, deduplicate jobs that exist on multiple tracks. * After AI filtering, deduplicate jobs that exist on multiple tracks.
* For each group sharing the same original job URL, keep the highest-scoring copy.
* Marks losers as status='duplicate'. Call this after collect completes.
*/ */
export function dedupeAfterFilter() { export async function dedupeAfterFilter() {
_queueCache = null; _queueCache = null;
const queue = loadQueue(); const queue = await loadJSON(QUEUE_PATH, []);
_queueCache = queue;
// Group by URL (canonical dedup key)
const byUrl = {}; const byUrl = {};
for (const job of queue) { for (const job of queue) {
if (!job.url) continue; if (!job.url) continue;
@@ -215,10 +181,8 @@ export function dedupeAfterFilter() {
let deduped = 0; let deduped = 0;
for (const jobs of Object.values(byUrl)) { for (const jobs of Object.values(byUrl)) {
if (jobs.length < 2) continue; if (jobs.length < 2) continue;
// Only dedup if ALL copies have been scored — skip groups with unscored members
if (jobs.some(j => j.filter_score == null && j.status !== 'filtered')) continue; if (jobs.some(j => j.filter_score == null && j.status !== 'filtered')) continue;
// Keep the one with highest filter_score; if tied, prefer 'new' over 'filtered'
jobs.sort((a, b) => { jobs.sort((a, b) => {
const sa = a.filter_score ?? -1; const sa = a.filter_score ?? -1;
const sb = b.filter_score ?? -1; const sb = b.filter_score ?? -1;
@@ -226,7 +190,6 @@ export function dedupeAfterFilter() {
if (a.status === 'new' && b.status !== 'new') return -1; if (a.status === 'new' && b.status !== 'new') return -1;
return 1; return 1;
}); });
// Mark losers as duplicate
for (const loser of jobs.slice(1)) { for (const loser of jobs.slice(1)) {
loser.status = 'duplicate'; loser.status = 'duplicate';
loser.status_updated_at = new Date().toISOString(); loser.status_updated_at = new Date().toISOString();
@@ -234,12 +197,12 @@ export function dedupeAfterFilter() {
} }
} }
if (deduped > 0) saveQueue(queue); if (deduped > 0) await saveQueue(queue);
return deduped; return deduped;
} }
export function isAlreadyApplied(jobId) { export async function isAlreadyApplied(jobId) {
const log = loadLog(); const log = await loadLog();
return log.some(e => e.id === jobId && e.status === 'applied'); return log.some(e => e.id === jobId && e.status === 'applied');
} }
@@ -249,7 +212,7 @@ export function getJobsByStatus(status) {
return queue.filter(j => j.status === status); return queue.filter(j => j.status === status);
} }
export function updateJobStatus(id, status, extra = {}) { export async function updateJobStatus(id, status, extra = {}) {
const queue = loadQueue(); const queue = loadQueue();
const idx = queue.findIndex(j => j.id === id); const idx = queue.findIndex(j => j.id === id);
if (idx === -1) return; if (idx === -1) return;
@@ -259,31 +222,24 @@ export function updateJobStatus(id, status, extra = {}) {
status, status,
status_updated_at: new Date().toISOString(), status_updated_at: new Date().toISOString(),
}; };
saveQueue(queue); await saveQueue(queue);
return queue[idx]; return queue[idx];
} }
/** /**
* Write a pending update to the sidecar JSONL file. * Write a pending update to the sidecar JSONL file.
* Use this from secondary processes (standalone scripts) instead of * Use this from secondary processes instead of calling updateJobStatus directly.
* calling updateJobStatus/saveQueue directly. The primary process
* (searcher/applier) will pick up these updates on next loadQueue().
*
* @param {string} id — job id
* @param {object} fields — fields to merge (e.g. { status: 'new', apply_type: 'lever', apply_url: '...' })
*/ */
export function writePendingUpdate(id, fields) { export function writePendingUpdate(id, fields) {
ensureDir(UPDATES_PATH); ensureDir(UPDATES_PATH);
appendFileSync(UPDATES_PATH, JSON.stringify({ id, ...fields }) + '\n'); appendFileSync(UPDATES_PATH, JSON.stringify({ id, ...fields }) + '\n');
} }
export function addJobs(newJobs) { export async function addJobs(newJobs) {
// Always read fresh from disk to avoid clobbering concurrent writes (e.g. filter scoring) // Reload fresh to avoid clobbering concurrent writes
_queueCache = null; _queueCache = await loadJSON(QUEUE_PATH, []);
const queue = loadQueue(); const queue = _queueCache;
// Dedup key: same job.id + same track = skip (duplicate search hit for same track)
// Same job.id but different track = allow (will be deduped after AI filter, keeping best score)
const existingKeys = new Set(queue.map(j => `${j.track || 'ae'}::${j.id}`)); const existingKeys = new Set(queue.map(j => `${j.track || 'ae'}::${j.id}`));
let added = 0; let added = 0;
@@ -293,8 +249,6 @@ export function addJobs(newJobs) {
if (existingKeys.has(key)) continue; if (existingKeys.has(key)) continue;
existingKeys.add(key); existingKeys.add(key);
// If this job.id already exists on a different track, give this copy a composite id
// so filter batch custom_ids don't collide
const idConflict = queue.some(j => j.id === job.id && (j.track || 'ae') !== track); const idConflict = queue.some(j => j.id === job.id && (j.track || 'ae') !== track);
const queueId = idConflict ? `${job.id}_${track}` : job.id; const queueId = idConflict ? `${job.id}_${track}` : job.id;
@@ -312,6 +266,6 @@ export function addJobs(newJobs) {
added++; added++;
} }
saveQueue(queue); await saveQueue(queue);
return added; return added;
} }

View File

@@ -1,37 +1,35 @@
/** /**
* storage.mjs — S3-backed data storage * storage.mjs — Pluggable data storage (local disk or S3)
* *
* Local files remain the primary read/write path (fast). * When type is "local": reads/writes go to local disk (default).
* Every write syncs to S3 asynchronously (versioned bucket — never lose data). * When type is "s3": S3 is the primary store. No local files for data.
* On load, if local file is missing or corrupt, auto-restores from S3. * - Versioned bucket means every write is recoverable.
* - In-memory cache in queue.mjs handles read performance.
* *
* Config in settings.json: * Config in settings.json:
* storage: { type: "s3", bucket: "claw-apply-data", region: "us-west-2" } * storage: { type: "s3", bucket: "claw-apply-data", region: "us-west-2" }
* or * storage: { type: "local" } (default)
* storage: { type: "local" } (default, no backup)
*/ */
import { readFileSync, writeFileSync, existsSync } from 'fs'; import { readFileSync, writeFileSync, existsSync } from 'fs';
import { basename } from 'path'; import { basename } from 'path';
let _s3Client = null; let _s3Client = null;
let _config = null; let _config = { type: 'local' };
/**
* Initialize storage with settings. Call once at startup.
*/
export function initStorage(settings) { export function initStorage(settings) {
_config = settings?.storage || { type: 'local' }; _config = settings?.storage || { type: 'local' };
} }
export function storageType() {
return _config?.type || 'local';
}
function getS3Key(filePath) { function getS3Key(filePath) {
return `data/${basename(filePath)}`; return `data/${basename(filePath)}`;
} }
async function getS3Client() { async function getS3Client() {
if (_s3Client) return _s3Client; if (_s3Client) return _s3Client;
if (!_config || _config.type !== 's3') return null;
try {
const { S3Client: Client, PutObjectCommand, GetObjectCommand } = await import('@aws-sdk/client-s3'); const { S3Client: Client, PutObjectCommand, GetObjectCommand } = await import('@aws-sdk/client-s3');
_s3Client = { _s3Client = {
client: new Client({ region: _config.region || 'us-west-2' }), client: new Client({ region: _config.region || 'us-west-2' }),
@@ -39,117 +37,72 @@ async function getS3Client() {
GetObjectCommand, GetObjectCommand,
}; };
return _s3Client; return _s3Client;
} catch {
console.warn('⚠️ @aws-sdk/client-s3 not installed — S3 backup disabled');
return null;
}
} }
/** /**
* Upload a local file to S3 (async, non-blocking). * Load JSON data. Source depends on storage type.
* Versioned bucket keeps every revision.
*/ */
export function backupToS3(filePath) { export async function loadJSON(filePath, defaultValue = []) {
if (!_config || _config.type !== 's3') return; if (_config.type === 's3') {
// Fire and forget — don't block the caller
_doBackup(filePath).catch(err => {
console.warn(`⚠️ S3 backup failed for ${basename(filePath)}: ${err.message}`);
});
}
async function _doBackup(filePath) {
const s3 = await getS3Client();
if (!s3) return;
const body = readFileSync(filePath, 'utf8');
// Safety: don't upload obviously corrupt data
if (body.length < 3) {
console.warn(`⚠️ Refusing to backup ${basename(filePath)} — file too small (${body.length} bytes)`);
return;
}
await s3.client.send(new s3.PutObjectCommand({
Bucket: _config.bucket,
Key: getS3Key(filePath),
Body: body,
ContentType: 'application/json',
}));
}
/**
* Restore a file from S3 to local disk.
* Returns true if restored, false if no S3 copy exists.
*/
export async function restoreFromS3(filePath) {
if (!_config || _config.type !== 's3') return false;
const s3 = await getS3Client();
if (!s3) return false;
try { try {
const s3 = await getS3Client();
const response = await s3.client.send(new s3.GetObjectCommand({ const response = await s3.client.send(new s3.GetObjectCommand({
Bucket: _config.bucket, Bucket: _config.bucket,
Key: getS3Key(filePath), Key: getS3Key(filePath),
})); }));
const body = await response.Body.transformToString(); const body = await response.Body.transformToString();
const parsed = JSON.parse(body);
// Validate it's real JSON before writing if (Array.isArray(defaultValue) && !Array.isArray(parsed)) {
JSON.parse(body); throw new Error(`Expected array, got ${typeof parsed}`);
writeFileSync(filePath, body); }
console.log(`✅ Restored ${basename(filePath)} from S3 (${body.length} bytes)`); return parsed;
return true;
} catch (err) { } catch (err) {
if (err.name === 'NoSuchKey') return false; if (err.name === 'NoSuchKey') return defaultValue;
console.warn(`⚠️ S3 restore failed for ${basename(filePath)}: ${err.message}`); console.warn(`⚠️ S3 load failed for ${basename(filePath)}: ${err.message}`);
return false; return defaultValue;
} }
} }
/** // Local storage
* Safe JSON file loader with S3 fallback. if (!existsSync(filePath)) return defaultValue;
* If local file is missing or corrupt, tries S3 restore.
*/
export async function loadJSONSafe(filePath, defaultValue = []) {
// Try local first
if (existsSync(filePath)) {
try { try {
const raw = readFileSync(filePath, 'utf8'); const raw = readFileSync(filePath, 'utf8');
const parsed = JSON.parse(raw); const parsed = JSON.parse(raw);
// Validate it's the expected type (array for queue/log)
if (Array.isArray(defaultValue) && !Array.isArray(parsed)) { if (Array.isArray(defaultValue) && !Array.isArray(parsed)) {
throw new Error(`Expected array, got ${typeof parsed}`); throw new Error(`Expected array, got ${typeof parsed}`);
} }
return parsed; return parsed;
} catch (err) { } catch (err) {
console.warn(`⚠️ Local ${basename(filePath)} is corrupt: ${err.message}`); console.warn(`⚠️ Local ${basename(filePath)} is corrupt: ${err.message}`);
console.warn(` Attempting S3 restore...`);
}
}
// Local missing or corrupt — try S3
const restored = await restoreFromS3(filePath);
if (restored) {
return JSON.parse(readFileSync(filePath, 'utf8'));
}
return defaultValue; return defaultValue;
} }
}
/** /**
* Safe JSON file writer with validation + S3 backup. * Save JSON data. Destination depends on storage type.
* Validates data type before writing to prevent corruption. * Validates data before writing to prevent corruption.
*/ */
export function saveJSONSafe(filePath, data) { export async function saveJSON(filePath, data) {
// Validate: never write non-object/non-array data to queue/log files if (typeof data === 'string' || data === null || data === undefined) {
if (typeof data === 'string') { throw new Error(`Refusing to save ${typeof data} to ${basename(filePath)} — data corruption prevented`);
throw new Error(`Refusing to save string to ${basename(filePath)} — data corruption prevented`);
}
if (data === null || data === undefined) {
throw new Error(`Refusing to save ${data} to ${basename(filePath)} — data corruption prevented`);
} }
writeFileSync(filePath, JSON.stringify(data, null, 2)); const body = JSON.stringify(data, null, 2);
backupToS3(filePath);
if (_config.type === 's3') {
const s3 = await getS3Client();
await s3.client.send(new s3.PutObjectCommand({
Bucket: _config.bucket,
Key: getS3Key(filePath),
Body: body,
ContentType: 'application/json',
}));
return;
}
// Local storage — atomic write
const tmp = filePath + '.tmp';
writeFileSync(tmp, body);
const { renameSync } = await import('fs');
renameSync(tmp, filePath);
} }

View File

@@ -163,7 +163,7 @@ export async function processTelegramReplies(settings, answersPath) {
} }
if (answersDirty) saveAnswers(answersPath, answers); if (answersDirty) saveAnswers(answersPath, answers);
if (queueDirty) saveQueue(queue); if (queueDirty) await saveQueue(queue);
saveOffset(maxUpdateId + 1); saveOffset(maxUpdateId + 1);
return processed; return processed;