- New lib/storage.mjs: async S3 backup on every queue/log save - Versioned S3 bucket (claw-apply-data) keeps every revision - Auto-restore from S3 if local file is missing or corrupt - saveQueue/saveLog now validate data type before writing (prevents the exact bug that corrupted the queue) - IAM role attached to EC2 instance for credential-free S3 access - Config: storage.type = "local" (default) or "s3" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
318 lines
9.5 KiB
JavaScript
318 lines
9.5 KiB
JavaScript
/**
|
|
* queue.mjs — Job queue management
|
|
* Handles jobs_queue.json read/write/update
|
|
* Uses in-memory cache to avoid redundant disk I/O within a run.
|
|
* S3-backed: every write syncs to versioned S3 bucket (never lose data).
|
|
*/
|
|
import { readFileSync, writeFileSync, appendFileSync, renameSync, unlinkSync, existsSync, mkdirSync } from 'fs';
|
|
import { dirname, resolve } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { initStorage, backupToS3, loadJSONSafe } from './storage.mjs';
|
|
|
|
const __dir = dirname(fileURLToPath(import.meta.url));
|
|
const QUEUE_PATH = `${__dir}/../data/jobs_queue.json`;
|
|
const LOG_PATH = `${__dir}/../data/applications_log.json`;
|
|
const UPDATES_PATH = `${__dir}/../data/queue_updates.jsonl`;
|
|
|
|
/**
|
|
* Load and validate a JSON config file. Throws with a clear message on failure.
|
|
*/
|
|
export function loadConfig(filePath) {
|
|
const resolved = resolve(filePath);
|
|
if (!existsSync(resolved)) {
|
|
throw new Error(`Config file not found: ${resolved}\nCopy the matching .example.json and fill in your values.`);
|
|
}
|
|
let raw;
|
|
try {
|
|
raw = readFileSync(resolved, 'utf8');
|
|
} catch (e) {
|
|
throw new Error(`Cannot read config file ${resolved}: ${e.message}`);
|
|
}
|
|
try {
|
|
return JSON.parse(raw);
|
|
} catch (e) {
|
|
throw new Error(`Invalid JSON in ${resolved}: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
function ensureDir(path) {
|
|
const dir = dirname(path);
|
|
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
}
|
|
|
|
/**
|
|
* Atomic write — writes to a temp file then renames.
|
|
* Prevents corruption if two processes write simultaneously or the process
|
|
* crashes mid-write. rename() is atomic on POSIX filesystems.
|
|
*/
|
|
function atomicWriteJSON(filePath, data) {
|
|
const tmp = filePath + '.tmp';
|
|
writeFileSync(tmp, JSON.stringify(data, null, 2));
|
|
renameSync(tmp, filePath);
|
|
}
|
|
|
|
// --- In-memory caches (populated on first read, invalidated on write) ---
|
|
let _queueCache = null;
|
|
let _logCache = null;
|
|
|
|
/**
|
|
* Apply pending updates from the sidecar JSONL file.
|
|
* Secondary processes (e.g. standalone classifier) write updates here
|
|
* instead of modifying jobs_queue.json directly, avoiding write conflicts.
|
|
*
|
|
* Uses rename-then-read to avoid losing updates appended between read and delete.
|
|
* Each line is JSON: { id, ...fields }
|
|
*/
|
|
function applyPendingUpdates(queue) {
|
|
if (!existsSync(UPDATES_PATH)) return false;
|
|
|
|
// Atomically claim the file by renaming it — new appends go to a fresh file
|
|
const claimedPath = UPDATES_PATH + '.applying';
|
|
try { renameSync(UPDATES_PATH, claimedPath); } catch { return false; }
|
|
|
|
let lines;
|
|
try {
|
|
lines = readFileSync(claimedPath, 'utf8').trim().split('\n').filter(Boolean);
|
|
} catch { return false; }
|
|
finally { try { unlinkSync(claimedPath); } catch {} }
|
|
|
|
if (lines.length === 0) return false;
|
|
|
|
const byId = new Map(queue.map((j, i) => [j.id, i]));
|
|
let applied = 0;
|
|
for (const line of lines) {
|
|
try {
|
|
const { id, ...fields } = JSON.parse(line);
|
|
const idx = byId.get(id);
|
|
if (idx == null) continue;
|
|
queue[idx] = { ...queue[idx], ...fields, status_updated_at: new Date().toISOString() };
|
|
applied++;
|
|
} catch {}
|
|
}
|
|
|
|
return applied > 0;
|
|
}
|
|
|
|
export function loadQueue() {
|
|
if (_queueCache) return _queueCache;
|
|
ensureDir(QUEUE_PATH);
|
|
|
|
let data = null;
|
|
if (existsSync(QUEUE_PATH)) {
|
|
try {
|
|
const raw = readFileSync(QUEUE_PATH, 'utf8');
|
|
const parsed = JSON.parse(raw);
|
|
if (!Array.isArray(parsed)) throw new Error(`Expected array, got ${typeof parsed}`);
|
|
data = parsed;
|
|
} catch (err) {
|
|
console.warn(`⚠️ Queue file corrupt: ${err.message} — will attempt S3 restore`);
|
|
}
|
|
}
|
|
|
|
// S3 restore handled async at startup via initQueueFromS3() — for sync path, use empty array
|
|
_queueCache = data || [];
|
|
if (applyPendingUpdates(_queueCache)) {
|
|
saveQueue(_queueCache);
|
|
}
|
|
return _queueCache;
|
|
}
|
|
|
|
/**
|
|
* Async queue initialization with S3 fallback.
|
|
* Call once at startup before processing jobs.
|
|
*/
|
|
export async function initQueueFromS3(settings) {
|
|
initStorage(settings);
|
|
|
|
// If local queue is missing or empty, try S3
|
|
let needsRestore = false;
|
|
if (!existsSync(QUEUE_PATH)) {
|
|
needsRestore = true;
|
|
} else {
|
|
try {
|
|
const raw = readFileSync(QUEUE_PATH, 'utf8');
|
|
const parsed = JSON.parse(raw);
|
|
if (!Array.isArray(parsed)) needsRestore = true;
|
|
} catch {
|
|
needsRestore = true;
|
|
}
|
|
}
|
|
|
|
if (needsRestore) {
|
|
const restored = await loadJSONSafe(QUEUE_PATH, []);
|
|
if (Array.isArray(restored) && restored.length > 0) {
|
|
_queueCache = restored;
|
|
console.log(`✅ Queue restored from S3: ${restored.length} jobs`);
|
|
}
|
|
}
|
|
|
|
// Also ensure applications log is backed up
|
|
if (existsSync(LOG_PATH)) {
|
|
backupToS3(LOG_PATH);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Force a fresh read from disk + apply pending updates.
|
|
* Call this between iterations in long-running processes to pick up
|
|
* sidecar updates from secondary processes.
|
|
*/
|
|
export function reloadQueue() {
|
|
_queueCache = null;
|
|
return loadQueue();
|
|
}
|
|
|
|
export function saveQueue(jobs) {
|
|
if (!Array.isArray(jobs)) {
|
|
throw new Error(`saveQueue: expected array, got ${typeof jobs} — refusing to write corrupt data`);
|
|
}
|
|
ensureDir(QUEUE_PATH);
|
|
atomicWriteJSON(QUEUE_PATH, jobs);
|
|
_queueCache = jobs;
|
|
backupToS3(QUEUE_PATH);
|
|
}
|
|
|
|
function loadLog() {
|
|
if (_logCache) return _logCache;
|
|
ensureDir(LOG_PATH);
|
|
_logCache = existsSync(LOG_PATH) ? JSON.parse(readFileSync(LOG_PATH, 'utf8')) : [];
|
|
return _logCache;
|
|
}
|
|
|
|
function saveLog(log) {
|
|
if (!Array.isArray(log)) {
|
|
throw new Error(`saveLog: expected array, got ${typeof log} — refusing to write corrupt data`);
|
|
}
|
|
ensureDir(LOG_PATH);
|
|
atomicWriteJSON(LOG_PATH, log);
|
|
_logCache = log;
|
|
backupToS3(LOG_PATH);
|
|
}
|
|
|
|
export function appendLog(entry) {
|
|
const log = loadLog();
|
|
log.push({ ...entry, logged_at: new Date().toISOString() });
|
|
saveLog(log);
|
|
}
|
|
|
|
/**
|
|
* After AI filtering, deduplicate jobs that exist on multiple tracks.
|
|
* For each group sharing the same original job URL, keep the highest-scoring copy.
|
|
* Marks losers as status='duplicate'. Call this after collect completes.
|
|
*/
|
|
export function dedupeAfterFilter() {
|
|
_queueCache = null;
|
|
const queue = loadQueue();
|
|
|
|
// Group by URL (canonical dedup key)
|
|
const byUrl = {};
|
|
for (const job of queue) {
|
|
if (!job.url) continue;
|
|
if (!byUrl[job.url]) byUrl[job.url] = [];
|
|
byUrl[job.url].push(job);
|
|
}
|
|
|
|
let deduped = 0;
|
|
for (const jobs of Object.values(byUrl)) {
|
|
if (jobs.length < 2) continue;
|
|
// Only dedup if ALL copies have been scored — skip groups with unscored members
|
|
if (jobs.some(j => j.filter_score == null && j.status !== 'filtered')) continue;
|
|
|
|
// Keep the one with highest filter_score; if tied, prefer 'new' over 'filtered'
|
|
jobs.sort((a, b) => {
|
|
const sa = a.filter_score ?? -1;
|
|
const sb = b.filter_score ?? -1;
|
|
if (sb !== sa) return sb - sa;
|
|
if (a.status === 'new' && b.status !== 'new') return -1;
|
|
return 1;
|
|
});
|
|
// Mark losers as duplicate
|
|
for (const loser of jobs.slice(1)) {
|
|
loser.status = 'duplicate';
|
|
loser.status_updated_at = new Date().toISOString();
|
|
deduped++;
|
|
}
|
|
}
|
|
|
|
if (deduped > 0) saveQueue(queue);
|
|
return deduped;
|
|
}
|
|
|
|
export function isAlreadyApplied(jobId) {
|
|
const log = loadLog();
|
|
return log.some(e => e.id === jobId && e.status === 'applied');
|
|
}
|
|
|
|
export function getJobsByStatus(status) {
|
|
const queue = loadQueue();
|
|
if (Array.isArray(status)) return queue.filter(j => status.includes(j.status));
|
|
return queue.filter(j => j.status === status);
|
|
}
|
|
|
|
export function updateJobStatus(id, status, extra = {}) {
|
|
const queue = loadQueue();
|
|
const idx = queue.findIndex(j => j.id === id);
|
|
if (idx === -1) return;
|
|
queue[idx] = {
|
|
...queue[idx],
|
|
...extra,
|
|
status,
|
|
status_updated_at: new Date().toISOString(),
|
|
};
|
|
saveQueue(queue);
|
|
return queue[idx];
|
|
}
|
|
|
|
/**
|
|
* Write a pending update to the sidecar JSONL file.
|
|
* Use this from secondary processes (standalone scripts) instead of
|
|
* calling updateJobStatus/saveQueue directly. The primary process
|
|
* (searcher/applier) will pick up these updates on next loadQueue().
|
|
*
|
|
* @param {string} id — job id
|
|
* @param {object} fields — fields to merge (e.g. { status: 'new', apply_type: 'lever', apply_url: '...' })
|
|
*/
|
|
export function writePendingUpdate(id, fields) {
|
|
ensureDir(UPDATES_PATH);
|
|
appendFileSync(UPDATES_PATH, JSON.stringify({ id, ...fields }) + '\n');
|
|
}
|
|
|
|
export function addJobs(newJobs) {
|
|
// Always read fresh from disk to avoid clobbering concurrent writes (e.g. filter scoring)
|
|
_queueCache = null;
|
|
const queue = loadQueue();
|
|
|
|
// Dedup key: same job.id + same track = skip (duplicate search hit for same track)
|
|
// Same job.id but different track = allow (will be deduped after AI filter, keeping best score)
|
|
const existingKeys = new Set(queue.map(j => `${j.track || 'ae'}::${j.id}`));
|
|
let added = 0;
|
|
|
|
for (const job of newJobs) {
|
|
const track = job.track || 'ae';
|
|
const key = `${track}::${job.id}`;
|
|
if (existingKeys.has(key)) continue;
|
|
existingKeys.add(key);
|
|
|
|
// If this job.id already exists on a different track, give this copy a composite id
|
|
// so filter batch custom_ids don't collide
|
|
const idConflict = queue.some(j => j.id === job.id && (j.track || 'ae') !== track);
|
|
const queueId = idConflict ? `${job.id}_${track}` : job.id;
|
|
|
|
queue.push({
|
|
...job,
|
|
id: queueId,
|
|
original_id: job.id,
|
|
status: 'new',
|
|
found_at: new Date().toISOString(),
|
|
status_updated_at: new Date().toISOString(),
|
|
pending_question: null,
|
|
applied_at: null,
|
|
notes: null,
|
|
});
|
|
added++;
|
|
}
|
|
|
|
saveQueue(queue);
|
|
return added;
|
|
}
|