diff --git a/job_filter.mjs b/job_filter.mjs index c6e4a72..89f2722 100644 --- a/job_filter.mjs +++ b/job_filter.mjs @@ -23,7 +23,7 @@ import { readFileSync, writeFileSync, existsSync, unlinkSync } from 'fs'; const __dir = dirname(fileURLToPath(import.meta.url)); -import { getJobsByStatus, updateJobStatus, loadConfig, loadQueue, saveQueue } from './lib/queue.mjs'; +import { getJobsByStatus, updateJobStatus, loadConfig, loadQueue, saveQueue, dedupeAfterFilter } from './lib/queue.mjs'; import { loadProfile, submitBatches, checkBatch, downloadResults } from './lib/filter.mjs'; import { sendTelegram } from './lib/notify.mjs'; @@ -152,6 +152,11 @@ async function collect(state, settings) { } saveQueue(queue); + + // Dedup cross-track copies — keep highest-scoring version of each job + const duped = dedupeAfterFilter(); + if (duped > 0) console.log(` Deduped ${duped} cross-track copies`); + clearState(); // Log run diff --git a/lib/queue.mjs b/lib/queue.mjs index 381a4fa..91feeac 100644 --- a/lib/queue.mjs +++ b/lib/queue.mjs @@ -72,6 +72,46 @@ export function appendLog(entry) { saveLog(log); } +/** + * After AI filtering, deduplicate jobs that exist on multiple tracks. + * For each group sharing the same original job URL, keep the highest-scoring copy. + * Marks losers as status='duplicate'. Call this after collect completes. + */ +export function dedupeAfterFilter() { + _queueCache = null; + const queue = loadQueue(); + + // Group by URL (canonical dedup key) + const byUrl = {}; + for (const job of queue) { + if (!job.url) continue; + if (!byUrl[job.url]) byUrl[job.url] = []; + byUrl[job.url].push(job); + } + + let deduped = 0; + for (const jobs of Object.values(byUrl)) { + if (jobs.length < 2) continue; + // Keep the one with highest filter_score; if tied, prefer 'new' over 'filtered' + jobs.sort((a, b) => { + const sa = a.filter_score ?? -1; + const sb = b.filter_score ?? -1; + if (sb !== sa) return sb - sa; + if (a.status === 'new' && b.status !== 'new') return -1; + return 1; + }); + // Mark losers as duplicate + for (const loser of jobs.slice(1)) { + loser.status = 'duplicate'; + loser.status_updated_at = new Date().toISOString(); + deduped++; + } + } + + if (deduped > 0) saveQueue(queue); + return deduped; +} + export function isAlreadyApplied(jobId) { const log = loadLog(); return log.some(e => e.id === jobId && e.status === 'applied'); @@ -101,14 +141,27 @@ export function addJobs(newJobs) { // Always read fresh from disk to avoid clobbering concurrent writes (e.g. filter scoring) _queueCache = null; const queue = loadQueue(); - const existingIds = new Set(queue.map(j => j.id)); - const existingUrls = new Set(queue.map(j => j.url)); + + // Dedup key: same job.id + same track = skip (duplicate search hit for same track) + // Same job.id but different track = allow (will be deduped after AI filter, keeping best score) + const existingKeys = new Set(queue.map(j => `${j.track || 'ae'}::${j.id}`)); let added = 0; for (const job of newJobs) { - if (existingIds.has(job.id) || existingUrls.has(job.url)) continue; + const track = job.track || 'ae'; + const key = `${track}::${job.id}`; + if (existingKeys.has(key)) continue; + existingKeys.add(key); + + // If this job.id already exists on a different track, give this copy a composite id + // so filter batch custom_ids don't collide + const idConflict = queue.some(j => j.id === job.id && (j.track || 'ae') !== track); + const queueId = idConflict ? `${job.id}_${track}` : job.id; + queue.push({ ...job, + id: queueId, + original_id: job.id, status: 'new', found_at: new Date().toISOString(), status_updated_at: new Date().toISOString(),