feat: find-all → filter → dedup flow

- addJobs: allows same job on multiple tracks (dedup key = track::id) - Cross-track copies get composite id (job.id_track) to avoid batch collisions - dedupeAfterFilter(): after collect, keeps highest-scored copy per URL, marks rest as 'duplicate' - Called automatically at end of collect phase
2026-03-06 15:55:00 +00:00
parent 2dfadbde99
commit c9b527c83a
2 changed files with 62 additions and 4 deletions
--- a/lib/queue.mjs
+++ b/lib/queue.mjs
@@ -72,6 +72,46 @@ export function appendLog(entry) {
  saveLog(log);
 }

+/**
+ * After AI filtering, deduplicate jobs that exist on multiple tracks.
+ * For each group sharing the same original job URL, keep the highest-scoring copy.
+ * Marks losers as status='duplicate'. Call this after collect completes.
+ */
+export function dedupeAfterFilter() {
+  _queueCache = null;
+  const queue = loadQueue();
+
+  // Group by URL (canonical dedup key)
+  const byUrl = {};
+  for (const job of queue) {
+    if (!job.url) continue;
+    if (!byUrl[job.url]) byUrl[job.url] = [];
+    byUrl[job.url].push(job);
+  }
+
+  let deduped = 0;
+  for (const jobs of Object.values(byUrl)) {
+    if (jobs.length < 2) continue;
+    // Keep the one with highest filter_score; if tied, prefer 'new' over 'filtered'
+    jobs.sort((a, b) => {
+      const sa = a.filter_score ?? -1;
+      const sb = b.filter_score ?? -1;
+      if (sb !== sa) return sb - sa;
+      if (a.status === 'new' && b.status !== 'new') return -1;
+      return 1;
+    });
+    // Mark losers as duplicate
+    for (const loser of jobs.slice(1)) {
+      loser.status = 'duplicate';
+      loser.status_updated_at = new Date().toISOString();
+      deduped++;
+    }
+  }
+
+  if (deduped > 0) saveQueue(queue);
+  return deduped;
+}
+
 export function isAlreadyApplied(jobId) {
  const log = loadLog();
  return log.some(e => e.id === jobId && e.status === 'applied');
@@ -101,14 +141,27 @@ export function addJobs(newJobs) {
  // Always read fresh from disk to avoid clobbering concurrent writes (e.g. filter scoring)
  _queueCache = null;
  const queue = loadQueue();
-  const existingIds = new Set(queue.map(j => j.id));
-  const existingUrls = new Set(queue.map(j => j.url));
+
+  // Dedup key: same job.id + same track = skip (duplicate search hit for same track)
+  // Same job.id but different track = allow (will be deduped after AI filter, keeping best score)
+  const existingKeys = new Set(queue.map(j => `${j.track || 'ae'}::${j.id}`));
  let added = 0;

  for (const job of newJobs) {
-    if (existingIds.has(job.id) || existingUrls.has(job.url)) continue;
+    const track = job.track || 'ae';
+    const key = `${track}::${job.id}`;
+    if (existingKeys.has(key)) continue;
+    existingKeys.add(key);
+
+    // If this job.id already exists on a different track, give this copy a composite id
+    // so filter batch custom_ids don't collide
+    const idConflict = queue.some(j => j.id === job.id && (j.track || 'ae') !== track);
+    const queueId = idConflict ? `${job.id}_${track}` : job.id;
+
    queue.push({
      ...job,
+      id: queueId,
+      original_id: job.id,
      status: 'new',
      found_at: new Date().toISOString(),
      status_updated_at: new Date().toISOString(),