feat: find-all → filter → dedup flow
- addJobs: allows same job on multiple tracks (dedup key = track::id) - Cross-track copies get composite id (job.id_track) to avoid batch collisions - dedupeAfterFilter(): after collect, keeps highest-scored copy per URL, marks rest as 'duplicate' - Called automatically at end of collect phase
This commit is contained in:
@@ -72,6 +72,46 @@ export function appendLog(entry) {
|
||||
saveLog(log);
|
||||
}
|
||||
|
||||
/**
|
||||
* After AI filtering, deduplicate jobs that exist on multiple tracks.
|
||||
* For each group sharing the same original job URL, keep the highest-scoring copy.
|
||||
* Marks losers as status='duplicate'. Call this after collect completes.
|
||||
*/
|
||||
export function dedupeAfterFilter() {
|
||||
_queueCache = null;
|
||||
const queue = loadQueue();
|
||||
|
||||
// Group by URL (canonical dedup key)
|
||||
const byUrl = {};
|
||||
for (const job of queue) {
|
||||
if (!job.url) continue;
|
||||
if (!byUrl[job.url]) byUrl[job.url] = [];
|
||||
byUrl[job.url].push(job);
|
||||
}
|
||||
|
||||
let deduped = 0;
|
||||
for (const jobs of Object.values(byUrl)) {
|
||||
if (jobs.length < 2) continue;
|
||||
// Keep the one with highest filter_score; if tied, prefer 'new' over 'filtered'
|
||||
jobs.sort((a, b) => {
|
||||
const sa = a.filter_score ?? -1;
|
||||
const sb = b.filter_score ?? -1;
|
||||
if (sb !== sa) return sb - sa;
|
||||
if (a.status === 'new' && b.status !== 'new') return -1;
|
||||
return 1;
|
||||
});
|
||||
// Mark losers as duplicate
|
||||
for (const loser of jobs.slice(1)) {
|
||||
loser.status = 'duplicate';
|
||||
loser.status_updated_at = new Date().toISOString();
|
||||
deduped++;
|
||||
}
|
||||
}
|
||||
|
||||
if (deduped > 0) saveQueue(queue);
|
||||
return deduped;
|
||||
}
|
||||
|
||||
export function isAlreadyApplied(jobId) {
|
||||
const log = loadLog();
|
||||
return log.some(e => e.id === jobId && e.status === 'applied');
|
||||
@@ -101,14 +141,27 @@ export function addJobs(newJobs) {
|
||||
// Always read fresh from disk to avoid clobbering concurrent writes (e.g. filter scoring)
|
||||
_queueCache = null;
|
||||
const queue = loadQueue();
|
||||
const existingIds = new Set(queue.map(j => j.id));
|
||||
const existingUrls = new Set(queue.map(j => j.url));
|
||||
|
||||
// Dedup key: same job.id + same track = skip (duplicate search hit for same track)
|
||||
// Same job.id but different track = allow (will be deduped after AI filter, keeping best score)
|
||||
const existingKeys = new Set(queue.map(j => `${j.track || 'ae'}::${j.id}`));
|
||||
let added = 0;
|
||||
|
||||
for (const job of newJobs) {
|
||||
if (existingIds.has(job.id) || existingUrls.has(job.url)) continue;
|
||||
const track = job.track || 'ae';
|
||||
const key = `${track}::${job.id}`;
|
||||
if (existingKeys.has(key)) continue;
|
||||
existingKeys.add(key);
|
||||
|
||||
// If this job.id already exists on a different track, give this copy a composite id
|
||||
// so filter batch custom_ids don't collide
|
||||
const idConflict = queue.some(j => j.id === job.id && (j.track || 'ae') !== track);
|
||||
const queueId = idConflict ? `${job.id}_${track}` : job.id;
|
||||
|
||||
queue.push({
|
||||
...job,
|
||||
id: queueId,
|
||||
original_id: job.id,
|
||||
status: 'new',
|
||||
found_at: new Date().toISOString(),
|
||||
status_updated_at: new Date().toISOString(),
|
||||
|
||||
Reference in New Issue
Block a user