161 lines
5.3 KiB
JavaScript
161 lines
5.3 KiB
JavaScript
/**
|
|
* storage.mjs — Pluggable data storage (local disk or S3)
|
|
*
|
|
* When type is "local": reads/writes go to local disk (default).
|
|
* When type is "s3": S3 is the primary store.
|
|
* - Versioned bucket means every write is recoverable.
|
|
* - In-memory cache in queue.mjs handles read performance.
|
|
*
|
|
* Config in settings.json:
|
|
* storage: { type: "s3", bucket: "claw-apply-data", region: "us-west-2" }
|
|
* storage: { type: "local" } (default)
|
|
*/
|
|
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
|
import { basename, dirname, join, resolve as resolvePath } from 'path';
|
|
import { tmpdir } from 'os';
|
|
|
|
let _s3Client = null;
|
|
let _config = { type: 'local' };
|
|
|
|
export function initStorage(settings) {
|
|
_config = settings?.storage || { type: 'local' };
|
|
}
|
|
|
|
export function storageType() {
|
|
return _config?.type || 'local';
|
|
}
|
|
|
|
function getS3Key(filePath) {
|
|
// Normalize and extract relative path from project root (e.g. data/jobs_queue.json)
|
|
const storageUrl = new URL(import.meta.url);
|
|
const projectRoot = dirname(dirname(storageUrl.pathname));
|
|
const abs = resolvePath(filePath.startsWith('/') ? filePath : join(projectRoot, filePath));
|
|
if (abs.startsWith(projectRoot + '/')) {
|
|
return abs.slice(projectRoot.length + 1);
|
|
}
|
|
// Fallback: use last two path segments
|
|
const parts = abs.split('/');
|
|
return parts.slice(-2).join('/');
|
|
}
|
|
|
|
async function getS3Client() {
|
|
if (_s3Client) return _s3Client;
|
|
const { S3Client: Client, PutObjectCommand, GetObjectCommand } = await import('@aws-sdk/client-s3');
|
|
_s3Client = {
|
|
client: new Client({ region: _config.region || 'us-west-2' }),
|
|
PutObjectCommand,
|
|
GetObjectCommand,
|
|
};
|
|
return _s3Client;
|
|
}
|
|
|
|
/**
|
|
* Load JSON data. Source depends on storage type.
|
|
*/
|
|
export async function loadJSON(filePath, defaultValue = []) {
|
|
if (_config.type === 's3') {
|
|
try {
|
|
const s3 = await getS3Client();
|
|
const response = await s3.client.send(new s3.GetObjectCommand({
|
|
Bucket: _config.bucket,
|
|
Key: getS3Key(filePath),
|
|
}));
|
|
const body = await response.Body.transformToString();
|
|
const parsed = JSON.parse(body);
|
|
if (Array.isArray(defaultValue) && !Array.isArray(parsed)) {
|
|
throw new Error(`Expected array, got ${typeof parsed}`);
|
|
}
|
|
return parsed;
|
|
} catch (err) {
|
|
if (err.name === 'NoSuchKey') return defaultValue;
|
|
if (err.$metadata?.httpStatusCode === 404) return defaultValue;
|
|
console.warn(`⚠️ S3 load failed for ${basename(filePath)}: ${err.message}`);
|
|
return defaultValue;
|
|
}
|
|
}
|
|
|
|
// Local storage
|
|
if (!existsSync(filePath)) return defaultValue;
|
|
try {
|
|
const raw = readFileSync(filePath, 'utf8');
|
|
const parsed = JSON.parse(raw);
|
|
if (Array.isArray(defaultValue) && !Array.isArray(parsed)) {
|
|
throw new Error(`Expected array, got ${typeof parsed}`);
|
|
}
|
|
return parsed;
|
|
} catch (err) {
|
|
console.warn(`⚠️ Local ${basename(filePath)} is corrupt: ${err.message}`);
|
|
return defaultValue;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Save JSON data. Destination depends on storage type.
|
|
* Validates data before writing to prevent corruption.
|
|
*/
|
|
export async function saveJSON(filePath, data) {
|
|
if (typeof data === 'string' || data === null || data === undefined) {
|
|
throw new Error(`Refusing to save ${typeof data} to ${basename(filePath)} — data corruption prevented`);
|
|
}
|
|
|
|
const body = JSON.stringify(data, null, 2);
|
|
|
|
if (_config.type === 's3') {
|
|
const s3 = await getS3Client();
|
|
await s3.client.send(new s3.PutObjectCommand({
|
|
Bucket: _config.bucket,
|
|
Key: getS3Key(filePath),
|
|
Body: body,
|
|
ContentType: 'application/json',
|
|
}));
|
|
return;
|
|
}
|
|
|
|
// Local storage — atomic write
|
|
const dir = dirname(filePath);
|
|
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
const tmp = filePath + '.tmp';
|
|
writeFileSync(tmp, body);
|
|
const { renameSync } = await import('fs');
|
|
renameSync(tmp, filePath);
|
|
}
|
|
|
|
/**
|
|
* Load a binary file (e.g. resume PDF) from storage.
|
|
* For S3: downloads to a temp file and returns the local path.
|
|
* For local: returns the path as-is (must already exist).
|
|
*
|
|
* @param {string} s3Key — S3 key (e.g. "config/Matthew_Jackson_Resume.pdf")
|
|
* @param {string} localPath — local file path (used as-is for local storage)
|
|
* @returns {string} — local file path (may be temp file for S3)
|
|
*/
|
|
export async function ensureLocalFile(s3Key, localPath) {
|
|
if (_config.type !== 's3') {
|
|
return localPath;
|
|
}
|
|
|
|
// If the file already exists locally (cached from previous download), use it
|
|
const tempPath = join(tmpdir(), basename(s3Key));
|
|
if (existsSync(tempPath)) return tempPath;
|
|
|
|
try {
|
|
const s3 = await getS3Client();
|
|
const response = await s3.client.send(new s3.GetObjectCommand({
|
|
Bucket: _config.bucket,
|
|
Key: s3Key,
|
|
}));
|
|
const chunks = [];
|
|
for await (const chunk of response.Body) {
|
|
chunks.push(chunk);
|
|
}
|
|
writeFileSync(tempPath, Buffer.concat(chunks));
|
|
console.log(`📄 Downloaded ${basename(s3Key)} from S3 (${chunks.reduce((s, c) => s + c.length, 0)} bytes)`);
|
|
return tempPath;
|
|
} catch (err) {
|
|
console.warn(`⚠️ Failed to download ${s3Key} from S3: ${err.message}`);
|
|
// Fall back to local path if it exists
|
|
if (existsSync(localPath)) return localPath;
|
|
throw new Error(`File not found: ${s3Key} (S3) or ${localPath} (local)`);
|
|
}
|
|
}
|