Description
It appears that my custom RequestQueue isn't working as expected. Very few jobs are being processed, even though my RequestQueue list has many more job IDs.
import { RequestQueue } from "crawlee";
let jobQueue: RequestQueue;
async function initializeJobQueue() {
if (!jobQueue) {
jobQueue = await RequestQueue.open("job-deduplication-queue");
}
}
async function fetchJobPages(page: Page, jobIds: string[], origin: string) {
await initializeJobQueue();
const filteredJobIds = [];
if (saveOnlyUniqueItems) {
for (const jobId of jobIds) {
const jobUrl = `${origin}/viewjob?jk=${jobId}`;
const request = await jobQueue.addRequest({ url: jobUrl });
if (!request.wasAlreadyPresent) filteredJobIds.push(jobId);
}
} else {
filteredJobIds.push(...jobIds);
}
myLog(
`Filtered ${jobIds.length - filteredJobIds.length} duplicates, ` +
`processing ${filteredJobIds.length} unique jobs.`
);
// fetchJobWithRetry and batching logic follows...
}
Am i using the request correctly, I am not using the default one from the crawler because my scrapping logic does not allow it.