Работник разветвляется, но не подключается к сети — кластер

#node.js #child-process #node-cluster

Вопрос:

У меня есть приложение, которое проводит опросы спортивных событий. Для каждого события я создаю работника с cluster модулем. Многие события могут начаться одновременно, что означает, что одновременно может быть разветвлено множество работников. Ниже приведена моя loopEvents функция, которая порождает этих новых работников (здесь между каждым появлением есть произвольное delay значение).

 async function loopEvents() {
    try {
        const appData = await store.getAppData()
        const eventsToPoll = []
        const newWorkers = {}

        if (Object.keys(appData.clusterMap).length amp;amp; appData.currentPolledEvents.length) {
            for (const [eventId, workerId] of Object.entries(appData.clusterMap)) {
                const worker = cluster.workers[workerId]

                if (appData.currentPolledEvents.includes(eventId) amp;amp; (!worker || worker.isDead())) {
                    const newEventWorkerEnvVars = getEventWorkerEnvVars(eventId)

                    eventsToPoll.push(newEventWorkerEnvVars)
                }
            }
        }

        if (eventsToPoll.length) {
            for (const eventVars of eventsToPoll) {
                await delay(500)

                const eventWorker = cluster.fork(eventVars)

                newWorkers[eventWorker.id] = eventVars.MATCHED_EVENT_ID
            }

            logger.info(`[POLLING ${Object.keys(newWorkers).length} EVENT(S)]${JSON.stringify(newWorkers)}`)

            global.TEMP_EVENT_MAP = {
                ...(global.TEMP_EVENT_MAP || {}),
                ...newWorkers
            }
        }

        setTimeout(loopEvents, appData.currentPollIntervalS * 1000)
    } catch(err) {
        utils.logJsonError(__filename, 'loopEvents', err)
    }
}
 

У меня есть несколько cluster обработчиков событий для выхода из системы и сохранения в БД:

 cluster.on('fork', worker => {
    // -1 for LOGS_HTTP_API worker
    logger.info(`[WORKER #${worker.id} HAS BEEN FORKED (${Object.keys(cluster.workers).length - 1} workers)]`)
})

cluster.on('online', async worker => {
    const eventId = global.TEMP_EVENT_MAP[worker.id]

    // Persist to the db
    if (eventId) {
        await store.setAppData({
            clusterMap: {
                [eventId]: worker.id
            }
        }, {
            clusterMapAction: 'add'
        })
    }

    delete global.TEMP_EVENT_MAP[worker.id]

    // ...
})

cluster.on('exit', async (worker, code, signal) => {
    const appData = await store.getAppData()
    const workerName = Object.keys(appData.clusterMap).find(eventId => {
        return String(appData.clusterMap[eventId]) === String(worker.id)
    }) || 'LOGS_HTTP_API'
    const newProcessSize = Object.keys(appData.clusterMap).length - 1

    let msg = `[WORKER #${worker.id} (${workerName}) DIED (${signal || code}) - `

    if (worker.exitedAfterDisconnect) {
        msg  = (workerFinishedPoll) ? 'FINISHED POLL' : 'KILLED MANUALLY OR FROM STATE UPDATE'

        logger.info(`${msg} (${newProcessSize} workers)]${JSON.stringify(appData.clusterMap)}`)
    }

    if (workerName === 'LOGS_HTTP_API') {
        logger.info(`Apparently logging API died; ${JSON.stringify(appData.clusterMap)}`)
    }

    // ...
})
 

The logs are as follows:

 {"message":"[WORKER #12 HAS BEEN FORKED (3 workers)]","level":"info","timestamp":"2021-08-11 12:50:33"}
{"message":"[STARTED FOR EVENT "6113c78aefc8cd001d11b97c" (Process: 151)]","level":"info","timestamp":"2021-08-11 12:50:33"}
{"message":"[WORKER #13 HAS BEEN FORKED (4 workers)]","level":"info","timestamp":"2021-08-11 12:50:34"}
{"message":"[WORKER #14 HAS BEEN FORKED (5 workers)]","level":"info","timestamp":"2021-08-11 12:50:34"}
{"message":"[FINISHED POLL FOR EVENT 6113c78aefc8cd001d11b979 (Process: 140)]","level":"info","timestamp":"2021-08-11 12:50:34"}
{"message":"[WORKER #10 (6113c78aefc8cd001d11b979) DIED (SIGTERM) - FINISHED POLL (7 workers)]{"6113c787efc8cd001d11b95c":"-","6113c789efc8cd001d11b971":"-","6113c785efc8cd001d11b944":"-","6113c784efc8cd001d11b93b":"-","6113c787efc8cd001d11b95b":"-","6113c788efc8cd001d11b967":"-","6113c78aefc8cd001d11b979":10,"6113c78aefc8cd001d11b97c":11}","level":"info","timestamp":"2021-08-11 12:50:34"}
{"message":"[ADDING WORKER #15 (6113c789efc8cd001d11b970)]","level":"info","timestamp":"2021-08-11 12:50:35"}
{"message":"[WORKER #15 HAS BEEN FORKED (5 workers)]","level":"info","timestamp":"2021-08-11 12:50:35"}
{"message":"[WORKER #16 HAS BEEN FORKED (6 workers)]","level":"info","timestamp":"2021-08-11 12:50:35"}
{"message":"[WORKER #15 (6113c789efc8cd001d11b970) IS NOW ONLINE (9 workers)]{"6113c787efc8cd001d11b95c":"-","6113c789efc8cd001d11b971":"-","6113c785efc8cd001d11b944":"-","6113c784efc8cd001d11b93b":"-","6113c787efc8cd001d11b95b":"-","6113c788efc8cd001d11b967":"-","6113c78aefc8cd001d11b979":"-","6113c78aefc8cd001d11b97c":11,"6113c789efc8cd001d11b970":15}","level":"info","timestamp":"2021-08-11 12:50:35"}
{"message":"[FINISHED POLL FOR EVENT 6113c787efc8cd001d11b95c (Process: 162)]","level":"info","timestamp":"2021-08-11 12:50:35"}
{"message":"[WORKER #12 (LOGS_HTTP_API) DIED (SIGTERM) - FINISHED POLL (8 workers)]{"6113c787efc8cd001d11b95c":"-","6113c789efc8cd001d11b971":"-","6113c785efc8cd001d11b944":"-","6113c784efc8cd001d11b93b":"-","6113c787efc8cd001d11b95b":"-","6113c788efc8cd001d11b967":"-","6113c78aefc8cd001d11b979":"-","6113c78aefc8cd001d11b97c":11,"6113c789efc8cd001d11b970":15}","level":"info","timestamp":"2021-08-11 12:50:35"}
{"message":"Apparently logging API died; {"6113c787efc8cd001d11b95c":"-","6113c789efc8cd001d11b971":"-","6113c785efc8cd001d11b944":"-","6113c784efc8cd001d11b93b":"-","6113c787efc8cd001d11b95b":"-","6113c788efc8cd001d11b967":"-","6113c78aefc8cd001d11b979":"-","6113c78aefc8cd001d11b97c":11,"6113c789efc8cd001d11b970":15}","level":"info","timestamp":"2021-08-11 12:50:35"}
 

I can deduce from the logs that Worker #12 was handling event «6113c787efc8cd001d11b95c». I can see the log that that worker was forked @ 12:50:33 but it never logged that it was online. Because of this, it never persisted data to the db where clusterMap is stored (stores the mappings of the worker IDs amp; event IDs). The exit handler was @ 12:50:35 for that worker amp; logged that LOGS_HTTP_API died because there was no entry in the clusterMap to match the worker ID (this also happened with Worker #13 a little while later).

Иногда работники, которые были созданы после «неисправного» работника, говорят, что они в Сети и работают идеально, но некоторые этого не делают. Это произошло в 1-й раз пару дней назад, но в настоящее время активными были 14 работников, а не 9, так что я не вижу здесь никаких ограничений.

Я запускаю это на небольшой коробке DO с 1vCPU. Использование vCPU составляло всего около 7%, когда это произошло, а mem составлял 36%, поэтому, насколько я вижу, ограничений по ресурсам нет.

В чем может быть причина того, что работник был разветвлен, но не вышел в Сеть?