#apache-spark #pyspark #airflow
Вопрос:
Я работаю на MAC со следующим файлом docker-compose, который включает воздушный поток и кластер pyspark в автономном режиме:
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
#
# WARNING: This configuration is for local development. Do not use it in a production deployment.
#
# This configuration supports basic configuration using environment variables or an .env file
# The following variables are supported:
#
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
# Default: apache/airflow:|version|
# AIRFLOW_UID - User ID in Airflow containers
# Default: 50000
# AIRFLOW_GID - Group ID in Airflow containers
# Default: 50000
#
# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
#
# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
# Default: airflow
# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
# Default: airflow
# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
# Default: ''
#
# Feel free to modify this file to suit your needs.
---
version: "3"
x-airflow-common: amp;airflow-common
image: ${AIRFLOW_IMAGE_NAME:-jordi-airflow:2.0.0}
environment: amp;airflow-common-env
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db postgresql://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ""
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true"
AIRFLOW__CORE__LOAD_EXAMPLES: "false"
AIRFLOW__API__AUTH_BACKEND: "airflow.api.auth.backend.basic_auth"
#_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-pip install apache-airflow-providers-apache-spark}
volumes:
- ./dags:/opt/airflow/dags
- ./logs:/opt/airflow/logs
- ./plugins:/opt/airflow/plugins
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
depends_on: amp;airflow-common-depends-on
redis:
condition: service_healthy
postgres:
condition: service_healthy
services:
postgres:
image: postgres:13
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: airflow
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 5s
retries: 5
restart: always
redis:
image: redis:latest
ports:
- 6379:6379
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 30s
retries: 50
restart: always
airflow-webserver:
<<: *airflow-common
command: webserver
ports:
- 8080:8080
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 10s
timeout: 10s
retries: 5
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-scheduler:
<<: *airflow-common
command: scheduler
healthcheck:
test:
[
"CMD-SHELL",
'airflow jobs check --job-type SchedulerJob --hostname "${HOSTNAME}"',
]
interval: 10s
timeout: 10s
retries: 5
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-worker:
<<: *airflow-common
command: celery worker
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@${HOSTNAME}"'
interval: 10s
timeout: 10s
retries: 5
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully
airflow-init:
<<: *airflow-common
command: version
environment:
<<: *airflow-common-env
_AIRFLOW_DB_UPGRADE: "true"
_AIRFLOW_WWW_USER_CREATE: "true"
_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
# flower:
# <<: *airflow-common
# command: celery flower
# ports:
# - 5555:5555
# healthcheck:
# test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
# interval: 10s
# timeout: 10s
# retries: 5
# restart: always
# depends_on:
# <<: *airflow-common-depends-on
# airflow-init:
# condition: service_completed_successfully
######################################################
# SPARK SERVICES
######################################################
jupyterlab:
image: andreper/jupyterlab:3.0.0-spark-3.0.0
container_name: jupyterlab
ports:
- 8888:8888
- 4040:4040
volumes:
- shared-workspace:/opt/workspace
spark-master:
image: andreper/spark-master:3.0.0
container_name: spark-master
ports:
- 8081:8080
- 7077:7077
volumes:
- shared-workspace:/opt/workspace
# https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources
mem_reservation: 2g
spark-worker-1:
image: andreper/spark-worker:3.0.0
container_name: spark-worker-1
environment:
- SPARK_WORKER_CORES=1
- SPARK_WORKER_MEMORY=512m
ports:
- 8082:8081
volumes:
- shared-workspace:/opt/workspace
depends_on:
- spark-master
mem_reservation: 2g
# spark-worker-2:
# image: andreper/spark-worker:3.0.0
# container_name: spark-worker-2
# environment:
# - SPARK_WORKER_CORES=1
# - SPARK_WORKER_MEMORY=512m
# ports:
# - 8083:8081
# volumes:
# - shared-workspace:/opt/workspace
# depends_on:
# - spark-master
volumes:
postgres-db-volume:
shared-workspace:
name: "spark-volume"
driver: local
driver_opts:
type: "none"
o: "bind"
device: "/Users/jordicrespoguzman/Projects/custom_airflow_spark/dags"
Я могу выполнить код pyspark в лабораторном контейнере jupyter, и он работает без проблем, но когда я запускаю следующий DAG:
from airflow import DAG
from airflow.providers.http.sensors.http import HttpSensor
from airflow.sensors.filesystem import FileSensor
from airflow.operators.python import PythonOperator
from airflow.operators.bash import BashOperator
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
from airflow.operators.email import EmailOperator
from datetime import datetime, timedelta
import csv
import requests
import json
from test import execute_spark
default_args = {
"owner": "airflow",
"email_on_failure": False,
"email_on_retry": False,
"email": "admin@localhost.com",
"retries": 1,
"retry_delay": timedelta(minutes=5),
}
def printar():
print("success!")
with DAG(
"forex_data_pipeline",
start_date=datetime(2021, 1, 1),
schedule_interval="@daily",
default_args=default_args,
catchup=False,
) as dag:
downloading_rates = PythonOperator(task_id="test1", python_callable=printar)
operator = SparkSubmitOperator(
task_id="spark_submit_job",
conn_id="spark_conn",
application="/opt/airflow/dags/test.py",
total_executor_cores="1",
executor_cores="1",
executor_memory="2g",
num_executors="1",
name="airflow-spark1",
verbose=False,
driver_memory="2g",
conf={
"spark.master": "spark://spark-master:7077",
"spark.dynamicAllocation.enabled": "false",
"spark.shuffle.service.enabled": "false",
},
)
downloading_rates >> operator
I obtain this error from the terminal:
Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
and this is the logs from airflow:
*** Reading local file: /opt/airflow/logs/forex_data_pipeline/spark_submit_job/2021-06-05T00:00:00 00:00/16.log
[2021-06-06 18:17:54,956] {taskinstance.py:851} INFO - Dependencies all met for <TaskInstance: forex_data_pipeline.spark_submit_job 2021-06-05T00:00:00 00:00 [queued]>
[2021-06-06 18:17:55,498] {taskinstance.py:851} INFO - Dependencies all met for <TaskInstance: forex_data_pipeline.spark_submit_job 2021-06-05T00:00:00 00:00 [queued]>
[2021-06-06 18:17:55,528] {taskinstance.py:1042} INFO -
--------------------------------------------------------------------------------
[2021-06-06 18:17:55,553] {taskinstance.py:1043} INFO - Starting attempt 16 of 17
[2021-06-06 18:17:55,616] {taskinstance.py:1044} INFO -
--------------------------------------------------------------------------------
[2021-06-06 18:17:56,000] {taskinstance.py:1063} INFO - Executing <Task(SparkSubmitOperator): spark_submit_job> on 2021-06-05T00:00:00 00:00
[2021-06-06 18:17:56,071] {standard_task_runner.py:52} INFO - Started process 334 to run task
[2021-06-06 18:17:56,310] {standard_task_runner.py:76} INFO - Running: ['airflow', 'tasks', 'run', 'forex_data_pipeline', 'spark_submit_job', '2021-06-05T00:00:00 00:00', '--job-id', '57', '--pool', 'default_pool', '--raw', '--subdir', 'DAGS_FOLDER/dag_spark.py', '--cfg-path', '/tmp/tmplw_gag3t', '--error-file', '/tmp/tmpdp5gkyr2']
[2021-06-06 18:17:56,481] {standard_task_runner.py:77} INFO - Job 57: Subtask spark_submit_job
[2021-06-06 18:17:57,001] {logging_mixin.py:104} INFO - Running <TaskInstance: forex_data_pipeline.spark_submit_job 2021-06-05T00:00:00 00:00 [running]> on host 72eec67484fb
[2021-06-06 18:17:57,428] {taskinstance.py:1257} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_EMAIL=admin@localhost.com
AIRFLOW_CTX_DAG_OWNER=airflow
AIRFLOW_CTX_DAG_ID=forex_data_pipeline
AIRFLOW_CTX_TASK_ID=spark_submit_job
AIRFLOW_CTX_EXECUTION_DATE=2021-06-05T00:00:00 00:00
AIRFLOW_CTX_DAG_RUN_ID=scheduled__2021-06-05T00:00:00 00:00
[2021-06-06 18:17:57,642] {base.py:74} INFO - Using connection to: id: spark_conn. Host: spark://spark-master, Port: 7077, Schema: , Login: marcelo, Password: XXXXXXXX, extra: None
[2021-06-06 18:17:57,676] {spark_submit.py:364} INFO - Spark-Submit cmd: spark-submit --master spark://spark-master:7077 --conf spark.master=spark://spark-master:7077 --conf spark.dynamicAllocation.enabled=false --conf spark.shuffle.service.enabled=false --conf spark.driver.port=8083 --conf spark.blockManager.port=51814 --num-executors 1 --total-executor-cores 1 --executor-cores 1 --executor-memory 2g --driver-memory 2g --name airflow-spark1 /opt/airflow/dags/test.py
[2021-06-06 18:18:25,835] {spark_submit.py:526} INFO - WARNING: An illegal reflective access operation has occurred
[2021-06-06 18:18:25,863] {spark_submit.py:526} INFO - WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/airflow/.local/lib/python3.7/site-packages/pyspark/jars/spark-unsafe_2.12-3.1.2.jar) to constructor java.nio.DirectByteBuffer(long,int)
[2021-06-06 18:18:25,873] {spark_submit.py:526} INFO - WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
[2021-06-06 18:18:25,893] {spark_submit.py:526} INFO - WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
[2021-06-06 18:18:25,975] {spark_submit.py:526} INFO - WARNING: All illegal access operations will be denied in a future release
[2021-06-06 18:18:31,946] {spark_submit.py:526} INFO - 21/06/06 18:18:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[2021-06-06 18:18:45,384] {spark_submit.py:526} INFO - Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
[2021-06-06 18:18:45,641] {spark_submit.py:526} INFO - 21/06/06 18:18:45 INFO SparkContext: Running Spark version 3.1.2
[2021-06-06 18:18:47,179] {spark_submit.py:526} INFO - 21/06/06 18:18:47 INFO ResourceUtils: ==============================================================
[2021-06-06 18:18:47,233] {spark_submit.py:526} INFO - 21/06/06 18:18:47 INFO ResourceUtils: No custom resources configured for spark.driver.
[2021-06-06 18:18:47,265] {spark_submit.py:526} INFO - 21/06/06 18:18:47 INFO ResourceUtils: ==============================================================
[2021-06-06 18:18:47,278] {spark_submit.py:526} INFO - 21/06/06 18:18:47 INFO SparkContext: Submitted application: pyspark-4
[2021-06-06 18:18:47,844] {spark_submit.py:526} INFO - 21/06/06 18:18:47 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 2048, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
[2021-06-06 18:18:48,285] {spark_submit.py:526} INFO - 21/06/06 18:18:48 INFO ResourceProfile: Limiting resource is cpus at 1 tasks per executor
[2021-06-06 18:18:48,388] {spark_submit.py:526} INFO - 21/06/06 18:18:48 INFO ResourceProfileManager: Added ResourceProfile id: 0
[2021-06-06 18:18:50,399] {spark_submit.py:526} INFO - 21/06/06 18:18:50 INFO SecurityManager: Changing view acls to: airflow
[2021-06-06 18:18:50,531] {spark_submit.py:526} INFO - 21/06/06 18:18:50 INFO SecurityManager: Changing modify acls to: airflow
[2021-06-06 18:18:50,537] {spark_submit.py:526} INFO - 21/06/06 18:18:50 INFO SecurityManager: Changing view acls groups to:
[2021-06-06 18:18:50,544] {spark_submit.py:526} INFO - 21/06/06 18:18:50 INFO SecurityManager: Changing modify acls groups to:
[2021-06-06 18:18:50,546] {spark_submit.py:526} INFO - 21/06/06 18:18:50 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(airflow); groups with view permissions: Set(); users with modify permissions: Set(airflow); groups with modify permissions: Set()
[2021-06-06 18:18:55,387] {spark_submit.py:526} INFO - 21/06/06 18:18:55 INFO Utils: Successfully started service 'sparkDriver' on port 8083.
[2021-06-06 18:18:55,784] {spark_submit.py:526} INFO - 21/06/06 18:18:55 INFO SparkEnv: Registering MapOutputTracker
[2021-06-06 18:18:56,441] {spark_submit.py:526} INFO - 21/06/06 18:18:56 INFO SparkEnv: Registering BlockManagerMaster
[2021-06-06 18:18:56,812] {spark_submit.py:526} INFO - 21/06/06 18:18:56 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
[2021-06-06 18:18:56,815] {spark_submit.py:526} INFO - 21/06/06 18:18:56 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
[2021-06-06 18:18:56,904] {spark_submit.py:526} INFO - 21/06/06 18:18:56 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
[2021-06-06 18:18:57,050] {spark_submit.py:526} INFO - 21/06/06 18:18:57 INFO DiskBlockManager: Created local directory at /tmp/blockmgr-1c1ee504-d748-4590-a348-772157d408a3
[2021-06-06 18:18:57,200] {spark_submit.py:526} INFO - 21/06/06 18:18:57 INFO MemoryStore: MemoryStore started with capacity 1048.8 MiB
[2021-06-06 18:18:57,378] {spark_submit.py:526} INFO - 21/06/06 18:18:57 INFO SparkEnv: Registering OutputCommitCoordinator
[2021-06-06 18:19:00,432] {spark_submit.py:526} INFO - 21/06/06 18:19:00 INFO Utils: Successfully started service 'SparkUI' on port 4040.
[2021-06-06 18:19:01,266] {spark_submit.py:526} INFO - 21/06/06 18:19:01 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://72eec67484fb:4040
[2021-06-06 18:19:03,846] {spark_submit.py:526} INFO - 21/06/06 18:19:03 INFO StandaloneAppClient$ClientEndpoint: Connecting to master spark://spark-master:7077...
[2021-06-06 18:19:04,809] {spark_submit.py:526} INFO - 21/06/06 18:19:04 INFO TransportClientFactory: Successfully created connection to spark-master/172.21.0.6:7077 after 543 ms (0 ms spent in bootstraps)
[2021-06-06 18:19:05,705] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO StandaloneSchedulerBackend: Connected to Spark cluster with app ID app-20210606181905-0011
[2021-06-06 18:19:05,729] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 51814.
[2021-06-06 18:19:05,731] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO NettyBlockTransferService: Server created on 72eec67484fb:51814
[2021-06-06 18:19:05,735] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
[2021-06-06 18:19:05,796] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 72eec67484fb, 51814, None)
[2021-06-06 18:19:05,912] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO BlockManagerMasterEndpoint: Registering block manager 72eec67484fb:51814 with 1048.8 MiB RAM, BlockManagerId(driver, 72eec67484fb, 51814, None)
[2021-06-06 18:19:05,930] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 72eec67484fb, 51814, None)
[2021-06-06 18:19:05,934] {spark_submit.py:526} INFO - 21/06/06 18:19:05 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 72eec67484fb, 51814, None)
[2021-06-06 18:19:08,745] {spark_submit.py:526} INFO - 21/06/06 18:19:08 INFO StandaloneSchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
[2021-06-06 18:19:13,720] {spark_submit.py:526} INFO - 21/06/06 18:19:13 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('/opt/airflow/spark-warehouse').
[2021-06-06 18:19:13,772] {spark_submit.py:526} INFO - 21/06/06 18:19:13 INFO SharedState: Warehouse path is '/opt/airflow/spark-warehouse'.
[2021-06-06 18:19:23,242] {spark_submit.py:526} INFO - ESTOY AQUI¿¿
[2021-06-06 18:20:03,028] {spark_submit.py:526} INFO - 21/06/06 18:20:03 INFO CodeGenerator: Code generated in 4421.1717 ms
[2021-06-06 18:20:03,414] {spark_submit.py:526} INFO - 21/06/06 18:20:03 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
[2021-06-06 18:20:03,767] {spark_submit.py:526} INFO - 21/06/06 18:20:03 INFO DAGScheduler: Got job 0 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
[2021-06-06 18:20:03,787] {spark_submit.py:526} INFO - 21/06/06 18:20:03 INFO DAGScheduler: Final stage: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0)
[2021-06-06 18:20:03,819] {spark_submit.py:526} INFO - 21/06/06 18:20:03 INFO DAGScheduler: Parents of final stage: List()
[2021-06-06 18:20:03,827] {spark_submit.py:526} INFO - 21/06/06 18:20:03 INFO DAGScheduler: Missing parents: List()
[2021-06-06 18:20:03,951] {spark_submit.py:526} INFO - 21/06/06 18:20:03 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[6] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
[2021-06-06 18:20:05,161] {spark_submit.py:526} INFO - 21/06/06 18:20:05 INFO AsyncEventQueue: Process of event SparkListenerJobStart(0,1623003603820,WrappedArray(org.apache.spark.scheduler.StageInfo@f7ba1aa),{spark.executor.memory=2g, spark.driver.port=8083, spark.driver.memory=2g, spark.master=spark://spark-master:7077, spark.submit.pyFiles=, spark.app.startTime=1623003524952, spark.driver.cores=1, spark.rdd.compress=True, spark.executor.id=driver, spark.app.name=pyspark-4, spark.executor.cores=1, spark.submit.deployMode=client, spark.driver.host=72eec67484fb, spark.app.id=app-20210606181905-0011, spark.shuffle.service.enabled=false, spark.sql.execution.id=0, spark.cores.max=1, spark.blockManager.port=51814, spark.dynamicAllocation.enabled=false, spark.sql.warehouse.dir=/opt/airflow/spark-warehouse, spark.serializer.objectStreamReset=100}) by listener AppStatusListener took 1.1323294s.
[2021-06-06 18:20:07,024] {spark_submit.py:526} INFO - 21/06/06 18:20:07 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 11.3 KiB, free 1048.8 MiB)
[2021-06-06 18:20:07,415] {spark_submit.py:526} INFO - 21/06/06 18:20:07 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 6.0 KiB, free 1048.8 MiB)
[2021-06-06 18:20:07,440] {spark_submit.py:526} INFO - 21/06/06 18:20:07 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 72eec67484fb:51814 (size: 6.0 KiB, free: 1048.8 MiB)
[2021-06-06 18:20:07,465] {spark_submit.py:526} INFO - 21/06/06 18:20:07 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1388
[2021-06-06 18:20:07,573] {spark_submit.py:526} INFO - 21/06/06 18:20:07 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[6] at showString at NativeMethodAccessorImpl.java:0) (first 15 tasks are for partitions Vector(0))
[2021-06-06 18:20:07,587] {spark_submit.py:526} INFO - 21/06/06 18:20:07 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks resource profile 0
[2021-06-06 18:20:22,808] {spark_submit.py:526} INFO - 21/06/06 18:20:22 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:20:37,721] {spark_submit.py:526} INFO - 21/06/06 18:20:37 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:20:52,722] {spark_submit.py:526} INFO - 21/06/06 18:20:52 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:21:07,981] {spark_submit.py:526} INFO - 21/06/06 18:21:07 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:21:22,799] {spark_submit.py:526} INFO - 21/06/06 18:21:22 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:21:37,728] {spark_submit.py:526} INFO - 21/06/06 18:21:37 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:21:52,721] {spark_submit.py:526} INFO - 21/06/06 18:21:52 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:22:07,735] {spark_submit.py:526} INFO - 21/06/06 18:22:07 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:22:22,722] {spark_submit.py:526} INFO - 21/06/06 18:22:22 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:22:37,721] {spark_submit.py:526} INFO - 21/06/06 18:22:37 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:22:52,721] {spark_submit.py:526} INFO - 21/06/06 18:22:52 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:23:07,754] {spark_submit.py:526} INFO - 21/06/06 18:23:07 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
[2021-06-06 18:23:22,721] {spark_submit.py:526} INFO - 21/06/06 18:23:22 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
Если я пойду в http://localhost:8081/, Я вижу, что идентификатор приложения всегда ждет.
Я попытался выполнить задание spark-отправки в контейнере jupyter, и это дало мне ту же ошибку, я искал что-то связанное, и я перепробовал много разных вещей, но все работает, возможно, я неправильно настроил какую-то конфигурацию
Комментарии:
1. Похоже, у вас есть только один работник , и вы не настраиваете
spark.executors.instances
, что по умолчанию равно 2. Вы можете попробовать, если это действительно проблема, то вы можете 1) установитьspark.executors.instances
1
или 2) добавить еще одного работника