#apache-spark #pyspark #apache-spark-mllib
Вопрос:
Я следую примеру Spark ML здесь,
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print "LogisticRegression parameters:n" lr.explainParams() "n"
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
Однако model1 = lr.fit(training)
выдает следующее сообщение об ошибке.
---------------------------------------------------------------------------
IllegalArgumentException Traceback (most recent call last)
<ipython-input-14-3e398ce8c8bd> in <module>
1 # Learn a LogisticRegression model. This uses the parameters stored in lr.
----> 2 model1 = lr.fit(training)
C:sparkspark-3.0.2-bin-hadoop2.7pythonpysparkmlbase.py in fit(self, dataset, params)
127 return self.copy(params)._fit(dataset)
128 else:
--> 129 return self._fit(dataset)
130 else:
131 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:sparkspark-3.0.2-bin-hadoop2.7pythonpysparkmlwrapper.py in _fit(self, dataset)
319
320 def _fit(self, dataset):
--> 321 java_model = self._fit_java(dataset)
322 model = self._create_model(java_model)
323 return self._copyValues(model)
C:sparkspark-3.0.2-bin-hadoop2.7pythonpysparkmlwrapper.py in _fit_java(self, dataset)
316 """
317 self._transfer_params_to_java()
--> 318 return self._java_obj.fit(dataset._jdf)
319
320 def _fit(self, dataset):
C:sparkspark-3.0.2-bin-hadoop2.7pythonlibpy4j-0.10.9-src.zippy4jjava_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
C:sparkspark-3.0.2-bin-hadoop2.7pythonpysparksqlutils.py in deco(*a, **kw)
132 # Hide where the exception came from that shows a non-Pythonic
133 # JVM exception message.
--> 134 raise_from(converted)
135 else:
136 raise
C:sparkspark-3.0.2-bin-hadoop2.7pythonpysparksqlutils.py in raise_from(e)
IllegalArgumentException: requirement failed: Column features must be of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.
Комментарии:
1. не смешивайте spark.ml и spark.mllib, который, как известно, вызывает такого рода ошибки.
2. @mck, не могли бы вы подробнее рассказать о своих комментариях, как изменить приведенный выше пример? Спасибо.
3. используйте векторы из ml, а не mllib