I am trying to evaluate my Logistic Regression model, going to get the "bestModel" based on the k-fold cross validation, which is also fully upgraded on the whole train_df.
This is my Pipeline:
# This function defines the general pipeline for logistic regression
def lr_pipeline(train, 
                numerical_feature, 
                target_variable, 
                with_std=True,
                with_mean=True,
                k_fold=5):
    from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
    from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.ml import Pipeline
    # Create stages list
    stages = [] 
    # Create the logistic regression transformer
    log_reg = LogisticRegression(featuresCol="word2vect", labelCol="label", maxIter=100, weightCol="classWeigth") 
    # Add the logistic regression transformer to the pipeline stages (i.e., the last one)
    stages += [log_reg]
    # Set up the pipeline
    pipeline = Pipeline(stages=stages)
    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # With 3 values for log_reg.regParam ($\lambda$) and 3 values for log_reg.elasticNetParam ($\alpha$),
    # this grid will have 3 x 3 = 9 parameter settings for CrossValidator to choose from.
    param_grid = ParamGridBuilder()\
    .addGrid(log_reg.regParam, [0.0, 0.05, 0.1]) \
    .addGrid(log_reg.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()
    
    cross_val = CrossValidator(estimator=pipeline, 
                               estimatorParamMaps=param_grid,
                               evaluator=BinaryClassificationEvaluator(metricName="areaUnderROC"), # default = "areaUnderROC", alternatively "areaUnderPR"
                               numFolds=k_fold,
                               collectSubModels=True # this flag allows us to store ALL the models trained during k-fold cross validation
                               )
    # Run cross-validation, and choose the best set of parameters.
    cv_model = cross_val.fit(train)
    return cv_model
from pyspark.ml.tuning import CrossValidatorModel
cv_model = lr_pipeline(train_df, "word2vect", "label")
When I go to do the model summary:
training_result = cv_model.bestModel.stages[-1].summary
to take the best result obtained from the k-fold I get the following error:
RuntimeError                              Traceback (most recent call last)
<ipython-input-27-923b8532c6cf> in <module>()
      1 # `bestModel` is the best resulting model according to k-fold cross validation, which is also entirely retrained on the whole `train_df`
----> 2 training_result = cv_model.bestModel.stages[-1].summary
      3 print("***** Training Set *****")
      4 print("Area Under ROC Curve (ROC AUC): {:.3f}".format(training_result.areaUnderROC))
      5 print("***** Training Set *****")
/usr/local/lib/python3.7/dist-packages/pyspark/ml/classification.py in summary(self)
   1250         else:
   1251             raise RuntimeError("No training summary available for this %s" %
-> 1252                                self.__class__.__name__)
   1253 
   1254     def evaluate(self, dataset):
RuntimeError: No training summary available for this LogisticRegressionModel
Can anyone help me?
 
    