// Ensemble源代码地址,https://github.com/XXXShao/EnsembleModelingInSpark// 需要打成jar包再导入方可使用
import Ensemble.{Ensembler,EnsembleModel}import org.apache.spark.ml.linalg.{Vector, Vectors}import org.apache.spark.ml.Pipelineimport org.apache.spark.ml.classification.LogisticRegressionimport org.apache.spark.ml.evaluation.BinaryClassificationEvaluatorimport org.apache.spark.ml.feature.{HashingTF, Tokenizer}import org.apache.spark.ml.linalg.Vectorimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}import org.apache.spark.sql.{Row, SparkSession}import org.apache.spark.ml.linalg.{Vector, Vectors}import org.apache.spark.ml.feature.Normalizer//import org.apache.log4j.{Level, Logger}//Logger.getLogger("org.apache.spark").setLevel(Level.WARN)// val training = spark.createDataFrame(Seq(// (1.0, Vectors.dense(0.0, 1.1, 0.1)),// (0.0, Vectors.dense(2.0, 1.0, -1.0)),// (0.0, Vectors.dense(2.0, 1.3, 1.0)),// (1.0, Vectors.dense(0.0, 1.2, -0.5))// )).toDF("label", "features")val training = spark.read.format("libsvm").load("/user/spark/H2O/data/sample_libsvm_data.txt")// Create a LogisticRegression instance. This instance is an Estimator.val lr = new LogisticRegression()// We may set parameters using setter methods.lr.setMaxIter(10).setRegParam(0.01)val lr1 = new LogisticRegression()val lr2 = new LogisticRegression().setMaxIter(10).setRegParam(0.05)//third component is a piplineval normalizer = (new Normalizer().setInputCol("features") .setOutputCol("normFeatures").setP(1.0))val lr30 = new LogisticRegression().setFeaturesCol("normFeatures")val lr3 = new Pipeline().setStages(Array(normalizer, lr30))//ensemble modelsval ensembling = new Ensembler().setComponents(Array(lr, lr1, lr2, lr3))val model = ensembling.fit(training)val transformers = model.components.map(t => t.transform(training))transformers.map(x => x.show()) //show individual predictions//combine modelsval prediction = model.transform(training)prediction.show()/****加入xgb之后的模型融合val xgb = new XGBoostEstimator(Map("num_class" -> 2, "num_rounds" -> 5, "objective" -> "binary:logistic", "booster" -> "gbtree")).setLabelCol("label").setFeaturesCol("features")/ensemble modelsval ensembling = new Ensembler().setComponents(Array(xgb, lr, lr1, lr2, lr3))val model = ensembling.fit(training)val transformers = model.components.map(t => t.transform(training))transformers.map(x => x.show()) //show individual predictions//combine modelsval prediction = model.transform(training)prediction.show()****/