package cn.edu360.day8
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
  * Created by zx on 2017/10/16.
  */
object JoinTest {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("CsvDataSource")
      .master("local[*]")
      .getOrCreate()
    import spark.implicits._
    //import org.apache.spark.sql.functions._
    //spark.sql.autoBroadcastJoinThreshold=-1
    spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
    //spark.conf.set("spark.sql.join.preferSortMergeJoin", true)
    //println(spark.conf.get("spark.sql.autoBroadcastJoinThreshold"))
    val df1 = Seq(
      (0, "playing"),
      (1, "with"),
      (2, "join")
    ).toDF("id", "token")
    val df2 = Seq(
      (0, "P"),
      (1, "W"),
      (2, "S")
    ).toDF("aid", "atoken")
    df2.repartition()
    //df1.cache().count()
    val result: DataFrame = df1.join(df2, $"id" === $"aid")
    //查看执行计划
    result.explain()
    result.show()
    spark.stop()
  }
}spark.conf.set(“spark.sql.autoBroadcastJoinThreshold”, -1)
Broadcast Join默认是10M -1代表不使用,可以修改-1的值
merge join 先进行每个表的排序,然后join
hashShuffle join 对数值取hash到不同分区进行join操作(默认)










