https://www.bilibili.com/video/BV1Xz4y1m7cv?p=60
需求使用SparkSQL的SQL和DSL两种方式完成wordCount
代码实现package sql import org.apache.spark.sql.{Dataframe, Dataset, SparkSession} object Demo05_WordCount{ def main(args: Array[String]): Unit = { //TODO 0.准备环境 val spark = SparkSession.builder().appName("sparksql").master("local[*]").getOrCreate() val sc = spark.sparkContext sc.setLogLevel("WARN") import spark.implicits._ //TODO 1.加载数据 val df: Dataframe = spark.read.text("data/person.txt") val ds: Dataset[String] = spark.read.textFile("data/person.txt") df.printSchema() df.show() ds.printSchema() ds.show() //TODO 2.处理数据 import spark.implicits._ // df.flatMap(_.split(" "))//注意:df没有泛型,不能直接使用split val words: Dataset[String] = ds.flatMap(_.split(" ")) words.printSchema() words.show() //WordCount //TODO =====SQL==== words.createOrReplaceGlobalTempView("t_words") val sql:String= """ |select value,count(*) as counts |from t_words |group by value |order by counts desc """.stripMargin spark.sql(sql).show //TODO =====DSL==== words.groupBy('value) .count() .orderBy('count.desc) .show() //TODO 3.输出数据 //TODO 4.关闭资源 spark.close() } case class Person(id:Int,name:String,age:Int) }演示
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)