Spark SQL实战:SparkSQL exmple

1.需求:

使用Spark SQL,读取文件并查询数据表

2.代码:

(1)pom.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
</dependency>

(2)SparkSQLExample.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package spark.sqlshizhan

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession


/**
* @ClassName SparkSQLExample
* @MethodDesc: TODO SparkSQLExample功能介绍
* @Author Movle
* @Date 5/18/20 9:34 下午
* @Version 1.0
* @Email movle_xjk@foxmail.com
**/
case class Student1(sno:String,sname:String,ssex:String,sbirthday:String,sclass:String)

case class Course(cno:String,cname:String,tno:String)

case class Score(sno:String,cno:String,degree:String)

case class Teacher(tno:String,tname:String,tsex:String,tbirthday:String,tprof:String,tdepart:String)

import java.text.SimpleDateFormat
object SparkSQLExample {

def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir","/Users/macbook/Documents/hadoop/hadoop-2.8.4")

Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

def getDate(time: String) = {
val now: Long = System.currentTimeMillis()
val df: SimpleDateFormat = new SimpleDateFormat(time)
df.format(now)
}

val spark = SparkSession.builder().master("local").appName("SparkSQLExample").getOrCreate()

import spark.sqlContext.implicits._

spark.sparkContext.textFile("/users/macbook/TestInfo/Student.csv")
.map(_.split(","))
.map(x=>Student1(x(0),x(1),x(2),x(3),x(4)))
.toDF
.createOrReplaceTempView("Student1")

spark.sparkContext.textFile("/users/macbook/TestInfo/Course.csv")
.map(_.split(","))
.map(x => Course(x(0),x(1),x(2)))
.toDF
.createOrReplaceTempView("Course")

spark.sparkContext.textFile("/users/macbook/TestInfo/Score.csv")
.map(_.split(","))
.map(x => Score(x(0),x(1),x(2)))
.toDF
.createOrReplaceTempView("Score")
spark.sparkContext.textFile("/users/macbook/TestInfo/Teacher.csv")
.map(_.split(","))
.map(x => Teacher(x(0),x(1),x(2),x(3),x(4),x(5)))
.toDF
.createOrReplaceTempView("Teacher")


//查询整个teacher表
spark.sql("select * from Teacher").show()

// 查询Student表中所有记录的sname ssex class列
spark.sql("select sname,ssex,sclass from student").show()

//查询教师表中不重复的depart列
spark.sql("select distinct tdepart from teacher").show(false)
spark.sql("select tdepart from teacher group by tdepart").show(false)

//查询score表中成绩在60 80 之间的所有记录
spark.sql("select * from score where degree >= 60 and degree <= 80").show()
spark.sql("select * from score where degree between 60 and 80").show()

//查询score表中成绩为 85 86 或 88 的记录
spark.sql("select * from score where degree = '85' or degree='86' OR degree='88'").show()
spark.sql("select * from score where degree =85 or degree=86 OR degree=88").show()

//以class降序、升序排列查询
spark.sql("select * from student order by sclass desc").show()
spark.sql("select * from student order by sclass").show()

//以cno升序 degree降序查询score表中的数据
spark.sql("select * from score t order by t.sno asc, t.degree desc").show()

//查询Score表中的最高分的学生学号和课程
spark.sql("select * from score order by Int(degree) desc limit 1").show()
spark.sql("select * from score order by Int(degree) desc").show()

//查询每门课的平均成绩
spark.sql("select cno,avg(degree) from score group by cno").show()

//查询score表中至少有5名学生选修的课,并且名字以 3 开头的课程 的平均分数
spark.sql("select cno,avg(degree) from score where cno like '3%' group by cno having count(cno) >= 5").show()

//查询所有学生中的sname cname degree
spark.sql("select s.sname, t.degree,c.cname from score t " +
"join student s on t.sno=s.sno " +
"join course c on c.cno=t.cno").show(false)

//查询score中选择多门课程的同学中,分数为非最高分成绩的记录
spark.sql("select * from score where " +
"sno in (select sno from score t group by t.sno having count(1) > 1) " +
" and degree != (select max(degree) from score)").show()

//查询和学号为108的同学同年出生的所有学生的sno sname sbirthday 列
spark.sql("select sno,sname,sbirthday from student where substring(sbirthday,0,4) = (" +
" select substring(t.sbirthday,0,4) from student t where sno='108')").show()

//查询选修某课程的同学人数多于5人的教师姓名
spark.sql("select tname from teacher e " +
" join course c on e.tno = c.tno " +
" join (select cno from score group by cno having count(cno) > 5) t on c.cno = t.cno").show()

//查询成绩比该课程平均成绩低的同学的成绩表
spark.sql("select s.* from score s where s.degree < (select avg(degree) from score c where s.cno = c.cno)").show()

//查询所有没有讲课的教师的tname 和 depart
spark.sql("select tname , tdepart from teacher t where t.tno not in (select tno from course c where c.cno in (select cno from score))").show(false)

//查询至少有2名男生的班号
spark.sql("select sclass from student t where ssex='male' group by sclass having count(ssex) >= 2").show()

//查询student表中不姓 王 的同学记录
spark.sql("select * from student t where sname not like('Wang%')").show()

//查询student表中每个学生的姓名和年龄
spark.sql("select sname, (cast(" + getDate("yyyy") + " as int) - cast(substring(sbirthday,0,4) as int)) as age from student t").show()

spark.close()
}

}

3.结果:

打赏
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2015-2021 Movle
  • 访问人数: | 浏览次数:

请我喝杯咖啡吧~

支付宝
微信