from pyspark.sql import SparkSession
from pyspark.sql.functions import col,udf
from pyspark.sql.types import BooleanType,StringType,FloatType
spark = SparkSession.builder.appName('test').getOrCreate()
df =spark.read.csv("911.csv",header=True,inferSchema=True)
1、使用sparksql
df.createOrReplaceTempView("employee")
spark.sql("select * from employee").show()
2、传入变量
add_count=300
sql = f"select `姓名`,`薪资`,`薪资`*0.1+{add_count} as `年终奖` from employee"
spark.sql(sql).show(3)
spark.sql("select `姓名`,`薪资`,`薪资`*0.1+{add_count} as `年终奖` from employee",add_count=add_count).show(3)
3、聚合统计
sql = """
dept,
count(*) as number,
max(salary) as max_salary,
min(salary) as min_salary,
round(avg(salary),) as avg_salary
from employee
group by dept
"""
spark.sql(sql).show(5)
4、自定义函数
def compute_bouns(salary):
return salary*0.5
spark.udf.register("bonus",compute_bouns,FloatType())
spark.sql("select `姓名`,`薪资`,compute_bouns('salary') as `年终奖` from employee").show(3)