pyspark之填充缺失的时间数据
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Windowspark = SparkSession.builder.appName("test").enableHiveSupport().getOrCreate()df = spark.createDataFrame(
[(1, "a", 10, "2019-09-20", "1"), (2, "a", 20, "2019-09-20", "3"), (3, "a", 5, "2019-09-21", "5"), (4, "b", 8, "2019-09-20", "7"), (5, "b", 9, "2019-09-21", "9"), (6, "b", 16, "2019-09-21", "11")],
["id", "category", "num", "d", "h"])df = df.withColumn("h", df["h"].cast(IntegerType()))
df = df.withColumn("time", F.from_unixtime((F.unix_timestamp(F.col("d"), "yyyy-MM-dd")+F.col("h")*3600)).cast("timestamp"))Last updated