There are many ways to do that:
- Option 1. Using selectExpr.
data = sqlContext.createDataFrame([("Pratik", 21), ("Dakota", 20)], ["col1", "col2"]) data.show() data.printSchema() # Output #+-------+-----+ #| col1|col2 | #+-------+-----+ #| Pratik| 21| #| Dakota| 20| #+-------+-----+ #root # |-- col1: string (nullable = true) # |-- col2: long (nullable = true) df = data.selectExpr("col1 as name", "col2 as age") df.show() df.printSchema() # Output #+-------+---+ #| name|age| #+-------+---+ #|Pratik| 21| #| Dakota| 20| #+-------+---+ #root # |-- name: string (nullable = true) # |-- age: long (nullable = true) - Option 2. Using withColumnRenamed, notice that this method allows you to "overwrite" the same column.
oldColumns = data.schema.names newColumns = ["name", "age"] df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), xrange(len(oldColumns)), data) df.printSchema() df.show() -
from pyspark.sql.functions import * data = data.select(col("col1").alias("name"), col("col2").alias("age")) data.show() # Output #+-------+---+ #| name|age| #+-------+---+ #| Pratik| 21| #| Dakota| 20| #+-------+---+ -
sqlContext.registerDataFrameAsTable(data, "myTable") df2 = sqlContext.sql("SELECT col1 AS name, col2 as age from myTable") df2.show() # Output #+-------+---+ #| name|age| #+-------+---+ #| Pratik| 21| #| Dakota| 20| #+-------+---+
No comments:
Post a Comment