agg |
agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame
agg(expr: Column, exprs: Column*): DataFrame
agg(exprs: Map[String, String]): DataFrame
An untyped transformation
|
alias |
alias(alias: String): Dataset[T]
alias(alias: Symbol): Dataset[T]
A typed transformation that is a mere synonym of as.
|
apply |
apply(colName: String): Column
An untyped transformation to select a column based on the column name (i.e. maps a Dataset onto a Column )
|
as |
as(alias: String): Dataset[T]
as(alias: Symbol): Dataset[T]
|
as |
as[U : Encoder]: Dataset[U]
A typed transformation to enforce a type, i.e. marking the records in the Dataset as of a given data type (data type conversion). as simply changes the view of the data that is passed into typed operations (e.g. map) and does not eagerly project away any columns that are not present in the specified class.
|
cache |
A basic action that is a mere synonym of persist.
|
checkpoint |
checkpoint(): Dataset[T]
checkpoint(eager: Boolean): Dataset[T]
A basic action to checkpoint the Dataset in a reliable way (using a reliable HDFS-compliant file system, e.g. Hadoop HDFS or Amazon S3)
|
coalesce |
coalesce(numPartitions: Int): Dataset[T]
A typed transformation to repartition a Dataset
|
col |
col(colName: String): Column
An untyped transformation to create a column (reference) based on the column name
|
collect |
|
colRegex |
colRegex(colName: String): Column
An untyped transformation to create a column (reference) based on the column name specified as a regex
|
columns |
|
count |
An action to count the number of rows
|
createGlobalTempView |
createGlobalTempView(viewName: String): Unit
|
createOrReplaceGlobalTempView |
createOrReplaceGlobalTempView(viewName: String): Unit
|
createOrReplaceTempView |
createOrReplaceTempView(viewName: String): Unit
|
createTempView |
createTempView(viewName: String): Unit
|
crossJoin |
crossJoin(right: Dataset[_]): DataFrame
An untyped transformation
|
cube |
cube(cols: Column*): RelationalGroupedDataset
cube(col1: String, cols: String*): RelationalGroupedDataset
An untyped transformation
|
describe |
describe(cols: String*): DataFrame
|
distinct |
A typed transformation that is a mere synonym of dropDuplicates (with all the columns of the Dataset )
|
drop |
drop(colName: String): DataFrame
drop(colNames: String*): DataFrame
drop(col: Column): DataFrame
An untyped transformation
|
dropDuplicates |
dropDuplicates(): Dataset[T]
dropDuplicates(colNames: Array[String]): Dataset[T]
dropDuplicates(colNames: Seq[String]): Dataset[T]
dropDuplicates(col1: String, cols: String*): Dataset[T]
|
dtypes |
dtypes: Array[(String, String)]
|
except |
except(
other: Dataset[T]): Dataset[T]
|
exceptAll |
exceptAll(
other: Dataset[T]): Dataset[T]
(New in 2.4.0) A typed transformation
|
explain |
explain(): Unit
explain(extended: Boolean): Unit
A basic action to display the logical and physical plans of the Dataset , i.e. displays the logical and physical plans (with optional cost and codegen summaries) to the standard output
|
filter |
filter(condition: Column): Dataset[T]
filter(conditionExpr: String): Dataset[T]
filter(func: T => Boolean): Dataset[T]
|
first |
An action that is a mere synonym of head
|
flatMap |
flatMap[U : Encoder](func: T => TraversableOnce[U]): Dataset[U]
|
foreach |
foreach(f: T => Unit): Unit
|
foreachPartition |
foreachPartition(f: Iterator[T] => Unit): Unit
|
groupBy |
groupBy(cols: Column*): RelationalGroupedDataset
groupBy(col1: String, cols: String*): RelationalGroupedDataset
An untyped transformation
|
groupByKey |
groupByKey[K: Encoder](func: T => K): KeyValueGroupedDataset[K, T]
|
head |
head(): T (1)
head(n: Int): Array[T]
|
hint |
hint(name: String, parameters: Any*): Dataset[T]
A basic action to specify a hint (and optional parameters)
|
inputFiles |
|
intersect |
intersect(other: Dataset[T]): Dataset[T]
|
intersectAll |
intersectAll(other: Dataset[T]): Dataset[T]
(New in 2.4.0) A typed transformation
|
isEmpty |
(New in 2.4.0) A basic action
|
isLocal |
|
isStreaming |
|
join |
join(right: Dataset[_]): DataFrame
join(right: Dataset[_], usingColumn: String): DataFrame
join(right: Dataset[_], usingColumns: Seq[String]): DataFrame
join(right: Dataset[_], usingColumns: Seq[String], joinType: String): DataFrame
join(right: Dataset[_], joinExprs: Column): DataFrame
join(right: Dataset[_], joinExprs: Column, joinType: String): DataFrame
An untyped transformation
|
joinWith |
joinWith[U](other: Dataset[U], condition: Column): Dataset[(T, U)]
joinWith[U](other: Dataset[U], condition: Column, joinType: String): Dataset[(T, U)]
|
limit |
limit(n: Int): Dataset[T]
|
localCheckpoint |
localCheckpoint(): Dataset[T]
localCheckpoint(eager: Boolean): Dataset[T]
A basic action to checkpoint the Dataset locally on executors (and therefore unreliably)
|
map |
map[U: Encoder](func: T => U): Dataset[U]
|
mapPartitions |
mapPartitions[U : Encoder](func: Iterator[T] => Iterator[U]): Dataset[U]
|
na |
An untyped transformation
|
orderBy |
orderBy(sortExprs: Column*): Dataset[T]
orderBy(sortCol: String, sortCols: String*): Dataset[T]
|
persist |
persist(): this.type
persist(newLevel: StorageLevel): this.type
A basic action to persist the Dataset
Note
|
Although its category persist is not an action in the common sense that means executing anything in a Spark cluster (i.e. execution on the driver or on executors). It acts only as a marker to perform Dataset persistence once an action is really executed.
|
|
printSchema |
|
randomSplit |
randomSplit(weights: Array[Double]): Array[Dataset[T]]
randomSplit(weights: Array[Double], seed: Long): Array[Dataset[T]]
A typed transformation to split a Dataset randomly into two Datasets
|
rdd |
|
reduce |
reduce(func: (T, T) => T): T
An action to reduce the records of the Dataset using the specified binary function.
|
repartition |
repartition(partitionExprs: Column*): Dataset[T]
repartition(numPartitions: Int): Dataset[T]
repartition(numPartitions: Int, partitionExprs: Column*): Dataset[T]
A typed transformation to repartition a Dataset
|
repartitionByRange |
repartitionByRange(partitionExprs: Column*): Dataset[T]
repartitionByRange(numPartitions: Int, partitionExprs: Column*): Dataset[T]
|
rollup |
rollup(cols: Column*): RelationalGroupedDataset
rollup(col1: String, cols: String*): RelationalGroupedDataset
An untyped transformation
|
sample |
sample(withReplacement: Boolean, fraction: Double): Dataset[T]
sample(withReplacement: Boolean, fraction: Double, seed: Long): Dataset[T]
sample(fraction: Double): Dataset[T]
sample(fraction: Double, seed: Long): Dataset[T]
|
schema |
|
select |
select(cols: Column*): DataFrame
select(col: String, cols: String*): DataFrame
select[U1](c1: TypedColumn[T, U1]): Dataset[U1]
select[U1, U2](c1: TypedColumn[T, U1], c2: TypedColumn[T, U2]): Dataset[(U1, U2)]
select[U1, U2, U3](
c1: TypedColumn[T, U1],
c2: TypedColumn[T, U2],
c3: TypedColumn[T, U3]): Dataset[(U1, U2, U3)]
select[U1, U2, U3, U4](
c1: TypedColumn[T, U1],
c2: TypedColumn[T, U2],
c3: TypedColumn[T, U3],
c4: TypedColumn[T, U4]): Dataset[(U1, U2, U3, U4)]
select[U1, U2, U3, U4, U5](
c1: TypedColumn[T, U1],
c2: TypedColumn[T, U2],
c3: TypedColumn[T, U3],
c4: TypedColumn[T, U4],
c5: TypedColumn[T, U5]): Dataset[(U1, U2, U3, U4, U5)]
An (untyped and typed) transformation
|
selectExpr |
selectExpr(exprs: String*): DataFrame
An untyped transformation
|
show |
show(): Unit
show(truncate: Boolean): Unit
show(numRows: Int): Unit
show(numRows: Int, truncate: Boolean): Unit
show(numRows: Int, truncate: Int): Unit
show(numRows: Int, truncate: Int, vertical: Boolean): Unit
|
sort |
sort(sortExprs: Column*): Dataset[T]
sort(sortCol: String, sortCols: String*): Dataset[T]
A typed transformation to sort elements globally (across partitions). Use sortWithinPartitions transformation for partition-local sort
|
sortWithinPartitions |
sortWithinPartitions(sortExprs: Column*): Dataset[T]
sortWithinPartitions(sortCol: String, sortCols: String*): Dataset[T]
A typed transformation to sort elements within partitions (aka local sort). Use sort transformation for global sort (across partitions)
|
stat |
stat: DataFrameStatFunctions
An untyped transformation
|
storageLevel |
storageLevel: StorageLevel
|
summary |
summary(statistics: String*): DataFrame
An action to calculate statistics (e.g. count , mean , stddev , min , max and 25% , 50% , 75% percentiles)
|
take |
An action to take the first records of a Dataset
|
toDF |
toDF(): DataFrame
toDF(colNames: String*): DataFrame
A basic action to convert a Dataset to a DataFrame
|
toJSON |
|
toLocalIterator |
toLocalIterator(): java.util.Iterator[T]
An action that returns an iterator with all rows in the Dataset . The iterator will consume as much memory as the largest partition in the Dataset .
|
transform |
A typed transformation for chaining custom transformations
|
union |
union(other: Dataset[T]): Dataset[T]
|
unionByName |
unionByName(other: Dataset[T]): Dataset[T]
|
unpersist |
unpersist(): this.type (1)
unpersist(blocking: Boolean): this.type
-
Uses unpersist with blocking disabled (false )
A basic action to unpersist the Dataset
|
where |
where(condition: Column): Dataset[T]
where(conditionExpr: String): Dataset[T]
|
withColumn |
withColumn(colName: String, col: Column): DataFrame
An untyped transformation
|
withColumnRenamed |
withColumnRenamed(existingName: String, newName: String): DataFrame
An untyped transformation
|
write |
write: DataFrameWriter[T]
A basic action that returns a DataFrameWriter for saving the content of the (non-streaming) Dataset out to an external storage
|