ParquetRelation2

An alternative to ParquetRelation that plugs in using the data sources API. This class is currently not intended as a full replacement of the parquet support in Spark SQL though it is likely that it will eventually subsume the existing physical plan implementation.

Compared with the current implementation, this class has the following notable differences:

Partitioning: Partitions are auto discovered and must be in the form of directories key=value/ located at path. Currently only a single partitioning column is supported and it must be an integer. This class supports both fully self-describing data, which contains the partition key, and data where the partition key is only present in the folder structure. The presence of the partitioning key in the data is also auto-detected. The null partition is not yet supported.

Metadata: The metadata is automatically discovered by reading the first parquet file present. There is currently no support for working with files that have different schema. Additionally, when parquet metadata caching is turned on, the FileStatus objects for all data will be cached to improve the speed of interactive querying. When data is added to a table it must be dropped and recreated to pick up any changes.

Statistics: Statistics for the size of the table are automatically populated during metadata discovery.

Annotations: @DeveloperApi()

Linear Supertypes

Serializable, Serializable, Product, Equals, Logging, CatalystScan, BaseRelation, AnyRef, Any

Instance Constructors

new ParquetRelation2(path: String)(sqlContext: SQLContext)

Value Members

final def !=(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def !=(arg0: Any): Boolean

Definition Classes
Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def ==(arg0: Any): Boolean

Definition Classes
Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row]

Definition Classes
ParquetRelation2 → CatalystScan
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
val dataIncludesKey: Boolean
val dataSchema: catalyst.types.StructType
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def isTraceEnabled(): Boolean

Attributes
protected
Definition Classes
Logging
def log: Logger

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logName: String

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
val path: String
val schema: catalyst.types.StructType

Definition Classes
ParquetRelation2 → BaseRelation
val sizeInBytes: Long

Returns an estimated size of this relation in bytes.
Returns an estimated size of this relation in bytes. This information is used by the planner to decided when it is safe to broadcast a relation and can be overridden by sources that know the size ahead of time. By default, the system will assume that tables are too large to broadcast. This method will be called multiple times during query planning and thus should not perform expensive operations for each invocation.

Definition Classes
ParquetRelation2 → BaseRelation
def sparkContext: SparkContext
val sqlContext: SQLContext

Definition Classes
ParquetRelation2 → BaseRelation
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

case class ParquetRelation2(path: String)(sqlContext: SQLContext) extends CatalystScan with Logging with Product with Serializable

Instance Constructors

new ParquetRelation2(path: String)(sqlContext: SQLContext)

Value Members

final def !=(arg0: AnyRef): Boolean

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: AnyRef): Boolean

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row]

def clone(): AnyRef

val dataIncludesKey: Boolean

val dataSchema: catalyst.types.StructType

final def eq(arg0: AnyRef): Boolean

def finalize(): Unit

final def getClass(): Class[_]

final def isInstanceOf[T0]: Boolean

def isTraceEnabled(): Boolean

def log: Logger

def logDebug(msg: ⇒ String, throwable: Throwable): Unit

def logDebug(msg: ⇒ String): Unit

def logError(msg: ⇒ String, throwable: Throwable): Unit

def logError(msg: ⇒ String): Unit

def logInfo(msg: ⇒ String, throwable: Throwable): Unit

def logInfo(msg: ⇒ String): Unit

def logName: String

def logTrace(msg: ⇒ String, throwable: Throwable): Unit

def logTrace(msg: ⇒ String): Unit

def logWarning(msg: ⇒ String, throwable: Throwable): Unit

def logWarning(msg: ⇒ String): Unit

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

val path: String

val schema: catalyst.types.StructType

val sizeInBytes: Long

def sparkContext: SparkContext

val sqlContext: SQLContext

final def synchronized[T0](arg0: ⇒ T0): T0

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from Logging

Inherited from CatalystScan

Inherited from BaseRelation

Inherited from AnyRef

Inherited from Any

Ungrouped