## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#"""A wrapper class for Spark DataFrame to behave like pandas DataFrame."""fromcollectionsimportdefaultdict,namedtuplefromcollections.abcimportMappingimportreimportwarningsimportinspectimportjsonimporttypesfromfunctoolsimportpartial,reduceimportsysfromitertoolsimportzip_longest,chainfromtypesimportTracebackTypefromtypingimport(Any,Callable,Dict,Generic,IO,Iterable,Iterator,List,Optional,Sequence,Tuple,Type,Union,cast,no_type_check,TYPE_CHECKING,)importdatetimeimportnumpyasnpimportpandasaspdfrompandas.api.typesimport(# type: ignore[attr-defined]is_bool_dtype,is_list_like,is_dict_like,is_scalar,)frompandas.tseries.frequenciesimportDateOffset,to_offsetifTYPE_CHECKING:frompandas.io.formats.styleimportStylerfrompandas.core.dtypes.commonimportinfer_dtype_from_objectfrompandas.core.accessorimportCachedAccessorfrompandas.core.dtypes.inferenceimportis_sequencefrompyspark.errorsimportPySparkValueErrorfrompysparkimportStorageLevelfrompyspark.sqlimportColumnasPySparkColumn,DataFrameasPySparkDataFrame,functionsasFfrompyspark.sql.functionsimportpandas_udffrompyspark.sql.typesimport(ArrayType,BooleanType,DataType,DoubleType,NumericType,Row,StringType,StructField,StructType,DecimalType,TimestampType,TimestampNTZType,NullType,)frompyspark.sql.windowimportWindowfrompysparkimportpandasasps# For running doctests and reference resolution in PyCharm.frompyspark.pandas._typingimport(Axis,DataFrameOrSeries,Dtype,Label,Name,Scalar,T,)frompyspark.pandas.accessorsimportPandasOnSparkFrameMethodsfrompyspark.pandas.configimportoption_context,get_optionfrompyspark.pandas.correlationimport(compute,CORRELATION_VALUE_1_COLUMN,CORRELATION_VALUE_2_COLUMN,CORRELATION_CORR_OUTPUT_COLUMN,CORRELATION_COUNT_OUTPUT_COLUMN,)frompyspark.pandas.sparkimportfunctionsasSFfrompyspark.pandas.spark.accessorsimportSparkFrameMethods,CachedSparkFrameMethodsfrompyspark.pandas.utilsimport(align_diff_frames,column_labels_level,combine_frames,default_session,is_name_like_tuple,is_name_like_value,is_testing,name_like_string,same_anchor,scol_for,validate_arguments_and_invoke_function,validate_axis,validate_bool_kwarg,validate_how,validate_mode,verify_temp_column_name,log_advice,)frompyspark.pandas.genericimportFramefrompyspark.pandas.internalimport(InternalField,InternalFrame,HIDDEN_COLUMNS,NATURAL_ORDER_COLUMN_NAME,SPARK_INDEX_NAME_FORMAT,SPARK_DEFAULT_INDEX_NAME,SPARK_DEFAULT_SERIES_NAME,SPARK_INDEX_NAME_PATTERN,)frompyspark.pandas.missing.frameimportMissingPandasLikeDataFramefrompyspark.pandas.typedef.typehintsimport(as_spark_type,infer_return_type,pandas_on_spark_type,spark_type_to_pandas_dtype,DataFrameType,SeriesType,ScalarType,create_tuple_for_frame_type,)frompyspark.pandas.plotimportPandasOnSparkPlotAccessorfrompyspark.sql.utilsimportget_column_class,get_dataframe_classifTYPE_CHECKING:frompyspark.sql._typingimportOptionalPrimitiveTypefrompyspark.pandas.groupbyimportDataFrameGroupByfrompyspark.pandas.resampleimportDataFrameResamplerfrompyspark.pandas.indexesimportIndexfrompyspark.pandas.seriesimportSeries# These regular expression patterns are compiled and defined here to avoid compiling the same# pattern every time it is used in _repr_ and _repr_html_ in DataFrame.# Two patterns basically seek the footer string from Pandas'REPR_PATTERN=re.compile(r"\n\n\[(?P<rows>[0-9]+) rows x (?P<columns>[0-9]+) columns\]$")REPR_HTML_PATTERN=re.compile(r"\n\<p\>(?P<rows>[0-9]+) rows × (?P<columns>[0-9]+) columns\<\/p\>\n\<\/div\>$")_flex_doc_FRAME="""Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).Equivalent to ``{equiv}``. With the reverse version, `{reverse}`.Among flexible wrappers (`add`, `sub`, `mul`, `div`) toarithmetic operators: `+`, `-`, `*`, `/`, `//`.Parameters----------other : scalar Any single dataReturns-------DataFrame Result of the arithmetic operation.Examples-------->>> df = ps.DataFrame({{'angles': [0, 3, 4],... 'degrees': [360, 180, 360]}},... index=['circle', 'triangle', 'rectangle'],... columns=['angles', 'degrees'])>>> df angles degreescircle 0 360triangle 3 180rectangle 4 360Add a scalar with operator version which returns the sameresults. Also, the reverse version.>>> df + 1 angles degreescircle 1 361triangle 4 181rectangle 5 361>>> df.add(1) angles degreescircle 1 361triangle 4 181rectangle 5 361>>> df.add(df) angles degreescircle 0 720triangle 6 360rectangle 8 720>>> df + df + df angles degreescircle 0 1080triangle 9 540rectangle 12 1080>>> df.radd(1) angles degreescircle 1 361triangle 4 181rectangle 5 361Divide and true divide by constant with reverse version.>>> df / 10 angles degreescircle 0.0 36.0triangle 0.3 18.0rectangle 0.4 36.0>>> df.div(10) angles degreescircle 0.0 36.0triangle 0.3 18.0rectangle 0.4 36.0>>> df.rdiv(10) angles degreescircle inf 0.027778triangle 3.333333 0.055556rectangle 2.500000 0.027778>>> df.truediv(10) angles degreescircle 0.0 36.0triangle 0.3 18.0rectangle 0.4 36.0>>> df.rtruediv(10) angles degreescircle inf 0.027778triangle 3.333333 0.055556rectangle 2.500000 0.027778Subtract by constant with reverse version.>>> df - 1 angles degreescircle -1 359triangle 2 179rectangle 3 359>>> df.sub(1) angles degreescircle -1 359triangle 2 179rectangle 3 359>>> df.rsub(1) angles degreescircle 1 -359triangle -2 -179rectangle -3 -359Multiply by constant with the reverse version.>>> df * 1 angles degreescircle 0 360triangle 3 180rectangle 4 360>>> df.mul(1) angles degreescircle 0 360triangle 3 180rectangle 4 360>>> df.rmul(1) angles degreescircle 0 360triangle 3 180rectangle 4 360Floor Divide by constant with reverse version.>>> df // 10 angles degreescircle 0.0 36.0triangle 0.0 18.0rectangle 0.0 36.0>>> df.floordiv(10) angles degreescircle 0.0 36.0triangle 0.0 18.0rectangle 0.0 36.0>>> df.rfloordiv(10) # doctest: +SKIP angles degreescircle inf 0.0triangle 3.0 0.0rectangle 2.0 0.0Mod by constant with reverse version.>>> df % 2 angles degreescircle 0 0triangle 1 0rectangle 0 0>>> df.mod(2) angles degreescircle 0 0triangle 1 0rectangle 0 0>>> df.rmod(2) angles degreescircle NaN 2triangle 2.0 2rectangle 2.0 2Power by constant with reverse version.>>> df ** 2 angles degreescircle 0.0 129600.0triangle 9.0 32400.0rectangle 16.0 129600.0>>> df.pow(2) angles degreescircle 0.0 129600.0triangle 9.0 32400.0rectangle 16.0 129600.0>>> df.rpow(2) angles degreescircle 1.0 2.348543e+108triangle 8.0 1.532496e+54rectangle 16.0 2.348543e+108"""
[docs]classDataFrame(Frame,Generic[T]):""" pandas-on-Spark DataFrame that corresponds to pandas DataFrame logically. This holds Spark DataFrame internally. :ivar _internal: an internal immutable Frame to manage metadata. :type _internal: InternalFrame Parameters ---------- data : numpy ndarray (structured or homogeneous), dict, pandas DataFrame, Spark DataFrame, pandas-on-Spark DataFrame or pandas-on-Spark Series. Dict can contain Series, arrays, constants, or list-like objects index : Index or array-like Index to use for the resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided columns : Index or array-like Column labels to use for the resulting frame. Will default to RangeIndex (0, 1, 2, ..., n) if no column labels are provided dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input .. versionchanged:: 3.4.0 Since 3.4.0, it deals with `data` and `index` in this approach: 1, when `data` is a distributed dataset (Internal DataFrame/Spark DataFrame/ pandas-on-Spark DataFrame/pandas-on-Spark Series), it will first parallelize the `index` if necessary, and then try to combine the `data` and `index`; Note that if `data` and `index` doesn't have the same anchor, then `compute.ops_on_diff_frames` should be turned on; 2, when `data` is a local dataset (Pandas DataFrame/numpy ndarray/list/etc), it will first collect the `index` to driver if necessary, and then apply the `pandas.DataFrame(...)` creation internally; Examples -------- Constructing DataFrame from a dictionary. >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> df = ps.DataFrame(data=d, columns=['col1', 'col2']) >>> df col1 col2 0 1 3 1 2 4 Constructing DataFrame from pandas DataFrame >>> df = ps.DataFrame(pd.DataFrame(data=d, columns=['col1', 'col2'])) >>> df col1 col2 0 1 3 1 2 4 Notice that the inferred dtype is int64. >>> df.dtypes col1 int64 col2 int64 dtype: object To enforce a single dtype: >>> df = ps.DataFrame(data=d, dtype=np.int8) >>> df.dtypes col1 int8 col2 int8 dtype: object Constructing DataFrame from numpy ndarray: >>> import numpy as np >>> ps.DataFrame(data=np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 0]]), ... columns=['a', 'b', 'c', 'd', 'e']) a b c d e 0 1 2 3 4 5 1 6 7 8 9 0 Constructing DataFrame from numpy ndarray with Pandas index: >>> import numpy as np >>> import pandas as pd >>> ps.DataFrame(data=np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 0]]), ... index=pd.Index([1, 4]), columns=['a', 'b', 'c', 'd', 'e']) a b c d e 1 1 2 3 4 5 4 6 7 8 9 0 Constructing DataFrame from numpy ndarray with pandas-on-Spark index: >>> import numpy as np >>> import pandas as pd >>> ps.DataFrame(data=np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 0]]), ... index=ps.Index([1, 4]), columns=['a', 'b', 'c', 'd', 'e']) a b c d e 1 1 2 3 4 5 4 6 7 8 9 0 Constructing DataFrame from Pandas DataFrame with Pandas index: >>> import numpy as np >>> import pandas as pd >>> pdf = pd.DataFrame(data=np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 0]]), ... columns=['a', 'b', 'c', 'd', 'e']) >>> ps.DataFrame(data=pdf, index=pd.Index([1, 4])) a b c d e 1 6.0 7.0 8.0 9.0 0.0 4 NaN NaN NaN NaN NaN Constructing DataFrame from Pandas DataFrame with pandas-on-Spark index: >>> import numpy as np >>> import pandas as pd >>> pdf = pd.DataFrame(data=np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 0]]), ... columns=['a', 'b', 'c', 'd', 'e']) >>> ps.DataFrame(data=pdf, index=ps.Index([1, 4])) a b c d e 1 6.0 7.0 8.0 9.0 0.0 4 NaN NaN NaN NaN NaN Constructing DataFrame from Spark DataFrame with Pandas index: >>> import pandas as pd >>> sdf = spark.createDataFrame([("Data", 1), ("Bricks", 2)], ["x", "y"]) >>> with ps.option_context("compute.ops_on_diff_frames", False): ... ps.DataFrame(data=sdf, index=pd.Index([0, 1, 2])) Traceback (most recent call last): ... ValueError: Cannot combine the series or dataframe...'compute.ops_on_diff_frames' option. Enable 'compute.ops_on_diff_frames' to combine SparkDataFrame and Pandas index >>> with ps.option_context("compute.ops_on_diff_frames", True): ... ps.DataFrame(data=sdf, index=pd.Index([0, 1, 2])) x y 0 Data 1.0 1 Bricks 2.0 2 None NaN Constructing DataFrame from Spark DataFrame with pandas-on-Spark index: >>> import pandas as pd >>> sdf = spark.createDataFrame([("Data", 1), ("Bricks", 2)], ["x", "y"]) >>> with ps.option_context("compute.ops_on_diff_frames", False): ... ps.DataFrame(data=sdf, index=ps.Index([0, 1, 2])) Traceback (most recent call last): ... ValueError: Cannot combine the series or dataframe...'compute.ops_on_diff_frames' option. Enable 'compute.ops_on_diff_frames' to combine Spark DataFrame and pandas-on-Spark index >>> with ps.option_context("compute.ops_on_diff_frames", True): ... ps.DataFrame(data=sdf, index=ps.Index([0, 1, 2])) x y 0 Data 1.0 1 Bricks 2.0 2 None NaN """def__init__(# type: ignore[no-untyped-def]self,data=None,index=None,columns=None,dtype=None,copy=False):SparkDataFrame=get_dataframe_class()index_assigned=Falseifisinstance(data,InternalFrame):assertcolumnsisNoneassertdtypeisNoneassertnotcopyifindexisNone:internal=dataelifisinstance(data,SparkDataFrame):assertcolumnsisNoneassertdtypeisNoneassertnotcopyifindexisNone:internal=InternalFrame(spark_frame=data,index_spark_columns=None)elifisinstance(data,ps.DataFrame):assertcolumnsisNoneassertdtypeisNoneassertnotcopyifindexisNone:internal=data._internalelifisinstance(data,ps.Series):assertdtypeisNoneassertnotcopy# For pandas compatibility when `columns` contains only one valid column.ifcolumnsisnotNone:assertisinstance(columns,(dict,list,tuple))assertlen(columns)==1columns=list(columns.keys())ifisinstance(columns,dict)elsecolumnsassertcolumns[0]==data._internal.data_spark_column_names[0]ifindexisNone:internal=data.to_frame()._internalelse:frompyspark.pandas.indexes.baseimportIndexifindexisnotNoneandisinstance(index,Index):# with local data, collect ps.Index to driver# to avoid mismatched results between# ps.DataFrame([1, 2], index=ps.Index([1, 2]))# and# pd.DataFrame([1, 2], index=pd.Index([1, 2]))index=index._to_pandas()pdf=pd.DataFrame(data=data,index=index,columns=columns,dtype=dtype,copy=copy)internal=InternalFrame.from_pandas(pdf)index_assigned=TrueifindexisnotNoneandnotindex_assigned:# TODO(SPARK-40226): Support MultiIndexifisinstance(index,(ps.MultiIndex,pd.MultiIndex)):raiseValueError("Cannot combine a Distributed Dataset with a MultiIndex")data_df=ps.DataFrame(data=data,index=None,columns=columns,dtype=dtype,copy=copy)index_ps=ps.Index(index)index_df=index_ps.to_frame()ifsame_anchor(data_df,index_df):data_labels=data_df._internal.column_labelsdata_pssers=[data_df._psser_for(label)forlabelindata_labels]index_labels=index_df._internal.column_labelsindex_pssers=[index_df._psser_for(label)forlabelinindex_labels]internal=data_df._internal.with_new_columns(data_pssers+index_pssers)combined=ps.DataFrame(internal).set_index(index_labels)combined.index.name=index_ps.nameelse:# drop un-matched rows in `data`# note that `combine_frames` cannot work with a MultiIndex for nowcombined=combine_frames(data_df,index_df,how="right")combined_labels=combined._internal.column_labelsindex_labels=[labelforlabelincombined_labelsiflabel[0]=="that"]combined=combined.set_index(index_labels)combined._internal._column_labels=data_df._internal.column_labelscombined._internal._column_label_names=data_df._internal._column_label_namescombined._internal._index_names=index_df._internal.column_labelscombined.index.name=index_ps.nameinternal=combined._internalobject.__setattr__(self,"_internal_frame",internal)@propertydef_pssers(self)->Dict[Label,"Series"]:"""Return a dict of column label -> Series which anchors `self`."""frompyspark.pandas.seriesimportSeriesifnothasattr(self,"_psseries"):object.__setattr__(self,"_psseries",{label:Series(data=self,index=label)forlabelinself._internal.column_labels},)else:psseries=cast(Dict[Label,Series],self._psseries)# type: ignore[has-type]assertlen(self._internal.column_labels)==len(psseries),(len(self._internal.column_labels),len(psseries),)ifany(selfisnotpsser._psdfforpsserinpsseries.values()):# Refresh the dict to contain only Series anchoring `self`.self._psseries={label:(psseries[label]ifselfispsseries[label]._psdfelseSeries(data=self,index=label))forlabelinself._internal.column_labels}returnself._psseries@propertydef_internal(self)->InternalFrame:returncast(InternalFrame,self._internal_frame)# type: ignore[has-type]def_update_internal_frame(self,internal:InternalFrame,check_same_anchor:bool=True,anchor_force_disconnect:bool=False,)->None:""" Update InternalFrame with the given one. If the column_label is changed or the new InternalFrame is not the same `anchor` or the `anchor_force_disconnect` flag is set to True, disconnect the original anchor and create a new one. If `check_same_anchor` is `False`, checking whether the same anchor is ignored and force to update the InternalFrame, e.g., replacing the internal with the resolved_copy, updating the underlying Spark DataFrame which need to combine a different Spark DataFrame. Parameters ---------- internal : InternalFrame The new InternalFrame check_same_anchor : bool Whether checking the same anchor anchor_force_disconnect : bool Force to disconnect the original anchor and create a new one """frompyspark.pandas.seriesimportSeriesifhasattr(self,"_psseries"):psseries={}forold_label,new_labelinzip_longest(self._internal.column_labels,internal.column_labels):ifold_labelisnotNone:psser=self._pssers[old_label]renamed=old_label!=new_labelnot_same_anchor=check_same_anchorandnotsame_anchor(internal,psser)ifrenamedornot_same_anchororanchor_force_disconnect:psdf:DataFrame=DataFrame(self._internal.select_column(old_label))psser._update_anchor(psdf)psser=Noneelse:psser=Noneifnew_labelisnotNone:ifpsserisNone:psser=Series(data=self,index=new_label)psseries[new_label]=psserself._psseries=psseriesself._internal_frame=internalifhasattr(self,"_repr_pandas_cache"):delself._repr_pandas_cache@propertydefndim(self)->int:""" Return an int representing the number of array dimensions. return 2 for DataFrame. Examples -------- >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', None], ... columns=['max_speed', 'shield']) >>> df # doctest: +SKIP max_speed shield cobra 1 2 viper 4 5 None 7 8 >>> df.ndim 2 """return2@propertydefaxes(self)->List:""" Return a list representing the axes of the DataFrame. It has the row axis labels and column axis labels as the only members. They are returned in that order. Examples -------- >>> df = ps.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.axes [Index([0, 1], dtype='int64'), Index(['col1', 'col2'], dtype='object')] """return[self.index,self.columns]def_reduce_for_stat_function(self,sfun:Callable[["Series"],PySparkColumn],name:str,axis:Optional[Axis]=None,numeric_only:bool=False,skipna:bool=True,**kwargs:Any,)->"Series":""" Applies sfun to each column and returns a pd.Series where the number of rows equals the number of columns. Parameters ---------- sfun : either an 1-arg function that takes a Column and returns a Column, or a 2-arg function that takes a Column and its DataType and returns a Column. axis: used only for sanity check because the series only supports index axis. name : original pandas API name. axis : axis to apply. 0 or 1, or 'index' or 'columns. numeric_only : bool, default False Include only float, int, boolean columns. skipna : bool, default True Exclude NA/null values when computing the result. """frompyspark.pandas.seriesimportSeries,first_seriesaxis=validate_axis(axis)ifaxis==0:min_count=kwargs.get("min_count",0)exprs=[F.lit(None).cast(StringType()).alias(SPARK_DEFAULT_INDEX_NAME)]new_column_labels=[]forlabelinself._internal.column_labels:psser=self._psser_for(label)is_numeric_or_boolean=isinstance(psser.spark.data_type,(NumericType,BooleanType))keep_column=notnumeric_onlyoris_numeric_or_booleanifkeep_column:ifnotskipnaandget_option("compute.eager_check")andpsser.hasnans:scol=F.first(F.lit(np.nan))else:scol=sfun(psser)ifmin_count>0:scol=F.when(Frame._count_expr(psser)>=min_count,scol)exprs.append(scol.alias(name_like_string(label)))new_column_labels.append(label)iflen(exprs)==1:returnSeries([],dtype="float64")sdf=self._internal.spark_frame.select(*exprs)# The data is expected to be small so it's fine to transpose/use the default index.withps.option_context("compute.max_rows",1):internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,SPARK_DEFAULT_INDEX_NAME)],column_labels=new_column_labels,column_label_names=self._internal.column_label_names,)returnfirst_series(DataFrame(internal).transpose())else:# Here we execute with the first 1000 to get the return type.# If the records were less than 1000, it uses pandas API directly for a shortcut.limit=get_option("compute.shortcut_limit")pdf=self.head(limit+1)._to_internal_pandas()pser=getattr(pdf,name)(axis=axis,numeric_only=numeric_only,**kwargs)iflen(pdf)<=limit:returnSeries(pser)@pandas_udf(returnType=as_spark_type(pser.dtype.type))# type: ignore[call-overload]defcalculate_columns_axis(*cols:pd.Series)->pd.Series:returngetattr(pd.concat(cols,axis=1),name)(axis=axis,numeric_only=numeric_only,**kwargs)column_name=verify_temp_column_name(self._internal.spark_frame.select(self._internal.index_spark_columns),"__calculate_columns_axis__",)sdf=self._internal.spark_frame.select(self._internal.index_spark_columns+[calculate_columns_axis(*self._internal.data_spark_columns).alias(column_name)])internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolinself._internal.index_spark_column_names],index_names=self._internal.index_names,index_fields=self._internal.index_fields,)returnfirst_series(DataFrame(internal)).rename(pser.name)def_psser_for(self,label:Label)->"Series":""" Create Series with a proper column label. The given label must be verified to exist in `InternalFrame.column_labels`. For example, in some method, self is like: >>> self = ps.range(3) `self._psser_for(label)` can be used with `InternalFrame.column_labels`: >>> self._psser_for(self._internal.column_labels[0]) 0 0 1 1 2 2 Name: id, dtype: int64 `self._psser_for(label)` must not be used directly with user inputs. In that case, `self[label]` should be used instead, which checks the label exists or not: >>> self['id'] 0 0 1 1 2 2 Name: id, dtype: int64 """returnself._pssers[label]def_apply_series_op(self,op:Callable[["Series"],Union["Series",PySparkColumn]],should_resolve:bool=False,)->"DataFrame":applied=[]forlabelinself._internal.column_labels:applied.append(op(self._psser_for(label)))internal=self._internal.with_new_columns(applied)ifshould_resolve:internal=internal.resolved_copyreturnDataFrame(internal)# Arithmetic Operatorsdef_map_series_op(self,op:str,other:Any)->"DataFrame":frompyspark.pandas.baseimportIndexOpsMixinifnotisinstance(other,DataFrame)and(isinstance(other,IndexOpsMixin)oris_sequence(other)):raiseTypeError("%s with a sequence is currently not supported; ""however, got %s."%(op,type(other).__name__))ifisinstance(other,DataFrame):ifself._internal.column_labels_level!=other._internal.column_labels_level:raiseValueError("cannot join with no overlapping index names")ifnotsame_anchor(self,other):# Different DataFramesdefapply_op(psdf:DataFrame,this_column_labels:List[Label],that_column_labels:List[Label],)->Iterator[Tuple["Series",Label]]:forthis_label,that_labelinzip(this_column_labels,that_column_labels):yield(getattr(psdf._psser_for(this_label),op)(psdf._psser_for(that_label)).rename(this_label),this_label,)returnalign_diff_frames(apply_op,self,other,fillna=True,how="full")else:applied=[]column_labels=[]forlabelinself._internal.column_labels:iflabelinother._internal.column_labels:applied.append(getattr(self._psser_for(label),op)(other._psser_for(label)))else:applied.append(F.lit(None).cast(self._internal.spark_type_for(label)).alias(name_like_string(label)))column_labels.append(label)forlabelinother._internal.column_labels:iflabelnotincolumn_labels:applied.append(F.lit(None).cast(other._internal.spark_type_for(label)).alias(name_like_string(label)))column_labels.append(label)internal=self._internal.with_new_columns(applied,column_labels=column_labels)returnDataFrame(internal)else:returnself._apply_series_op(lambdapsser:getattr(psser,op)(other))def__add__(self,other:Any)->"DataFrame":returnself._map_series_op("add",other)def__radd__(self,other:Any)->"DataFrame":returnself._map_series_op("radd",other)def__truediv__(self,other:Any)->"DataFrame":returnself._map_series_op("truediv",other)def__rtruediv__(self,other:Any)->"DataFrame":returnself._map_series_op("rtruediv",other)def__mul__(self,other:Any)->"DataFrame":returnself._map_series_op("mul",other)def__rmul__(self,other:Any)->"DataFrame":returnself._map_series_op("rmul",other)def__sub__(self,other:Any)->"DataFrame":returnself._map_series_op("sub",other)def__rsub__(self,other:Any)->"DataFrame":returnself._map_series_op("rsub",other)def__pow__(self,other:Any)->"DataFrame":returnself._map_series_op("pow",other)def__rpow__(self,other:Any)->"DataFrame":returnself._map_series_op("rpow",other)def__mod__(self,other:Any)->"DataFrame":returnself._map_series_op("mod",other)def__rmod__(self,other:Any)->"DataFrame":returnself._map_series_op("rmod",other)def__floordiv__(self,other:Any)->"DataFrame":returnself._map_series_op("floordiv",other)def__rfloordiv__(self,other:Any)->"DataFrame":returnself._map_series_op("rfloordiv",other)def__abs__(self)->"DataFrame":returnself._apply_series_op(lambdapsser:abs(psser))def__neg__(self)->"DataFrame":returnself._apply_series_op(lambdapsser:-psser)
# create accessor for plotplot=CachedAccessor("plot",PandasOnSparkPlotAccessor)# create accessor for Spark related methods.spark=CachedAccessor("spark",SparkFrameMethods)# create accessor for pandas-on-Spark specific methods.pandas_on_spark=CachedAccessor("pandas_on_spark",PandasOnSparkFrameMethods)
[docs]defeq(self,other:Any)->"DataFrame":""" Compare if the current value is equal to the other. >>> df = ps.DataFrame({'a': [1, 2, 3, 4], ... 'b': [1, np.nan, 1, np.nan]}, ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.eq(1) a b a True True b False False c False True d False False """returnself==other
equals=eq
[docs]defgt(self,other:Any)->"DataFrame":""" Compare if the current value is greater than the other. >>> df = ps.DataFrame({'a': [1, 2, 3, 4], ... 'b': [1, np.nan, 1, np.nan]}, ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.gt(2) a b a False False b False False c True False d True False """returnself>other
[docs]defge(self,other:Any)->"DataFrame":""" Compare if the current value is greater than or equal to the other. >>> df = ps.DataFrame({'a': [1, 2, 3, 4], ... 'b': [1, np.nan, 1, np.nan]}, ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.ge(1) a b a True True b True False c True True d True False """returnself>=other
[docs]deflt(self,other:Any)->"DataFrame":""" Compare if the current value is less than the other. >>> df = ps.DataFrame({'a': [1, 2, 3, 4], ... 'b': [1, np.nan, 1, np.nan]}, ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.lt(1) a b a False False b False False c False False d False False """returnself<other
[docs]defle(self,other:Any)->"DataFrame":""" Compare if the current value is less than or equal to the other. >>> df = ps.DataFrame({'a': [1, 2, 3, 4], ... 'b': [1, np.nan, 1, np.nan]}, ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.le(2) a b a True True b True False c False True d False False """returnself<=other
[docs]defne(self,other:Any)->"DataFrame":""" Compare if the current value is not equal to the other. >>> df = ps.DataFrame({'a': [1, 2, 3, 4], ... 'b': [1, np.nan, 1, np.nan]}, ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) >>> df.ne(1) a b a False False b True True c True False d True True """returnself!=other
[docs]defapplymap(self,func:Callable[[Any],Any])->"DataFrame":""" Apply a function to a Dataframe elementwise. This method applies a function that accepts and returns a scalar to every element of a DataFrame. .. deprecated:: 4.0.0 .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def square(x) -> np.int32: ... return x ** 2 pandas-on-Spark uses return type hints and does not try to infer the type. Parameters ---------- func : callable Python function returns a single value from a single value. Returns ------- DataFrame Transformed DataFrame. Examples -------- >>> df = ps.DataFrame([[1, 2.12], [3.356, 4.567]]) >>> df 0 1 0 1.000 2.120 1 3.356 4.567 >>> def str_len(x) -> int: ... return len(str(x)) >>> df.applymap(str_len) 0 1 0 3 4 1 5 5 >>> def power(x) -> float: ... return x ** 2 >>> df.applymap(power) 0 1 0 1.000000 4.494400 1 11.262736 20.857489 You can omit type hints and let pandas-on-Spark infer its type. >>> df.applymap(lambda x: x ** 2) 0 1 0 1.000000 4.494400 1 11.262736 20.857489 """warnings.warn("DataFrame.applymap has been deprecated. Use DataFrame.map instead",FutureWarning)# TODO: We can implement shortcut theoretically since it creates new DataFrame# anyway and we don't have to worry about operations on different DataFrames.returnself.map(func=func)
[docs]defmap(self,func:Callable[[Any],Any])->"DataFrame":""" Apply a function to a Dataframe elementwise. This method applies a function that accepts and returns a scalar to every element of a DataFrame. .. versionadded:: 4.0.0 DataFrame.applymap was deprecated and renamed to DataFrame.map. .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def square(x) -> np.int32: ... return x ** 2 pandas-on-Spark uses return type hints and does not try to infer the type. Parameters ---------- func : callable Python function returns a single value from a single value. Returns ------- DataFrame Transformed DataFrame. Examples -------- >>> df = ps.DataFrame([[1, 2.12], [3.356, 4.567]]) >>> df 0 1 0 1.000 2.120 1 3.356 4.567 >>> def str_len(x) -> int: ... return len(str(x)) >>> df.map(str_len) 0 1 0 3 4 1 5 5 >>> def power(x) -> float: ... return x ** 2 >>> df.map(power) 0 1 0 1.000000 4.494400 1 11.262736 20.857489 You can omit type hints and let pandas-on-Spark infer its type. >>> df.map(lambda x: x ** 2) 0 1 0 1.000000 4.494400 1 11.262736 20.857489 """# TODO: We can implement shortcut theoretically since it creates new DataFrame# anyway and we don't have to worry about operations on different DataFrames.returnself._apply_series_op(lambdapsser:psser.apply(func))
# TODO(SPARK-46156): add `axis` parameter.
[docs]defaggregate(self,func:Union[List[str],Dict[Name,List[str]]])->"DataFrame":"""Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict or a list a dict mapping from column name (string) to aggregate functions (list of strings). If a list is given, the aggregation is performed against all columns. Returns ------- DataFrame Notes ----- `agg` is an alias for `aggregate`. Use the alias. See Also -------- DataFrame.apply : Invoke function on DataFrame. DataFrame.transform : Only perform transforming type operations. DataFrame.groupby : Perform operations over groups. Series.aggregate : The equivalent function for Series. Examples -------- >>> df = ps.DataFrame([[1, 2, 3], ... [4, 5, 6], ... [7, 8, 9], ... [np.nan, np.nan, np.nan]], ... columns=['A', 'B', 'C']) >>> df A B C 0 1.0 2.0 3.0 1 4.0 5.0 6.0 2 7.0 8.0 9.0 3 NaN NaN NaN Aggregate these functions over the rows. >>> df.agg(['sum', 'min'])[['A', 'B', 'C']].sort_index() A B C min 1.0 2.0 3.0 sum 12.0 15.0 18.0 Different aggregations per column. >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})[['A', 'B']].sort_index() A B max NaN 8.0 min 1.0 2.0 sum 12.0 NaN For multi-index columns: >>> df.columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) >>> df.agg(['sum', 'min'])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index() X Y A B C min 1.0 2.0 3.0 sum 12.0 15.0 18.0 >>> aggregated = df.agg({("X", "A") : ['sum', 'min'], ("X", "B") : ['min', 'max']}) >>> aggregated[[("X", "A"), ("X", "B")]].sort_index() # doctest: +NORMALIZE_WHITESPACE X A B max NaN 8.0 min 1.0 2.0 sum 12.0 NaN """frompyspark.pandas.groupbyimportGroupByifisinstance(func,list):ifall((isinstance(f,str)forfinfunc)):func=dict([(column,func)forcolumninself.columns])else:raiseValueError("If the given function is a list, it ""should only contains function names as strings.")ifnotisinstance(func,dict)ornotall(is_name_like_value(key)and(isinstance(value,str)or(isinstance(value,list)andall(isinstance(v,str)forvinvalue)))forkey,valueinfunc.items()):raiseValueError("aggs must be a dict mapping from column name to aggregate ""functions (string or list of strings).")withoption_context("compute.default_index_type","distributed"):psdf:DataFrame=DataFrame(GroupBy._spark_groupby(self,func))# The codes below basically convert:## A B# sum min min max# 0 12.0 1.0 2.0 8.0## to:# A B# max NaN 8.0# min 1.0 2.0# sum 12.0 NaN## Aggregated output is usually pretty much small.returnpsdf.stack().droplevel(0)[list(func.keys())]
[docs]defcorrwith(self,other:DataFrameOrSeries,axis:Axis=0,drop:bool=False,method:str="pearson")->"Series":""" Compute pairwise correlation. Pairwise correlation is computed between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first aligned along both axes before computing the correlations. .. versionadded:: 3.4.0 Parameters ---------- other : DataFrame, Series Object with which to compute correlations. axis : int, default 0 or 'index' Can only be set to 0 now. drop : bool, default False Drop missing indices from result. method : {'pearson', 'spearman', 'kendall'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation * kendall : Kendall Tau correlation coefficient Returns ------- Series Pairwise correlations. See Also -------- DataFrame.corr : Compute pairwise correlation of columns. Examples -------- >>> df1 = ps.DataFrame({ ... "A":[1, 5, 7, 8], ... "X":[5, 8, 4, 3], ... "C":[10, 4, 9, 3]}) >>> df1.corrwith(df1[["X", "C"]]).sort_index() A NaN C 1.0 X 1.0 dtype: float64 >>> df2 = ps.DataFrame({ ... "A":[5, 3, 6, 4], ... "B":[11, 2, 4, 3], ... "C":[4, 3, 8, 5]}) >>> with ps.option_context("compute.ops_on_diff_frames", True): ... df1.corrwith(df2).sort_index() A -0.041703 B NaN C 0.395437 X NaN dtype: float64 >>> with ps.option_context("compute.ops_on_diff_frames", True): ... df1.corrwith(df2, method="kendall").sort_index() A 0.0 B NaN C 0.0 X NaN dtype: float64 >>> with ps.option_context("compute.ops_on_diff_frames", True): ... df1.corrwith(df2.B, method="spearman").sort_index() A -0.4 C 0.8 X -0.2 dtype: float64 >>> with ps.option_context("compute.ops_on_diff_frames", True): ... df2.corrwith(df1.X).sort_index() A -0.597614 B -0.151186 C -0.642857 dtype: float64 """frompyspark.pandas.seriesimportSeries,first_seriesaxis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError("corrwith currently only works for axis=0")ifmethodnotin["pearson","spearman","kendall"]:raiseValueError(f"Invalid method {method}")ifnotisinstance(other,(DataFrame,Series)):raiseTypeError("unsupported type: {}".format(type(other).__name__))right_is_series=isinstance(other,Series)ifsame_anchor(self,other):combined=selfthis=selfthat=otherelse:combined=combine_frames(self,other,how="inner")this=combined["this"]that=combined["that"]sdf=combined._internal.spark_frameindex_col_name=verify_temp_column_name(sdf,"__corrwith_index_temp_column__")this_numeric_column_labels:List[Label]=[]forcolumn_labelinthis._internal.column_labels:ifisinstance(this._internal.spark_type_for(column_label),(NumericType,BooleanType)):this_numeric_column_labels.append(column_label)that_numeric_column_labels:List[Label]=[]forcolumn_labelinthat._internal.column_labels:ifisinstance(that._internal.spark_type_for(column_label),(NumericType,BooleanType)):that_numeric_column_labels.append(column_label)intersect_numeric_column_labels:List[Label]=[]diff_numeric_column_labels:List[Label]=[]pair_scols=[]ifright_is_series:intersect_numeric_column_labels=this_numeric_column_labelsthat_scol=that._internal.spark_column_for(that_numeric_column_labels[0]).cast("double")fornumeric_column_labelinintersect_numeric_column_labels:this_scol=this._internal.spark_column_for(numeric_column_label).cast("double")pair_scols.append(F.struct(F.lit(name_like_string(numeric_column_label)).alias(index_col_name),this_scol.alias(CORRELATION_VALUE_1_COLUMN),that_scol.alias(CORRELATION_VALUE_2_COLUMN),))else:fornumeric_column_labelinthis_numeric_column_labels:ifnumeric_column_labelinthat_numeric_column_labels:intersect_numeric_column_labels.append(numeric_column_label)else:diff_numeric_column_labels.append(numeric_column_label)fornumeric_column_labelinthat_numeric_column_labels:ifnumeric_column_labelnotinthis_numeric_column_labels:diff_numeric_column_labels.append(numeric_column_label)fornumeric_column_labelinintersect_numeric_column_labels:this_scol=this._internal.spark_column_for(numeric_column_label).cast("double")that_scol=that._internal.spark_column_for(numeric_column_label).cast("double")pair_scols.append(F.struct(F.lit(name_like_string(numeric_column_label)).alias(index_col_name),this_scol.alias(CORRELATION_VALUE_1_COLUMN),that_scol.alias(CORRELATION_VALUE_2_COLUMN),))iflen(pair_scols)>0:sdf=sdf.select(F.inline(F.array(*pair_scols)))sdf=compute(sdf=sdf,groupKeys=[index_col_name],method=method).select(index_col_name,CORRELATION_CORR_OUTPUT_COLUMN)else:sdf=self._internal.spark_frame.select(F.lit(None).cast("string").alias(index_col_name),F.lit(None).cast("double").alias(CORRELATION_CORR_OUTPUT_COLUMN),).limit(0)ifnotdropandlen(diff_numeric_column_labels)>0:sdf2=(self._internal.spark_frame.select(F.lit([name_like_string(label)forlabelindiff_numeric_column_labels]).alias(index_col_name)).limit(1).select(F.explode(index_col_name).alias(index_col_name)))sdf=sdf.unionByName(sdf2,allowMissingColumns=True)sdf=sdf.withColumn(NATURAL_ORDER_COLUMN_NAME,F.monotonically_increasing_id(),)internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,index_col_name)],column_labels=[(CORRELATION_CORR_OUTPUT_COLUMN,)],column_label_names=self._internal.column_label_names,)sser=first_series(DataFrame(internal))sser.name=Nonereturnsser
[docs]defitems(self)->Iterator[Tuple[Name,"Series"]]:""" Iterator over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series. Returns ------- label : object The column names for the DataFrame being iterated over. content : Series The column entries belonging to each label, as a Series. Examples -------- >>> df = ps.DataFrame({'species': ['bear', 'bear', 'marsupial'], ... 'population': [1864, 22000, 80000]}, ... index=['panda', 'polar', 'koala'], ... columns=['species', 'population']) >>> df species population panda bear 1864 polar bear 22000 koala marsupial 80000 >>> for label, content in df.items(): ... print('label:', label) ... print('content:', content.to_string()) label: species content: panda bear polar bear koala marsupial label: population content: panda 1864 polar 22000 koala 80000 """return((labeliflen(label)>1elselabel[0],self._psser_for(label))forlabelinself._internal.column_labels)
[docs]defiterrows(self)->Iterator[Tuple[Name,pd.Series]]:""" Iterate over DataFrame rows as (index, Series) pairs. Yields ------ index : label or tuple of label The index of the row. A tuple for a `MultiIndex`. data : pandas.Series The data of the row as a Series. it : generator A generator that iterates over the rows of the frame. Notes ----- 1. Because ``iterrows`` returns a Series for each row, it does **not** preserve dtypes across the rows (dtypes are preserved across columns for DataFrames). For example, >>> df = ps.DataFrame([[1, 1.5]], columns=['int', 'float']) >>> row = next(df.iterrows())[1] >>> row int 1.0 float 1.5 Name: 0, dtype: float64 >>> print(row['int'].dtype) float64 >>> print(df['int'].dtype) int64 To preserve dtypes while iterating over the rows, it is better to use :meth:`itertuples` which returns namedtuples of the values and which is generally faster than ``iterrows``. 2. You should **never modify** something you are iterating over. This is not guaranteed to work in all cases. Depending on the data types, the iterator returns a copy and not a view, and writing to it will have no effect. """columns=self.columnsinternal_index_columns=self._internal.index_spark_column_namesinternal_data_columns=self._internal.data_spark_column_namesdefextract_kv_from_spark_row(row:Row)->Tuple[Name,Any]:k=(row[internal_index_columns[0]]iflen(internal_index_columns)==1elsetuple(row[c]forcininternal_index_columns))v=[row[c]forcininternal_data_columns]returnk,vfork,vinmap(extract_kv_from_spark_row,self._internal.resolved_copy.spark_frame.toLocalIterator()):s=pd.Series(v,index=columns,name=k)yieldk,s
[docs]defitertuples(self,index:bool=True,name:Optional[str]="PandasOnSpark")->Iterator[Tuple]:""" Iterate over DataFrame rows as namedtuples. Parameters ---------- index : bool, default True If True, return the index as the first element of the tuple. name : str or None, default "PandasOnSpark" The name of the returned namedtuples or None to return regular tuples. Returns ------- iterator An object to iterate over namedtuples for each row in the DataFrame with the first field possibly being the index and following fields being the column values. See Also -------- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. DataFrame.items : Iterate over (column name, Series) pairs. Notes ----- The column names will be renamed to positional names if they are invalid Python identifiers, repeated, or start with an underscore. Examples -------- >>> df = ps.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, ... index=['dog', 'hawk']) >>> df num_legs num_wings dog 4 0 hawk 2 2 >>> for row in df.itertuples(): ... print(row) ... PandasOnSpark(Index='dog', num_legs=4, num_wings=0) PandasOnSpark(Index='hawk', num_legs=2, num_wings=2) By setting the `index` parameter to False we can remove the index as the first element of the tuple: >>> for row in df.itertuples(index=False): ... print(row) ... PandasOnSpark(num_legs=4, num_wings=0) PandasOnSpark(num_legs=2, num_wings=2) With the `name` parameter set we set a custom name for the yielded namedtuples: >>> for row in df.itertuples(name='Animal'): ... print(row) ... Animal(Index='dog', num_legs=4, num_wings=0) Animal(Index='hawk', num_legs=2, num_wings=2) """fields=list(self.columns)ifindex:fields.insert(0,"Index")index_spark_column_names=self._internal.index_spark_column_namesdata_spark_column_names=self._internal.data_spark_column_namesdefextract_kv_from_spark_row(row:Row)->Tuple[Name,Any]:k=(row[index_spark_column_names[0]]iflen(index_spark_column_names)==1elsetuple(row[c]forcinindex_spark_column_names))v=[row[c]forcindata_spark_column_names]returnk,vifnameisnotNone:itertuple=namedtuple(name,fields,rename=True)# type: ignore[misc]fork,vinmap(extract_kv_from_spark_row,self._internal.resolved_copy.spark_frame.toLocalIterator(),):yielditertuple._make(([k]ifindexelse[])+list(v))else:fork,vinmap(extract_kv_from_spark_row,self._internal.resolved_copy.spark_frame.toLocalIterator(),):yieldtuple(([k]ifindexelse[])+list(v))
[docs]defto_clipboard(self,excel:bool=True,sep:Optional[str]=None,**kwargs:Any)->None:""" Copy object to the system clipboard. Write a text representation of object to the system clipboard. This can be pasted into Excel, for example. .. note:: This method should only be used if the resulting DataFrame is expected to be small, as all the data is loaded into the driver's memory. Parameters ---------- excel : bool, default True - True, use the provided separator, writing in a csv format for allowing easy pasting into excel. - False, write a string representation of the object to the clipboard. sep : str, default ``'\\t'`` Field delimiter. **kwargs These parameters will be passed to DataFrame.to_csv. Notes ----- Requirements for your platform. - Linux : `xclip`, or `xsel` (with `gtk` or `PyQt4` modules) - Windows : none - OS X : none See Also -------- read_clipboard : Read text from clipboard. Examples -------- Copy the contents of a DataFrame to the clipboard. >>> df = ps.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) # doctest: +SKIP >>> df.to_clipboard(sep=',') # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # ,A,B,C ... # 0,1,2,3 ... # 1,4,5,6 We can omit the index by passing the keyword `index` and setting it to false. >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # A,B,C ... # 1,2,3 ... # 4,5,6 This function also works for Series: >>> df = ps.Series([1, 2, 3, 4, 5, 6, 7], name='x') # doctest: +SKIP >>> df.to_clipboard(sep=',') # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # 0, 1 ... # 1, 2 ... # 2, 3 ... # 3, 4 ... # 4, 5 ... # 5, 6 ... # 6, 7 """args=locals()psdf=selfreturnvalidate_arguments_and_invoke_function(psdf._to_internal_pandas(),self.to_clipboard,pd.DataFrame.to_clipboard,args)
[docs]defto_html(self,buf:Optional[IO[str]]=None,columns:Optional[Sequence[Name]]=None,col_space:Optional[Union[str,int,Dict[Name,Union[str,int]]]]=None,header:bool=True,index:bool=True,na_rep:str="NaN",formatters:Optional[Union[List[Callable[[Any],str]],Dict[Name,Callable[[Any],str]]]]=None,float_format:Optional[Callable[[float],str]]=None,sparsify:Optional[bool]=None,index_names:bool=True,justify:Optional[str]=None,max_rows:Optional[int]=None,max_cols:Optional[int]=None,show_dimensions:bool=False,decimal:str=".",bold_rows:bool=True,classes:Optional[Union[str,list,tuple]]=None,escape:bool=True,notebook:bool=False,border:Optional[int]=None,table_id:Optional[str]=None,render_links:bool=False,)->Optional[str]:""" Render a DataFrame as an HTML table. .. note:: This method should only be used if the resulting pandas object is expected to be small, as all the data is loaded into the driver's memory. If the input is large, set max_rows parameter. Parameters ---------- buf : StringIO-like, optional Buffer to write to. columns : sequence, optional, default None The subset of columns to write. Writes all columns by default. col_space : int, optional The minimum width of each column. header : bool, optional Write out the column names. If a list of strings is given, it is assumed to be aliases for the column names index : bool, optional, default True Whether to print index (row) labels. na_rep : str, optional, default 'NaN' String representation of NAN to use. formatters : list or dict of one-param. functions, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a Unicode string. List must be of length equal to the number of columns. float_format : one-parameter function, optional, default None Formatter function to apply to columns' elements if they are floats. The result of this function must be a Unicode string. sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. index_names : bool, optional, default True Prints the names of the indexes. justify : str, default None How to justify the column labels. If None uses the option from the print configuration (controlled by set_option), 'right' out of the box. Valid values are * left * right * center * justify * justify-all * start * end * inherit * match-parent * initial * unset. max_rows : int, optional Maximum number of rows to display in the console. max_cols : int, optional Maximum number of columns to display in the console. show_dimensions : bool, default False Display DataFrame dimensions (number of rows by number of columns). decimal : str, default '.' Character recognized as decimal separator, e.g. ',' in Europe. bold_rows : bool, default True Make the row labels bold in the output. classes : str or list or tuple, default None CSS class(es) to apply to the resulting html table. escape : bool, default True Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. border : int A ``border=border`` attribute is included in the opening `<table>` tag. By default ``pd.options.html.border``. table_id : str, optional A css id is included in the opening `<table>` tag if specified. render_links : bool, default False Convert URLs to HTML links (only works with pandas 0.24+). Returns ------- str (or Unicode, depending on data and options) String representation of the dataframe. See Also -------- to_string : Convert DataFrame to a string. """# Make sure locals() call is at the top of the function so we don't capture local variables.args=locals()ifmax_rowsisnotNone:psdf=self.head(max_rows)else:psdf=selfreturnvalidate_arguments_and_invoke_function(psdf._to_internal_pandas(),self.to_html,pd.DataFrame.to_html,args)
[docs]defto_string(self,buf:Optional[IO[str]]=None,columns:Optional[Sequence[Name]]=None,col_space:Optional[Union[str,int,Dict[Name,Union[str,int]]]]=None,header:bool=True,index:bool=True,na_rep:str="NaN",formatters:Optional[Union[List[Callable[[Any],str]],Dict[Name,Callable[[Any],str]]]]=None,float_format:Optional[Callable[[float],str]]=None,sparsify:Optional[bool]=None,index_names:bool=True,justify:Optional[str]=None,max_rows:Optional[int]=None,max_cols:Optional[int]=None,show_dimensions:bool=False,decimal:str=".",line_width:Optional[int]=None,)->Optional[str]:""" Render a DataFrame to a console-friendly tabular output. .. note:: This method should only be used if the resulting pandas object is expected to be small, as all the data is loaded into the driver's memory. If the input is large, set max_rows parameter. Parameters ---------- buf : StringIO-like, optional Buffer to write to. columns : sequence, optional, default None The subset of columns to write. Writes all columns by default. col_space : int, optional The minimum width of each column. header : bool, optional Write out the column names. If a list of strings is given, it is assumed to be aliases for the column names index : bool, optional, default True Whether to print index (row) labels. na_rep : str, optional, default 'NaN' String representation of NAN to use. formatters : list or dict of one-param. functions, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a Unicode string. List must be of length equal to the number of columns. float_format : one-parameter function, optional, default None Formatter function to apply to columns' elements if they are floats. The result of this function must be a Unicode string. sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. index_names : bool, optional, default True Prints the names of the indexes. justify : str, default None How to justify the column labels. If None uses the option from the print configuration (controlled by set_option), 'right' out of the box. Valid values are * left * right * center * justify * justify-all * start * end * inherit * match-parent * initial * unset. max_rows : int, optional Maximum number of rows to display in the console. max_cols : int, optional Maximum number of columns to display in the console. show_dimensions : bool, default False Display DataFrame dimensions (number of rows by number of columns). decimal : str, default '.' Character recognized as decimal separator, e.g. ',' in Europe. line_width : int, optional Width to wrap a line in characters. Returns ------- str (or Unicode, depending on data and options) String representation of the dataframe. See Also -------- to_html : Convert DataFrame to HTML. Examples -------- >>> df = ps.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}, columns=['col1', 'col2']) >>> print(df.to_string()) col1 col2 0 1 4 1 2 5 2 3 6 >>> print(df.to_string(max_rows=2)) col1 col2 0 1 4 1 2 5 """# Make sure locals() call is at the top of the function so we don't capture local variables.args=locals()ifmax_rowsisnotNone:psdf=self.head(max_rows)else:psdf=selfreturnvalidate_arguments_and_invoke_function(psdf._to_internal_pandas(),self.to_string,pd.DataFrame.to_string,args)
[docs]defto_dict(self,orient:str="dict",into:Type=dict)->Union[List,Mapping]:""" Convert the DataFrame to a dictionary. The type of the key-value pairs can be customized with the parameters (see below). .. note:: This method should only be used if the resulting pandas DataFrame is expected to be small, as all the data is loaded into the driver's memory. Parameters ---------- orient : str {'dict', 'list', 'series', 'split', 'records', 'index'} Determines the type of the values of the dictionary. - 'dict' (default) : dict like {column -> {index -> value}} - 'list' : dict like {column -> [values]} - 'series' : dict like {column -> Series(values)} - 'split' : dict like {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} - 'records' : list like [{column -> value}, ... , {column -> value}] - 'index' : dict like {index -> {column -> value}} Abbreviations are allowed. `s` indicates `series` and `sp` indicates `split`. into : class, default dict The collections.abc.Mapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. Returns ------- dict, list or collections.abc.Mapping Return a collections.abc.Mapping object representing the DataFrame. The resulting transformation depends on the `orient` parameter. Examples -------- >>> df = ps.DataFrame({'col1': [1, 2], ... 'col2': [0.5, 0.75]}, ... index=['row1', 'row2'], ... columns=['col1', 'col2']) >>> df col1 col2 row1 1 0.50 row2 2 0.75 >>> df_dict = df.to_dict() >>> sorted([(key, sorted(values.items())) for key, values in df_dict.items()]) [('col1', [('row1', 1), ('row2', 2)]), ('col2', [('row1', 0.5), ('row2', 0.75)])] You can specify the return orientation. >>> df_dict = df.to_dict('series') >>> sorted(df_dict.items()) [('col1', row1 1 row2 2 Name: col1, dtype: int64), ('col2', row1 0.50 row2 0.75 Name: col2, dtype: float64)] >>> df_dict = df.to_dict('split') >>> sorted(df_dict.items()) # doctest: +ELLIPSIS [('columns', ['col1', 'col2']), ('data', [[1..., 0.75]]), ('index', ['row1', 'row2'])] >>> df_dict = df.to_dict('records') >>> [sorted(values.items()) for values in df_dict] # doctest: +ELLIPSIS [[('col1', 1...), ('col2', 0.5)], [('col1', 2...), ('col2', 0.75)]] >>> df_dict = df.to_dict('index') >>> sorted([(key, sorted(values.items())) for key, values in df_dict.items()]) [('row1', [('col1', 1), ('col2', 0.5)]), ('row2', [('col1', 2), ('col2', 0.75)])] You can also specify the mapping type. >>> from collections import OrderedDict, defaultdict >>> df.to_dict(into=OrderedDict) # doctest: +ELLIPSIS OrderedDict(...) If you want a `defaultdict`, you need to initialize it: >>> dd = defaultdict(list) >>> df.to_dict('records', into=dd) # doctest: +ELLIPSIS [defaultdict(<class 'list'>, {'col..., 'col...}), \defaultdict(<class 'list'>, {'col..., 'col...})] """# Make sure locals() call is at the top of the function so we don't capture local variables.args=locals()psdf=selfreturnvalidate_arguments_and_invoke_function(psdf._to_internal_pandas(),self.to_dict,pd.DataFrame.to_dict,args)
[docs]defto_latex(self,buf:Optional[IO[str]]=None,columns:Optional[List[Name]]=None,header:bool=True,index:bool=True,na_rep:str="NaN",formatters:Optional[Union[List[Callable[[Any],str]],Dict[Name,Callable[[Any],str]]]]=None,float_format:Optional[Callable[[float],str]]=None,sparsify:Optional[bool]=None,index_names:bool=True,bold_rows:bool=False,column_format:Optional[str]=None,longtable:Optional[bool]=None,escape:Optional[bool]=None,encoding:Optional[str]=None,decimal:str=".",multicolumn:Optional[bool]=None,multicolumn_format:Optional[str]=None,multirow:Optional[bool]=None,)->Optional[str]:r""" Render an object to a LaTeX tabular environment table. Render an object to a tabular environment table. You can splice this into a LaTeX document. Requires usepackage{booktabs}. .. note:: This method should only be used if the resulting pandas object is expected to be small, as all the data is loaded into the driver's memory. If the input is large, consider alternative formats. Parameters ---------- buf : file descriptor or None Buffer to write to. If None, the output is returned as a string. columns : list of label, optional The subset of columns to write. Writes all columns by default. header : bool or list of str, default True Write out the column names. If a list of strings is given, it is assumed to be aliases for the column names. index : bool, default True Write row names (index). na_rep : str, default ‘NaN’ Missing data representation. formatters : list of functions or dict of {str: function}, optional Formatter functions to apply to columns’ elements by position or name. The result of each function must be a Unicode string. List must be of length equal to the number of columns. float_format : str, optional Format string for floating point numbers. sparsify : bool, optional Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. By default the value will be read from the config module. index_names : bool, default True Prints the names of the indexes. bold_rows : bool, default False Make the row labels bold in the output. column_format : str, optional The columns format as specified in LaTeX table format e.g. ‘rcl’ for 3 columns. By default, ‘l’ will be used for all columns except columns of numbers, which default to ‘r’. longtable : bool, optional By default the value will be read from the pandas config module. Use a longtable environment instead of tabular. Requires adding a usepackage{longtable} to your LaTeX preamble. escape : bool, optional By default the value will be read from the pandas config module. When set to False prevents from escaping latex special characters in column names. encoding : str, optional A string representing the encoding to use in the output file, defaults to ‘ascii’ on Python 2 and ‘utf-8’ on Python 3. decimal : str, default ‘.’ Character recognized as decimal separator, e.g. ‘,’ in Europe. multicolumn : bool, default True Use multicolumn to enhance MultiIndex columns. The default will be read from the config module. multicolumn_format : str, default ‘l’ The alignment for multicolumns, similar to column_format The default will be read from the config module. multirow : bool, default False Use multirow to enhance MultiIndex rows. Requires adding a usepackage{multirow} to your LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module. Returns ------- str or None If buf is None, returns the resulting LateX format as a string. Otherwise returns None. See Also -------- DataFrame.to_string : Render a DataFrame to a console-friendly tabular output. DataFrame.to_html : Render a DataFrame as an HTML table. Examples -------- >>> df = ps.DataFrame({'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}, ... columns=['name', 'mask', 'weapon']) >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE \begin{tabular}{lll} \toprule name & mask & weapon \\ \midrule Raphael & red & sai \\ Donatello & purple & bo staff \\ \bottomrule \end{tabular} """args=locals()psdf=selfreturnvalidate_arguments_and_invoke_function(psdf._to_internal_pandas(),self.to_latex,pd.DataFrame.to_latex,args)
[docs]defto_feather(self,path:Union[str,IO[str]],**kwargs:Any,)->None:""" Write a DataFrame to the binary Feather format. .. note:: This method should only be used if the resulting DataFrame is expected to be small, as all the data is loaded into the driver's memory. .. versionadded:: 4.0.0 Parameters ---------- path : str, path object, file-like object String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a binary ``write()`` function. **kwargs : Additional keywords passed to :func:`pyarrow.feather.write_feather`. This includes the `compression`, `compression_level`, `chunksize` and `version` keywords. Examples -------- >>> df = ps.DataFrame([[1, 2, 3], [4, 5, 6]]) >>> df.to_feather("file.feather") # doctest: +SKIP """# Make sure locals() call is at the top of the function so we don't capture local variables.args=locals()returnvalidate_arguments_and_invoke_function(self._to_internal_pandas(),self.to_feather,pd.DataFrame.to_feather,args)
[docs]defto_stata(self,path:Union[str,IO[str]],*,convert_dates:Optional[Dict]=None,write_index:bool=True,byteorder:Optional[str]=None,time_stamp:Optional[datetime.datetime]=None,data_label:Optional[str]=None,variable_labels:Optional[Dict]=None,version:Optional[int]=114,convert_strl:Optional[Sequence[Name]]=None,compression:str="infer",storage_options:Optional[str]=None,value_labels:Optional[Dict]=None,)->None:""" Export DataFrame object to Stata dta format. .. note:: This method should only be used if the resulting DataFrame is expected to be small, as all the data is loaded into the driver's memory. .. versionadded:: 4.0.0 Parameters ---------- path : str, path object, or buffer String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a binary ``write()`` function. convert_dates : dict Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. Datetime columns that do not have a conversion type specified will be converted to 'tc'. Raises NotImplementedError if a datetime column has timezone information. write_index : bool Write the index to Stata dataset. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder`. time_stamp : datetime A datetime to use as file creation date. Default is the current time. data_label : str, optional A label for the data set. Must be 80 characters or smaller. variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. version : {{114, 117, 118, 119, None}}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of columns in the frame. Version 114 can be read by Stata 10 and later. Version 117 can be read by Stata 13 or later. Version 118 is supported in Stata 14 and later. Version 119 is supported in Stata 15 and later. Version 114 limits string variables to 244 characters or fewer while versions 117 and later allow strings with lengths up to 2,000,000 characters. Versions 118 and 119 support Unicode characters, and version 119 supports more than 32,767 variables. convert_strl : list, optional List of column names to convert to string columns to Stata StrL format. Only available if version is 117. Storing strings in the StrL format can produce smaller dta files if strings have more than 8 characters and values are repeated. value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value to labels as values. Labels for a single variable must be 32,000 characters or smaller. Examples -------- >>> df = ps.DataFrame({'animal': ['falcon', 'parrot', 'falcon', 'parrot'], ... 'speed': [350, 18, 361, 15]}) >>> df.to_stata('animals.dta') # doctest: +SKIP """# Make sure locals() call is at the top of the function so we don't capture local variables.args=locals()returnvalidate_arguments_and_invoke_function(self._to_internal_pandas(),self.to_stata,pd.DataFrame.to_stata,args)
[docs]deftranspose(self)->"DataFrame":""" Transpose index and columns. Reflect the DataFrame over its main diagonal by writing rows as columns and vice-versa. The property :attr:`.T` is an accessor to the method :meth:`transpose`. .. note:: This method is based on an expensive operation due to the nature of big data. Internally it needs to generate each row for each value, and then group twice - it is a huge operation. To prevent misuse, this method has the 'compute.max_rows' default limit of input length and raises a ValueError. >>> from pyspark.pandas.config import option_context >>> with option_context('compute.max_rows', 1000): # doctest: +NORMALIZE_WHITESPACE ... ps.DataFrame({'a': range(1001)}).transpose() Traceback (most recent call last): ... ValueError: Current DataFrame's length exceeds the given limit of 1000 rows. Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' to retrieve more than 1000 rows. Note that, before changing the 'compute.max_rows', this operation is considerably expensive. Returns ------- DataFrame The transposed DataFrame. Notes ----- Transposing a DataFrame with mixed dtypes will result in a homogeneous DataFrame with the coerced dtype. For instance, if int and float have to be placed in same column, it becomes float. If type coercion is not possible, it fails. Also, note that the values in index should be unique because they become unique column names. In addition, if Spark 2.3 is used, the types should always be exactly same. Examples -------- **Square DataFrame with homogeneous dtype** >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} >>> df1 = ps.DataFrame(data=d1, columns=['col1', 'col2']) >>> df1 col1 col2 0 1 3 1 2 4 >>> df1_transposed = df1.T.sort_index() >>> df1_transposed 0 1 col1 1 2 col2 3 4 When the dtype is homogeneous in the original DataFrame, we get a transposed DataFrame with the same dtype: >>> df1.dtypes col1 int64 col2 int64 dtype: object >>> df1_transposed.dtypes 0 int64 1 int64 dtype: object **Non-square DataFrame with mixed dtypes** >>> d2 = {'score': [9.5, 8], ... 'kids': [0, 0], ... 'age': [12, 22]} >>> df2 = ps.DataFrame(data=d2, columns=['score', 'kids', 'age']) >>> df2 score kids age 0 9.5 0 12 1 8.0 0 22 >>> df2_transposed = df2.T.sort_index() >>> df2_transposed 0 1 age 12.0 22.0 kids 0.0 0.0 score 9.5 8.0 When the DataFrame has mixed dtypes, we get a transposed DataFrame with the coerced dtype: >>> df2.dtypes score float64 kids int64 age int64 dtype: object >>> df2_transposed.dtypes 0 float64 1 float64 dtype: object """max_compute_count=get_option("compute.max_rows")ifmax_compute_countisnotNone:pdf=self.head(max_compute_count+1)._to_internal_pandas()iflen(pdf)>max_compute_count:raiseValueError("Current DataFrame's length exceeds the given limit of {0} rows. ""Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' ""to retrieve more than {0} rows. Note that, before changing the ""'compute.max_rows', this operation is considerably expensive.".format(max_compute_count))returnDataFrame(pdf.transpose())# Explode the data to be pairs.## For instance, if the current input DataFrame is as below:## +------+------+------+------+------+# |index1|index2|(a,x1)|(a,x2)|(b,x3)|# +------+------+------+------+------+# | y1| z1| 1| 0| 0|# | y2| z2| 0| 50| 0|# | y3| z3| 3| 2| 1|# +------+------+------+------+------+## Output of `exploded_df` becomes as below:## +-----------------+-----------------+-----------------+-----+# | index|__index_level_0__|__index_level_1__|value|# +-----------------+-----------------+-----------------+-----+# |{"a":["y1","z1"]}| a| x1| 1|# |{"a":["y1","z1"]}| a| x2| 0|# |{"a":["y1","z1"]}| b| x3| 0|# |{"a":["y2","z2"]}| a| x1| 0|# |{"a":["y2","z2"]}| a| x2| 50|# |{"a":["y2","z2"]}| b| x3| 0|# |{"a":["y3","z3"]}| a| x1| 3|# |{"a":["y3","z3"]}| a| x2| 2|# |{"a":["y3","z3"]}| b| x3| 1|# +-----------------+-----------------+-----------------+-----+pairs=F.explode(F.array(*[F.struct(*[F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i))fori,colinenumerate(label)],*[self._internal.spark_column_for(label).alias("value")],)forlabelinself._internal.column_labels]))exploded_df=self._internal.spark_frame.withColumn("pairs",pairs).select([F.to_json(F.struct(F.array(*[scolforscolinself._internal.index_spark_columns]).alias("a"))).alias("index"),F.col("pairs.*"),])# After that, executes pivot with key and its index column.# Note that index column should contain unique values since column names# should be unique.internal_index_columns=[SPARK_INDEX_NAME_FORMAT(i)foriinrange(self._internal.column_labels_level)]pivoted_df=exploded_df.groupBy(internal_index_columns).pivot("index")transposed_df=pivoted_df.agg(F.first(F.col("value")))new_data_columns=list(filter(lambdax:xnotininternal_index_columns,transposed_df.columns))column_labels=[Noneiflen(label)==1andlabel[0]isNoneelselabelforlabelin(tuple(json.loads(col)["a"])forcolinnew_data_columns)]internal=InternalFrame(spark_frame=transposed_df,index_spark_columns=[scol_for(transposed_df,col)forcolininternal_index_columns],index_names=self._internal.column_label_names,column_labels=column_labels,data_spark_columns=[scol_for(transposed_df,col)forcolinnew_data_columns],column_label_names=self._internal.index_names,)returnDataFrame(internal)
T=property(transpose)
[docs]defapply(self,func:Callable,axis:Axis=0,args:Sequence[Any]=(),**kwds:Any)->Union["Series","DataFrame","Index"]:""" Apply a function along an axis of the DataFrame. Objects passed to the function are Series objects whose index is either the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). See also `Transform and apply a function <https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/transform_apply.html>`_. .. note:: when `axis` is 0 or 'index', the `func` is unable to access to the whole input series. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole series but of the batch internally ... # used. ... def length(s) -> int: ... return len(s) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.apply(length, axis=0) # doctest: +SKIP 0 83 1 83 2 83 ... 10 83 11 83 dtype: int32 .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify the return type as `Series` or scalar value in ``func``, for instance, as below: >>> def square(s) -> ps.Series[np.int32]: ... return s ** 2 pandas-on-Spark uses return type hints and does not try to infer the type. In case when axis is 1, it requires to specify `DataFrame` or scalar value with type hints as below: >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]: ... return x + 1 If the return type is specified as `DataFrame`, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a pandas style as below: >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to apply to each column or row. axis : {0 or 'index', 1 or 'columns'}, default 0 Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- Series or DataFrame Result of applying ``func`` along the given axis of the DataFrame. See Also -------- DataFrame.applymap : For elementwise operations. DataFrame.aggregate : Only perform aggregating type operations. DataFrame.transform : Only perform transforming type operations. Series.apply : The equivalent function for Series. Examples -------- >>> df = ps.DataFrame([[4, 9]] * 3, columns=['A', 'B']) >>> df A B 0 4 9 1 4 9 2 4 9 Using a numpy universal function (in this case the same as ``np.sqrt(df)``): >>> def sqrt(x) -> ps.Series[float]: ... return np.sqrt(x) ... >>> df.apply(sqrt, axis=0) A B 0 2.0 3.0 1 2.0 3.0 2 2.0 3.0 You can omit type hints and let pandas-on-Spark infer its type. >>> df.apply(np.sqrt, axis=0) A B 0 2.0 3.0 1 2.0 3.0 2 2.0 3.0 When `axis` is 1 or 'columns', it applies the function for each row. >>> def summation(x) -> np.int64: ... return np.sum(x) ... >>> df.apply(summation, axis=1) 0 13 1 13 2 13 dtype: int64 You can omit type hints and let pandas-on-Spark infer its type. >>> df.apply(np.sum, axis=1) 0 13 1 13 2 13 dtype: int64 >>> df.apply(max, axis=1) 0 9 1 9 2 9 dtype: int64 Returning a list-like will result in a Series >>> df.apply(lambda x: [1, 2], axis=1) 0 [1, 2] 1 [1, 2] 2 [1, 2] dtype: object To specify the types when `axis` is '1', it should use DataFrame[...] annotation. In this case, the column names are automatically generated. >>> def identify(x) -> ps.DataFrame[('index', int), [('A', np.int64), ('B', np.int64)]]: ... return x ... >>> df.apply(identify, axis=1) # doctest: +NORMALIZE_WHITESPACE A B index 0 4 9 1 4 9 2 4 9 You can also specify extra arguments. >>> def plus_two(a, b, c) -> ps.DataFrame[np.int64, [np.int64, np.int64]]: ... return a + b + c ... >>> df.apply(plus_two, axis=1, args=(1,), c=3) c0 c1 0 8 13 1 8 13 2 8 13 """frompyspark.pandas.groupbyimportGroupByfrompyspark.pandas.seriesimportfirst_seriesifnotisinstance(func,types.FunctionType):assertcallable(func),"the first argument should be a callable function."f=func# Note that the return type hints specified here affects actual return# type in Spark (e.g., infer_return_type). And MyPy does not allow# redefinition of a function.func=lambda*args,**kwargs:f(*args,**kwargs)# noqa: E731axis=validate_axis(axis)should_return_series=Falsespec=inspect.getfullargspec(func)return_sig=spec.annotations.get("return",None)should_infer_schema=return_sigisNoneshould_retain_index=should_infer_schemadefapply_func(pdf:pd.DataFrame)->pd.DataFrame:pdf_or_pser=pdf.apply(func,axis=axis,args=args,**kwds)# type: ignore[arg-type]ifisinstance(pdf_or_pser,pd.Series):returnpdf_or_pser.to_frame()else:returnpdf_or_pserself_applied:DataFrame=DataFrame(self._internal.resolved_copy)column_labels:Optional[List[Label]]=Noneifshould_infer_schema:# Here we execute with the first 1000 to get the return type.# If the records were less than 1000, it uses pandas API directly for a shortcut.log_advice("If the type hints is not specified for `apply`, ""it is expensive to infer the data type internally.")limit=get_option("compute.shortcut_limit")pdf=self_applied.head(limit+1)._to_internal_pandas()applied=pdf.apply(func,axis=axis,args=args,**kwds)# type: ignore[arg-type]psser_or_psdf=ps.from_pandas(applied)iflen(pdf)<=limit:returnpsser_or_psdfpsdf=psser_or_psdfifisinstance(psser_or_psdf,ps.Series):should_return_series=Truepsdf=psser_or_psdf._psdfindex_fields=[field.normalize_spark_type()forfieldinpsdf._internal.index_fields]data_fields=[field.normalize_spark_type()forfieldinpsdf._internal.data_fields]return_schema=StructType([field.struct_fieldforfieldinindex_fields+data_fields])output_func=GroupBy._make_pandas_df_builder_func(self_applied,apply_func,return_schema,retain_index=should_retain_index)sdf=self_applied._internal.to_internal_spark_frame.mapInPandas(lambdaiterator:map(output_func,iterator),schema=return_schema)# If schema is inferred, we can restore indexes too.internal=psdf._internal.with_new_sdf(spark_frame=sdf,index_fields=index_fields,data_fields=data_fields)else:return_type=infer_return_type(func)require_index_axis=isinstance(return_type,SeriesType)require_column_axis=isinstance(return_type,DataFrameType)index_fields=Noneifrequire_index_axis:ifaxis!=0:raiseTypeError("The given function should specify a scalar or a series as its type ""hints when axis is 0 or 'index'; however, the return type ""was %s"%return_sig)dtype=cast(SeriesType,return_type).dtypespark_type=cast(SeriesType,return_type).spark_typedata_fields=[InternalField(dtype=dtype,struct_field=StructField(name=name,dataType=spark_type))fornameinself_applied.columns]return_schema=StructType([field.struct_fieldforfieldindata_fields])elifrequire_column_axis:ifaxis!=1:raiseTypeError("The given function should specify a scalar or a frame as its type ""hints when axis is 1 or 'column'; however, the return type ""was %s"%return_sig)index_fields=cast(DataFrameType,return_type).index_fieldsshould_retain_index=len(index_fields)>0data_fields=cast(DataFrameType,return_type).data_fieldsreturn_schema=cast(DataFrameType,return_type).spark_typeelse:# any axis is fine.should_return_series=Truespark_type=cast(ScalarType,return_type).spark_typedtype=cast(ScalarType,return_type).dtypedata_fields=[InternalField(dtype=dtype,struct_field=StructField(name=SPARK_DEFAULT_SERIES_NAME,dataType=spark_type),)]return_schema=StructType([field.struct_fieldforfieldindata_fields])column_labels=[None]output_func=GroupBy._make_pandas_df_builder_func(self_applied,apply_func,return_schema,retain_index=should_retain_index)sdf=self_applied._internal.to_internal_spark_frame.mapInPandas(lambdaiterator:map(output_func,iterator),schema=return_schema)index_spark_columns=Noneindex_names:Optional[List[Optional[Tuple[Any,...]]]]=Noneifshould_retain_index:index_spark_columns=[scol_for(sdf,index_field.struct_field.name)forindex_fieldinindex_fields]ifnotany([SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name)forindex_fieldinindex_fields]):index_names=[(index_field.struct_field.name,)forindex_fieldinindex_fields]internal=InternalFrame(spark_frame=sdf,index_names=index_names,index_spark_columns=index_spark_columns,index_fields=index_fields,data_fields=data_fields,column_labels=column_labels,)result:DataFrame=DataFrame(internal)ifshould_return_series:returnfirst_series(result)else:returnresult
[docs]deftransform(self,func:Callable[...,"Series"],axis:Axis=0,*args:Any,**kwargs:Any)->"DataFrame":""" Call ``func`` on self producing a Series with transformed values and that has the same length as its input. See also `Transform and apply a function <https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/transform_apply.html>`_. .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def square(x) -> ps.Series[np.int32]: ... return x ** 2 pandas-on-Spark uses return type hints and does not try to infer the type. .. note:: the series within ``func`` is actually multiple pandas series as the segments of the whole pandas-on-Spark series; therefore, the length of each series is not guaranteed. As an example, an aggregation against each series does work as a global aggregation but an aggregation of each segment. See below: >>> def func(x) -> ps.Series[np.int32]: ... return x + sum(x) Parameters ---------- func : function Function to use for transforming the data. It must work when pandas Series is passed. axis : int, default 0 or 'index' Can only be set to 0 now. *args Positional arguments to pass to func. **kwargs Keyword arguments to pass to func. Returns ------- DataFrame A DataFrame that must have the same length as self. Raises ------ Exception : If the returned DataFrame has a different length than self. See Also -------- DataFrame.aggregate : Only perform aggregating type operations. DataFrame.apply : Invoke function on DataFrame. Series.transform : The equivalent function for Series. Examples -------- >>> df = ps.DataFrame({'A': range(3), 'B': range(1, 4)}, columns=['A', 'B']) >>> df A B 0 0 1 1 1 2 2 2 3 >>> def square(x) -> ps.Series[np.int32]: ... return x ** 2 >>> df.transform(square) A B 0 0 1 1 1 4 2 4 9 You can omit type hints and let pandas-on-Spark infer its type. >>> df.transform(lambda x: x ** 2) A B 0 0 1 1 1 4 2 4 9 For multi-index columns: >>> df.columns = [('X', 'A'), ('X', 'B')] >>> df.transform(square) # doctest: +NORMALIZE_WHITESPACE X A B 0 0 1 1 1 4 2 4 9 >>> (df * -1).transform(abs) # doctest: +NORMALIZE_WHITESPACE X A B 0 0 1 1 1 2 2 2 3 You can also specify extra arguments. >>> def calculation(x, y, z) -> ps.Series[int]: ... return x ** y + z >>> df.transform(calculation, y=10, z=20) # doctest: +NORMALIZE_WHITESPACE X A B 0 20 21 1 21 1044 2 1044 59069 """ifnotisinstance(func,types.FunctionType):assertcallable(func),"the first argument should be a callable function."f=func# Note that the return type hints specified here affects actual return# type in Spark (e.g., infer_return_type). And, MyPy does not allow# redefinition of a function.func=lambda*args,**kwargs:f(*args,**kwargs)# noqa: E731axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')spec=inspect.getfullargspec(func)return_sig=spec.annotations.get("return",None)should_infer_schema=return_sigisNoneifshould_infer_schema:# Here we execute with the first 1000 to get the return type.# If the records were less than 1000, it uses pandas API directly for a shortcut.log_advice("If the type hints is not specified for `transform`, ""it is expensive to infer the data type internally.")limit=get_option("compute.shortcut_limit")pdf=self.head(limit+1)._to_internal_pandas()transformed=pdf.transform(func,axis,*args,**kwargs)# type: ignore[arg-type]psdf:DataFrame=DataFrame(transformed)iflen(pdf)<=limit:returnpsdfapplied=[]data_fields=[]forinput_label,output_labelinzip(self._internal.column_labels,psdf._internal.column_labels):psser=self._psser_for(input_label)field=psdf._internal.field_for(output_label).normalize_spark_type()data_fields.append(field)return_schema=field.spark_typeapplied.append(psser.pandas_on_spark._transform_batch(func=lambdac:func(c,*args,**kwargs),return_type=SeriesType(field.dtype,return_schema),))internal=self._internal.with_new_columns(applied,data_fields=data_fields)returnDataFrame(internal)else:returnself._apply_series_op(lambdapsser:psser.pandas_on_spark.transform_batch(func,*args,**kwargs))
[docs]defpop(self,item:Name)->"DataFrame":""" Return item and drop from frame. Raise KeyError if not found. Parameters ---------- item : str Label of column to be popped. Returns ------- Series Examples -------- >>> df = ps.DataFrame([('falcon', 'bird', 389.0), ... ('parrot', 'bird', 24.0), ... ('lion', 'mammal', 80.5), ... ('monkey','mammal', np.nan)], ... columns=('name', 'class', 'max_speed')) >>> df name class max_speed 0 falcon bird 389.0 1 parrot bird 24.0 2 lion mammal 80.5 3 monkey mammal NaN >>> df.pop('class') 0 bird 1 bird 2 mammal 3 mammal Name: class, dtype: object >>> df name max_speed 0 falcon 389.0 1 parrot 24.0 2 lion 80.5 3 monkey NaN Also support for MultiIndex >>> df = ps.DataFrame([('falcon', 'bird', 389.0), ... ('parrot', 'bird', 24.0), ... ('lion', 'mammal', 80.5), ... ('monkey','mammal', np.nan)], ... columns=('name', 'class', 'max_speed')) >>> columns = [('a', 'name'), ('a', 'class'), ('b', 'max_speed')] >>> df.columns = pd.MultiIndex.from_tuples(columns) >>> df a b name class max_speed 0 falcon bird 389.0 1 parrot bird 24.0 2 lion mammal 80.5 3 monkey mammal NaN >>> df.pop('a') name class 0 falcon bird 1 parrot bird 2 lion mammal 3 monkey mammal >>> df b max_speed 0 389.0 1 24.0 2 80.5 3 NaN """result=self[item]self._update_internal_frame(self.drop(columns=item)._internal)returnresult
# TODO(SPARK-46158): add axis parameter can work when '1' or 'columns'
[docs]defxs(self,key:Name,axis:Axis=0,level:Optional[int]=None)->DataFrameOrSeries:""" Return cross-section from the DataFrame. This method takes a `key` argument to select data at a particular level of a MultiIndex. Parameters ---------- key : label or tuple of label Label contained in the index, or partially in a MultiIndex. axis : 0 or 'index', default 0 Axis to retrieve cross-section on. currently only support 0 or 'index' level : object, defaults to first n levels (n=1 or len(key)) In case of a key partially contained in a MultiIndex, indicate which levels are used. Levels can be referred by label or position. Returns ------- DataFrame or Series Cross-section from the original DataFrame corresponding to the selected index levels. See Also -------- DataFrame.loc : Access a group of rows and columns by label(s) or a boolean array. DataFrame.iloc : Purely integer-location based indexing for selection by position. Examples -------- >>> d = {'num_legs': [4, 4, 2, 2], ... 'num_wings': [0, 0, 2, 2], ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], ... 'animal': ['cat', 'dog', 'bat', 'penguin'], ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} >>> df = ps.DataFrame(data=d) >>> df = df.set_index(['class', 'animal', 'locomotion']) >>> df # doctest: +NORMALIZE_WHITESPACE num_legs num_wings class animal locomotion mammal cat walks 4 0 dog walks 4 0 bat flies 2 2 bird penguin walks 2 2 Get values at specified index >>> df.xs('mammal') # doctest: +NORMALIZE_WHITESPACE num_legs num_wings animal locomotion cat walks 4 0 dog walks 4 0 bat flies 2 2 Get values at several indexes >>> df.xs(('mammal', 'dog')) # doctest: +NORMALIZE_WHITESPACE num_legs num_wings locomotion walks 4 0 >>> df.xs(('mammal', 'dog', 'walks')) # doctest: +NORMALIZE_WHITESPACE num_legs 4 num_wings 0 Name: (mammal, dog, walks), dtype: int64 Get values at specified index and level >>> df.xs('cat', level=1) # doctest: +NORMALIZE_WHITESPACE num_legs num_wings class locomotion mammal walks 4 0 """frompyspark.pandas.seriesimportfirst_seriesifnotis_name_like_value(key):raiseTypeError("'key' should be a scalar value or tuple that contains scalar values")iflevelisnotNoneandis_name_like_tuple(key):raiseKeyError(key)axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')ifnotis_name_like_tuple(key):key=(key,)iflen(key)>self._internal.index_level:raiseKeyError("Key length ({}) exceeds index depth ({})".format(len(key),self._internal.index_level))iflevelisNone:level=0rows=[self._internal.index_spark_columns[lvl]==indexforlvl,indexinenumerate(key,level)]internal=self._internal.with_filter(reduce(lambdax,y:x&y,rows))iflen(key)==self._internal.index_level:psdf:DataFrame=DataFrame(internal)pdf=psdf.head(2)._to_internal_pandas()iflen(pdf)==0:raiseKeyError(key)eliflen(pdf)>1:returnpsdfelse:returnfirst_series(DataFrame(pdf.transpose()))else:index_spark_columns=(internal.index_spark_columns[:level]+internal.index_spark_columns[level+len(key):])index_names=internal.index_names[:level]+internal.index_names[level+len(key):]index_fields=internal.index_fields[:level]+internal.index_fields[level+len(key):]internal=internal.copy(index_spark_columns=index_spark_columns,index_names=index_names,index_fields=index_fields,).resolved_copyreturnDataFrame(internal)
[docs]defbetween_time(self,start_time:Union[datetime.time,str],end_time:Union[datetime.time,str],inclusive:str="both",axis:Axis=0,)->"DataFrame":""" Select values between particular times of the day (example: 9:00-9:30 AM). By setting ``start_time`` to be later than ``end_time``, you can get the times that are *not* between the two times. Parameters ---------- start_time : datetime.time or str Initial time as a time filter limit. end_time : datetime.time or str End time as a time filter limit. inclusive : {"both", "neither", "left", "right"}, default "both" Include boundaries; whether to set each bound as closed or open. .. versionadded:: 4.0.0 axis : {0 or 'index', 1 or 'columns'}, default 0 Determine range time on index or columns value. Returns ------- DataFrame Data from the original object filtered to the specified dates range. Raises ------ TypeError If the index is not a :class:`DatetimeIndex` See Also -------- at_time : Select values at a particular time of the day. first : Select initial periods of time series based on a date offset. last : Select final periods of time series based on a date offset. DatetimeIndex.indexer_between_time : Get just the index locations for values between particular times of the day. Examples -------- >>> idx = pd.date_range('2018-04-09', periods=4, freq='1D20min') >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=idx) >>> psdf A 2018-04-09 00:00:00 1 2018-04-10 00:20:00 2 2018-04-11 00:40:00 3 2018-04-12 01:00:00 4 >>> psdf.between_time('0:15', '0:45') # doctest: +SKIP A 2018-04-10 00:20:00 2 2018-04-11 00:40:00 3 You get the times that are *not* between two times by setting ``start_time`` later than ``end_time``: >>> psdf.between_time('0:45', '0:15') # doctest: +SKIP A 2018-04-09 00:00:00 1 2018-04-12 01:00:00 4 """axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError("between_time currently only works for axis=0")ifnotisinstance(self.index,ps.DatetimeIndex):raiseTypeError("Index must be DatetimeIndex")allowed_inclusive_values=["left","right","both","neither"]ifinclusivenotinallowed_inclusive_values:raisePySparkValueError(error_class="VALUE_NOT_ALLOWED",message_parameters={"arg_name":"inclusive","allowed_values":str(allowed_inclusive_values),},)psdf=self.copy()psdf.index.name=verify_temp_column_name(psdf,"__index_name__")return_types=[psdf.index.dtype]+list(psdf.dtypes)defpandas_between_time(# type: ignore[no-untyped-def]pdf,)->ps.DataFrame[return_types]:# type: ignore[valid-type]returnpdf.between_time(start_time,end_time,inclusive).reset_index()# apply_batch will remove the index of the pandas-on-Spark DataFrame and attach a# default index, which will never be used. Use "distributed" index as a dummy to# avoid overhead.withoption_context("compute.default_index_type","distributed"):psdf=psdf.pandas_on_spark.apply_batch(pandas_between_time)returnDataFrame(self._internal.copy(spark_frame=psdf._internal.spark_frame,index_spark_columns=psdf._internal.data_spark_columns[:1],index_fields=psdf._internal.data_fields[:1],data_spark_columns=psdf._internal.data_spark_columns[1:],data_fields=psdf._internal.data_fields[1:],))
# TODO(SPARK-46159): implement axis=1
[docs]defat_time(self,time:Union[datetime.time,str],asof:bool=False,axis:Axis=0)->"DataFrame":""" Select values at particular time of day (example: 9:30AM). Parameters ---------- time : datetime.time or str axis : {0 or 'index', 1 or 'columns'}, default 0 Returns ------- DataFrame Raises ------ TypeError If the index is not a :class:`DatetimeIndex` See Also -------- between_time : Select values between particular times of the day. DatetimeIndex.indexer_at_time : Get just the index locations for values at particular time of the day. Examples -------- >>> idx = pd.date_range('2018-04-09', periods=4, freq='12H') >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=idx) >>> psdf A 2018-04-09 00:00:00 1 2018-04-09 12:00:00 2 2018-04-10 00:00:00 3 2018-04-10 12:00:00 4 >>> psdf.at_time('12:00') A 2018-04-09 12:00:00 2 2018-04-10 12:00:00 4 """ifasof:raiseNotImplementedError("'asof' argument is not supported")axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError("at_time currently only works for axis=0")ifnotisinstance(self.index,ps.DatetimeIndex):raiseTypeError("Index must be DatetimeIndex")psdf=self.copy()psdf.index.name=verify_temp_column_name(psdf,"__index_name__")return_types=[psdf.index.dtype]+list(psdf.dtypes)defpandas_at_time(# type: ignore[no-untyped-def]pdf,)->ps.DataFrame[return_types]:# type: ignore[valid-type]returnpdf.at_time(time,asof,axis).reset_index()# apply_batch will remove the index of the pandas-on-Spark DataFrame and attach# a default index, which will never be used. Use "distributed" index as a dummy# to avoid overhead.withoption_context("compute.default_index_type","distributed"):psdf=psdf.pandas_on_spark.apply_batch(pandas_at_time)returnDataFrame(self._internal.copy(spark_frame=psdf._internal.spark_frame,index_spark_columns=psdf._internal.data_spark_columns[:1],index_fields=psdf._internal.data_fields[:1],data_spark_columns=psdf._internal.data_spark_columns[1:],data_fields=psdf._internal.data_fields[1:],))
[docs]defwhere(self,cond:DataFrameOrSeries,other:Union[DataFrameOrSeries,Any]=np.nan,axis:Axis=None,)->"DataFrame":""" Replace values where the condition is False. Parameters ---------- cond : boolean DataFrame Where cond is True, keep the original value. Where False, replace with corresponding value from other. other : scalar, DataFrame Entries where cond is False are replaced with corresponding value from other. axis : int, default None Can only be set to 0 now for compatibility with pandas. Returns ------- DataFrame Examples -------- >>> from pyspark.pandas.config import set_option, reset_option >>> set_option("compute.ops_on_diff_frames", True) >>> df1 = ps.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]}) >>> df2 = ps.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]}) >>> df1 A B 0 0 100 1 1 200 2 2 300 3 3 400 4 4 500 >>> df2 A B 0 0 -100 1 -1 -200 2 -2 -300 3 -3 -400 4 -4 -500 >>> df1.where(df1 > 0).sort_index() A B 0 NaN 100.0 1 1.0 200.0 2 2.0 300.0 3 3.0 400.0 4 4.0 500.0 >>> df1.where(df1 > 1, 10).sort_index() A B 0 10 100 1 10 200 2 2 300 3 3 400 4 4 500 >>> df1.where(df1 > 1, df1 + 100).sort_index() A B 0 100 100 1 101 200 2 2 300 3 3 400 4 4 500 >>> df1.where(df1 > 1, df2).sort_index() A B 0 0 100 1 -1 200 2 2 300 3 3 400 4 4 500 When the column name of cond is different from self, it treats all values are False >>> cond = ps.DataFrame({'C': [0, -1, -2, -3, -4], 'D':[4, 3, 2, 1, 0]}) % 3 == 0 >>> cond C D 0 True False 1 False True 2 False False 3 True False 4 False True >>> df1.where(cond).sort_index() A B 0 NaN NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 NaN NaN When the type of cond is Series, it just check boolean regardless of column name >>> cond = ps.Series([1, 2]) > 1 >>> cond 0 False 1 True dtype: bool >>> df1.where(cond).sort_index() A B 0 NaN NaN 1 1.0 200.0 2 NaN NaN 3 NaN NaN 4 NaN NaN >>> reset_option("compute.ops_on_diff_frames") """frompyspark.pandas.seriesimportSeriesaxis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')tmp_cond_col_name="__tmp_cond_col_{}__".formattmp_other_col_name="__tmp_other_col_{}__".formatpsdf=self.copy()tmp_cond_col_names=[tmp_cond_col_name(name_like_string(label))forlabelinself._internal.column_labels]ifisinstance(cond,DataFrame):cond=cond[[(cond._internal.spark_column_for(label)iflabelincond._internal.column_labelselseF.lit(False)).alias(name)forlabel,nameinzip(self._internal.column_labels,tmp_cond_col_names)]]psdf[tmp_cond_col_names]=condelifisinstance(cond,Series):cond=cond.to_frame()cond=cond[[cond._internal.data_spark_columns[0].alias(name)fornameintmp_cond_col_names]]psdf[tmp_cond_col_names]=condelse:raiseTypeError("type of cond must be a DataFrame or Series")tmp_other_col_names=[tmp_other_col_name(name_like_string(label))forlabelinself._internal.column_labels]ifisinstance(other,DataFrame):other=other[[(other._internal.spark_column_for(label)iflabelinother._internal.column_labelselseF.lit(np.nan)).alias(name)forlabel,nameinzip(self._internal.column_labels,tmp_other_col_names)]]psdf[tmp_other_col_names]=otherelifisinstance(other,Series):other=other.to_frame()other=other[[other._internal.data_spark_columns[0].alias(name)fornameintmp_other_col_names]]psdf[tmp_other_col_names]=otherelse:forlabelinself._internal.column_labels:psdf[tmp_other_col_name(name_like_string(label))]=other# above logic make spark dataframe looks like below:# +-----------------+---+---+------------------+-------------------+------------------+--...# |__index_level_0__| A| B|__tmp_cond_col_A__|__tmp_other_col_A__|__tmp_cond_col_B__|__...# +-----------------+---+---+------------------+-------------------+------------------+--...# | 0| 0|100| true| 0| false| ...# | 1| 1|200| false| -1| false| ...# | 3| 3|400| true| -3| false| ...# | 2| 2|300| false| -2| true| ...# | 4| 4|500| false| -4| false| ...# +-----------------+---+---+------------------+-------------------+------------------+--...data_spark_columns=[]forlabelinself._internal.column_labels:data_spark_columns.append(F.when(psdf[tmp_cond_col_name(name_like_string(label))].spark.column,psdf._internal.spark_column_for(label),).otherwise(psdf[tmp_other_col_name(name_like_string(label))].spark.column).alias(psdf._internal.spark_column_name_for(label)))returnDataFrame(psdf._internal.with_new_columns(data_spark_columns,column_labels=self._internal.column_labels# TODO: dtypes?))
[docs]defmask(self,cond:DataFrameOrSeries,other:Union[DataFrameOrSeries,Any]=np.nan)->"DataFrame":""" Replace values where the condition is True. Parameters ---------- cond : boolean DataFrame Where cond is False, keep the original value. Where True, replace with corresponding value from other. other : scalar, DataFrame Entries where cond is True are replaced with corresponding value from other. Returns ------- DataFrame Examples -------- >>> from pyspark.pandas.config import set_option, reset_option >>> set_option("compute.ops_on_diff_frames", True) >>> df1 = ps.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]}) >>> df2 = ps.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]}) >>> df1 A B 0 0 100 1 1 200 2 2 300 3 3 400 4 4 500 >>> df2 A B 0 0 -100 1 -1 -200 2 -2 -300 3 -3 -400 4 -4 -500 >>> df1.mask(df1 > 0).sort_index() A B 0 0.0 NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 NaN NaN >>> df1.mask(df1 > 1, 10).sort_index() A B 0 0 10 1 1 10 2 10 10 3 10 10 4 10 10 >>> df1.mask(df1 > 1, df1 + 100).sort_index() A B 0 0 200 1 1 300 2 102 400 3 103 500 4 104 600 >>> df1.mask(df1 > 1, df2).sort_index() A B 0 0 -100 1 1 -200 2 -2 -300 3 -3 -400 4 -4 -500 >>> reset_option("compute.ops_on_diff_frames") """frompyspark.pandas.seriesimportSeriesifnotisinstance(cond,(DataFrame,Series)):raiseTypeError("type of cond must be a DataFrame or Series")cond_inversed=cond._apply_series_op(lambdapsser:~psser)returnself.where(cond_inversed,other)
@propertydefindex(self)->"Index":"""The index (row labels) Column of the DataFrame. Currently not supported when the DataFrame has no index. See Also -------- Index """frompyspark.pandas.indexes.baseimportIndexreturnIndex._new_instance(self)@propertydefempty(self)->bool:""" Returns true if the current DataFrame is empty. Otherwise, returns false. Examples -------- >>> ps.range(10).empty False >>> ps.range(0).empty True >>> ps.DataFrame({}, index=list('abc')).empty True """return(len(self._internal.column_labels)==0orself._internal.resolved_copy.spark_frame.isEmpty())@propertydefstyle(self)->"Styler":""" Property returning a Styler object containing methods for building a styled HTML representation for the DataFrame. Examples -------- >>> ps.range(1001).style # doctest: +SKIP <pandas.io.formats.style.Styler object at ...> """max_results=get_option("compute.max_rows")ifmax_resultsisnotNone:pdf=self.head(max_results+1)._to_internal_pandas()iflen(pdf)>max_results:warnings.warn("'style' property will only use top %s rows."%max_results,UserWarning)returnpdf.head(max_results).styleelse:returnself._to_internal_pandas().style
[docs]defset_index(self,keys:Union[Name,List[Name]],drop:bool=True,append:bool=False,inplace:bool=False,)->Optional["DataFrame"]:"""Set the DataFrame index (row labels) using one or more existing columns. Set the DataFrame index (row labels) using one or more existing columns or arrays (of the correct length). The index can replace the existing index or expand on it. Parameters ---------- keys : label or array-like or list of labels/arrays This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays. Here, "array" encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. drop : bool, default True Delete columns to be used as the new index. append : bool, default False Whether to append columns to existing index. inplace : bool, default False Modify the DataFrame in place (do not create a new object). Returns ------- DataFrame Changed row labels. See Also -------- DataFrame.reset_index : Opposite of set_index. Examples -------- >>> df = ps.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], ... 'sale': [55, 40, 84, 31]}, ... columns=['month', 'year', 'sale']) >>> df month year sale 0 1 2012 55 1 4 2014 40 2 7 2013 84 3 10 2014 31 Set the index to become the 'month' column: >>> df.set_index('month') # doctest: +NORMALIZE_WHITESPACE year sale month 1 2012 55 4 2014 40 7 2013 84 10 2014 31 Create a MultiIndex using columns 'year' and 'month': >>> df.set_index(['year', 'month']) # doctest: +NORMALIZE_WHITESPACE sale year month 2012 1 55 2014 4 40 2013 7 84 2014 10 31 """inplace=validate_bool_kwarg(inplace,"inplace")key_list:List[Label]ifis_name_like_tuple(keys):key_list=[cast(Label,keys)]elifis_name_like_value(keys):key_list=[(keys,)]else:key_list=[keyifis_name_like_tuple(key)else(key,)forkeyinkeys]columns=set(self._internal.column_labels)forkeyinkey_list:ifkeynotincolumns:raiseKeyError(name_like_string(key))ifdrop:column_labels=[labelforlabelinself._internal.column_labelsiflabelnotinkey_list]else:column_labels=self._internal.column_labelsifappend:index_spark_columns=self._internal.index_spark_columns+[self._internal.spark_column_for(label)forlabelinkey_list]index_names=self._internal.index_names+key_listindex_fields=self._internal.index_fields+[self._internal.field_for(label)forlabelinkey_list]else:index_spark_columns=[self._internal.spark_column_for(label)forlabelinkey_list]index_names=key_listindex_fields=[self._internal.field_for(label)forlabelinkey_list]internal=self._internal.copy(index_spark_columns=index_spark_columns,index_names=index_names,index_fields=index_fields,column_labels=column_labels,data_spark_columns=[self._internal.spark_column_for(label)forlabelincolumn_labels],data_fields=[self._internal.field_for(label)forlabelincolumn_labels],)ifinplace:self._update_internal_frame(internal)returnNoneelse:returnDataFrame(internal)
[docs]defreset_index(self,level:Optional[Union[int,Name,Sequence[Union[int,Name]]]]=None,drop:bool=False,inplace:bool=False,col_level:int=0,col_fill:str="",)->Optional["DataFrame"]:"""Reset the index, or a level of it. For DataFrame with multi-level index, return new DataFrame with labeling information in the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None. For a standard index, the index name will be used (if set), otherwise a default 'index' or 'level_0' (if 'index' is already taken) will be used. Parameters ---------- level : int, str, tuple, or list, default None Only remove the given levels from the index. Removes all levels by default. drop : bool, default False Do not try to insert index into dataframe columns. This reset the index to the default integer index. inplace : bool, default False Modify the DataFrame in place (do not create a new object). col_level : int or str, default 0 If the columns have multiple levels, determines which level the labels are inserted into. By default it is inserted into the first level. col_fill : object, default '' If the columns have multiple levels, determines how the other levels are named. If None then the index name is repeated. Returns ------- DataFrame DataFrame with the new index. See Also -------- DataFrame.set_index : Opposite of reset_index. Examples -------- >>> df = ps.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), ... ('mammal', np.nan)], ... index=['falcon', 'parrot', 'lion', 'monkey'], ... columns=('class', 'max_speed')) >>> df class max_speed falcon bird 389.0 parrot bird 24.0 lion mammal 80.5 monkey mammal NaN When we reset the index, the old index is added as a column. Unlike pandas, pandas-on-Spark does not automatically add a sequential index. The following 0, 1, 2, 3 are only there when we display the DataFrame. >>> df.reset_index() index class max_speed 0 falcon bird 389.0 1 parrot bird 24.0 2 lion mammal 80.5 3 monkey mammal NaN We can use the `drop` parameter to avoid the old index being added as a column: >>> df.reset_index(drop=True) class max_speed 0 bird 389.0 1 bird 24.0 2 mammal 80.5 3 mammal NaN You can also use `reset_index` with `MultiIndex`. >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ... ('bird', 'parrot'), ... ('mammal', 'lion'), ... ('mammal', 'monkey')], ... names=['class', 'name']) >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), ... ('species', 'type')]) >>> df = ps.DataFrame([(389.0, 'fly'), ... ( 24.0, 'fly'), ... ( 80.5, 'run'), ... (np.nan, 'jump')], ... index=index, ... columns=columns) >>> df # doctest: +NORMALIZE_WHITESPACE speed species max type class name bird falcon 389.0 fly parrot 24.0 fly mammal lion 80.5 run monkey NaN jump If the index has multiple levels, we can reset a subset of them: >>> df.reset_index(level='class') # doctest: +NORMALIZE_WHITESPACE class speed species max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump If we are not dropping the index, by default, it is placed in the top level. We can place it in another level: >>> df.reset_index(level='class', col_level=1) # doctest: +NORMALIZE_WHITESPACE speed species class max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump When the index is inserted under another level, we can specify under which one with the parameter `col_fill`: >>> df.reset_index(level='class', col_level=1, ... col_fill='species') # doctest: +NORMALIZE_WHITESPACE species speed species class max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump If we specify a nonexistent level for `col_fill`, it is created: >>> df.reset_index(level='class', col_level=1, ... col_fill='genus') # doctest: +NORMALIZE_WHITESPACE genus speed species class max type name falcon bird 389.0 fly parrot bird 24.0 fly lion mammal 80.5 run monkey mammal NaN jump """inplace=validate_bool_kwarg(inplace,"inplace")multi_index=self._internal.index_level>1defrename(index:int)->Label:ifmulti_index:return("level_{}".format(index),)else:if("index",)notinself._internal.column_labels:return("index",)else:return("level_{}".format(index),)iflevelisNone:new_column_labels=[nameifnameisnotNoneelserename(i)fori,nameinenumerate(self._internal.index_names)]new_data_spark_columns=[scol.alias(name_like_string(label))forscol,labelinzip(self._internal.index_spark_columns,new_column_labels)]new_data_fields=self._internal.index_fieldsindex_spark_columns=[]index_names=[]index_fields=[]else:ifis_list_like(level):level=list(cast(Sequence[Union[int,Name]],level))ifisinstance(level,int)oris_name_like_tuple(level):level_list=[cast(Union[int,Label],level)]elifis_name_like_value(level):level_list=[(level,)]else:level_list=[lvlifisinstance(lvl,int)oris_name_like_tuple(lvl)else(lvl,)forlvlinlevel]ifall(isinstance(lvl,int)forlvlinlevel_list):int_level_list=cast(List[int],level_list)forlevinint_level_list:iflev>=self._internal.index_level:raiseIndexError("Too many levels: Index has only {} level, not {}".format(self._internal.index_level,lev+1))idx=int_level_listelifall(is_name_like_tuple(lev)forlevinlevel_list):idx=[]forlabelincast(List[Label],level_list):try:i=self._internal.index_names.index(label)idx.append(i)exceptValueError:ifmulti_index:raiseKeyError("Level unknown not found")else:raiseKeyError("Level unknown must be same as name ({})".format(name_like_string(self._internal.index_names[0])))else:raiseValueError("Level should be all int or all string.")idx.sort()new_column_labels=[]new_data_spark_columns=[]new_data_fields=[]index_spark_columns=self._internal.index_spark_columns.copy()index_names=self._internal.index_names.copy()index_fields=self._internal.index_fields.copy()foriinidx[::-1]:name=index_names.pop(i)new_column_labels.insert(0,nameifnameisnotNoneelserename(i))scol=index_spark_columns.pop(i)new_data_spark_columns.insert(0,scol.alias(name_like_string(name)))new_data_fields.insert(0,index_fields.pop(i).copy(name=name_like_string(name)))ifdrop:new_data_spark_columns=[]new_column_labels=[]new_data_fields=[]forlabelinnew_column_labels:iflabelinself._internal.column_labels:raiseValueError("cannot insert {}, already exists".format(name_like_string(label)))ifself._internal.column_labels_level>1:column_depth=len(self._internal.column_labels[0])ifcol_level>=column_depth:raiseIndexError("Too many levels: Index has only {} levels, not {}".format(column_depth,col_level+1))ifany(col_level+len(label)>column_depthforlabelinnew_column_labels):raiseValueError("Item must have length equal to number of levels.")new_column_labels=[tuple(([col_fill]*col_level)+list(label)+([col_fill]*(column_depth-(len(label)+col_level))))forlabelinnew_column_labels]internal=self._internal.copy(index_spark_columns=index_spark_columns,index_names=index_names,index_fields=index_fields,column_labels=new_column_labels+self._internal.column_labels,data_spark_columns=new_data_spark_columns+self._internal.data_spark_columns,data_fields=new_data_fields+self._internal.data_fields,)ifinplace:self._update_internal_frame(internal)returnNoneelse:returnDataFrame(internal)
[docs]defisnull(self)->"DataFrame":""" Detects missing values for items in the current Dataframe. Return a boolean same-sized Dataframe indicating if the values are NA. NA values, such as None or numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. See Also -------- DataFrame.notnull Examples -------- >>> df = ps.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)]) >>> df.isnull() 0 1 0 False False 1 False True 2 False True 3 False False >>> df = ps.DataFrame([[None, 'bee', None], ['dog', None, 'fly']]) >>> df.isnull() 0 1 2 0 True False True 1 False True False """returnself._apply_series_op(lambdapsser:psser.isnull())
isna=isnull
[docs]defnotnull(self)->"DataFrame":""" Detects non-missing values for items in the current Dataframe. This function takes a dataframe and indicates whether it's values are valid (not missing, which is ``NaN`` in numeric datatypes, ``None`` or ``NaN`` in objects and ``NaT`` in datetimelike). See Also -------- DataFrame.isnull Examples -------- >>> df = ps.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)]) >>> df.notnull() 0 1 0 True True 1 True False 2 True False 3 True True >>> df = ps.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) >>> df.notnull() 0 1 2 0 True True True 1 True False True """returnself._apply_series_op(lambdapsser:psser.notnull())
notna=notnull
[docs]definsert(self,loc:int,column:Name,value:Union[Scalar,"Series",Iterable],allow_duplicates:bool=False,)->None:""" Insert column into DataFrame at specified location. Raises a ValueError if `column` is already contained in the DataFrame, unless `allow_duplicates` is set to True. Parameters ---------- loc : int Insertion index. Must verify 0 <= loc <= len(columns). column : str, number, or hashable object Label of the inserted column. value : int, Series, or array-like allow_duplicates : bool, optional Examples -------- >>> psdf = ps.DataFrame([1, 2, 3]) >>> psdf.sort_index() 0 0 1 1 2 2 3 >>> psdf.insert(0, 'x', 4) >>> psdf.sort_index() x 0 0 4 1 1 4 2 2 4 3 >>> from pyspark.pandas.config import set_option, reset_option >>> set_option("compute.ops_on_diff_frames", True) >>> psdf.insert(1, 'y', [5, 6, 7]) >>> psdf.sort_index() x y 0 0 4 5 1 1 4 6 2 2 4 7 3 >>> psdf.insert(2, 'z', ps.Series([8, 9, 10])) >>> psdf.sort_index() x y z 0 0 4 5 8 1 1 4 6 9 2 2 4 7 10 3 >>> reset_option("compute.ops_on_diff_frames") """ifnotisinstance(loc,int):raiseTypeError("loc must be int")assert0<=loc<=len(self.columns)assertallow_duplicatesisFalseifnotis_name_like_value(column):raiseTypeError('"column" should be a scalar value or tuple that contains scalar values')# TODO(SPARK-37723): Support tuple for non-MultiIndex column name.ifis_name_like_tuple(column):ifself._internal.column_labels_level>1:iflen(column)!=len(self.columns.levels):# type: ignore[attr-defined]# To be consistent with pandasraiseValueError('"column" must have length equal to number of column levels.')else:raiseNotImplementedError("Assigning column name as tuple is only supported for MultiIndex columns ""for now.")ifcolumninself.columns:raiseValueError("cannot insert %s, already exists"%str(column))psdf=self.copy()psdf[column]=valuecolumns=psdf.columns[:-1].insert(loc,psdf.columns[-1])psdf=psdf[columns]self._update_internal_frame(psdf._internal)
# TODO(SPARK-46156): add frep and axis parameter
[docs]defshift(self,periods:int=1,fill_value:Optional[Any]=None)->"DataFrame":""" Shift DataFrame by desired number of periods. .. note:: the current implementation of shift uses Spark's Window without specifying partition specification. This leads to moving all data into a single partition in a single machine and could cause serious performance degradation. Avoid this method with very large datasets. Parameters ---------- periods : int Number of periods to shift. Can be positive or negative. fill_value : object, optional The scalar value to use for newly introduced missing values. The default depends on the dtype of self. For numeric data, np.nan is used. Returns ------- Copy of input DataFrame, shifted. Examples -------- >>> df = ps.DataFrame({'Col1': [10, 20, 15, 30, 45], ... 'Col2': [13, 23, 18, 33, 48], ... 'Col3': [17, 27, 22, 37, 52]}, ... columns=['Col1', 'Col2', 'Col3']) >>> df.shift(periods=3) Col1 Col2 Col3 0 NaN NaN NaN 1 NaN NaN NaN 2 NaN NaN NaN 3 10.0 13.0 17.0 4 20.0 23.0 27.0 >>> df.shift(periods=3, fill_value=0) Col1 Col2 Col3 0 0 0 0 1 0 0 0 2 0 0 0 3 10 13 17 4 20 23 27 """returnself._apply_series_op(lambdapsser:psser._shift(periods,fill_value),should_resolve=True)
# TODO(SPARK-46161): axis should support 1 or 'columns' either at this moment
[docs]defdiff(self,periods:int=1,axis:Axis=0)->"DataFrame":""" First discrete difference of element. Calculates the difference of a DataFrame element compared with another element in the DataFrame (default is the element in the same column of the previous row). .. note:: the current implementation of diff uses Spark's Window without specifying partition specification. This leads to moving all data into a single partition in a single machine and could cause serious performance degradation. Avoid this method with very large datasets. Parameters ---------- periods : int, default 1 Periods to shift for calculating difference, accepts negative values. axis : int, default 0 or 'index' Can only be set to 0 now. Returns ------- diffed : DataFrame Examples -------- >>> df = ps.DataFrame({'a': [1, 2, 3, 4, 5, 6], ... 'b': [1, 1, 2, 3, 5, 8], ... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c']) >>> df a b c 0 1 1 1 1 2 1 4 2 3 2 9 3 4 3 16 4 5 5 25 5 6 8 36 >>> df.diff() a b c 0 NaN NaN NaN 1 1.0 0.0 3.0 2 1.0 1.0 5.0 3 1.0 1.0 7.0 4 1.0 2.0 9.0 5 1.0 3.0 11.0 Difference with previous column >>> df.diff(periods=3) a b c 0 NaN NaN NaN 1 NaN NaN NaN 2 NaN NaN NaN 3 3.0 2.0 15.0 4 3.0 4.0 21.0 5 3.0 6.0 27.0 Difference with following row >>> df.diff(periods=-1) a b c 0 -1.0 0.0 -3.0 1 -1.0 -1.0 -5.0 2 -1.0 -1.0 -7.0 3 -1.0 -2.0 -9.0 4 -1.0 -3.0 -11.0 5 NaN NaN NaN """axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')returnself._apply_series_op(lambdapsser:psser._diff(periods),should_resolve=True)
# TODO(SPARK-46162): axis should support 1 or 'columns' either at this moment
[docs]defnunique(self,axis:Axis=0,dropna:bool=True,approx:bool=False,rsd:float=0.05,)->"Series":""" Return number of unique elements in the object. Excludes NA values by default. Parameters ---------- axis : int, default 0 or 'index' Can only be set to 0 now. dropna : bool, default True Don’t include NaN in the count. approx: bool, default False If False, will use the exact algorithm and return the exact number of unique. If True, it uses the HyperLogLog approximate algorithm, which is significantly faster for large amounts of data. Note: This parameter is specific to pandas-on-Spark and is not found in pandas. rsd: float, default 0.05 Maximum estimation error allowed in the HyperLogLog algorithm. Note: Just like ``approx`` this parameter is specific to pandas-on-Spark. Returns ------- The number of unique values per column as a pandas-on-Spark Series. Examples -------- >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [np.nan, 3, np.nan]}) >>> df.nunique() A 3 B 1 dtype: int64 >>> df.nunique(dropna=False) A 3 B 2 dtype: int64 On big data, we recommend using the approximate algorithm to speed up this function. The result will be very close to the exact unique count. >>> df.nunique(approx=True) A 3 B 1 dtype: int64 """frompyspark.pandas.seriesimportfirst_seriesaxis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')sdf=self._internal.spark_frame.select([F.lit(None).cast(StringType()).alias(SPARK_DEFAULT_INDEX_NAME)]+[self._psser_for(label)._nunique(dropna,approx,rsd)forlabelinself._internal.column_labels])# The data is expected to be small so it's fine to transpose/use the default index.withps.option_context("compute.max_rows",1):internal=self._internal.copy(spark_frame=sdf,index_spark_columns=[scol_for(sdf,SPARK_DEFAULT_INDEX_NAME)],index_names=[None],index_fields=[None],data_spark_columns=[scol_for(sdf,col)forcolinself._internal.data_spark_column_names],data_fields=None,)returnfirst_series(DataFrame(internal).transpose())
[docs]defround(self,decimals:Union[int,Dict[Name,int],"Series"]=0)->"DataFrame":""" Round a DataFrame to a variable number of decimal places. Parameters ---------- decimals : int, dict, Series Number of decimal places to round each column to. If an int is given, round each column to the same number of places. Otherwise dict and Series round to variable numbers of places. Column names should be in the keys if `decimals` is a dict-like, or in the index if `decimals` is a Series. Any columns not included in `decimals` will be left as is. Elements of `decimals` which are not columns of the input will be ignored. .. note:: If `decimals` is a Series, it is expected to be small, as all the data is loaded into the driver's memory. Returns ------- DataFrame See Also -------- Series.round Examples -------- >>> df = ps.DataFrame({'A':[0.028208, 0.038683, 0.877076], ... 'B':[0.992815, 0.645646, 0.149370], ... 'C':[0.173891, 0.577595, 0.491027]}, ... columns=['A', 'B', 'C'], ... index=['first', 'second', 'third']) >>> df A B C first 0.028208 0.992815 0.173891 second 0.038683 0.645646 0.577595 third 0.877076 0.149370 0.491027 >>> df.round(2) A B C first 0.03 0.99 0.17 second 0.04 0.65 0.58 third 0.88 0.15 0.49 >>> df.round({'A': 1, 'C': 2}) A B C first 0.0 0.992815 0.17 second 0.0 0.645646 0.58 third 0.9 0.149370 0.49 >>> decimals = ps.Series([1, 0, 2], index=['A', 'B', 'C']) >>> df.round(decimals) A B C first 0.0 1.0 0.17 second 0.0 1.0 0.58 third 0.9 0.0 0.49 """ifisinstance(decimals,ps.Series):decimals_dict={kifisinstance(k,tuple)else(k,):vfork,vindecimals._to_internal_pandas().items()}elifisinstance(decimals,dict):decimals_dict={kifis_name_like_tuple(k)else(k,):vfork,vindecimals.items()}elifisinstance(decimals,int):decimals_dict={k:decimalsforkinself._internal.column_labels}else:raiseTypeError("decimals must be an integer, a dict-like or a Series")defop(psser:ps.Series)->Union[ps.Series,PySparkColumn]:label=psser._column_labeliflabelindecimals_dict:returnF.round(psser.spark.column,decimals_dict[label])else:returnpsserreturnself._apply_series_op(op)
def_mark_duplicates(self,subset:Optional[Union[Name,List[Name]]]=None,keep:Union[bool,str]="first",)->Tuple[PySparkDataFrame,str]:ifsubsetisNone:subset_list=self._internal.column_labelselse:ifis_name_like_tuple(subset):subset_list=[cast(Label,subset)]elifis_name_like_value(subset):subset_list=[(subset,)]else:subset_list=[subifis_name_like_tuple(sub)else(sub,)forsubinsubset]diff=set(subset_list).difference(set(self._internal.column_labels))iflen(diff)>0:raiseKeyError(", ".join([name_like_string(d)fordindiff]))group_cols=[self._internal.spark_column_name_for(label)forlabelinsubset_list]sdf=self._internal.resolved_copy.spark_framecolumn=verify_temp_column_name(sdf,"__duplicated__")ifkeep=="first"orkeep=="last":ifkeep=="first":ord_func=F.ascelse:ord_func=F.descwindow=(Window.partitionBy(*group_cols).orderBy(ord_func(NATURAL_ORDER_COLUMN_NAME)).rowsBetween(Window.unboundedPreceding,Window.currentRow))sdf=sdf.withColumn(column,F.row_number().over(window)>1)elifnotkeep:window=Window.partitionBy(*group_cols).rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)sdf=sdf.withColumn(column,F.count("*").over(window)>1)else:raiseValueError("'keep' only supports 'first', 'last' and False")returnsdf,column
[docs]defduplicated(self,subset:Optional[Union[Name,List[Name]]]=None,keep:Union[bool,str]="first",)->"Series":""" Return boolean Series denoting duplicate rows, optionally only considering certain columns. Parameters ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, default use all of the columns keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : Series Examples -------- >>> df = ps.DataFrame({'a': [1, 1, 1, 3], 'b': [1, 1, 1, 4], 'c': [1, 1, 1, 5]}, ... columns = ['a', 'b', 'c']) >>> df a b c 0 1 1 1 1 1 1 1 2 1 1 1 3 3 4 5 >>> df.duplicated().sort_index() 0 False 1 True 2 True 3 False dtype: bool Mark duplicates as ``True`` except for the last occurrence. >>> df.duplicated(keep='last').sort_index() 0 True 1 True 2 False 3 False dtype: bool Mark all duplicates as ``True``. >>> df.duplicated(keep=False).sort_index() 0 True 1 True 2 True 3 False dtype: bool """frompyspark.pandas.seriesimportfirst_seriessdf,column=self._mark_duplicates(subset,keep)sdf=sdf.select(self._internal.index_spark_columns+[scol_for(sdf,column).alias(SPARK_DEFAULT_SERIES_NAME)])returnfirst_series(DataFrame(InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolinself._internal.index_spark_column_names],index_names=self._internal.index_names,index_fields=self._internal.index_fields,column_labels=[None],data_spark_columns=[scol_for(sdf,SPARK_DEFAULT_SERIES_NAME)],)))
# TODO: support other as DataFrame or array-like
[docs]defdot(self,other:"Series")->"Series":""" Compute the matrix multiplication between the DataFrame and others. This method computes the matrix product between the DataFrame and the values of an other Series It can also be called using ``self @ other`` in Python >= 3.5. .. note:: This method is based on an expensive operation due to the nature of big data. Internally it needs to generate each row for each value, and then group twice - it is a huge operation. To prevent misuse, this method has the 'compute.max_rows' default limit of input length and raises a ValueError. >>> from pyspark.pandas.config import option_context >>> with option_context( ... 'compute.max_rows', 1000, "compute.ops_on_diff_frames", True ... ): # doctest: +NORMALIZE_WHITESPACE ... psdf = ps.DataFrame({'a': range(1001)}) ... psser = ps.Series([2], index=['a']) ... psdf.dot(psser) Traceback (most recent call last): ... ValueError: Current DataFrame's length exceeds the given limit of 1000 rows. Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' to retrieve more than 1000 rows. Note that, before changing the 'compute.max_rows', this operation is considerably expensive. Parameters ---------- other : Series The other object to compute the matrix product with. Returns ------- Series Return the matrix product between self and other as a Series. See Also -------- Series.dot: Similar method for Series. Notes ----- The dimensions of DataFrame and other must be compatible to compute the matrix multiplication. In addition, the column names of DataFrame and the index of other must contain the same values, as they will be aligned prior to the multiplication. The dot method for Series computes the inner product, instead of the matrix product here. Examples -------- >>> from pyspark.pandas.config import set_option, reset_option >>> set_option("compute.ops_on_diff_frames", True) >>> psdf = ps.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) >>> psser = ps.Series([1, 1, 2, 1]) >>> psdf.dot(psser) 0 -4 1 5 dtype: int64 Note how shuffling of the objects does not change the result. >>> psser2 = psser.reindex([1, 0, 2, 3]) >>> psdf.dot(psser2) 0 -4 1 5 dtype: int64 >>> psdf @ psser2 0 -4 1 5 dtype: int64 >>> reset_option("compute.ops_on_diff_frames") """ifnotisinstance(other,ps.Series):raiseTypeError("Unsupported type {}".format(type(other).__name__))else:returncast(ps.Series,other.dot(self.transpose())).rename(None)
def__matmul__(self,other:"Series")->"Series":""" Matrix multiplication using binary `@` operator in Python>=3.5. """returnself.dot(other)
[docs]defto_table(self,name:str,format:Optional[str]=None,mode:str="w",partition_cols:Optional[Union[str,List[str]]]=None,index_col:Optional[Union[str,List[str]]]=None,**options:Any,)->None:ifindex_colisNone:log_advice("If `index_col` is not specified for `to_table`, ""the existing index is lost when converting to table.")mode=validate_mode(mode)returnself.spark.to_table(name,format,mode,partition_cols,index_col,**options)
[docs]defto_delta(self,path:str,mode:str="w",partition_cols:Optional[Union[str,List[str]]]=None,index_col:Optional[Union[str,List[str]]]=None,**options:"OptionalPrimitiveType",)->None:""" Write the DataFrame out as a Delta Lake table. Parameters ---------- path : str, required Path to write to. mode : str Python write mode, default 'w'. .. note:: mode can accept the strings for Spark writing mode. Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'. - 'append' (equivalent to 'a'): Append the new data to existing data. - 'overwrite' (equivalent to 'w'): Overwrite existing data. - 'ignore': Silently ignore this operation if data already exists. - 'error' or 'errorifexists': Throw an exception if data already exists. partition_cols : str or list of str, optional, default None Names of partitioning columns index_col: str or list of str, optional, default: None Column names to be used in Spark to represent pandas-on-Spark's index. The index name in pandas-on-Spark is ignored. By default the index is always lost. options : dict All other options passed directly into Delta Lake. See Also -------- read_delta DataFrame.to_parquet DataFrame.to_table Examples -------- >>> df = ps.DataFrame(dict( ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')), ... country=['KR', 'US', 'JP'], ... code=[1, 2 ,3]), columns=['date', 'country', 'code']) >>> df date country code 0 2012-01-31 12:00:00 KR 1 1 2012-02-29 12:00:00 US 2 2 2012-03-31 12:00:00 JP 3 Create a new Delta Lake table, partitioned by one column: >>> df.to_delta('%s/to_delta/foo' % path, partition_cols='date') # doctest: +SKIP Partitioned by two columns: >>> df.to_delta('%s/to_delta/bar' % path, ... partition_cols=['date', 'country']) # doctest: +SKIP Overwrite an existing table's partitions, using the 'replaceWhere' capability in Delta: >>> df.to_delta('%s/to_delta/bar' % path, ... mode='overwrite', replaceWhere='date >= "2012-01-01"') # doctest: +SKIP """ifindex_colisNone:log_advice("If `index_col` is not specified for `to_delta`, ""the existing index is lost when converting to Delta.")if"options"inoptionsandisinstance(options.get("options"),dict)andlen(options)==1:options=options.get("options")# type: ignore[assignment]mode=validate_mode(mode)self.spark.to_spark_io(path=path,mode=mode,format="delta",partition_cols=partition_cols,index_col=index_col,**options,)
[docs]defto_parquet(self,path:str,mode:str="w",partition_cols:Optional[Union[str,List[str]]]=None,compression:Optional[str]=None,index_col:Optional[Union[str,List[str]]]=None,**options:Any,)->None:""" Write the DataFrame out as a Parquet file or directory. Parameters ---------- path : str, required Path to write to. mode : str Python write mode, default 'w'. .. note:: mode can accept the strings for Spark writing mode. Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'. - 'append' (equivalent to 'a'): Append the new data to existing data. - 'overwrite' (equivalent to 'w'): Overwrite existing data. - 'ignore': Silently ignore this operation if data already exists. - 'error' or 'errorifexists': Throw an exception if data already exists. partition_cols : str or list of str, optional, default None Names of partitioning columns compression : str {'none', 'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'lz4', 'zstd'} Compression codec to use when saving to file. If None is set, it uses the value specified in `spark.sql.parquet.compression.codec`. index_col: str or list of str, optional, default: None Column names to be used in Spark to represent pandas-on-Spark's index. The index name in pandas-on-Spark is ignored. By default the index is always lost. options : dict All other options passed directly into Spark's data source. See Also -------- read_parquet DataFrame.to_delta DataFrame.to_table Examples -------- >>> df = ps.DataFrame(dict( ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')), ... country=['KR', 'US', 'JP'], ... code=[1, 2 ,3]), columns=['date', 'country', 'code']) >>> df date country code 0 2012-01-31 12:00:00 KR 1 1 2012-02-29 12:00:00 US 2 2 2012-03-31 12:00:00 JP 3 >>> df.to_parquet('%s/to_parquet/foo.parquet' % path, partition_cols='date') >>> df.to_parquet( ... '%s/to_parquet/foo.parquet' % path, ... mode = 'overwrite', ... partition_cols=['date', 'country']) Notes ----- pandas API on Spark writes Parquet files into the directory, `path`, and writes multiple part files in the directory unlike pandas. pandas API on Spark respects HDFS's property such as 'fs.default.name'. """ifindex_colisNone:log_advice("If `index_col` is not specified for `to_parquet`, ""the existing index is lost when converting to Parquet.")if"options"inoptionsandisinstance(options.get("options"),dict)andlen(options)==1:options=options.get("options")mode=validate_mode(mode)builder=self.to_spark(index_col=index_col).write.mode(mode)ifpartition_colsisnotNone:builder.partitionBy(partition_cols)ifcompressionisnotNone:builder.option("compression",compression)builder.options(**options).format("parquet").save(path)
[docs]defto_orc(self,path:str,mode:str="w",partition_cols:Optional[Union[str,List[str]]]=None,index_col:Optional[Union[str,List[str]]]=None,**options:"OptionalPrimitiveType",)->None:""" Write a DataFrame to the ORC format. Parameters ---------- path : str Path to write to. mode : str Python write mode, default 'w'. .. note:: mode can accept the strings for Spark writing mode. Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'. - 'append' (equivalent to 'a'): Append the new data to existing data. - 'overwrite' (equivalent to 'w'): Overwrite existing data. - 'ignore': Silently ignore this operation if data already exists. - 'error' or 'errorifexists': Throw an exception if data already exists. partition_cols : str or list of str, optional, default None Names of partitioning columns index_col: str or list of str, optional, default: None Column names to be used in Spark to represent pandas-on-Spark's index. The index name in pandas-on-Spark is ignored. By default the index is always lost. options : dict All other options passed directly into Spark's data source. See Also -------- read_orc DataFrame.to_delta DataFrame.to_parquet DataFrame.to_table Examples -------- >>> df = ps.DataFrame(dict( ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')), ... country=['KR', 'US', 'JP'], ... code=[1, 2 ,3]), columns=['date', 'country', 'code']) >>> df date country code 0 2012-01-31 12:00:00 KR 1 1 2012-02-29 12:00:00 US 2 2 2012-03-31 12:00:00 JP 3 >>> df.to_orc('%s/to_orc/foo.orc' % path, partition_cols='date') >>> df.to_orc( ... '%s/to_orc/foo.orc' % path, ... mode = 'overwrite', ... partition_cols=['date', 'country']) Notes ----- pandas API on Spark writes ORC files into the directory, `path`, and writes multiple part files in the directory unlike pandas. pandas API on Spark respects HDFS's property such as 'fs.default.name'. """ifindex_colisNone:log_advice("If `index_col` is not specified for `to_orc`, ""the existing index is lost when converting to ORC.")if"options"inoptionsandisinstance(options.get("options"),dict)andlen(options)==1:options=options.get("options")# type: ignore[assignment]mode=validate_mode(mode)self.spark.to_spark_io(path=path,mode=mode,format="orc",partition_cols=partition_cols,index_col=index_col,**options,)
[docs]defto_spark(self,index_col:Optional[Union[str,List[str]]]=None)->PySparkDataFrame:ifindex_colisNone:log_advice("If `index_col` is not specified for `to_spark`, ""the existing index is lost when converting to Spark DataFrame.")returnself._to_spark(index_col)
to_spark.__doc__=SparkFrameMethods.__doc__def_to_spark(self,index_col:Optional[Union[str,List[str]]]=None)->PySparkDataFrame:""" Same as `to_spark()`, without issuing the advice log when `index_col` is not specified for internal usage. """returnself.spark.frame(index_col)
[docs]defto_pandas(self)->pd.DataFrame:""" Return a pandas DataFrame. .. note:: This method should only be used if the resulting pandas DataFrame is expected to be small, as all the data is loaded into the driver's memory. Examples -------- >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df.to_pandas() dogs cats 0 0.2 0.3 1 0.0 0.6 2 0.6 0.0 3 0.2 0.1 """log_advice("`to_pandas` loads all data into the driver's memory. ""It should only be used if the resulting pandas DataFrame is expected to be small.")returnself._to_pandas()
def_to_pandas(self)->pd.DataFrame:""" Same as `to_pandas()`, without issuing the advice log for internal usage. """returnself._internal.to_pandas_frame.copy()
[docs]defassign(self,**kwargs:Any)->"DataFrame":""" Assign new columns to a DataFrame. Returns a new object with all original columns in addition to new ones. Existing columns that are re-assigned will be overwritten. Parameters ---------- **kwargs : dict of {str: callable, Series or Index} The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not change input DataFrame (though pandas-on-Spark doesn't check it). If the values are not callable, (e.g. a Series or a literal), they are simply assigned. Returns ------- DataFrame A new DataFrame with the new columns in addition to all the existing columns. Examples -------- >>> df = ps.DataFrame({'temp_c': [17.0, 25.0]}, ... index=['Portland', 'Berkeley']) >>> df temp_c Portland 17.0 Berkeley 25.0 Where the value is a callable, evaluated on `df`: >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 Alternatively, the same behavior can be achieved by directly referencing an existing Series or sequence and you can also create multiple columns within the same assign. >>> assigned = df.assign(temp_f=df['temp_c'] * 9 / 5 + 32, ... temp_k=df['temp_c'] + 273.15, ... temp_idx=df.index) >>> assigned[['temp_c', 'temp_f', 'temp_k', 'temp_idx']] temp_c temp_f temp_k temp_idx Portland 17.0 62.6 290.15 Portland Berkeley 25.0 77.0 298.15 Berkeley Notes ----- Assigning multiple columns within the same ``assign`` is possible but you cannot refer to newly created or modified columns. This feature is supported in pandas for Python 3.6 and later but not in pandas-on-Spark. In pandas-on-Spark, all items are computed first, and then assigned. """returnself._assign(kwargs)
def_assign(self,kwargs:Any)->"DataFrame":assertisinstance(kwargs,dict)frompyspark.pandas.indexesimportMultiIndexfrompyspark.pandas.seriesimportIndexOpsMixinColumn=get_column_class()fork,vinkwargs.items():is_invalid_assignee=(not(isinstance(v,(IndexOpsMixin,Column))orcallable(v)oris_scalar(v)))orisinstance(v,MultiIndex)ifis_invalid_assignee:raiseTypeError("Column assignment doesn't support type ""{0}".format(type(v).__name__))ifcallable(v):kwargs[k]=v(self)pairs={(kifis_name_like_tuple(k)else(k,)):((v.spark.column,v._internal.data_fields[0])ifisinstance(v,IndexOpsMixin)andnotisinstance(v,MultiIndex)else(v,None)ifisinstance(v,Column)else(F.lit(v),None))fork,vinkwargs.items()}scols=[]data_fields=[]forlabelinself._internal.column_labels:foriinrange(len(label)):iflabel[:len(label)-i]inpairs:scol,field=pairs[label[:len(label)-i]]name=self._internal.spark_column_name_for(label)scol=scol.alias(name)iffieldisnotNone:field=field.copy(name=name)breakelse:scol=self._internal.spark_column_for(label)field=self._internal.field_for(label)scols.append(scol)data_fields.append(field)column_labels=self._internal.column_labels.copy()forlabel,(scol,field)inpairs.items():iflabelnotinset(i[:len(label)]foriinself._internal.column_labels):name=name_like_string(label)scols.append(scol.alias(name))iffieldisnotNone:field=field.copy(name=name)data_fields.append(field)column_labels.append(label)level=self._internal.column_labels_levelcolumn_labels=[tuple(list(label)+([""]*(level-len(label))))forlabelincolumn_labels]internal=self._internal.with_new_columns(cast(Sequence[Union[PySparkColumn,"Series"]],scols),column_labels=column_labels,data_fields=data_fields,)returnDataFrame(internal)
[docs]@staticmethoddeffrom_records(data:Union[np.ndarray,List[tuple],dict,pd.DataFrame],index:Union[str,list,np.ndarray]=None,exclude:list=None,columns:list=None,coerce_float:bool=False,nrows:int=None,)->"DataFrame":""" Convert structured or recorded ndarray to DataFrame. Parameters ---------- data : ndarray (structured dtype), list of tuples, dict, or DataFrame .. deprecated:: 4.0.0 Passing a DataFrame is deprecated. index : string, list of fields, array-like Field of array to use as the index, alternately a specific set of input labels to use exclude : sequence, default None Columns or fields to exclude columns : sequence, default None Column names to use. If the passed data do not have names associated with them, this argument provides names for the columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA columns) coerce_float : boolean, default False Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets nrows : int, default None Number of rows to read if data is an iterator Returns ------- df : DataFrame Examples -------- Use dict as input >>> ps.DataFrame.from_records({'A': [1, 2, 3]}) A 0 1 1 2 2 3 Use list of tuples as input >>> ps.DataFrame.from_records([(1, 2), (3, 4)]) 0 1 0 1 2 1 3 4 Use NumPy array as input >>> ps.DataFrame.from_records(np.eye(3)) 0 1 2 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """returnDataFrame(pd.DataFrame.from_records(data,index,exclude,columns,coerce_float,nrows))
[docs]defto_records(self,index:bool=True,column_dtypes:Optional[Union[str,Dtype,Dict[Name,Union[str,Dtype]]]]=None,index_dtypes:Optional[Union[str,Dtype,Dict[Name,Union[str,Dtype]]]]=None,)->np.recarray:""" Convert DataFrame to a NumPy record array. Index will be included as the first field of the record array if requested. .. note:: This method should only be used if the resulting NumPy ndarray is expected to be small, as all the data is loaded into the driver's memory. Parameters ---------- index : bool, default True Include index in resulting record array, stored in 'index' field or using the index label, if set. column_dtypes : str, type, dict, default None If a string or type, the data type to store all columns. If a dictionary, a mapping of column names and indices (zero-indexed) to specific data types. index_dtypes : str, type, dict, default None If a string or type, the data type to store all index levels. If a dictionary, a mapping of index level names and indices (zero-indexed) to specific data types. This mapping is applied only if `index=True`. Returns ------- numpy.recarray NumPy ndarray with the DataFrame labels as fields and each row of the DataFrame as entries. See Also -------- DataFrame.from_records: Convert structured or record ndarray to DataFrame. numpy.recarray: An ndarray that allows field access using attributes, analogous to typed columns in a spreadsheet. Examples -------- >>> df = ps.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, ... index=['a', 'b']) >>> df A B a 1 0.50 b 2 0.75 >>> df.to_records() # doctest: +SKIP rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')]) The index can be excluded from the record array: >>> df.to_records(index=False) # doctest: +SKIP rec.array([(1, 0.5 ), (2, 0.75)], dtype=[('A', '<i8'), ('B', '<f8')]) Specification of dtype for columns is new in pandas 0.24.0. Data types can be specified for the columns: >>> df.to_records(column_dtypes={"A": "int32"}) # doctest: +SKIP rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], dtype=[('index', 'O'), ('A', '<i4'), ('B', '<f8')]) Specification of dtype for index is new in pandas 0.24.0. Data types can also be specified for the index: >>> df.to_records(index_dtypes="<S2") # doctest: +SKIP rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], dtype=[('index', 'S2'), ('A', '<i8'), ('B', '<f8')]) """args=locals()psdf=selfreturnvalidate_arguments_and_invoke_function(psdf._to_internal_pandas(),self.to_records,pd.DataFrame.to_records,args)
[docs]defcopy(self,deep:bool=True)->"DataFrame":""" Make a copy of this object's indices and data. Parameters ---------- deep : bool, default True this parameter is not supported but just dummy parameter to match pandas. Returns ------- copy : DataFrame Examples -------- >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]}, ... columns=['x', 'y', 'z', 'w']) >>> df x y z w 0 1 3 5 7 1 2 4 6 8 >>> df_copy = df.copy() >>> df_copy x y z w 0 1 3 5 7 1 2 4 6 8 """returnDataFrame(self._internal)
[docs]defdropna(self,axis:Axis=0,how:str="any",thresh:Optional[int]=None,subset:Optional[Union[Name,List[Name]]]=None,inplace:bool=False,)->Optional["DataFrame"]:""" Remove missing values. Parameters ---------- axis : {0 or 'index'}, default 0 Determine if rows or columns which contain missing values are removed. * 0, or 'index' : Drop rows which contain missing values. how : {'any', 'all'}, default 'any' Determine if row or column is removed from DataFrame, when we have at least one NA or all NA. * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. thresh : int, optional Require that many non-NA values. subset : array-like, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. inplace : bool, default False If True, do operation inplace and return None. Returns ------- DataFrame DataFrame with NA entries dropped from it. See Also -------- DataFrame.drop : Drop specified labels from columns. DataFrame.isnull: Indicate missing values. DataFrame.notnull : Indicate existing (non-missing) values. Examples -------- >>> df = ps.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [None, 'Batmobile', 'Bullwhip'], ... "born": [None, "1940-04-25", None]}, ... columns=['name', 'toy', 'born']) >>> df name toy born 0 Alfred None None 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip None Drop the rows where at least one element is missing. >>> df.dropna() name toy born 1 Batman Batmobile 1940-04-25 Drop the columns where at least one element is missing. >>> df.dropna(axis='columns') name 0 Alfred 1 Batman 2 Catwoman Drop the rows where all elements are missing. >>> df.dropna(how='all') name toy born 0 Alfred None None 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip None Keep only the rows with at least 2 non-NA values. >>> df.dropna(thresh=2) name toy born 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip None Define in which columns to look for missing values. >>> df.dropna(subset=['name', 'born']) name toy born 1 Batman Batmobile 1940-04-25 Keep the DataFrame with valid entries in the same variable. >>> df.dropna(inplace=True) >>> df name toy born 1 Batman Batmobile 1940-04-25 """axis=validate_axis(axis)inplace=validate_bool_kwarg(inplace,"inplace")ifthreshisNone:ifhowisNone:raiseTypeError("must specify how or thresh")elifhownotin("any","all"):raiseValueError("invalid how option: {h}".format(h=how))labels:Optional[List[Label]]ifsubsetisnotNone:ifisinstance(subset,str):labels=[(subset,)]elifisinstance(subset,tuple):labels=[subset]else:labels=[subifisinstance(sub,tuple)else(sub,)forsubinsubset]else:labels=Noneifaxis==0:iflabelsisnotNone:invalids=[labelforlabelinlabelsiflabelnotinself._internal.column_labels]iflen(invalids)>0:raiseKeyError(invalids)else:labels=self._internal.column_labelscnt=reduce(lambdax,y:x+y,[F.when(self._psser_for(label).notna().spark.column,1).otherwise(0)forlabelinlabels],F.lit(0),)ifthreshisnotNone:pred=cnt>=F.lit(int(thresh))elifhow=="any":pred=cnt==F.lit(len(labels))elifhow=="all":pred=cnt>F.lit(0)internal=self._internal.with_filter(pred)ifinplace:self._update_internal_frame(internal)returnNoneelse:returnDataFrame(internal)else:assertaxis==1internal=self._internal.resolved_copyiflabelsisnotNone:ifany(len(lbl)!=internal.index_levelforlblinlabels):raiseValueError("The length of each subset must be the same as the index size.")cond=reduce(lambdax,y:x|y,[reduce(lambdax,y:x&y,[scol==F.lit(part)forpart,scolinzip(lbl,internal.index_spark_columns)],)forlblinlabels],)internal=internal.with_filter(cond)psdf:DataFrame=DataFrame(internal)null_counts=[]forlabelininternal.column_labels:psser=psdf._psser_for(label)cond=psser.isnull().spark.columnnull_counts.append(F.sum(F.when(~cond,1).otherwise(0)).alias(name_like_string(label)))counts=internal.spark_frame.select(null_counts+[F.count("*")]).head()ifthreshisnotNone:column_labels=[labelforlabel,cntinzip(internal.column_labels,counts)if(cntor0)>=int(thresh)]elifhow=="any":column_labels=[labelforlabel,cntinzip(internal.column_labels,counts)if(cntor0)==counts[-1]]elifhow=="all":column_labels=[labelforlabel,cntinzip(internal.column_labels,counts)if(cntor0)>0]psdf=self[column_labels]ifinplace:self._update_internal_frame(psdf._internal)returnNoneelse:returnpsdf
# TODO: add 'limit' when value parameter exists
[docs]deffillna(self,value:Optional[Union[Any,Dict[Name,Any]]]=None,method:Optional[str]=None,axis:Optional[Axis]=None,inplace:bool=False,limit:Optional[int]=None,)->Optional["DataFrame"]:"""Fill NA/NaN values. .. note:: the current implementation of 'method' parameter in fillna uses Spark's Window without specifying partition specification. This leads to moving all data into a single partition in a single machine and could cause serious performance degradation. Avoid this method with very large datasets. Parameters ---------- value : scalar, dict, Series Value to use to fill holes. alternately a dict/Series of values specifying which value to use for each column. DataFrame is not supported. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap .. deprecated:: 4.0.0 axis : {0 or `index`} 1 and `columns` are not supported. inplace : boolean, default False Fill in place (do not create a new object) limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None .. deprecated:: 4.0.0 Returns ------- DataFrame DataFrame with NA entries filled. Examples -------- >>> df = ps.DataFrame({ ... 'A': [None, 3, None, None], ... 'B': [2, 4, None, 3], ... 'C': [None, None, None, 1], ... 'D': [0, 1, 5, 4] ... }, ... columns=['A', 'B', 'C', 'D']) >>> df A B C D 0 NaN 2.0 NaN 0 1 3.0 4.0 NaN 1 2 NaN NaN NaN 5 3 NaN 3.0 1.0 4 Replace all NaN elements with 0s. >>> df.fillna(0) A B C D 0 0.0 2.0 0.0 0 1 3.0 4.0 0.0 1 2 0.0 0.0 0.0 5 3 0.0 3.0 1.0 4 We can also propagate non-null values forward or backward. >>> df.fillna(method='ffill') A B C D 0 NaN 2.0 NaN 0 1 3.0 4.0 NaN 1 2 3.0 4.0 NaN 5 3 3.0 3.0 1.0 4 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, 2, and 3 respectively. >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3} >>> df.fillna(value=values) A B C D 0 0.0 2.0 2.0 0 1 3.0 4.0 2.0 1 2 0.0 1.0 2.0 5 3 0.0 3.0 1.0 4 """axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError("fillna currently only works for axis=0 or axis='index'")ifvalueisnotNone:ifnotisinstance(value,(float,int,str,bool,dict,pd.Series)):raiseTypeError("Unsupported type %s"%type(value).__name__)iflimitisnotNone:raiseValueError("limit parameter for value is not support now")ifisinstance(value,pd.Series):value=value.to_dict()ifisinstance(value,dict):forvinvalue.values():ifnotisinstance(v,(float,int,str,bool)):raiseTypeError("Unsupported type %s"%type(v).__name__)value={kifis_name_like_tuple(k)else(k,):vfork,vinvalue.items()}defop(psser:ps.Series)->ps.Series:label=psser._column_labelfork,vinvalue.items():ifk==label[:len(k)]:returnpsser._fillna(value=value[k],method=method,axis=axis,limit=limit)else:returnpsserelse:defop(psser:ps.Series)->ps.Series:returnpsser._fillna(value=value,method=method,axis=axis,limit=limit)elifmethodisnotNone:warnings.warn("DataFrame.fillna with 'method' is deprecated and will raise in a future version. ""Use DataFrame.ffill() or DataFrame.bfill() instead.",FutureWarning,)defop(psser:ps.Series)->ps.Series:returnpsser._fillna(value=value,method=method,axis=axis,limit=limit)else:raiseValueError("Must specify a fillna 'value' or 'method' parameter.")psdf=self._apply_series_op(op,should_resolve=(methodisnotNone))inplace=validate_bool_kwarg(inplace,"inplace")ifinplace:self._update_internal_frame(psdf._internal,check_same_anchor=False)returnNoneelse:returnpsdf
[docs]definterpolate(self,method:str="linear",limit:Optional[int]=None,limit_direction:Optional[str]=None,limit_area:Optional[str]=None,)->"DataFrame":ifmethodnotin["linear"]:raiseNotImplementedError("interpolate currently works only for method='linear'")if(limitisnotNone)and(notlimit>0):raiseValueError("limit must be > 0.")if(limit_directionisnotNone)and(limit_directionnotin["forward","backward","both"]):raiseValueError("invalid limit_direction: '{}'".format(limit_direction))if(limit_areaisnotNone)and(limit_areanotin["inside","outside"]):raiseValueError("invalid limit_area: '{}'".format(limit_area))fordtypeinself.dtypes.values:ifdtype=="object":warnings.warn("DataFrame.interpolate with object dtype is deprecated and will raise in a ""future version. Convert to a specific numeric type before interpolating.",FutureWarning,)numeric_col_names=[]forlabelinself._internal.column_labels:psser=self._psser_for(label)ifisinstance(psser.spark.data_type,(NumericType,BooleanType)):numeric_col_names.append(psser.name)iflen(numeric_col_names)==0:raiseTypeError("Cannot interpolate with all object-dtype columns in the DataFrame. ""Try setting at least one column to a numeric dtype.")psdf=self[numeric_col_names]returnpsdf._apply_series_op(lambdapsser:psser._interpolate(method=method,limit=limit,limit_direction=limit_direction,limit_area=limit_area),should_resolve=True,)
[docs]defreplace(self,to_replace:Optional[Union[Any,List,Tuple,Dict]]=None,value:Optional[Any]=None,inplace:bool=False,limit:Optional[int]=None,regex:bool=False,method:str="pad",)->Optional["DataFrame"]:""" Returns a new DataFrame replacing a value with another value. Parameters ---------- to_replace : int, float, string, list, tuple or dict Value to be replaced. value : int, float, string, list or tuple Value to use to replace holes. The replacement value must be an int, float, or string. If value is a list or tuple, value should be of the same length with to_replace. inplace : boolean, default False Fill in place (do not create a new object) limit : int, default None Maximum size gap to forward or backward fill. .. deprecated:: 4.0.0 regex : bool or str, default False Whether to interpret to_replace and/or value as regular expressions. If this is True then to_replace must be a string. Alternatively, this could be a regular expression in which case to_replace must be None. method : 'pad', default None The method to use when for replacement, when to_replace is a scalar, list or tuple and value is None. .. deprecated:: 4.0.0 Returns ------- DataFrame Object after replacement. Examples -------- >>> df = ps.DataFrame({"name": ['Ironman', 'Captain America', 'Thor', 'Hulk'], ... "weapon": ['Mark-45', 'Shield', 'Mjolnir', 'Smash']}, ... columns=['name', 'weapon']) >>> df name weapon 0 Ironman Mark-45 1 Captain America Shield 2 Thor Mjolnir 3 Hulk Smash Scalar `to_replace` and `value` >>> df.replace('Ironman', 'War-Machine') name weapon 0 War-Machine Mark-45 1 Captain America Shield 2 Thor Mjolnir 3 Hulk Smash List like `to_replace` and `value` >>> df.replace(['Ironman', 'Captain America'], ['Rescue', 'Hawkeye'], inplace=True) >>> df name weapon 0 Rescue Mark-45 1 Hawkeye Shield 2 Thor Mjolnir 3 Hulk Smash Dicts can be used to specify different replacement values for different existing values To use a dict in this way the value parameter should be None >>> df.replace({'Mjolnir': 'Stormbuster'}) name weapon 0 Rescue Mark-45 1 Hawkeye Shield 2 Thor Stormbuster 3 Hulk Smash Dict can specify that different values should be replaced in different columns The value parameter should not be None in this case >>> df.replace({'weapon': 'Mjolnir'}, 'Stormbuster') name weapon 0 Rescue Mark-45 1 Hawkeye Shield 2 Thor Stormbuster 3 Hulk Smash Nested dictionaries The value parameter should be None to use a nested dict in this way >>> df.replace({'weapon': {'Mjolnir': 'Stormbuster'}}) name weapon 0 Rescue Mark-45 1 Hawkeye Shield 2 Thor Stormbuster 3 Hulk Smash """ifmethod!="pad":warnings.warn("The 'method' keyword in DataFrame.replace is deprecated ""and will be removed in a future version.",FutureWarning,)raiseNotImplementedError("replace currently works only for method='pad")iflimitisnotNone:warnings.warn("The 'limit' keyword in DataFrame.replace is deprecated ""and will be removed in a future version.",FutureWarning,)raiseNotImplementedError("replace currently works only when limit=None")ifregexisnotFalse:raiseNotImplementedError("replace currently doesn't supports regex")inplace=validate_bool_kwarg(inplace,"inplace")ifvalueisnotNoneandnotisinstance(value,(int,float,str,list,tuple,dict)):raiseTypeError("Unsupported type {}".format(type(value).__name__))ifto_replaceisnotNoneandnotisinstance(to_replace,(int,float,str,list,tuple,dict)):raiseTypeError("Unsupported type {}".format(type(to_replace).__name__))ifisinstance(value,(list,tuple))andisinstance(to_replace,(list,tuple)):iflen(value)!=len(to_replace):raiseValueError("Length of to_replace and value must be same")ifisinstance(to_replace,dict)and(valueisnotNoneorall(isinstance(i,dict)foriinto_replace.values())):to_replace_dict=to_replacedefop(psser:ps.Series)->ps.Series:ifpsser.nameinto_replace_dict:returnpsser.replace(to_replace=to_replace_dict[psser.name],value=value,regex=regex)else:returnpsserelse:ifvalueisNone:warnings.warn("DataFrame.replace without 'value' and with non-dict-like 'to_replace' ""is deprecated and will raise in a future version. ""Explicitly specify the new values instead.",FutureWarning,)defop(psser:ps.Series)->ps.Series:returnpsser.replace(to_replace=to_replace,value=value,regex=regex# type: ignore[arg-type])psdf=self._apply_series_op(op)ifinplace:self._update_internal_frame(psdf._internal)returnNoneelse:returnpsdf
[docs]defclip(self,lower:Union[float,int]=None,upper:Union[float,int]=None)->"DataFrame":""" Trim values at input threshold(s). Assigns values outside boundary-to-boundary values. Parameters ---------- lower : float or int, default None Minimum threshold value. All values below this threshold will be set to it. upper : float or int, default None Maximum threshold value. All values above this threshold will be set to it. Returns ------- DataFrame DataFrame with the values outside the clip boundaries replaced. Examples -------- >>> ps.DataFrame({'A': [0, 2, 4]}).clip(1, 3) A 0 1 1 2 2 3 Notes ----- One difference between this implementation and pandas is that running pd.DataFrame({'A': ['a', 'b']}).clip(0, 1) will crash with "TypeError: '<=' not supported between instances of 'str' and 'int'" while ps.DataFrame({'A': ['a', 'b']}).clip(0, 1) will output the original DataFrame, simply ignoring the incompatible types. """ifis_list_like(lower)oris_list_like(upper):raiseTypeError("List-like value are not supported for 'lower' and 'upper' at the "+"moment")iflowerisNoneandupperisNone:returnselfreturnself._apply_series_op(lambdapsser:psser.clip(lower=lower,upper=upper))
[docs]defhead(self,n:int=5)->"DataFrame":""" Return the first `n` rows. This function returns the first `n` rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it. Parameters ---------- n : int, default 5 Number of rows to select. Returns ------- obj_head : same type as caller The first `n` rows of the caller object. Examples -------- >>> df = ps.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the first 5 lines >>> df.head() animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey Viewing the first `n` lines (three in this case) >>> df.head(3) animal 0 alligator 1 bee 2 falcon """ifn<0:n=len(self)+nifn<=0:returnDataFrame(self._internal.with_filter(F.lit(False)))else:sdf=self._internal.resolved_copy.spark_frameifget_option("compute.ordered_head"):sdf=sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)returnDataFrame(self._internal.with_new_sdf(sdf.limit(n)))
[docs]deflast(self,offset:Union[str,DateOffset])->"DataFrame":""" Select final periods of time series data based on a date offset. When having a DataFrame with dates as index, this function can select the last few rows based on a date offset. .. deprecated:: 4.0.0 Parameters ---------- offset : str or DateOffset The offset length of the data that will be selected. For instance, '3D' will display all the rows having their index within the last 3 days. Returns ------- DataFrame A subset of the caller. Raises ------ TypeError If the index is not a :class:`DatetimeIndex` Examples -------- >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=index) >>> psdf A 2018-04-09 1 2018-04-11 2 2018-04-13 3 2018-04-15 4 Get the rows for the last 3 days: >>> psdf.last('3D') A 2018-04-13 3 2018-04-15 4 Notice the data for 3 last calendar days were returned, not the last 3 observed days in the dataset, and therefore data for 2018-04-11 was not returned. """warnings.warn("last is deprecated and will be removed in a future version. ""Please create a mask and filter using `.loc` instead",FutureWarning,)# Check index type should be format DateTimeifnotisinstance(self.index,ps.DatetimeIndex):raiseTypeError("'last' only supports a DatetimeIndex")from_date=cast(int,cast(datetime.datetime,self.index.max())-cast(datetime.timedelta,to_offset(offset)),)returncast(DataFrame,self.loc[from_date:])
[docs]deffirst(self,offset:Union[str,DateOffset])->"DataFrame":""" Select first periods of time series data based on a date offset. When having a DataFrame with dates as index, this function can select the first few rows based on a date offset. .. deprecated:: 4.0.0 Parameters ---------- offset : str or DateOffset The offset length of the data that will be selected. For instance, '3D' will display all the rows having their index within the first 3 days. Returns ------- DataFrame A subset of the caller. Raises ------ TypeError If the index is not a :class:`DatetimeIndex` Examples -------- >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=index) >>> psdf A 2018-04-09 1 2018-04-11 2 2018-04-13 3 2018-04-15 4 Get the rows for the last 3 days: >>> psdf.first('3D') A 2018-04-09 1 2018-04-11 2 Notice the data for 3 first calendar days were returned, not the first 3 observed days in the dataset, and therefore data for 2018-04-13 was not returned. """warnings.warn("first is deprecated and will be removed in a future version. ""Please create a mask and filter using `.loc` instead",FutureWarning,)# Check index type should be format DatetimeIndexifnotisinstance(self.index,ps.DatetimeIndex):raiseTypeError("'first' only supports a DatetimeIndex")to_date=cast(int,cast(datetime.datetime,self.index.min())+cast(datetime.timedelta,to_offset(offset)),)returncast(DataFrame,self.loc[:to_date])
[docs]defpivot_table(self,values:Optional[Union[Name,List[Name]]]=None,index:Optional[List[Name]]=None,columns:Optional[Name]=None,aggfunc:Union[str,Dict[Name,str]]="mean",fill_value:Optional[Any]=None,)->"DataFrame":""" Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame. Parameters ---------- values : column to aggregate. They should be either a list less than three or a string. index : column (string) or list of columns If an array is passed, it must be the same length as the data. The list should contain string. columns : column Columns used in the pivot operation. Only one column is supported and it should be a string. aggfunc : function (string), dict, default mean If dict is passed, the key is column to aggregate and value is function or list of functions. fill_value : scalar, default None Value to replace missing values with. Returns ------- table : DataFrame Examples -------- >>> df = ps.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", ... "bar", "bar", "bar", "bar"], ... "B": ["one", "one", "one", "two", "two", ... "one", "one", "two", "two"], ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}, ... columns=['A', 'B', 'C', 'D', 'E']) >>> df A B C D E 0 foo one small 1 2 1 foo one large 2 4 2 foo one large 2 5 3 foo two small 3 5 4 foo two small 3 6 5 bar one large 4 6 6 bar one small 5 8 7 bar two small 6 9 8 bar two large 7 9 This first example aggregates values by taking the sum. >>> table = df.pivot_table(values='D', index=['A', 'B'], ... columns='C', aggfunc='sum') >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE C large small A B bar one 4.0 5 two 7.0 6 foo one 4.0 1 two NaN 6 We can also fill missing values using the `fill_value` parameter. >>> table = df.pivot_table(values='D', index=['A', 'B'], ... columns='C', aggfunc='sum', fill_value=0) >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE C large small A B bar one 4 5 two 7 6 foo one 4 1 two 0 6 We can also calculate multiple types of aggregations for any given value column. >>> table = df.pivot_table(values=['D'], index =['C'], ... columns="A", aggfunc={'D': 'mean'}) >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE D A bar foo C large 5.5 2.000000 small 5.5 2.333333 The next example aggregates on multiple values. >>> table = df.pivot_table(index=['C'], columns="A", values=['D', 'E'], ... aggfunc={'D': 'mean', 'E': 'sum'}) >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE D E A bar foo bar foo C large 5.5 2.000000 15 9 small 5.5 2.333333 17 13 """ifnotis_name_like_value(columns):raiseTypeError("columns should be one column name.")ifnotis_name_like_value(values)andnot(isinstance(values,list)andall(is_name_like_value(v)forvinvalues)):raiseTypeError("values should be one column or list of columns.")ifnotisinstance(aggfunc,str)and(notisinstance(aggfunc,dict)ornotall(is_name_like_value(key)andisinstance(value,str)forkey,valueinaggfunc.items())):raiseTypeError("aggfunc must be a dict mapping from column name ""to aggregate functions (string).")ifisinstance(aggfunc,dict)andindexisNone:raiseNotImplementedError("pivot_table doesn't support aggfunc"" as dict and without index.")ifisinstance(values,list)andindexisNone:raiseNotImplementedError("values can't be a list without index.")ifcolumnsnotinself.columns:raiseValueError("Wrong columns {}.".format(name_like_string(columns)))ifnotis_name_like_tuple(columns):columns=(columns,)ifisinstance(values,list):values=[colifis_name_like_tuple(col)else(col,)forcolinvalues]ifnotall(isinstance(self._internal.spark_type_for(col),NumericType)forcolinvalues):raiseTypeError("values should be a numeric type.")else:values=valuesifis_name_like_tuple(values)else(values,)ifnotisinstance(self._internal.spark_type_for(values),NumericType):raiseTypeError("values should be a numeric type.")ifisinstance(aggfunc,str):ifisinstance(values,list):agg_cols=[F.expr("{1}(`{0}`) as `{0}`".format(self._internal.spark_column_name_for(value),aggfunc))forvalueinvalues]else:agg_cols=[F.expr("{1}(`{0}`) as `{0}`".format(self._internal.spark_column_name_for(values),aggfunc))]elifisinstance(aggfunc,dict):aggfunc={keyifis_name_like_tuple(key)else(key,):valueforkey,valueinaggfunc.items()}agg_cols=[F.expr("{1}(`{0}`) as `{0}`".format(self._internal.spark_column_name_for(key),value))forkey,valueinaggfunc.items()]agg_columns=[keyforkey,_inaggfunc.items()]ifset(agg_columns)!=set(values):raiseValueError("Columns in aggfunc must be the same as values.")sdf=self._internal.resolved_copy.spark_frameifindexisNone:sdf=(sdf.groupBy().pivot(pivot_col=self._internal.spark_column_name_for(columns)).agg(*agg_cols))elifisinstance(index,list):index=[labelifis_name_like_tuple(label)else(label,)forlabelinindex]sdf=(sdf.groupBy([self._internal.spark_column_name_for(label)forlabelinindex]).pivot(pivot_col=self._internal.spark_column_name_for(columns)).agg(*agg_cols))else:raiseTypeError("index should be a None or a list of columns.")iffill_valueisnotNoneandisinstance(fill_value,(int,float)):sdf=sdf.fillna(fill_value)psdf:DataFrameifindexisnotNone:index_columns=[self._internal.spark_column_name_for(label)forlabelinindex]index_fields=[self._internal.field_for(label)forlabelinindex]ifisinstance(values,list):data_columns=[columnforcolumninsdf.columnsifcolumnnotinindex_columns]iflen(values)>1:# If we have two values, Spark will return column's name# in this format: column_values, where column contains# their values in the DataFrame and values is# the column list passed to the pivot_table().# E.g. if column is b and values is ['b','e'],# then ['2_b', '2_e', '3_b', '3_e'].# We sort the columns of Spark DataFrame by values.data_columns.sort(key=lambdax:x.split("_",1)[1])sdf=sdf.select(index_columns+data_columns)column_name_to_index=dict(zip(self._internal.data_spark_column_names,self._internal.column_labels))column_labels=[tuple(list(column_name_to_index[name.split("_")[1]])+[name.split("_")[0]])fornameindata_columns]column_label_names=([cast(Optional[Name],None)]*column_labels_level(values))+[columns]internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolinindex_columns],index_names=index,index_fields=index_fields,column_labels=column_labels,data_spark_columns=[scol_for(sdf,col)forcolindata_columns],column_label_names=column_label_names,)psdf=DataFrame(internal)else:column_labels=[tuple(list(values[0])+[column])forcolumnindata_columns]column_label_names=([cast(Optional[Name],None)]*len(values[0]))+[columns]internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolinindex_columns],index_names=index,index_fields=index_fields,column_labels=column_labels,data_spark_columns=[scol_for(sdf,col)forcolindata_columns],column_label_names=column_label_names,)psdf=DataFrame(internal)else:internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolinindex_columns],index_names=index,index_fields=index_fields,column_label_names=[columns],)psdf=DataFrame(internal)else:index_values=valuesindex_map:Dict[str,Optional[Label]]={}fori,index_valueinenumerate(index_values):colname=SPARK_INDEX_NAME_FORMAT(i)sdf=sdf.withColumn(colname,F.lit(index_value))index_map[colname]=Noneinternal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolinindex_map.keys()],index_names=list(index_map.values()),column_label_names=[columns],)psdf=DataFrame(internal)psdf_columns=psdf.columnsifisinstance(psdf_columns,pd.MultiIndex):psdf.columns=psdf_columns.set_levels(psdf_columns.levels[-1].astype(# type: ignore[index]spark_type_to_pandas_dtype(self._psser_for(columns).spark.data_type)),level=-1,)else:psdf.columns=psdf_columns.astype(spark_type_to_pandas_dtype(self._psser_for(columns).spark.data_type))returnpsdf
[docs]defpivot(self,index:Optional[Name]=None,columns:Optional[Name]=None,values:Optional[Name]=None,)->"DataFrame":""" Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses unique values from specified `index` / `columns` to form axes of the resulting DataFrame. This function does not support data aggregation. Parameters ---------- index : string, optional Column to use to make new frame's index. If None, uses existing index. columns : string Column to use to make new frame's columns. values : string, object or a list of the previous Column(s) to use for populating new frame's values. Returns ------- DataFrame Returns reshaped DataFrame. See Also -------- DataFrame.pivot_table : Generalization of pivot that can handle duplicate values for one index/column pair. Examples -------- >>> df = ps.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', ... 'two'], ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], ... 'baz': [1, 2, 3, 4, 5, 6], ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}, ... columns=['foo', 'bar', 'baz', 'zoo']) >>> df foo bar baz zoo 0 one A 1 x 1 one B 2 y 2 one C 3 z 3 two A 4 q 4 two B 5 w 5 two C 6 t >>> df.pivot(index='foo', columns='bar', values='baz').sort_index() ... # doctest: +NORMALIZE_WHITESPACE bar A B C foo one 1 2 3 two 4 5 6 >>> df.pivot(columns='bar', values='baz').sort_index() # doctest: +NORMALIZE_WHITESPACE bar A B C 0 1.0 NaN NaN 1 NaN 2.0 NaN 2 NaN NaN 3.0 3 4.0 NaN NaN 4 NaN 5.0 NaN 5 NaN NaN 6.0 Notice that, unlike pandas raises an ValueError when duplicated values are found. Pandas-on-Spark's pivot still works with its first value it meets during operation because pivot is an expensive operation, and it is preferred to permissively execute over failing fast when processing large data. >>> df = ps.DataFrame({"foo": ['one', 'one', 'two', 'two'], ... "bar": ['A', 'A', 'B', 'C'], ... "baz": [1, 2, 3, 4]}, columns=['foo', 'bar', 'baz']) >>> df foo bar baz 0 one A 1 1 one A 2 2 two B 3 3 two C 4 >>> df.pivot(index='foo', columns='bar', values='baz').sort_index() ... # doctest: +NORMALIZE_WHITESPACE bar A B C foo one 1.0 NaN NaN two NaN 3.0 4.0 It also supports multi-index and multi-index column. >>> df.columns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'baz')]) >>> df = df.set_index(('a', 'bar'), append=True) >>> df # doctest: +NORMALIZE_WHITESPACE a b foo baz (a, bar) 0 A one 1 1 A one 2 2 B two 3 3 C two 4 >>> df.pivot(columns=('a', 'foo'), values=('b', 'baz')).sort_index() ... # doctest: +NORMALIZE_WHITESPACE ('a', 'foo') one two (a, bar) 0 A 1.0 NaN 1 A 2.0 NaN 2 B NaN 3.0 3 C NaN 4.0 """ifcolumnsisNone:raiseValueError("columns should be set.")ifvaluesisNone:raiseValueError("values should be set.")should_use_existing_index=indexisnotNoneifshould_use_existing_index:df=selfindex_labels=[index]else:# The index after `reset_index()` will never be used, so use "distributed" index# as a dummy to avoid overhead.withoption_context("compute.default_index_type","distributed"):df=self.reset_index()index_labels=df._internal.column_labels[:self._internal.index_level]df=df.pivot_table(index=index_labels,columns=columns,values=values,aggfunc="first")ifshould_use_existing_index:returndfelse:internal=df._internal.copy(index_names=self._internal.index_names)returnDataFrame(internal)
@propertydefcolumns(self)->pd.Index:"""The column labels of the DataFrame."""names=[nameifnameisNoneorlen(name)>1elsename[0]fornameinself._internal.column_label_names]ifself._internal.column_labels_level>1:columns=pd.MultiIndex.from_tuples(self._internal.column_labels,names=names)else:columns=pd.Index([label[0]forlabelinself._internal.column_labels],name=names[0])returncolumns@columns.setterdefcolumns(self,columns:Union[pd.Index,List[Name]])->None:ifisinstance(columns,pd.MultiIndex):column_labels=columns.tolist()else:column_labels=[colifis_name_like_tuple(col,allow_none=False)else(col,)forcolincolumns]iflen(self._internal.column_labels)!=len(column_labels):raiseValueError("Length mismatch: Expected axis has {} elements, ""new values have {} elements".format(len(self._internal.column_labels),len(column_labels)))column_label_names:Optional[List]ifisinstance(columns,pd.Index):column_label_names=[nameifis_name_like_tuple(name)else(name,)fornameincolumns.names]else:column_label_names=Nonepssers=[self._psser_for(label).rename(name)forlabel,nameinzip(self._internal.column_labels,column_labels)]self._update_internal_frame(self._internal.with_new_columns(pssers,column_label_names=column_label_names))@propertydefdtypes(self)->pd.Series:"""Return the dtypes in the DataFrame. This returns a Series with the data type of each column. The result's index is the original DataFrame's columns. Columns with mixed types are stored with the object dtype. Returns ------- pd.Series The data type of each column. Examples -------- >>> df = ps.DataFrame({'a': list('abc'), ... 'b': list(range(1, 4)), ... 'c': np.arange(3, 6).astype('i1'), ... 'd': np.arange(4.0, 7.0, dtype='float64'), ... 'e': [True, False, True], ... 'f': pd.date_range('20130101', periods=3)}, ... columns=['a', 'b', 'c', 'd', 'e', 'f']) >>> df.dtypes a object b int64 c int8 d float64 e bool f datetime64[ns] dtype: object """returnpd.Series([self._psser_for(label).dtypeforlabelinself._internal.column_labels],index=pd.Index([labeliflen(label)>1elselabel[0]forlabelinself._internal.column_labels]),)
[docs]defselect_dtypes(self,include:Optional[Union[str,List[str]]]=None,exclude:Optional[Union[str,List[str]]]=None,)->"DataFrame":""" Return a subset of the DataFrame's columns based on the column dtypes. Parameters ---------- include, exclude : scalar or list-like A selection of dtypes or strings to be included/excluded. At least one of these parameters must be supplied. It also takes Spark SQL DDL type strings, for instance, 'string' and 'date'. Returns ------- DataFrame The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``. Raises ------ ValueError * If both of ``include`` and ``exclude`` are empty >>> df = ps.DataFrame({'a': [1, 2] * 3, ... 'b': [True, False] * 3, ... 'c': [1.0, 2.0] * 3}) >>> df.select_dtypes() Traceback (most recent call last): ... ValueError: at least one of include or exclude must be nonempty * If ``include`` and ``exclude`` have overlapping elements >>> df = ps.DataFrame({'a': [1, 2] * 3, ... 'b': [True, False] * 3, ... 'c': [1.0, 2.0] * 3}) >>> df.select_dtypes(include='a', exclude='a') Traceback (most recent call last): ... ValueError: include and exclude overlap on {'a'} Notes ----- * To select datetimes, use ``np.datetime64``, ``'datetime'`` or ``'datetime64'`` Examples -------- >>> df = ps.DataFrame({'a': [1, 2] * 3, ... 'b': [True, False] * 3, ... 'c': [1.0, 2.0] * 3, ... 'd': ['a', 'b'] * 3}, columns=['a', 'b', 'c', 'd']) >>> df a b c d 0 1 True 1.0 a 1 2 False 2.0 b 2 1 True 1.0 a 3 2 False 2.0 b 4 1 True 1.0 a 5 2 False 2.0 b >>> df.select_dtypes(include='bool') b 0 True 1 False 2 True 3 False 4 True 5 False >>> df.select_dtypes(include=['float64'], exclude=['int']) c 0 1.0 1 2.0 2 1.0 3 2.0 4 1.0 5 2.0 >>> df.select_dtypes(include=['int'], exclude=['float64']) a 0 1 1 2 2 1 3 2 4 1 5 2 >>> df.select_dtypes(exclude=['int']) b c d 0 True 1.0 a 1 False 2.0 b 2 True 1.0 a 3 False 2.0 b 4 True 1.0 a 5 False 2.0 b Spark SQL DDL type strings can be used as well. >>> df.select_dtypes(exclude=['string']) a b c 0 1 True 1.0 1 2 False 2.0 2 1 True 1.0 3 2 False 2.0 4 1 True 1.0 5 2 False 2.0 """frompyspark.sql.typesimport_parse_datatype_stringinclude_list:List[str]ifnotis_list_like(include):include_list=[cast(str,include)]ifincludeisnotNoneelse[]else:include_list=list(include)exclude_list:List[str]ifnotis_list_like(exclude):exclude_list=[cast(str,exclude)]ifexcludeisnotNoneelse[]else:exclude_list=list(exclude)ifnotany((include_list,exclude_list)):raiseValueError("at least one of include or exclude must be ""nonempty")# can't both include AND exclude!ifset(include_list).intersection(set(exclude_list)):raiseValueError("include and exclude overlap on {inc_ex}".format(inc_ex=set(include_list).intersection(set(exclude_list))))# Handle Spark typesinclude_spark_type=[]forincininclude_list:try:include_spark_type.append(_parse_datatype_string(inc))exceptBaseException:passexclude_spark_type=[]forexcinexclude_list:try:exclude_spark_type.append(_parse_datatype_string(exc))exceptBaseException:pass# Handle pandas typesinclude_numpy_type=[]forincininclude_list:try:include_numpy_type.append(infer_dtype_from_object(inc))exceptBaseException:passexclude_numpy_type=[]forexcinexclude_list:try:exclude_numpy_type.append(infer_dtype_from_object(exc))exceptBaseException:passcolumn_labels=[]forlabelinself._internal.column_labels:iflen(include_list)>0:should_include=(infer_dtype_from_object(self._psser_for(label).dtype.name)ininclude_numpy_typeorself._internal.spark_type_for(label)ininclude_spark_type)else:should_include=not(infer_dtype_from_object(self._psser_for(label).dtype.name)inexclude_numpy_typeorself._internal.spark_type_for(label)inexclude_spark_type)ifshould_include:column_labels.append(label)returnDataFrame(self._internal.with_new_columns([self._psser_for(label)forlabelincolumn_labels]))
[docs]defdroplevel(self,level:Union[int,Name,List[Union[int,Name]]],axis:Axis=0)->"DataFrame":""" Return DataFrame with requested index / column level(s) removed. Parameters ---------- level: int, str, or list-like If a string is given, must be the name of a level If list-like, elements must be names or positional indexes of levels. axis: {0 or ‘index’, 1 or ‘columns’}, default 0 Returns ------- DataFrame with requested index / column level(s) removed. Examples -------- >>> df = ps.DataFrame( ... [[3, 4], [7, 8], [11, 12]], ... index=pd.MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"]), ... ) >>> df.columns = pd.MultiIndex.from_tuples([ ... ('c', 'e'), ('d', 'f') ... ], names=['level_1', 'level_2']) >>> df # doctest: +NORMALIZE_WHITESPACE level_1 c d level_2 e f a b 1 2 3 4 5 6 7 8 9 10 11 12 >>> df.droplevel('a') # doctest: +NORMALIZE_WHITESPACE level_1 c d level_2 e f b 2 3 4 6 7 8 10 11 12 >>> df.droplevel('level_2', axis=1) # doctest: +NORMALIZE_WHITESPACE level_1 c d a b 1 2 3 4 5 6 7 8 9 10 11 12 """axis=validate_axis(axis)ifaxis==0:ifnotisinstance(level,(tuple,list)):# huh?level=[level]names=self.index.namesnlevels=self._internal.index_levelint_level=set()forninlevel:ifisinstance(n,int):ifn<0:n=n+nlevelsifn<0:raiseIndexError("Too many levels: Index has only {} levels, ""{} is not a valid level number".format(nlevels,(n-nlevels)))ifn>=nlevels:raiseIndexError("Too many levels: Index has only {} levels, not {}".format(nlevels,(n+1)))else:ifnnotinnames:raiseKeyError("Level {} not found".format(n))n=names.index(n)int_level.add(n)iflen(level)>=nlevels:raiseValueError("Cannot remove {} levels from an index with {} levels: ""at least one level must be left.".format(len(level),nlevels))index_spark_columns,index_names,index_fields=zip(*[itemfori,iteminenumerate(zip(self._internal.index_spark_columns,self._internal.index_names,self._internal.index_fields,))ifinotinint_level])internal=self._internal.copy(index_spark_columns=list(index_spark_columns),index_names=list(index_names),index_fields=list(index_fields),)returnDataFrame(internal)else:psdf=self.copy()psdf.columns=psdf.columns.droplevel(level)# type: ignore[arg-type]returnpsdf
[docs]defdrop(self,labels:Optional[Union[Name,List[Name]]]=None,axis:Optional[Axis]=0,index:Union[Name,List[Name]]=None,columns:Union[Name,List[Name]]=None,)->"DataFrame":""" Drop specified labels from columns. Remove rows and/or columns by specifying label names and corresponding axis, or by specifying directly index and/or column names. Drop rows of a MultiIndex DataFrame is not supported yet. Parameters ---------- labels : single label or list-like Column labels to drop. axis : {0 or 'index', 1 or 'columns'}, default 0 .. versionchanged:: 3.3 Set dropping by index is default. index : single label or list-like Alternative to specifying axis (``labels, axis=0`` is equivalent to ``index=columns``). .. versionchanged:: 3.3 Added dropping rows by 'index'. columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). Returns ------- dropped : DataFrame See Also -------- Series.dropna Examples -------- >>> df = ps.DataFrame(np.arange(12).reshape(3, 4), columns=['A', 'B', 'C', 'D']) >>> df A B C D 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 Drop columns >>> df.drop(['B', 'C'], axis=1) A D 0 0 3 1 4 7 2 8 11 >>> df.drop(columns=['B', 'C']) A D 0 0 3 1 4 7 2 8 11 Drop a row by index >>> df.drop([0, 1]) A B C D 2 8 9 10 11 >>> df.drop(index=[0, 1], columns='A') B C D 2 9 10 11 Also support dropping columns for MultiIndex >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]}, ... columns=['x', 'y', 'z', 'w']) >>> columns = [('a', 'x'), ('a', 'y'), ('b', 'z'), ('b', 'w')] >>> df.columns = pd.MultiIndex.from_tuples(columns) >>> df # doctest: +NORMALIZE_WHITESPACE a b x y z w 0 1 3 5 7 1 2 4 6 8 >>> df.drop(labels='a', axis=1) # doctest: +NORMALIZE_WHITESPACE b z w 0 5 7 1 6 8 Notes ----- Currently, dropping rows of a MultiIndex DataFrame is not supported yet. """iflabelsisnotNone:ifindexisnotNoneorcolumnsisnotNone:raiseValueError("Cannot specify both 'labels' and 'index'/'columns'")axis=validate_axis(axis)ifaxis==1:returnself.drop(index=index,columns=labels)else:returnself.drop(index=labels,columns=columns)else:ifindexisNoneandcolumnsisNone:raiseValueError("Need to specify at least one of 'labels' or 'columns' or 'index'")internal=self._internalifindexisnotNone:ifis_name_like_tuple(index)oris_name_like_value(index):index=[index]iflen(index)>0:ifinternal.index_level==1:internal=internal.resolved_copyiflen(index)<=ps.get_option("compute.isin_limit"):self_index_type=self.index.spark.data_typecond=~internal.index_spark_columns[0].isin([F.lit(label).cast(self_index_type)forlabelinindex])internal=internal.with_filter(cond)else:index_sdf_col="__index"index_sdf=default_session().createDataFrame(pd.DataFrame({index_sdf_col:index}))joined_sdf=internal.spark_frame.join(other=F.broadcast(index_sdf),on=(internal.index_spark_columns[0]==scol_for(index_sdf,index_sdf_col)),how="anti",)internal=internal.with_new_sdf(joined_sdf)else:raiseNotImplementedError("Drop rows of MultiIndex DataFrame is not supported yet")ifcolumnsisnotNone:ifis_name_like_tuple(columns):columns=[columns]elifis_name_like_value(columns):columns=[(columns,)]else:columns=[colifis_name_like_tuple(col)else(col,)forcolincolumns]iflen(columns)>0:drop_column_labels=set(labelforlabelininternal.column_labelsforcolincolumnsiflabel[:len(col)]==col)iflen(drop_column_labels)==0:raiseKeyError(columns)keep_columns_and_labels=[(column,label)forcolumn,labelinzip(self._internal.data_spark_column_names,self._internal.column_labels)iflabelnotindrop_column_labels]cols,labels=(zip(*keep_columns_and_labels)iflen(keep_columns_and_labels)>0else([],[]))internal=internal.with_new_columns([self._psser_for(label)forlabelinlabels])returnDataFrame(internal)
def_prepare_sort_by_scols(self,by:Union[Name,List[Name]])->List[PySparkColumn]:ifis_name_like_value(by):by=[by]else:assertis_list_like(by),type(by)new_by=[]forcolnameinby:ser=self[colname]ifnotisinstance(ser,ps.Series):raiseValueError("The column %s is not unique. For a multi-index, the label must be a tuple ""with elements corresponding to each level."%name_like_string(colname))new_by.append(ser.spark.column)returnnew_bydef_sort(self,by:Sequence[PySparkColumn],ascending:Union[bool,List[bool]],na_position:str,keep:str="first",)->"DataFrame":ifisinstance(ascending,bool):ascending=[ascending]*len(by)iflen(ascending)!=len(by):raiseValueError("Length of ascending ({}) != length of by ({})".format(len(ascending),len(by)))ifna_positionnotin("first","last"):raiseValueError("invalid na_position: '{}'".format(na_position))Column=get_column_class()# Mapper: Get a spark colum# n function for (ascending, na_position) combinationmapper={(True,"first"):Column.asc_nulls_first,(True,"last"):Column.asc_nulls_last,(False,"first"):Column.desc_nulls_first,(False,"last"):Column.desc_nulls_last,}by=[mapper[(asc,na_position)](scol)forscol,ascinzip(by,ascending)]natural_order_scol=F.col(NATURAL_ORDER_COLUMN_NAME)ifkeep=="last":natural_order_scol=Column.desc(natural_order_scol)elifkeep=="all":raiseNotImplementedError("`keep`=all is not implemented yet.")elifkeep!="first":raiseValueError('keep must be either "first", "last" or "all".')sdf=self._internal.resolved_copy.spark_frame.sort(*by,natural_order_scol)returnDataFrame(self._internal.with_new_sdf(sdf))
[docs]defsort_values(self,by:Union[Name,List[Name]],ascending:Union[bool,List[bool]]=True,inplace:bool=False,na_position:str="last",ignore_index:bool=False,)->Optional["DataFrame"]:""" Sort by the values along either axis. Parameters ---------- by : str or list of str ascending : bool or list of bool, default True Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. inplace : bool, default False if True, perform operation in-place na_position : {'first', 'last'}, default 'last' `first` puts NaNs at the beginning, `last` puts NaNs at the end ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. Returns ------- sorted_obj : DataFrame Examples -------- >>> df = ps.DataFrame({ ... 'col1': ['A', 'B', None, 'D', 'C'], ... 'col2': [2, 9, 8, 7, 4], ... 'col3': [0, 9, 4, 2, 3], ... }, ... columns=['col1', 'col2', 'col3'], ... index=['a', 'b', 'c', 'd', 'e']) >>> df col1 col2 col3 a A 2 0 b B 9 9 c None 8 4 d D 7 2 e C 4 3 Sort by col1 >>> df.sort_values(by=['col1']) col1 col2 col3 a A 2 0 b B 9 9 e C 4 3 d D 7 2 c None 8 4 Ignore index for the resulting axis >>> df.sort_values(by=['col1'], ignore_index=True) col1 col2 col3 0 A 2 0 1 B 9 9 2 C 4 3 3 D 7 2 4 None 8 4 Sort Descending >>> df.sort_values(by='col1', ascending=False) col1 col2 col3 d D 7 2 e C 4 3 b B 9 9 a A 2 0 c None 8 4 Sort by multiple columns >>> df = ps.DataFrame({ ... 'col1': ['A', 'A', 'B', None, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], ... }, ... columns=['col1', 'col2', 'col3']) >>> df.sort_values(by=['col1', 'col2']) col1 col2 col3 1 A 1 1 0 A 2 0 2 B 9 9 5 C 4 3 4 D 7 2 3 None 8 4 """inplace=validate_bool_kwarg(inplace,"inplace")new_by=self._prepare_sort_by_scols(by)psdf=self._sort(by=new_by,ascending=ascending,na_position=na_position)ifinplace:ifignore_index:psdf.reset_index(drop=True,inplace=inplace)self._update_internal_frame(psdf._internal)returnNoneelse:returnpsdf.reset_index(drop=True)ifignore_indexelsepsdf
[docs]defsort_index(self,axis:Axis=0,level:Optional[Union[int,List[int]]]=None,ascending:bool=True,inplace:bool=False,kind:str=None,na_position:str="last",ignore_index:bool=False,)->Optional["DataFrame"]:""" Sort object by labels (along an axis) Parameters ---------- axis : index, columns to direct sorting. Currently, only axis = 0 is supported. level : int or level name or list of ints or list of level names if not None, sort on values in specified index level(s) ascending : boolean, default True Sort ascending vs. descending inplace : bool, default False if True, perform operation in-place kind : str, default None pandas-on-Spark does not allow specifying the sorting algorithm now, default None na_position : {‘first’, ‘last’}, default ‘last’ first puts NaNs at the beginning, last puts NaNs at the end. Not implemented for MultiIndex. ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. .. versionadded:: 3.4.0 Returns ------- sorted_obj : DataFrame Examples -------- >>> df = ps.DataFrame({'A': [2, 1, np.nan]}, index=['b', 'a', np.nan]) >>> df.sort_index() # doctest: +SKIP A a 1.0 b 2.0 None NaN >>> df.sort_index(ascending=False) # doctest: +SKIP A b 2.0 a 1.0 None NaN >>> df.sort_index(na_position='first') # doctest: +SKIP A None NaN a 1.0 b 2.0 >>> df.sort_index(ignore_index=True) A 0 1.0 1 2.0 2 NaN >>> df.sort_index(inplace=True) >>> df # doctest: +SKIP A a 1.0 b 2.0 None NaN >>> df = ps.DataFrame({'A': range(4), 'B': range(4)[::-1]}, ... index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]], ... columns=['A', 'B']) >>> df.sort_index() A B a 0 3 0 1 2 1 b 0 1 2 1 0 3 >>> df.sort_index(level=1) A B b 0 1 2 a 0 3 0 b 1 0 3 a 1 2 1 >>> df.sort_index(level=[1, 0]) A B a 0 3 0 b 0 1 2 a 1 2 1 b 1 0 3 >>> df.sort_index(ignore_index=True) A B 0 3 0 1 2 1 2 1 2 3 0 3 """inplace=validate_bool_kwarg(inplace,"inplace")axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError("No other axis than 0 are supported now")ifkindisnotNone:raiseNotImplementedError("Specifying the sorting algorithm is not supported now.")iflevelisNoneor(is_list_like(level)andlen(level)==0):# type: ignore[arg-type]by=self._internal.index_spark_columnselifis_list_like(level):by=[self._internal.index_spark_columns[lvl]forlvlinlevel# type: ignore[union-attr]]else:by=[self._internal.index_spark_columns[level]]# type: ignore[index]psdf=self._sort(by=by,ascending=ascending,na_position=na_position)ifinplace:ifignore_index:psdf.reset_index(drop=True,inplace=inplace)self._update_internal_frame(psdf._internal)returnNoneelse:returnpsdf.reset_index(drop=True)ifignore_indexelsepsdf
[docs]defswaplevel(self,i:Union[int,Name]=-2,j:Union[int,Name]=-1,axis:Axis=0)->"DataFrame":""" Swap levels i and j in a MultiIndex on a particular axis. Parameters ---------- i, j : int or str Levels of the indices to be swapped. Can pass level name as string. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to swap levels on. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. Returns ------- DataFrame DataFrame with levels swapped in MultiIndex. Examples -------- >>> midx = pd.MultiIndex.from_arrays( ... [['red', 'blue'], [1, 2], ['s', 'm']], names = ['color', 'number', 'size']) >>> midx # doctest: +SKIP MultiIndex([( 'red', 1, 's'), ('blue', 2, 'm')], names=['color', 'number', 'size']) Swap levels in a MultiIndex on index. >>> psdf = ps.DataFrame({'x': [5, 6], 'y':[5, 6]}, index=midx) >>> psdf # doctest: +NORMALIZE_WHITESPACE x y color number size red 1 s 5 5 blue 2 m 6 6 >>> psdf.swaplevel() # doctest: +NORMALIZE_WHITESPACE x y color size number red s 1 5 5 blue m 2 6 6 >>> psdf.swaplevel(0, 1) # doctest: +NORMALIZE_WHITESPACE x y number color size 1 red s 5 5 2 blue m 6 6 >>> psdf.swaplevel('number', 'size') # doctest: +NORMALIZE_WHITESPACE x y color size number red s 1 5 5 blue m 2 6 6 Swap levels in a MultiIndex on columns. >>> psdf = ps.DataFrame({'x': [5, 6], 'y':[5, 6]}) >>> psdf.columns = midx >>> psdf color red blue number 1 2 size s m 0 5 5 1 6 6 >>> psdf.swaplevel(axis=1) color red blue size s m number 1 2 0 5 5 1 6 6 >>> psdf.swaplevel(axis=1) color red blue size s m number 1 2 0 5 5 1 6 6 >>> psdf.swaplevel(0, 1, axis=1) number 1 2 color red blue size s m 0 5 5 1 6 6 >>> psdf.swaplevel('number', 'color', axis=1) number 1 2 color red blue size s m 0 5 5 1 6 6 """axis=validate_axis(axis)ifaxis==0:internal=self._swaplevel_index(i,j)else:assertaxis==1internal=self._swaplevel_columns(i,j)returnDataFrame(internal)
[docs]defswapaxes(self,i:Axis,j:Axis,copy:bool=True)->"DataFrame":""" Interchange axes and swap values axes appropriately. .. note:: This method is based on an expensive operation due to the nature of big data. Internally it needs to generate each row for each value, and then group twice - it is a huge operation. To prevent misuse, this method has the 'compute.max_rows' default limit of input length and raises a ValueError. >>> from pyspark.pandas.config import option_context >>> with option_context('compute.max_rows', 1000): # doctest: +NORMALIZE_WHITESPACE ... ps.DataFrame({'a': range(1001)}).swapaxes(i=0, j=1) Traceback (most recent call last): ... ValueError: Current DataFrame's length exceeds the given limit of 1000 rows. Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' to retrieve more than 1000 rows. Note that, before changing the 'compute.max_rows', this operation is considerably expensive. Parameters ---------- i: {0 or 'index', 1 or 'columns'}. The axis to swap. j: {0 or 'index', 1 or 'columns'}. The axis to swap. copy : bool, default True. Returns ------- DataFrame Examples -------- >>> psdf = ps.DataFrame( ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['x', 'y', 'z'], columns=['a', 'b', 'c'] ... ) >>> psdf a b c x 1 2 3 y 4 5 6 z 7 8 9 >>> psdf.swapaxes(i=1, j=0) x y z a 1 4 7 b 2 5 8 c 3 6 9 >>> psdf.swapaxes(i=1, j=1) a b c x 1 2 3 y 4 5 6 z 7 8 9 """assertcopyisTruei=validate_axis(i)j=validate_axis(j)returnself.copy()ifi==jelseself.transpose()
def_swaplevel_columns(self,i:Union[int,Name],j:Union[int,Name])->InternalFrame:assertisinstance(self.columns,pd.MultiIndex)forindexin(i,j):ifnotisinstance(index,int)andindexnotinself.columns.names:raiseKeyError("Level %s not found"%index)i=iifisinstance(i,int)elseself.columns.names.index(i)j=jifisinstance(j,int)elseself.columns.names.index(j)forindexin(i,j):ifindex>=len(self.columns)orindex<-len(self.columns):raiseIndexError("Too many levels: Columns have only %s levels, ""%s is not a valid level number"%(self._internal.index_level,index))column_label_names=self._internal.column_label_names.copy()(column_label_names[i],column_label_names[j],)=(column_label_names[j],column_label_names[i],)column_labels=self._internal._column_labelscolumn_label_list=[list(label)forlabelincolumn_labels]forlabel_listincolumn_label_list:label_list[i],label_list[j]=label_list[j],label_list[i]column_labels=[tuple(x)forxincolumn_label_list]internal=self._internal.copy(column_label_names=list(column_label_names),column_labels=list(column_labels))returninternaldef_swaplevel_index(self,i:Union[int,Name],j:Union[int,Name])->InternalFrame:assertisinstance(self.index,ps.MultiIndex)forindexin(i,j):ifnotisinstance(index,int)andindexnotinself.index.names:raiseKeyError("Level %s not found"%index)i=iifisinstance(i,int)elseself.index.names.index(i)j=jifisinstance(j,int)elseself.index.names.index(j)forindexin(i,j):ifindex>=self._internal.index_levelorindex<-self._internal.index_level:raiseIndexError("Too many levels: Index has only %s levels, ""%s is not a valid level number"%(self._internal.index_level,index))index_map=list(zip(self._internal.index_spark_columns,self._internal.index_names,self._internal.index_fields,))index_map[i],index_map[j]=index_map[j],index_map[i]index_spark_columns,index_names,index_fields=zip(*index_map)internal=self._internal.copy(index_spark_columns=list(index_spark_columns),index_names=list(index_names),index_fields=list(index_fields),)returninternal
[docs]defnlargest(self,n:int,columns:Union[Name,List[Name]],keep:str="first")->"DataFrame":""" Return the first `n` rows ordered by `columns` in descending order. Return the first `n` rows with the largest values in `columns`, in descending order. The columns that are not specified are returned as well, but not used for ordering. This method is equivalent to ``df.sort_values(columns, ascending=False).head(n)``, but more performant in pandas. In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer, the two would have same performance. Parameters ---------- n : int Number of rows to return. columns : label or list of labels Column label(s) to order by. keep : {'first', 'last'}, default 'first'. 'all' is not implemented yet. Determines which duplicates (if any) to keep. - ``first`` : Keep the first occurrence. - ``last`` : Keep the last occurrence. Returns ------- DataFrame The first `n` rows ordered by the given columns in descending order. See Also -------- DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in ascending order. DataFrame.sort_values : Sort DataFrame by the values. DataFrame.head : Return the first `n` rows without re-ordering. Notes ----- This function cannot be used with all column types. For example, when specifying columns with `object` or `category` dtypes, ``TypeError`` is raised. Examples -------- >>> df = ps.DataFrame({'X': [1, 2, 3, 5, 6, 7, np.nan], ... 'Y': [6, 7, 8, 9, 10, 11, 12]}) >>> df X Y 0 1.0 6 1 2.0 7 2 3.0 8 3 5.0 9 4 6.0 10 5 7.0 11 6 NaN 12 In the following example, we will use ``nlargest`` to select the three rows having the largest values in column "X". >>> df.nlargest(n=3, columns='X') X Y 5 7.0 11 4 6.0 10 3 5.0 9 To order by the largest values in column "Y" and then "X", we can specify multiple columns like in the next example. >>> df.nlargest(n=3, columns=['Y', 'X']) X Y 6 NaN 12 5 7.0 11 4 6.0 10 The examples below show how ties are resolved, which is decided by `keep`. >>> tied_df = ps.DataFrame({'X': [1, 2, 2, 3, 3]}, index=['a', 'b', 'c', 'd', 'e']) >>> tied_df X a 1 b 2 c 2 d 3 e 3 When using keep='first' (default), ties are resolved in order: >>> tied_df.nlargest(3, 'X') X d 3 e 3 b 2 >>> tied_df.nlargest(3, 'X', keep='first') X d 3 e 3 b 2 When using keep='last', ties are resolved in reverse order: >>> tied_df.nlargest(3, 'X', keep='last') X e 3 d 3 c 2 """by_scols=self._prepare_sort_by_scols(columns)returnself._sort(by=by_scols,ascending=False,na_position="last",keep=keep).head(n=n)
[docs]defnsmallest(self,n:int,columns:Union[Name,List[Name]],keep:str="first")->"DataFrame":""" Return the first `n` rows ordered by `columns` in ascending order. Return the first `n` rows with the smallest values in `columns`, in ascending order. The columns that are not specified are returned as well, but not used for ordering. This method is equivalent to ``df.sort_values(columns, ascending=True).head(n)``, but more performant. In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer, the two would have same performance. Parameters ---------- n : int Number of items to retrieve. columns : list or str Column name or names to order by. keep : {'first', 'last'}, default 'first'. 'all' is not implemented yet. Determines which duplicates (if any) to keep. - ``first`` : Keep the first occurrence. - ``last`` : Keep the last occurrence. Returns ------- DataFrame See Also -------- DataFrame.nlargest : Return the first `n` rows ordered by `columns` in descending order. DataFrame.sort_values : Sort DataFrame by the values. DataFrame.head : Return the first `n` rows without re-ordering. Examples -------- >>> df = ps.DataFrame({'X': [1, 2, 3, 5, 6, 7, np.nan], ... 'Y': [6, 7, 8, 9, 10, 11, 12]}) >>> df X Y 0 1.0 6 1 2.0 7 2 3.0 8 3 5.0 9 4 6.0 10 5 7.0 11 6 NaN 12 In the following example, we will use ``nsmallest`` to select the three rows having the smallest values in column "X". >>> df.nsmallest(n=3, columns='X') # doctest: +NORMALIZE_WHITESPACE X Y 0 1.0 6 1 2.0 7 2 3.0 8 To order by the smallest values in column "Y" and then "X", we can specify multiple columns like in the next example. >>> df.nsmallest(n=3, columns=['Y', 'X']) # doctest: +NORMALIZE_WHITESPACE X Y 0 1.0 6 1 2.0 7 2 3.0 8 The examples below show how ties are resolved, which is decided by `keep`. >>> tied_df = ps.DataFrame({'X': [1, 1, 2, 2, 3]}, index=['a', 'b', 'c', 'd', 'e']) >>> tied_df X a 1 b 1 c 2 d 2 e 3 When using keep='first' (default), ties are resolved in order: >>> tied_df.nsmallest(3, 'X') X a 1 b 1 c 2 >>> tied_df.nsmallest(3, 'X', keep='first') X a 1 b 1 c 2 When using keep='last', ties are resolved in reverse order: >>> tied_df.nsmallest(3, 'X', keep='last') X b 1 a 1 d 2 """by_scols=self._prepare_sort_by_scols(columns)returnself._sort(by=by_scols,ascending=True,na_position="last",keep=keep).head(n=n)
[docs]defisin(self,values:Union[List,Dict])->"DataFrame":""" Whether each element in the DataFrame is contained in values. Parameters ---------- values : iterable or dict The sequence of values to test. If values are a dict, the keys must be the column names, which must match. Series and DataFrame are not supported. Returns ------- DataFrame DataFrame of booleans showing whether each element in the DataFrame is contained in values. Examples -------- >>> df = ps.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog'], ... columns=['num_legs', 'num_wings']) >>> df num_legs num_wings falcon 2 2 dog 4 0 When ``values`` is a list check whether every value in the DataFrame is present in the list (which animals have 0 or 2 legs or wings) >>> df.isin([0, 2]) num_legs num_wings falcon True True dog False True When ``values`` is a dict, we can pass values to check for each column separately: >>> df.isin({'num_wings': [0, 3]}) num_legs num_wings falcon False False dog False True """ifisinstance(values,(pd.DataFrame,pd.Series)):raiseNotImplementedError("DataFrame and Series are not supported")ifisinstance(values,dict)andnotset(values.keys()).issubset(self.columns):raiseAttributeError("'DataFrame' object has no attribute %s"%(set(values.keys()).difference(self.columns)))data_spark_columns=[]ifisinstance(values,dict):fori,colinenumerate(self.columns):ifcolinvalues:item=values[col]item=item.tolist()ifisinstance(item,np.ndarray)elselist(item)scol=self._internal.spark_column_for(self._internal.column_labels[i]).isin([F.lit(v)forvinitem])scol=F.coalesce(scol,F.lit(False))else:scol=F.lit(False)data_spark_columns.append(scol.alias(self._internal.data_spark_column_names[i]))elifis_list_like(values):values=(cast(np.ndarray,values).tolist()ifisinstance(values,np.ndarray)elselist(values))forlabelinself._internal.column_labels:scol=self._internal.spark_column_for(label).isin([F.lit(v)forvinvalues])scol=F.coalesce(scol,F.lit(False))data_spark_columns.append(scol.alias(self._internal.spark_column_name_for(label)))else:raiseTypeError("Values should be iterable, Series, DataFrame or dict.")returnDataFrame(self._internal.with_new_columns(data_spark_columns,data_fields=[field.copy(dtype=np.dtype("bool"),spark_type=BooleanType(),nullable=False)forfieldinself._internal.data_fields],))
[docs]defmerge(self,right:"DataFrame",how:str="inner",on:Optional[Union[Name,List[Name]]]=None,left_on:Optional[Union[Name,List[Name]]]=None,right_on:Optional[Union[Name,List[Name]]]=None,left_index:bool=False,right_index:bool=False,suffixes:Tuple[str,str]=("_x","_y"),)->"DataFrame":""" Merge DataFrame objects with a database-style join. The index of the resulting DataFrame will be one of the following: - 0...n if no index is used for merging - Index of the left DataFrame if merged only on the index of the right DataFrame - Index of the right DataFrame if merged only on the index of the left DataFrame - All involved indices if merged using the indices of both DataFrames e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will be an index (x, a, b) Parameters ---------- right: Object to merge with. how: Type of merge to be performed. {'left', 'right', 'outer', 'inner'}, default 'inner' left: use only keys from left frame, like a SQL left outer join; not preserve key order unlike pandas. right: use only keys from right frame, like a SQL right outer join; not preserve key order unlike pandas. outer: use union of keys from both frames, like a SQL full outer join; sort keys lexicographically. inner: use intersection of keys from both frames, like a SQL inner join; not preserve the order of the left keys unlike pandas. on: Column or index level names to join on. These must be found in both DataFrames. If on is None and not merging on indexes then this defaults to the intersection of the columns in both DataFrames. left_on: Column or index level names to join on in the left DataFrame. Can also be an array or list of arrays of the length of the left DataFrame. These arrays are treated as if they are columns. right_on: Column or index level names to join on in the right DataFrame. Can also be an array or list of arrays of the length of the right DataFrame. These arrays are treated as if they are columns. left_index: Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index or a number of columns) must match the number of levels. right_index: Use the index from the right DataFrame as the join key. Same caveats as left_index. suffixes: Suffix to apply to overlapping column names in the left and right side, respectively. Returns ------- DataFrame A DataFrame of the two merged objects. See Also -------- DataFrame.join : Join columns of another DataFrame. DataFrame.update : Modify in place using non-NA values from another DataFrame. DataFrame.hint : Specifies some hint on the current DataFrame. broadcast : Marks a DataFrame as small enough for use in broadcast joins. Examples -------- >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [1, 2, 3, 5]}, ... columns=['lkey', 'value']) >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [5, 6, 7, 8]}, ... columns=['rkey', 'value']) >>> df1 lkey value 0 foo 1 1 bar 2 2 baz 3 3 foo 5 >>> df2 rkey value 0 foo 5 1 bar 6 2 baz 7 3 foo 8 Merge df1 and df2 on the lkey and rkey columns. The value columns have the default suffixes, _x and _y, appended. >>> merged = df1.merge(df2, left_on='lkey', right_on='rkey') >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) # doctest: +ELLIPSIS lkey value_x rkey value_y ...bar 2 bar 6 ...baz 3 baz 7 ...foo 1 foo 5 ...foo 1 foo 8 ...foo 5 foo 5 ...foo 5 foo 8 >>> left_psdf = ps.DataFrame({'A': [1, 2]}) >>> right_psdf = ps.DataFrame({'B': ['x', 'y']}, index=[1, 2]) >>> left_psdf.merge(right_psdf, left_index=True, right_index=True).sort_index() A B 1 2 x >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='left').sort_index() A B 0 1 None 1 2 x >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='right').sort_index() A B 1 2.0 x 2 NaN y >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='outer').sort_index() A B 0 1.0 None 1 2.0 x 2 NaN y Notes ----- As described in #263, joining string columns currently returns None for missing values instead of NaN. """defto_list(os:Optional[Union[Name,List[Name]]])->List[Label]:ifosisNone:return[]elifis_name_like_tuple(os):return[cast(Label,os)]elifis_name_like_value(os):return[(os,)]else:return[oifis_name_like_tuple(o)else(o,)foroinos]ifisinstance(right,ps.Series):right=right.to_frame()ifon:ifleft_onorright_on:raiseValueError('Can only pass argument "on" OR "left_on" and "right_on", '"not a combination of both.")left_key_names=list(map(self._internal.spark_column_name_for,to_list(on)))right_key_names=list(map(right._internal.spark_column_name_for,to_list(on)))else:# TODO: need special handling for multi-index.ifleft_index:left_key_names=self._internal.index_spark_column_nameselse:left_key_names=list(map(self._internal.spark_column_name_for,to_list(left_on)))ifright_index:right_key_names=right._internal.index_spark_column_nameselse:right_key_names=list(map(right._internal.spark_column_name_for,to_list(right_on)))ifleft_key_namesandnotright_key_names:raiseValueError("Must pass right_on or right_index=True")ifright_key_namesandnotleft_key_names:raiseValueError("Must pass left_on or left_index=True")ifnotleft_key_namesandnotright_key_names:common=list(self.columns.intersection(right.columns))iflen(common)==0:raiseValueError("No common columns to perform merge on. Merge options: ""left_on=None, right_on=None, left_index=False, right_index=False")left_key_names=list(map(self._internal.spark_column_name_for,to_list(common)))right_key_names=list(map(right._internal.spark_column_name_for,to_list(common)))iflen(left_key_names)!=len(right_key_names):raiseValueError("len(left_keys) must equal len(right_keys)")# We should distinguish the name to avoid ambiguous column name after merging.right_prefix="__right_"right_key_names=[right_prefix+right_key_nameforright_key_nameinright_key_names]how=validate_how(how)defresolve(internal:InternalFrame,side:str)->InternalFrame:defrename(col:str)->str:return"__{}_{}".format(side,col)internal=internal.resolved_copysdf=internal.spark_framesdf=sdf.select(*[scol_for(sdf,col).alias(rename(col))forcolinsdf.columnsifcolnotinHIDDEN_COLUMNS],*HIDDEN_COLUMNS,)returninternal.copy(spark_frame=sdf,index_spark_columns=[scol_for(sdf,rename(col))forcolininternal.index_spark_column_names],index_fields=[field.copy(name=rename(field.name))forfieldininternal.index_fields],data_spark_columns=[scol_for(sdf,rename(col))forcolininternal.data_spark_column_names],data_fields=[field.copy(name=rename(field.name))forfieldininternal.data_fields],)left_internal=self._internal.resolved_copyright_internal=resolve(right._internal,"right")left_table=left_internal.spark_frame.alias("left_table")right_table=right_internal.spark_frame.alias("right_table")left_key_columns=[scol_for(left_table,label)forlabelinleft_key_names]right_key_columns=[scol_for(right_table,label)forlabelinright_key_names]join_condition=reduce(lambdax,y:x&y,[lkey==rkeyforlkey,rkeyinzip(left_key_columns,right_key_columns)],)joined_table=left_table.join(right_table,join_condition,how=how)# Unpack suffixes tuple for convenienceleft_suffix=suffixes[0]right_suffix=suffixes[1]# Append suffixes to columns with the same name to avoid conflicts laterduplicate_columns=set(left_internal.column_labels)&set(right_internal.column_labels)exprs=[]data_columns=[]column_labels=[]defleft_scol_for(label:Label)->PySparkColumn:returnscol_for(left_table,left_internal.spark_column_name_for(label))defright_scol_for(label:Label)->PySparkColumn:returnscol_for(right_table,right_internal.spark_column_name_for(label))forlabelinleft_internal.column_labels:col=left_internal.spark_column_name_for(label)scol=left_scol_for(label)iflabelinduplicate_columns:spark_column_name=left_internal.spark_column_name_for(label)if(spark_column_nameinleft_key_namesand(right_prefix+spark_column_name)inright_key_names):right_scol=right_scol_for(label)ifhow=="right":scol=right_scol.alias(col)elifhow=="full":scol=F.when(scol.isNotNull(),scol).otherwise(right_scol).alias(col)else:passelse:col=col+left_suffixscol=scol.alias(col)label=tuple([str(label[0])+left_suffix]+list(label[1:]))exprs.append(scol)data_columns.append(col)column_labels.append(label)forlabelinright_internal.column_labels:# recover `right_prefix` here.col=right_internal.spark_column_name_for(label)[len(right_prefix):]scol=right_scol_for(label).alias(col)iflabelinduplicate_columns:spark_column_name=left_internal.spark_column_name_for(label)if(spark_column_nameinleft_key_namesand(right_prefix+spark_column_name)inright_key_names):continueelse:col=col+right_suffixscol=scol.alias(col)label=tuple([str(label[0])+right_suffix]+list(label[1:]))exprs.append(scol)data_columns.append(col)column_labels.append(label)left_index_scols=left_internal.index_spark_columnsright_index_scols=right_internal.index_spark_columns# Retain indices if they are used for joiningifleft_index:ifright_index:ifhowin("inner","left"):exprs.extend(left_index_scols)index_spark_column_names=left_internal.index_spark_column_namesindex_names=left_internal.index_nameselifhow=="right":exprs.extend(right_index_scols)index_spark_column_names=right_internal.index_spark_column_namesindex_names=right_internal.index_nameselse:index_spark_column_names=left_internal.index_spark_column_namesindex_names=left_internal.index_namesforcol,left_scol,right_scolinzip(index_spark_column_names,left_index_scols,right_index_scols):scol=F.when(left_scol.isNotNull(),left_scol).otherwise(right_scol)exprs.append(scol.alias(col))else:exprs.extend(right_index_scols)index_spark_column_names=right_internal.index_spark_column_namesindex_names=right_internal.index_nameselifright_index:exprs.extend(left_index_scols)index_spark_column_names=left_internal.index_spark_column_namesindex_names=left_internal.index_nameselse:index_spark_column_names=[]index_names=[]selected_columns=joined_table.select(*exprs)internal=InternalFrame(spark_frame=selected_columns,index_spark_columns=[scol_for(selected_columns,col)forcolinindex_spark_column_names],index_names=index_names,column_labels=column_labels,data_spark_columns=[scol_for(selected_columns,col)forcolindata_columns],)returnDataFrame(internal)
[docs]defjoin(self,right:"DataFrame",on:Optional[Union[Name,List[Name]]]=None,how:str="left",lsuffix:str="",rsuffix:str="",)->"DataFrame":""" Join columns of another DataFrame. Join columns with `right` DataFrame either on index or on a key column. Efficiently join multiple DataFrame objects by index at once by passing a list. Parameters ---------- right: DataFrame, Series on: str, list of str, or array-like, optional Column or index level name(s) in the caller to join on the index in `right`, otherwise joins index-on-index. If multiple values given, the `right` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. how: {'left', 'right', 'outer', 'inner'}, default 'left' How to handle the operation of the two objects. * left: use `left` frame’s index (or column if on is specified). * right: use `right`’s index. * outer: form union of `left` frame’s index (or column if on is specified) with right’s index, and sort it. lexicographically. * inner: form intersection of `left` frame’s index (or column if on is specified) with `right`’s index, preserving the order of the `left`’s one. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' Suffix to use from `right` frame's overlapping columns. Returns ------- DataFrame A dataframe containing columns from both the `left` and `right`. See Also -------- DataFrame.merge: For column(s)-on-columns(s) operations. DataFrame.update : Modify in place using non-NA values from another DataFrame. DataFrame.hint : Specifies some hint on the current DataFrame. broadcast : Marks a DataFrame as small enough for use in broadcast joins. Notes ----- Parameters on, lsuffix, and rsuffix are not supported when passing a list of DataFrame objects. Examples -------- >>> psdf1 = ps.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], ... 'A': ['A0', 'A1', 'A2', 'A3']}, ... columns=['key', 'A']) >>> psdf2 = ps.DataFrame({'key': ['K0', 'K1', 'K2'], ... 'B': ['B0', 'B1', 'B2']}, ... columns=['key', 'B']) >>> psdf1 key A 0 K0 A0 1 K1 A1 2 K2 A2 3 K3 A3 >>> psdf2 key B 0 K0 B0 1 K1 B1 2 K2 B2 Join DataFrames using their indexes. >>> join_psdf = psdf1.join(psdf2, lsuffix='_left', rsuffix='_right') >>> join_psdf.sort_values(by=join_psdf.columns) key_left A key_right B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 2 K2 A2 K2 B2 3 K3 A3 None None If we want to join using the key columns, we need to set key to be the index in both df and right. The joined DataFrame will have key as its index. >>> join_psdf = psdf1.set_index('key').join(psdf2.set_index('key')) >>> join_psdf.sort_values(by=join_psdf.columns) # doctest: +NORMALIZE_WHITESPACE A B key K0 A0 B0 K1 A1 B1 K2 A2 B2 K3 A3 None Another option to join using the key columns is to use the on parameter. DataFrame.join always uses right’s index but we can use any column in df. This method does not preserve the original DataFrame’s index in the result unlike pandas. >>> join_psdf = psdf1.join(psdf2.set_index('key'), on='key') >>> join_psdf.index Index([0, 1, 2, 3], dtype='int64') """ifisinstance(right,ps.Series):common=list(self.columns.intersection([right.name]))else:common=list(self.columns.intersection(right.columns))iflen(common)>0andnotlsuffixandnotrsuffix:raiseValueError("columns overlap but no suffix specified: ""{rename}".format(rename=common))need_set_index=Falseifon:ifnotis_list_like(on):on=[on]iflen(on)!=right._internal.index_level:raiseValueError('len(left_on) must equal the number of levels in the index of "right"')need_set_index=len(set(on)&set(self.index.names))==0ifneed_set_index:self=self.set_index(on)join_psdf=self.merge(right,left_index=True,right_index=True,how=how,suffixes=(lsuffix,rsuffix))returnjoin_psdf.reset_index()ifneed_set_indexelsejoin_psdf
[docs]defcombine_first(self,other:"DataFrame")->"DataFrame":""" Update null elements with value in the same location in `other`. Combine two DataFrame objects by filling null values in one DataFrame with non-null values from other DataFrame. The row and column indexes of the resulting DataFrame will be the union of the two. .. versionadded:: 3.3.0 Parameters ---------- other : DataFrame Provided DataFrame to use to fill null values. Returns ------- DataFrame Examples -------- >>> ps.set_option("compute.ops_on_diff_frames", True) >>> df1 = ps.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = ps.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2).sort_index() A B 0 1.0 3.0 1 0.0 4.0 Null values persist if the location of that null value does not exist in other >>> df1 = ps.DataFrame({'A': [None, 0], 'B': [4, None]}) >>> df2 = ps.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) >>> df1.combine_first(df2).sort_index() A B C 0 NaN 4.0 NaN 1 0.0 3.0 1.0 2 NaN 3.0 1.0 >>> ps.reset_option("compute.ops_on_diff_frames") """ifnotisinstance(other,DataFrame):raiseTypeError("`combine_first` only allows `DataFrame` for parameter `other`")ifsame_anchor(self,other):combined=selfthis=selfthat=otherelse:combined=combine_frames(self,other)this=combined["this"]that=combined["that"]intersect_column_labels=set(self._internal.column_labels).intersection(set(other._internal.column_labels))column_labels,data_spark_columns=[],[]forcolumn_labelinthis._internal.column_labels:this_scol=this._internal.spark_column_for(column_label)ifcolumn_labelinintersect_column_labels:that_scol=that._internal.spark_column_for(column_label)this_scol_name=this._internal.spark_column_name_for(column_label)combined_scol=(F.when(this_scol.isNull(),that_scol).otherwise(this_scol).alias(this_scol_name))data_spark_columns.append(combined_scol)else:data_spark_columns.append(this_scol)column_labels.append(column_label)forcolumn_labelinthat._internal.column_labels:ifcolumn_labelnotinintersect_column_labels:that_scol=that._internal.spark_column_for(column_label)data_spark_columns.append(that_scol)column_labels.append(column_label)internal=combined._internal.copy(column_labels=column_labels,data_spark_columns=data_spark_columns,data_fields=None,# TODO: dtype?column_label_names=self._internal.column_label_names,)returnDataFrame(internal)
# TODO(SPARK-46163): add 'filter_func' and 'errors' parameter
[docs]defupdate(self,other:"DataFrame",join:str="left",overwrite:bool=True)->None:""" Modify in place using non-NA values from another DataFrame. Aligns on indices. There is no return value. Parameters ---------- other : DataFrame, or Series join : 'left', default 'left' Only left join is implemented, keeping the index and columns of the original object. overwrite : bool, default True How to handle non-NA values for overlapping keys: * True: overwrite original DataFrame's values with values from `other`. * False: only update values that are NA in the original DataFrame. Returns ------- None : method directly changes calling object See Also -------- DataFrame.merge : For column(s)-on-columns(s) operations. DataFrame.join : Join columns of another DataFrame. DataFrame.hint : Specifies some hint on the current DataFrame. broadcast : Marks a DataFrame as small enough for use in broadcast joins. Examples -------- >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}, columns=['A', 'B']) >>> new_df = ps.DataFrame({'B': [4, 5, 6], 'C': [7, 8, 9]}, columns=['B', 'C']) >>> df.update(new_df) >>> df.sort_index() A B 0 1 4 1 2 5 2 3 6 The DataFrame's length does not increase because of the update, only values at matching index/column labels are updated. >>> df = ps.DataFrame({'A': ['a', 'b', 'c'], 'B': ['x', 'y', 'z']}, columns=['A', 'B']) >>> new_df = ps.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}, columns=['B']) >>> df.update(new_df) >>> df.sort_index() A B 0 a d 1 b e 2 c f For Series, its name attribute must be set. >>> df = ps.DataFrame({'A': ['a', 'b', 'c'], 'B': ['x', 'y', 'z']}, columns=['A', 'B']) >>> new_column = ps.Series(['d', 'e'], name='B', index=[0, 2]) >>> df.update(new_column) >>> df.sort_index() A B 0 a d 1 b y 2 c e If `other` contains None the corresponding values are not updated in the original dataframe. >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}, columns=['A', 'B']) >>> new_df = ps.DataFrame({'B': [4, None, 6]}, columns=['B']) >>> df.update(new_df) >>> df.sort_index() A B 0 1 4.0 1 2 500.0 2 3 6.0 """ifjoin!="left":raiseNotImplementedError("Only left join is supported")ifisinstance(other,ps.Series):other=other.to_frame()update_columns=list(set(self._internal.column_labels).intersection(set(other._internal.column_labels)))update_sdf=self.join(other[update_columns],rsuffix="_new")._internal.resolved_copy.spark_framedata_fields=self._internal.data_fields.copy()forcolumn_labelsinupdate_columns:column_name=self._internal.spark_column_name_for(column_labels)old_col=scol_for(update_sdf,column_name)new_col=scol_for(update_sdf,other._internal.spark_column_name_for(column_labels)+"_new")ifoverwrite:update_sdf=update_sdf.withColumn(column_name,F.when(new_col.isNull(),old_col).otherwise(new_col))else:update_sdf=update_sdf.withColumn(column_name,F.when(old_col.isNull(),new_col).otherwise(old_col))data_fields[self._internal.column_labels.index(column_labels)]=Nonesdf=update_sdf.select(*[scol_for(update_sdf,col)forcolinself._internal.spark_column_names],*HIDDEN_COLUMNS,)internal=self._internal.with_new_sdf(sdf,data_fields=data_fields)self._update_internal_frame(internal,check_same_anchor=False)
[docs]defcov(self,min_periods:Optional[int]=None,ddof:int=1)->"DataFrame":""" Compute pairwise covariance of columns, excluding NA/null values. Compute the pairwise covariance among the series of a DataFrame. The returned data frame is the `covariance matrix <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns of the DataFrame. Both NA and null values are automatically excluded from the calculation. (See the note below about bias from missing values.) A threshold can be set for the minimum number of observations for each value created. Comparisons with observations below this threshold will be returned as ``NaN``. This method is generally used for the analysis of time series data to understand the relationship between different measures across time. .. versionadded:: 3.3.0 Parameters ---------- min_periods : int, optional Minimum number of observations required per pair of columns to have a valid result. ddof : int, default 1 Delta degrees of freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. .. versionadded:: 3.4.0 Returns ------- DataFrame The covariance matrix of the series of the DataFrame. See Also -------- Series.cov : Compute covariance with another Series. Examples -------- >>> df = ps.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], ... columns=['dogs', 'cats']) >>> df.cov() dogs cats dogs 0.666667 -1.000000 cats -1.000000 1.666667 >>> np.random.seed(42) >>> df = ps.DataFrame(np.random.randn(1000, 5), ... columns=['a', 'b', 'c', 'd', 'e']) >>> df.cov() a b c d e a 0.998438 -0.020161 0.059277 -0.008943 0.014144 b -0.020161 1.059352 -0.008543 -0.024738 0.009826 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 >>> df.cov(ddof=2) a b c d e a 0.999439 -0.020181 0.059336 -0.008952 0.014159 b -0.020181 1.060413 -0.008551 -0.024762 0.009836 c 0.059336 -0.008551 1.011683 -0.001487 -0.000271 d -0.008952 -0.024762 -0.001487 0.922220 -0.013705 e 0.014159 0.009836 -0.000271 -0.013705 0.978775 >>> df.cov(ddof=-1) a b c d e a 0.996444 -0.020121 0.059158 -0.008926 0.014116 b -0.020121 1.057235 -0.008526 -0.024688 0.009807 c 0.059158 -0.008526 1.008650 -0.001483 -0.000270 d -0.008926 -0.024688 -0.001483 0.919456 -0.013664 e 0.014116 0.009807 -0.000270 -0.013664 0.975842 **Minimum number of periods** This method also supports an optional ``min_periods`` keyword that specifies the required minimum number of non-NA observations for each column pair to have a valid result: >>> np.random.seed(42) >>> df = pd.DataFrame(np.random.randn(20, 3), ... columns=['a', 'b', 'c']) >>> df.loc[df.index[:5], 'a'] = np.nan >>> df.loc[df.index[5:10], 'b'] = np.nan >>> sdf = ps.from_pandas(df) >>> sdf.cov(min_periods=12) a b c a 0.316741 NaN -0.150812 b NaN 1.248003 0.191417 c -0.150812 0.191417 0.895202 """ifnotisinstance(ddof,int):raiseTypeError("ddof must be integer")min_periods=1ifmin_periodsisNoneelsemin_periods# Only compute covariance for Boolean and Numeric except Decimalpsdf=self[[colforcolinself.columnsifisinstance(self[col].spark.data_type,BooleanType)or(isinstance(self[col].spark.data_type,NumericType)andnotisinstance(self[col].spark.data_type,DecimalType))]]num_cols=len(psdf.columns)cov=np.zeros([num_cols,num_cols])ifnum_cols==0:returnDataFrame()iflen(psdf)<min_periods:cov.fill(np.nan)returnDataFrame(cov,columns=psdf.columns,index=psdf.columns)data_cols=psdf._internal.data_spark_column_namescov_scols=[]count_not_null_scols=[]# Count number of null row between two columns# Example:# a b c# 0 1 1 1# 1 NaN 2 2# 2 3 NaN 3# 3 4 4 4## a b c# a count(a, a) count(a, b) count(a, c)# b count(b, b) count(b, c)# c count(c, c)## count_not_null_scols =# [F.count(a, a), F.count(a, b), F.count(a, c), F.count(b, b), F.count(b, c), F.count(c, c)]forrinrange(0,num_cols):forcinrange(r,num_cols):count_not_null_scols.append(F.count(F.when(F.col(data_cols[r]).isNotNull()&F.col(data_cols[c]).isNotNull(),1)))count_not_null=(psdf._internal.spark_frame.replace(float("nan"),None).select(*count_not_null_scols).head(1)[0])# Calculate covariance between two columns# Example:# with min_periods = 3# a b c# 0 1 1 1# 1 NaN 2 2# 2 3 NaN 3# 3 4 4 4## a b c# a cov(a, a) None cov(a, c)# b cov(b, b) cov(b, c)# c cov(c, c)## cov_scols = [F.cov(a, a), None, F.cov(a, c), F.cov(b, b), F.cov(b, c), F.cov(c, c)]step=0forrinrange(0,num_cols):step+=rforcinrange(r,num_cols):cov_scols.append(SF.covar(F.col(data_cols[r]).cast("double"),F.col(data_cols[c]).cast("double"),ddof)ifcount_not_null[r*num_cols+c-step]>=min_periodselseF.lit(None))pair_cov=psdf._internal.spark_frame.select(*cov_scols).head(1)[0]# Convert from row to 2D array# Example:# pair_cov = [cov(a, a), None, cov(a, c), cov(b, b), cov(b, c), cov(c, c)]## cov =## a b c# a cov(a, a) None cov(a, c)# b cov(b, b) cov(b, c)# c cov(c, c)step=0forrinrange(0,num_cols):step+=rforcinrange(r,num_cols):cov[r][c]=pair_cov[r*num_cols+c-step]# Copy values# Example:# cov =# a b c# a cov(a, a) None cov(a, c)# b None cov(b, b) cov(b, c)# c cov(a, c) cov(b, c) cov(c, c)cov=cov+cov.T-np.diag(np.diag(cov))returnDataFrame(cov,columns=psdf.columns,index=psdf.columns)
[docs]defsample(self,n:Optional[int]=None,frac:Optional[float]=None,replace:bool=False,random_state:Optional[int]=None,ignore_index:bool=False,)->"DataFrame":""" Return a random sample of items from an axis of object. Please call this function using named argument by specifying the ``frac`` argument. You can use `random_state` for reproducibility. However, note that different from pandas, specifying a seed in pandas-on-Spark/Spark does not guarantee the sampled rows will be fixed. The result set depends on not only the seed, but also how the data is distributed across machines and to some extent network randomness when shuffle operations are involved. Even in the simplest case, the result set will depend on the system's CPU core count. Parameters ---------- n : int, optional Number of items to return. This is currently NOT supported. Use frac instead. frac : float, optional Fraction of axis items to return. replace : bool, default False Sample with or without replacement. random_state : int, optional Seed for the random number generator (if int). ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. .. versionadded:: 3.4.0 Returns ------- Series or DataFrame A new object of same type as caller containing the sampled items. Examples -------- >>> df = ps.DataFrame({'num_legs': [2, 4, 8, 0], ... 'num_wings': [2, 0, 0, 0], ... 'num_specimen_seen': [10, 2, 1, 8]}, ... index=['falcon', 'dog', 'spider', 'fish'], ... columns=['num_legs', 'num_wings', 'num_specimen_seen']) >>> df # doctest: +SKIP num_legs num_wings num_specimen_seen falcon 2 2 10 dog 4 0 2 spider 8 0 1 fish 0 0 8 A random 25% sample of the ``DataFrame``. Note that we use `random_state` to ensure the reproducibility of the examples. >>> df.sample(frac=0.25, random_state=1) # doctest: +SKIP num_legs num_wings num_specimen_seen falcon 2 2 10 fish 0 0 8 A random 50% sample of the ``DataFrame``, while ignoring the index. >>> df.sample(frac=0.5, random_state=1, ignore_index=True) # doctest: +SKIP num_legs num_wings num_specimen_seen 0 4 0 2 1 8 0 1 2 0 0 8 Extract 25% random elements from the ``Series`` ``df['num_legs']`` with replacement so, the same items could appear more than once. >>> df['num_legs'].sample(frac=0.4, replace=True, random_state=1) # doctest: +SKIP falcon 2 spider 8 spider 8 Name: num_legs, dtype: int64 Specifying the exact number of items to return is not supported now. >>> df.sample(n=5) # doctest: +ELLIPSIS Traceback (most recent call last): ... NotImplementedError: Function sample currently does not support specifying ... """# Note: we don't run any of the doctests because the result can change depending on the# system's core count.ifnisnotNone:raiseNotImplementedError("Function sample currently does not support specifying ""exact number of items to return. Use frac instead.")iffracisNone:raiseValueError("frac must be specified.")sdf=self._internal.resolved_copy.spark_frame.sample(withReplacement=replace,fraction=frac,seed=random_state)ifignore_index:returnDataFrame(sdf.drop(*self._internal.index_spark_column_names))else:returnDataFrame(self._internal.with_new_sdf(sdf))
[docs]defastype(self,dtype:Union[str,Dtype,Dict[Name,Union[str,Dtype]]])->"DataFrame":""" Cast a pandas-on-Spark object to a specified dtype ``dtype``. Parameters ---------- dtype : data type, or dict of column name -> data type Use a numpy.dtype or Python type to cast entire pandas-on-Spark object to the same type. Alternatively, use {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. Returns ------- casted : same type as caller See Also -------- to_datetime : Convert argument to datetime. Examples -------- >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}, dtype='int64') >>> df a b 0 1 1 1 2 2 2 3 3 Convert to float type: >>> df.astype('float') a b 0 1.0 1.0 1 2.0 2.0 2 3.0 3.0 Convert to int64 type back: >>> df.astype('int64') a b 0 1 1 1 2 2 2 3 3 Convert column a to float type: >>> df.astype({'a': float}) a b 0 1.0 1 1 2.0 2 2 3.0 3 """applied=[]ifis_dict_like(dtype):dtype_dict=cast(Dict[Name,Union[str,Dtype]],dtype)forcol_nameindtype_dict.keys():ifcol_namenotinself.columns:raiseKeyError("Only a column name can be used for the ""key in a dtype mappings argument.")forcol_name,colinself.items():ifcol_nameindtype_dict:applied.append(col.astype(dtype=dtype_dict[col_name]))else:applied.append(col)else:forcol_name,colinself.items():applied.append(col.astype(dtype=cast(Union[str,Dtype],dtype)))returnDataFrame(self._internal.with_new_columns(applied))
[docs]defadd_prefix(self,prefix:str)->"DataFrame":""" Prefix labels with string `prefix`. For Series, the row labels are prefixed. For DataFrame, the column labels are prefixed. Parameters ---------- prefix : str The string to add before each label. Returns ------- DataFrame New DataFrame with updated labels. See Also -------- Series.add_prefix: Prefix row labels with string `prefix`. Series.add_suffix: Suffix row labels with string `suffix`. DataFrame.add_suffix: Suffix column labels with string `suffix`. Examples -------- >>> df = ps.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, columns=['A', 'B']) >>> df A B 0 1 3 1 2 4 2 3 5 3 4 6 >>> df.add_prefix('col_') col_A col_B 0 1 3 1 2 4 2 3 5 3 4 6 """assertisinstance(prefix,str)returnself._apply_series_op(lambdapsser:psser.rename(tuple([prefix+iforiinpsser._column_label])))
[docs]defadd_suffix(self,suffix:str)->"DataFrame":""" Suffix labels with string `suffix`. For Series, the row labels are suffixed. For DataFrame, the column labels are suffixed. Parameters ---------- suffix : str The string to add before each label. Returns ------- DataFrame New DataFrame with updated labels. See Also -------- Series.add_prefix: Prefix row labels with string `prefix`. Series.add_suffix: Suffix row labels with string `suffix`. DataFrame.add_prefix: Prefix column labels with string `prefix`. Examples -------- >>> df = ps.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, columns=['A', 'B']) >>> df A B 0 1 3 1 2 4 2 3 5 3 4 6 >>> df.add_suffix('_col') A_col B_col 0 1 3 1 2 4 2 3 5 3 4 6 """assertisinstance(suffix,str)returnself._apply_series_op(lambdapsser:psser.rename(tuple([i+suffixforiinpsser._column_label])))
# TODO(SPARK-46164): include and exclude should be implemented.
[docs]defdescribe(self,percentiles:Optional[List[float]]=None)->"DataFrame":""" Generate descriptive statistics that summarize the central tendency, dispersion and shape of a dataset's distribution, excluding ``NaN`` values. Analyzes both numeric and object series, as well as ``DataFrame`` column sets of mixed data types. The output will vary depending on what is provided. Refer to the notes below for more detail. Parameters ---------- percentiles : list of ``float`` in range [0.0, 1.0], default [0.25, 0.5, 0.75] A list of percentiles to be computed. Returns ------- DataFrame Summary statistics of the Dataframe provided. See Also -------- DataFrame.count: Count number of non-NA/null observations. DataFrame.max: Maximum of the values in the object. DataFrame.min: Minimum of the values in the object. DataFrame.mean: Mean of the values. DataFrame.std: Standard deviation of the observations. Notes ----- For numeric data, the result's index will include ``count``, ``mean``, ``std``, ``min``, ``25%``, ``50%``, ``75%``, ``max``. For object data (e.g. strings or timestamps), the result’s index will include ``count``, ``unique``, ``top``, and ``freq``. The ``top`` is the most common value. The ``freq`` is the most common value’s frequency. Timestamps also include the ``first`` and ``last`` items. Examples -------- Describing a numeric ``Series``. >>> s = ps.Series([1, 2, 3]) >>> s.describe() count 3.0 mean 2.0 std 1.0 min 1.0 25% 1.0 50% 2.0 75% 3.0 max 3.0 dtype: float64 Describing a ``DataFrame``. Only numeric fields are returned. >>> df = ps.DataFrame({'numeric1': [1, 2, 3], ... 'numeric2': [4.0, 5.0, 6.0], ... 'object': ['a', 'b', 'c'] ... }, ... columns=['numeric1', 'numeric2', 'object']) >>> df.describe() numeric1 numeric2 count 3.0 3.0 mean 2.0 5.0 std 1.0 1.0 min 1.0 4.0 25% 1.0 4.0 50% 2.0 5.0 75% 3.0 6.0 max 3.0 6.0 For multi-index columns: >>> df.columns = [('num', 'a'), ('num', 'b'), ('obj', 'c')] >>> df.describe() # doctest: +NORMALIZE_WHITESPACE num a b count 3.0 3.0 mean 2.0 5.0 std 1.0 1.0 min 1.0 4.0 25% 1.0 4.0 50% 2.0 5.0 75% 3.0 6.0 max 3.0 6.0 >>> df[('num', 'b')].describe() count 3.0 mean 5.0 std 1.0 min 4.0 25% 4.0 50% 5.0 75% 6.0 max 6.0 Name: (num, b), dtype: float64 Describing a ``DataFrame`` and selecting custom percentiles. >>> df = ps.DataFrame({'numeric1': [1, 2, 3], ... 'numeric2': [4.0, 5.0, 6.0] ... }, ... columns=['numeric1', 'numeric2']) >>> df.describe(percentiles = [0.85, 0.15]) numeric1 numeric2 count 3.0 3.0 mean 2.0 5.0 std 1.0 1.0 min 1.0 4.0 15% 1.0 4.0 50% 2.0 5.0 85% 3.0 6.0 max 3.0 6.0 Describing a column from a ``DataFrame`` by accessing it as an attribute. >>> df.numeric1.describe() count 3.0 mean 2.0 std 1.0 min 1.0 25% 1.0 50% 2.0 75% 3.0 max 3.0 Name: numeric1, dtype: float64 Describing a column from a ``DataFrame`` by accessing it as an attribute and selecting custom percentiles. >>> df.numeric1.describe(percentiles = [0.85, 0.15]) count 3.0 mean 2.0 std 1.0 min 1.0 15% 1.0 50% 2.0 85% 3.0 max 3.0 Name: numeric1, dtype: float64 """psser_numeric:List[Series]=[]psser_string:List[Series]=[]psser_timestamp:List[Series]=[]spark_data_types:List[DataType]=[]column_labels:Optional[List[Label]]=[]column_names:List[str]=[]forlabelinself._internal.column_labels:psser=self._psser_for(label)spark_data_type=psser.spark.data_typeifisinstance(spark_data_type,NumericType):psser_numeric.append(psser)column_labels.append(label)spark_data_types.append(spark_data_type)elifisinstance(spark_data_type,(TimestampType,TimestampNTZType)):psser_timestamp.append(psser)column_labels.append(label)spark_data_types.append(spark_data_type)else:psser_string.append(psser)column_names.append(self._internal.spark_column_name_for(label))ifpercentilesisnotNone:ifany((p<0.0)or(p>1.0)forpinpercentiles):raiseValueError("Percentiles should all be in the interval [0, 1]")# appending 50% if not in percentiles alreadypercentiles=(percentiles+[0.5])if0.5notinpercentileselsepercentileselse:percentiles=[0.25,0.5,0.75]# Identify the casesis_all_string_type=(len(psser_numeric)==0andlen(psser_timestamp)==0andlen(psser_string)>0)is_all_numeric_type=len(psser_numeric)>0andlen(psser_timestamp)==0has_timestamp_type=len(psser_timestamp)>0has_numeric_type=len(psser_numeric)>0ifis_all_string_type:# Handling string type columns# We will retrive the `count`, `unique`, `top` and `freq`.internal=self._internal.resolved_copyexprs_string=[internal.spark_column_for(psser._column_label)forpsserinpsser_string]sdf=internal.spark_frame.select(*exprs_string)# Get `count` & `unique` for each columnscounts,uniques=map(lambdax:x[1:],sdf.summary("count","count_distinct").take(2))# Handling Empty DataFrameiflen(counts)==0orcounts[0]=="0":data=dict()forpsserinpsser_string:data[psser.name]=[0,0,np.nan,np.nan]returnDataFrame(data,index=["count","unique","top","freq"])# Get `top` & `freq` for each columnstops=[]freqs=[]# TODO(SPARK-37711): We should do it in single pass since invoking Spark job# for every columns is too expensive.forcolumninexprs_string:top,freq=sdf.groupby(column).count().sort("count",ascending=False).first()tops.append(str(top))freqs.append(str(freq))stats=[counts,uniques,tops,freqs]stats_names=["count","unique","top","freq"]result:DataFrame=DataFrame(data=stats,index=stats_names,columns=column_names,)elifis_all_numeric_type:# Handling numeric columnsexprs_numeric=[psser._dtype_op.nan_to_null(psser).spark.columnforpsserinpsser_numeric]formatted_perc=["{:.0%}".format(p)forpinsorted(percentiles)]stats=["count","mean","stddev","min",*formatted_perc,"max"]# In this case, we can simply use `summary` to calculate the stats.sdf=self._internal.spark_frame.select(*exprs_numeric).summary(*stats)sdf=sdf.replace("stddev","std",subset=["summary"])internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,"summary")],column_labels=column_labels,data_spark_columns=[scol_for(sdf,self._internal.spark_column_name_for(label))forlabelincolumn_labels],)result=DataFrame(internal).astype("float64")elifhas_timestamp_type:internal=self._internal.resolved_copycolumn_names=[internal.spark_column_name_for(column_label)forcolumn_labelincolumn_labels]column_length=len(column_labels)# Apply stat functions for each column.count_exprs=map(F.count,column_names)min_exprs=map(F.min,column_names)# Here we try to flat the multiple maps into single list that contains each calculated# percentile using `chain`.# e.g. flat the `[<map object at 0x7fc1907dc280>, <map object at 0x7fc1907dcc70>]`# to `[Column<'percentile_approx(A, 0.2, 10000)'>,# Column<'percentile_approx(B, 0.2, 10000)'>,# Column<'percentile_approx(A, 0.5, 10000)'>,# Column<'percentile_approx(B, 0.5, 10000)'>]`perc_exprs=chain(*[map(F.percentile_approx,column_names,[percentile]*column_length)forpercentileinpercentiles])max_exprs=map(F.max,column_names)mean_exprs=[]forcolumn_name,spark_data_typeinzip(column_names,spark_data_types):mean_exprs.append(F.mean(column_name).astype(spark_data_type))exprs=[*count_exprs,*mean_exprs,*min_exprs,*perc_exprs,*max_exprs]formatted_perc=["{:.0%}".format(p)forpinsorted(percentiles)]stats_names=["count","mean","min",*formatted_perc,"max"]# If not all columns are timestamp type,# we also need to calculate the `std` for numeric columnsifhas_numeric_type:std_exprs=[]forlabel,spark_data_typeinzip(column_labels,spark_data_types):column_name=label[0]ifisinstance(spark_data_type,(TimestampType,TimestampNTZType)):std_exprs.append(F.lit(None).alias("stddev_samp({})".format(column_name)))else:std_exprs.append(F.stddev(column_name))exprs.extend(std_exprs)stats_names.append("std")# Select stats for all columns at once.sdf=internal.spark_frame.select(exprs)stat_values=sdf.first()num_stats=int(len(exprs)/column_length)# `column_name_stats_kv` is key-value store that has column name as key, and# the stats as values e.g. {"A": [{count_value}, {min_value}, ...],# "B": [{count_value}, {min_value} ...]}column_name_stats_kv:Dict[str,List[str]]=defaultdict(list)fori,column_nameinenumerate(column_names):forfirst_stat_idxinrange(num_stats):column_name_stats_kv[column_name].append(stat_values[(first_stat_idx*column_length)+i])# For timestamp type columns, we should cast the column type to string.forkey,spark_data_typeinzip(column_name_stats_kv,spark_data_types):ifisinstance(spark_data_type,(TimestampType,TimestampNTZType)):column_name_stats_kv[key]=[str(value)forvalueincolumn_name_stats_kv[key]]result:DataFrame=DataFrame(# type: ignore[no-redef]data=column_name_stats_kv,index=stats_names,columns=column_names,)else:# Empty DataFrame without columnraiseValueError("Cannot describe a DataFrame without columns")returnresult
[docs]defdrop_duplicates(self,subset:Optional[Union[Name,List[Name]]]=None,keep:Union[bool,str]="first",inplace:bool=False,ignore_index:bool=False,)->Optional["DataFrame"]:""" Return DataFrame with duplicate rows removed, optionally only considering certain columns. Parameters ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all the columns. keep : {'first', 'last', False}, default 'first' Determines which duplicates (if any) to keep. - ``first`` : Drop duplicates except for the first occurrence. - ``last`` : Drop duplicates except for the last occurrence. - False : Drop all duplicates. inplace : boolean, default False Whether to drop duplicates in place or to return a copy. ignore_index : boolean, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. Returns ------- DataFrame DataFrame with duplicates removed or None if ``inplace=True``. >>> df = ps.DataFrame( ... {'a': [1, 2, 2, 2, 3], 'b': ['a', 'a', 'a', 'c', 'd']}, columns = ['a', 'b']) >>> df a b 0 1 a 1 2 a 2 2 a 3 2 c 4 3 d >>> df.drop_duplicates().sort_index() a b 0 1 a 1 2 a 3 2 c 4 3 d >>> df.drop_duplicates(ignore_index=True).sort_index() a b 0 1 a 1 2 a 2 2 c 3 3 d >>> df.drop_duplicates('a').sort_index() a b 0 1 a 1 2 a 4 3 d >>> df.drop_duplicates(['a', 'b']).sort_index() a b 0 1 a 1 2 a 3 2 c 4 3 d >>> df.drop_duplicates(keep='last').sort_index() a b 0 1 a 2 2 a 3 2 c 4 3 d >>> df.drop_duplicates(keep=False).sort_index() a b 0 1 a 3 2 c 4 3 d """inplace=validate_bool_kwarg(inplace,"inplace")sdf,column=self._mark_duplicates(subset,keep)sdf=sdf.where(~scol_for(sdf,column)).drop(column)internal=self._internal.with_new_sdf(sdf)psdf:DataFrame=DataFrame(internal)ifinplace:ifignore_index:psdf.reset_index(drop=True,inplace=inplace)self._update_internal_frame(psdf._internal)returnNoneelse:returnpsdf.reset_index(drop=True)ifignore_indexelsepsdf
[docs]defreindex(self,labels:Optional[Sequence[Any]]=None,index:Optional[Union["Index",Sequence[Any]]]=None,columns:Optional[Union[pd.Index,Sequence[Any]]]=None,axis:Optional[Axis]=None,copy:Optional[bool]=True,fill_value:Optional[Any]=None,)->"DataFrame":""" Conform DataFrame to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and ``copy=False``. Parameters ---------- labels: array-like, optional New labels / index to conform the axis specified by ‘axis’ to. index, columns: array-like, optional New labels / index to conform to, should be specified using keywords. Preferably an Index object to avoid duplicating data axis: int or str, optional Axis to target. Can be either the axis name (‘index’, ‘columns’) or number (0, 1). copy : bool, default True Return a new object, even if the passed indexes are the same. fill_value : scalar, default np.NaN Value to use for missing values. Defaults to NaN, but can be any "compatible" value. Returns ------- DataFrame with changed index. See Also -------- DataFrame.set_index : Set row labels. DataFrame.reset_index : Remove row labels or move them to new columns. Examples -------- ``DataFrame.reindex`` supports two calling conventions * ``(index=index_labels, columns=column_labels, ...)`` * ``(labels, axis={'index', 'columns'}, ...)`` We *highly* recommend using keyword arguments to clarify your intent. Create a dataframe with some fictional data. >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] >>> df = ps.DataFrame({ ... 'http_status': [200, 200, 404, 404, 301], ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, ... index=index, ... columns=['http_status', 'response_time']) >>> df http_status response_time Firefox 200 0.04 Chrome 200 0.02 Safari 404 0.07 IE10 404 0.08 Konqueror 301 1.00 Create a new index and reindex the dataframe. By default values in the new index that do not have corresponding records in the dataframe are assigned ``NaN``. >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', ... 'Chrome'] >>> df.reindex(new_index).sort_index() http_status response_time Chrome 200.0 0.02 Comodo Dragon NaN NaN IE10 404.0 0.08 Iceweasel NaN NaN Safari 404.0 0.07 We can fill in the missing values by passing a value to the keyword ``fill_value``. >>> df.reindex(new_index, fill_value=0, copy=False).sort_index() http_status response_time Chrome 200 0.02 Comodo Dragon 0 0.00 IE10 404 0.08 Iceweasel 0 0.00 Safari 404 0.07 We can also reindex the columns. >>> df.reindex(columns=['http_status', 'user_agent']).sort_index() http_status user_agent Chrome 200 NaN Firefox 200 NaN IE10 404 NaN Konqueror 301 NaN Safari 404 NaN Or we can use "axis-style" keyword arguments >>> df.reindex(['http_status', 'user_agent'], axis="columns").sort_index() http_status user_agent Chrome 200 NaN Firefox 200 NaN IE10 404 NaN Konqueror 301 NaN Safari 404 NaN To further illustrate the filling functionality in ``reindex``, we will create a dataframe with a monotonically increasing index (for example, a sequence of dates). >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') >>> df2 = ps.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, ... index=date_index) >>> df2.sort_index() prices 2010-01-01 100.0 2010-01-02 101.0 2010-01-03 NaN 2010-01-04 100.0 2010-01-05 89.0 2010-01-06 88.0 Suppose we decide to expand the dataframe to cover a wider date range. >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') >>> df2.reindex(date_index2).sort_index() prices 2009-12-29 NaN 2009-12-30 NaN 2009-12-31 NaN 2010-01-01 100.0 2010-01-02 101.0 2010-01-03 NaN 2010-01-04 100.0 2010-01-05 89.0 2010-01-06 88.0 2010-01-07 NaN """ifaxisisnotNoneand(indexisnotNoneorcolumnsisnotNone):raiseTypeError("Cannot specify both 'axis' and any of 'index' or 'columns'.")iflabelsisnotNone:axis=validate_axis(axis)ifaxis==0:index=labelselifaxis==1:columns=labelsifindexisnotNoneandnotis_list_like(index):raiseTypeError("Index must be called with a collection of some kind, ""%s was passed"%type(index))ifcolumnsisnotNoneandnotis_list_like(columns):raiseTypeError("Columns must be called with a collection of some kind, ""%s was passed"%type(columns))df=selfifindexisnotNone:df=df._reindex_index(index,fill_value)ifcolumnsisnotNone:df=df._reindex_columns(columns,fill_value)# Copyifcopyanddfisself:returndf.copy()else:returndf
def_reindex_index(self,index:Optional[Union["Index",Sequence[Any]]],fill_value:Optional[Any])->"DataFrame":# When axis is index, we can mimic pandas by a right outer join.nlevels=self._internal.index_levelassertnlevels<=1or(isinstance(index,ps.MultiIndex)andnlevels==index.nlevels),"MultiIndex DataFrame can only be reindexed with a similar pandas-on-Spark MultiIndex."index_columns=self._internal.index_spark_column_namesframe=self._internal.resolved_copy.spark_frame.drop(NATURAL_ORDER_COLUMN_NAME)ifisinstance(index,ps.Index):ifnlevels!=index.nlevels:returnDataFrame(index._internal.with_new_columns([])).reindex(columns=self.columns,fill_value=fill_value)index_names=index._internal.index_namesscols=index._internal.index_spark_columnslabels=index._internal.spark_frame.select([scol.alias(index_column)forscol,index_columninzip(scols,index_columns)])else:index=ps.Index(list(index))labels=index._internal.spark_frame.select(index.spark.column.alias(index_columns[0]))index_names=self._internal.index_namesiffill_valueisnotNone:frame_index_columns=[verify_temp_column_name(frame,"__frame_index_column_{}__".format(i))foriinrange(nlevels)]index_scols=[scol_for(frame,index_col).alias(frame_index_col)forindex_col,frame_index_colinzip(index_columns,frame_index_columns)]scols=self._internal.resolved_copy.data_spark_columnsframe=frame.select(index_scols+scols)temp_fill_value=verify_temp_column_name(frame,"__fill_value__")labels=labels.withColumn(temp_fill_value,F.lit(fill_value))frame_index_scols=[scol_for(frame,col)forcolinframe_index_columns]labels_index_scols=[scol_for(labels,col)forcolinindex_columns]joined_df=frame.join(labels,on=[fcol==lcolforfcol,lcolinzip(frame_index_scols,labels_index_scols)],how="right",)joined_df=joined_df.select(*labels_index_scols,*[F.when(reduce(lambdac1,c2:c1&c2,[fcol.isNull()&lcol.isNotNull()forfcol,lcolinzip(frame_index_scols,labels_index_scols)],),scol_for(joined_df,temp_fill_value),).otherwise(scol_for(joined_df,col)).alias(col)forcolinself._internal.data_spark_column_names],)data_fields=Noneelse:joined_df=frame.join(labels,on=index_columns,how="right")data_fields=[field.copy(nullable=True)forfieldinself._internal.data_fields]sdf=joined_df.drop(NATURAL_ORDER_COLUMN_NAME)internal=self._internal.copy(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolinself._internal.index_spark_column_names],index_names=index_names,index_fields=[field.copy(name=name)forfield,nameinzip(index._internal.index_fields,self._internal.index_spark_column_names)],data_spark_columns=[scol_for(sdf,col)forcolinself._internal.data_spark_column_names],data_fields=data_fields,)returnDataFrame(internal)def_reindex_columns(self,columns:Optional[Union[pd.Index,Sequence[Any]]],fill_value:Optional[Any])->"DataFrame":level=self._internal.column_labels_leveliflevel>1:label_columns=list(columns)forcolinlabel_columns:ifnotisinstance(col,tuple):raiseTypeError("Expected tuple, got {}".format(type(col).__name__))else:label_columns=[(col,)forcolincolumns]forcolinlabel_columns:iflen(col)!=level:raiseValueError("shape (1,{}) doesn't match the shape (1,{})".format(len(col),level))fill_value=np.naniffill_valueisNoneelsefill_valuescols_or_pssers:List[Union[PySparkColumn,"Series"]]=[]labels=[]forlabelinlabel_columns:iflabelinself._internal.column_labels:scols_or_pssers.append(self._psser_for(label))else:scols_or_pssers.append(F.lit(fill_value).alias(name_like_string(label)))labels.append(label)ifisinstance(columns,pd.Index):column_label_names=[nameifis_name_like_tuple(name)else(name,)fornameincolumns.names]internal=self._internal.with_new_columns(scols_or_pssers,column_labels=labels,column_label_names=column_label_names)else:internal=self._internal.with_new_columns(scols_or_pssers,column_labels=labels)returnDataFrame(internal)
[docs]defreindex_like(self,other:"DataFrame",copy:bool=True)->"DataFrame":""" Return a DataFrame with matching indices as other object. Conform the object to the same index on all axes. Places NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and copy=False. Parameters ---------- other : DataFrame Its row and column indices are used to define the new indices of this object. copy : bool, default True Return a new object, even if the passed indexes are the same. Returns ------- DataFrame DataFrame with changed indices on each axis. See Also -------- DataFrame.set_index : Set row labels. DataFrame.reset_index : Remove row labels or move them to new columns. DataFrame.reindex : Change to new indices or expand indices. Notes ----- Same as calling ``.reindex(index=other.index, columns=other.columns,...)``. Examples -------- >>> df1 = ps.DataFrame([[24.3, 75.7, 'high'], ... [31, 87.8, 'high'], ... [22, 71.6, 'medium'], ... [35, 95, 'medium']], ... columns=['temp_celsius', 'temp_fahrenheit', ... 'windspeed'], ... index=pd.date_range(start='2014-02-12', ... end='2014-02-15', freq='D')) >>> df1 temp_celsius temp_fahrenheit windspeed 2014-02-12 24.3 75.7 high 2014-02-13 31.0 87.8 high 2014-02-14 22.0 71.6 medium 2014-02-15 35.0 95.0 medium >>> df2 = ps.DataFrame([[28, 'low'], ... [30, 'low'], ... [35.1, 'medium']], ... columns=['temp_celsius', 'windspeed'], ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', ... '2014-02-15'])) >>> df2 temp_celsius windspeed 2014-02-12 28.0 low 2014-02-13 30.0 low 2014-02-15 35.1 medium >>> df2.reindex_like(df1).sort_index() # doctest: +NORMALIZE_WHITESPACE temp_celsius temp_fahrenheit windspeed 2014-02-12 28.0 NaN low 2014-02-13 30.0 NaN low 2014-02-14 NaN NaN None 2014-02-15 35.1 NaN medium """ifisinstance(other,DataFrame):returnself.reindex(index=other.index,columns=other.columns,copy=copy)else:raiseTypeError("other must be a pandas-on-Spark DataFrame")
[docs]defmelt(self,id_vars:Optional[Union[Name,List[Name]]]=None,value_vars:Optional[Union[Name,List[Name]]]=None,var_name:Optional[Union[str,List[str]]]=None,value_name:str="value",)->"DataFrame":""" Unpivot a DataFrame from wide format to long format, optionally leaving identifier variables set. This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other columns, considered measured variables (`value_vars`), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. var_name : scalar, default 'variable' Name to use for the 'variable' column. If None it uses `frame.columns.name` or ‘variable’. value_name : scalar, default 'value' Name to use for the 'value' column. Returns ------- DataFrame Unpivoted DataFrame. Examples -------- >>> df = ps.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, ... 'B': {0: 1, 1: 3, 2: 5}, ... 'C': {0: 2, 1: 4, 2: 6}}, ... columns=['A', 'B', 'C']) >>> df A B C 0 a 1 2 1 b 3 4 2 c 5 6 >>> ps.melt(df) variable value 0 A a 1 B 1 2 C 2 3 A b 4 B 3 5 C 4 6 A c 7 B 5 8 C 6 >>> df.melt(id_vars='A') A variable value 0 a B 1 1 a C 2 2 b B 3 3 b C 4 4 c B 5 5 c C 6 >>> df.melt(value_vars='A') variable value 0 A a 1 A b 2 A c >>> ps.melt(df, id_vars=['A', 'B']) A B variable value 0 a 1 C 2 1 b 3 C 4 2 c 5 C 6 >>> df.melt(id_vars=['A'], value_vars=['C']) A variable value 0 a C 2 1 b C 4 2 c C 6 The names of 'variable' and 'value' columns can be customized: >>> ps.melt(df, id_vars=['A'], value_vars=['B'], ... var_name='myVarname', value_name='myValname') A myVarname myValname 0 a B 1 1 b B 3 2 c B 5 """column_labels=self._internal.column_labelsifid_varsisNone:id_vars=[]else:ifisinstance(id_vars,tuple):ifself._internal.column_labels_level==1:id_vars=[idvifis_name_like_tuple(idv)else(idv,)foridvinid_vars]else:raiseValueError("id_vars must be a list of tuples"" when columns are a MultiIndex")elifis_name_like_value(id_vars):id_vars=[(id_vars,)]else:id_vars=[idvifis_name_like_tuple(idv)else(idv,)foridvinid_vars]non_existence_col=[idvforidvinid_varsifidvnotincolumn_labels]iflen(non_existence_col)!=0:raveled_column_labels:np.ndarray[Any,np.dtype[Any]]=np.ravel(column_labels)missing=[necfornecinnp.ravel(non_existence_col)ifnecnotinraveled_column_labels]iflen(missing)!=0:raiseKeyError("The following 'id_vars' are not present"" in the DataFrame: {}".format(missing))else:raiseKeyError("None of {} are in the {}".format(non_existence_col,column_labels))ifvalue_varsisNone:value_vars=[]else:ifisinstance(value_vars,tuple):ifself._internal.column_labels_level==1:value_vars=[valvifis_name_like_tuple(valv)else(valv,)forvalvinvalue_vars]else:raiseValueError("value_vars must be a list of tuples"" when columns are a MultiIndex")elifis_name_like_value(value_vars):value_vars=[(value_vars,)]else:value_vars=[valvifis_name_like_tuple(valv)else(valv,)forvalvinvalue_vars]non_existence_col=[valvforvalvinvalue_varsifvalvnotincolumn_labels]iflen(non_existence_col)!=0:raveled_column_labels=np.ravel(column_labels)missing=[necfornecinnp.ravel(non_existence_col)ifnecnotinraveled_column_labels]iflen(missing)!=0:raiseKeyError("The following 'value_vars' are not present"" in the DataFrame: {}".format(missing))else:raiseKeyError("None of {} are in the {}".format(non_existence_col,column_labels))iflen(value_vars)==0:value_vars=column_labelscolumn_labels=[labelforlabelincolumn_labelsiflabelnotinid_vars]sdf=self._internal.spark_frameifvar_nameisNone:if(self._internal.column_labels_level==1andself._internal.column_label_names[0]isNone):var_name=["variable"]else:var_name=[name_like_string(name)ifnameisnotNoneelse"variable_{}".format(i)fori,nameinenumerate(self._internal.column_label_names)]elifis_list_like(var_name):raiseValueError(f"{var_name=} must be a scalar.")else:var_name=[var_name]# type: ignore[list-item]pairs=F.explode(F.array(*[F.struct(*[F.lit(c).alias(name)forc,nameinzip(label,var_name)],*[self._internal.spark_column_for(label).alias(value_name)],)forlabelincolumn_labelsiflabelinvalue_vars]))columns=([self._internal.spark_column_for(label).alias(name_like_string(label))forlabelinid_vars]+[F.col("pairs.`%s`"%name)fornameinvar_name]+[F.col("pairs.`%s`"%value_name)])exploded_df=sdf.withColumn("pairs",pairs).select(columns)returnDataFrame(InternalFrame(spark_frame=exploded_df,index_spark_columns=None,column_labels=([labeliflen(label)==1else(name_like_string(label),)forlabelinid_vars]+[(name,)fornameinvar_name]+[(value_name,)]),))
[docs]defstack(self)->DataFrameOrSeries:""" Stack the prescribed level(s) from columns to index. Return a reshaped DataFrame or Series having a multi-level index with one or more new inner-most levels compared to the current DataFrame. The new inner-most levels are created by pivoting the columns of the current dataframe: - if the columns have a single level, the output is a Series - if the columns have multiple levels, the new index level(s) is (are) taken from the prescribed level(s) and the output is a DataFrame. The new index levels are sorted. Returns ------- DataFrame or Series Stacked dataframe or series. See Also -------- DataFrame.unstack : Unstack prescribed level(s) from index axis onto column axis. DataFrame.pivot : Reshape dataframe from long format to wide format. DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame. Notes ----- The function is named by analogy with a collection of books being reorganized from being side by side on a horizontal position (the columns of the dataframe) to being stacked vertically on top of each other (in the index of the dataframe). Examples -------- **Single level columns** >>> df_single_level_cols = ps.DataFrame([[0, 1], [2, 3]], ... index=['cat', 'dog'], ... columns=['weight', 'height']) Stacking a dataframe with a single level column axis returns a Series: >>> df_single_level_cols weight height cat 0 1 dog 2 3 >>> df_single_level_cols.stack().sort_index() cat height 1 weight 0 dog height 3 weight 2 dtype: int64 **Multi level columns: simple case** >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), ... ('weight', 'pounds')]) >>> df_multi_level_cols1 = ps.DataFrame([[1, 2], [2, 4]], ... index=['cat', 'dog'], ... columns=multicol1) Stacking a dataframe with a multi-level column axis: >>> df_multi_level_cols1 # doctest: +NORMALIZE_WHITESPACE weight kg pounds cat 1 2 dog 2 4 >>> df_multi_level_cols1.stack().sort_index() weight cat kg 1 pounds 2 dog kg 2 pounds 4 **Missing values** >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), ... ('height', 'm')]) >>> df_multi_level_cols2 = ps.DataFrame([[1.0, 2.0], [3.0, 4.0]], ... index=['cat', 'dog'], ... columns=multicol2) It is common to have missing values when stacking a dataframe with multi-level columns, as the stacked dataframe typically has more values than the original dataframe. Missing values are filled with NaNs: >>> df_multi_level_cols2 weight height kg m cat 1.0 2.0 dog 3.0 4.0 >>> df_multi_level_cols2.stack().sort_index() weight height cat kg 1.0 NaN m NaN 2.0 dog kg 3.0 NaN m NaN 4.0 """frompyspark.pandas.seriesimportfirst_seriesiflen(self._internal.column_labels)==0:returnDataFrame(self._internal.copy(column_label_names=self._internal.column_label_names[:-1]).with_filter(F.lit(False)))column_labels:Dict[Label,Dict[Any,PySparkColumn]]=defaultdict(dict)index_values=set()should_returns_series=Falseforlabelinself._internal.column_labels:new_label=label[:-1]iflen(new_label)==0:new_label=Noneshould_returns_series=Truevalue=label[-1]scol=self._internal.spark_column_for(label)column_labels[new_label][value]=scolindex_values.add(value)index_name=self._internal.column_label_names[-1]column_label_names=self._internal.column_label_names[:-1]iflen(column_label_names)==0:column_label_names=[None]index_column=SPARK_INDEX_NAME_FORMAT(self._internal.index_level)data_columns=[name_like_string(label)forlabelincolumn_labels]structs=[F.struct(*[F.lit(value).alias(index_column)],*[(column_labels[label][value]ifvalueincolumn_labels[label]elseF.lit(None)).alias(name)forlabel,nameinzip(column_labels,data_columns)],).alias(value)forvalueinindex_values]pairs=F.explode(F.array(*structs))sdf=self._internal.spark_frame.withColumn("pairs",pairs)sdf=sdf.select(self._internal.index_spark_columns+[sdf["pairs"][index_column].alias(index_column)]+[sdf["pairs"][name].alias(name)fornameindata_columns])internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,col)forcolin(self._internal.index_spark_column_names+[index_column])],index_names=self._internal.index_names+[index_name],index_fields=self._internal.index_fields+[None],column_labels=list(column_labels),data_spark_columns=[scol_for(sdf,col)forcolindata_columns],column_label_names=column_label_names,)psdf:DataFrame=DataFrame(internal)ifshould_returns_series:returnfirst_series(psdf)else:returnpsdf
[docs]defunstack(self)->DataFrameOrSeries:""" Pivot the (necessarily hierarchical) index labels. Returns a DataFrame having a new level of column labels whose inner-most level consists of the pivoted index labels. If the index is not a MultiIndex, the output will be a Series. .. note:: If the index is a MultiIndex, the output DataFrame could be very wide, and it could cause a serious performance degradation since Spark partitions its row based. Returns ------- Series or DataFrame See Also -------- DataFrame.pivot : Pivot a table based on column values. DataFrame.stack : Pivot a level of the column labels (inverse operation from unstack). Examples -------- >>> df = ps.DataFrame({"A": {"0": "a", "1": "b", "2": "c"}, ... "B": {"0": "1", "1": "3", "2": "5"}, ... "C": {"0": "2", "1": "4", "2": "6"}}, ... columns=["A", "B", "C"]) >>> df A B C 0 a 1 2 1 b 3 4 2 c 5 6 >>> df.unstack().sort_index() A 0 a 1 b 2 c B 0 1 1 3 2 5 C 0 2 1 4 2 6 dtype: object >>> df.columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')]) >>> df.unstack().sort_index() X A 0 a 1 b 2 c B 0 1 1 3 2 5 Y C 0 2 1 4 2 6 dtype: object For MultiIndex case: >>> df = ps.DataFrame({"A": ["a", "b", "c"], ... "B": [1, 3, 5], ... "C": [2, 4, 6]}, ... columns=["A", "B", "C"]) >>> df = df.set_index('A', append=True) >>> df # doctest: +NORMALIZE_WHITESPACE B C A 0 a 1 2 1 b 3 4 2 c 5 6 >>> df.unstack().sort_index() # doctest: +NORMALIZE_WHITESPACE B C A a b c a b c 0 1.0 NaN NaN 2.0 NaN NaN 1 NaN 3.0 NaN NaN 4.0 NaN 2 NaN NaN 5.0 NaN NaN 6.0 """frompyspark.pandas.seriesimportfirst_seriesifself._internal.index_level>1:# The index after `reset_index()` will never be used, so use "distributed" index# as a dummy to avoid overhead.withoption_context("compute.default_index_type","distributed"):df=self.reset_index()index=df._internal.column_labels[:self._internal.index_level-1]columns=df.columns[self._internal.index_level-1]df=df.pivot_table(index=index,columns=columns,values=self._internal.column_labels,aggfunc="first")internal=df._internal.copy(index_names=self._internal.index_names[:-1],index_fields=df._internal.index_fields[:self._internal.index_level-1],column_label_names=(df._internal.column_label_names[:-1]+[Noneifself._internal.index_names[-1]isNoneelsedf._internal.column_label_names[-1]]),)returnDataFrame(internal)# TODO: Codes here are similar with melt. Should we deduplicate?column_labels=self._internal.column_labelsser_name=SPARK_DEFAULT_SERIES_NAMEsdf=self._internal.spark_framenew_index_columns=[SPARK_INDEX_NAME_FORMAT(i)foriinrange(self._internal.column_labels_level)]new_index_map=list(zip_longest(new_index_columns,self._internal.column_label_names,[]))pairs=F.explode(F.array(*[F.struct(*[F.lit(c).alias(name)forc,nameinzip(idx,new_index_columns)],*[self._internal.spark_column_for(idx).alias(ser_name)],)foridxincolumn_labels]))columns=[F.col("pairs.%s"%name)fornameinnew_index_columns[:self._internal.column_labels_level]]+[F.col("pairs.%s"%ser_name)]new_index_len=len(new_index_columns)existing_index_columns=[]fori,(index_name,index_field)inenumerate(zip(self._internal.index_names,self._internal.index_fields)):name=SPARK_INDEX_NAME_FORMAT(i+new_index_len)new_index_map.append((name,index_name,index_field.copy(name=name)))existing_index_columns.append(self._internal.index_spark_columns[i].alias(name))exploded_df=sdf.withColumn("pairs",pairs).select(existing_index_columns+columns)index_spark_column_names,index_names,index_fields=zip(*new_index_map)returnfirst_series(DataFrame(InternalFrame(exploded_df,index_spark_columns=[scol_for(exploded_df,col)forcolinindex_spark_column_names],index_names=list(index_names),index_fields=list(index_fields),column_labels=[None],)))
# TODO(SPARK-46165): axis and **kwargs should be implemented.
[docs]defall(self,axis:Axis=0,bool_only:Optional[bool]=None,skipna:bool=True)->"Series":""" Return whether all elements are True. Returns True unless there is at least one element within a series that is False or equivalent (e.g. zero or empty) Parameters ---------- axis : {0 or 'index'}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the original column labels. bool_only : bool, default None Include only boolean columns. If None, will attempt to use everything, then use only boolean data. skipna : boolean, default True Exclude NA values, such as None or numpy.NaN. If an entire row/column is NA values and `skipna` is True, then the result will be True, as for an empty row/column. If `skipna` is False, numpy.NaNs are treated as True because these are not equal to zero, Nones are treated as False. Returns ------- Series Examples -------- Create a dataframe from a dictionary. >>> df = ps.DataFrame({ ... 'col1': [True, True, True], ... 'col2': [True, False, False], ... 'col3': [0, 0, 0], ... 'col4': [1, 2, 3], ... 'col5': [True, True, None], ... 'col6': [True, False, None]}, ... columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6']) Default behavior checks if column-wise values all return True. >>> df.all() col1 True col2 False col3 False col4 True col5 True col6 False dtype: bool Include NA values when set `skipna=False`. >>> df[['col5', 'col6']].all(skipna=False) col5 False col6 False dtype: bool Include only boolean columns when set `bool_only=True`. >>> df.all(bool_only=True) col1 True col2 False dtype: bool """axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')column_labels=self._internal.column_labelsifbool_only:column_labels=self._bool_column_labels(column_labels)iflen(column_labels)==0:returnps.Series([],dtype=bool)applied:List[PySparkColumn]=[]forlabelincolumn_labels:scol=self._internal.spark_column_for(label)ifisinstance(self._internal.spark_type_for(label),NumericType)orskipna:# np.nan takes no effect to the result; None takes no effect if `skipna`all_col=F.min(F.coalesce(scol.cast("boolean"),F.lit(True)))else:# Take None as False when not `skipna`all_col=F.min(F.when(scol.isNull(),F.lit(False)).otherwise(scol.cast("boolean")))applied.append(F.when(all_col.isNull(),True).otherwise(all_col))returnself._result_aggregated(column_labels,applied)
# TODO(SPARK-46166): axis, skipna and **kwargs should be implemented.
[docs]defany(self,axis:Axis=0,bool_only:Optional[bool]=None)->"Series":""" Return whether any element is True. Returns False unless there is at least one element within a series that is True or equivalent (e.g. non-zero or non-empty). Parameters ---------- axis : {0 or 'index'}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the original column labels. bool_only : bool, default None Include only boolean columns. If None, will attempt to use everything, then use only boolean data. Returns ------- Series Examples -------- Create a dataframe from a dictionary. >>> df = ps.DataFrame({ ... 'col1': [False, False, False], ... 'col2': [True, False, False], ... 'col3': [0, 0, 1], ... 'col4': [0, 1, 2], ... 'col5': [False, False, None], ... 'col6': [True, False, None]}, ... columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6']) Default behavior checks if column-wise values all return True. >>> df.any() col1 False col2 True col3 True col4 True col5 False col6 True dtype: bool Include only boolean columns when set `bool_only=True`. >>> df.any(bool_only=True) col1 False col2 True dtype: bool Returns empty Series when the DataFrame is empty. >>> df[[]].any() Series([], dtype: bool) """axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')column_labels=self._internal.column_labelsifbool_only:column_labels=self._bool_column_labels(column_labels)iflen(column_labels)==0:returnps.Series([],dtype=bool)applied:List[PySparkColumn]=[]forlabelincolumn_labels:scol=self._internal.spark_column_for(label)any_col=F.max(F.coalesce(scol.cast("boolean"),F.lit(False)))applied.append(F.when(any_col.isNull(),False).otherwise(any_col))returnself._result_aggregated(column_labels,applied)
def_bool_column_labels(self,column_labels:List[Label])->List[Label]:""" Filter column labels of boolean columns (without None). """bool_column_labels=[]forlabelincolumn_labels:psser=self._psser_for(label)ifis_bool_dtype(psser):# Rely on dtype rather than spark type because# columns that consist of bools and Nones should be excluded# if bool_only is Truebool_column_labels.append(label)returnbool_column_labelsdef_result_aggregated(self,column_labels:List[Label],scols:Sequence[PySparkColumn])->"Series":""" Given aggregated Spark columns and respective column labels from the original pandas-on-Spark DataFrame, construct the result Series. """frompyspark.pandas.seriesimportfirst_seriescols=[]result_scol_name="value"forlabel,applied_colinzip(column_labels,scols):cols.append(F.struct(*[F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i))fori,colinenumerate(label)],*[applied_col.alias(result_scol_name)],))# Statements under this comment implement spark frame transformations as below:# From:# +-------------------------------------------------------------------------------------+# |arrays |# +-------------------------------------------------------------------------------------+# |[{col1, true}, {col2, true}, {col3, false}, {col4, true}]|# +-------------------------------------------------------------------------------------+# To:# +-------------+# |col |# +-------------+# |{col1, true} |# |{col2, true} |# |{col3, false}|# |{col4, true} |# +-------------+# To:# +-----------------+-----+# |__index_level_0__|value|# +-----------------+-----+# |col1 |true |# |col2 |true |# |col3 |false|# |col4 |true |# +-----------------+-----+sdf=self._internal.spark_frame.select(F.array(*cols).alias("arrays")).select(F.explode(F.col("arrays")))sdf=sdf.selectExpr("col.*")internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,SPARK_INDEX_NAME_FORMAT(i))foriinrange(self._internal.column_labels_level)],index_names=self._internal.column_label_names,column_labels=[None],data_spark_columns=[scol_for(sdf,result_scol_name)],)# (cont.) The result Series should look as below:# col1 False# col2 True# col3 True# col4 True# dtype: boolreturnfirst_series(DataFrame(internal))# TODO(SPARK-46167): add axis, pct, na_option parameter
[docs]defrank(self,method:str="average",ascending:bool=True,numeric_only:bool=False)->"DataFrame":""" Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the average of the ranks of those values. .. note:: the current implementation of rank uses Spark's Window without specifying partition specification. This leads to moving all data into a single partition in a single machine and could cause serious performance degradation. Avoid this method with very large datasets. Parameters ---------- method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array * dense: like 'min', but rank always increases by 1 between groups ascending : boolean, default True False for ranks by high (1) to low (N) numeric_only : bool, default False For DataFrame objects, rank only numeric columns if set to True. .. versionchanged:: 4.0.0 The default value of ``numeric_only`` is now ``False``. Returns ------- ranks : same type as caller Examples -------- >>> df = ps.DataFrame({'A': [1, 2, 2, 3], 'B': [4, 3, 2, 1]}, columns=['A', 'B']) >>> df A B 0 1 4 1 2 3 2 2 2 3 3 1 >>> df.rank().sort_index() A B 0 1.0 4.0 1 2.5 3.0 2 2.5 2.0 3 4.0 1.0 If method is set to 'min', it uses lowest rank in group. >>> df.rank(method='min').sort_index() A B 0 1.0 4.0 1 2.0 3.0 2 2.0 2.0 3 4.0 1.0 If method is set to 'max', it uses highest rank in group. >>> df.rank(method='max').sort_index() A B 0 1.0 4.0 1 3.0 3.0 2 3.0 2.0 3 4.0 1.0 If method is set to 'dense', it leaves no gaps in group. >>> df.rank(method='dense').sort_index() A B 0 1.0 4.0 1 2.0 3.0 2 2.0 2.0 3 3.0 1.0 If numeric_only is set to 'True', rank only numeric columns. >>> df = ps.DataFrame({'A': [1, 2, 2, 3], 'B': ['a', 'b', 'd', 'c']}, columns= ['A', 'B']) >>> df A B 0 1 a 1 2 b 2 2 d 3 3 c >>> df.rank(numeric_only=True) A 0 1.0 1 2.5 2 2.5 3 4.0 """ifnumeric_only:numeric_col_names=[]forlabelinself._internal.column_labels:psser=self._psser_for(label)ifisinstance(psser.spark.data_type,(NumericType,BooleanType)):numeric_col_names.append(psser.name)psdf=self[numeric_col_names]ifnumeric_onlyelseselfreturnpsdf._apply_series_op(lambdapsser:psser._rank(method=method,ascending=ascending),should_resolve=True)
[docs]deffilter(self,items:Optional[Sequence[Any]]=None,like:Optional[str]=None,regex:Optional[str]=None,axis:Optional[Axis]=None,)->"DataFrame":""" Subset rows or columns of dataframe according to labels in the specified index. Note that this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index. Parameters ---------- items : list-like Keep labels from axis which are in items. like : string Keep labels from axis for which "like in label == True". regex : string (regular expression) Keep labels from axis for which re.search(regex, label) == True. axis : int or string axis name The axis to filter on. By default this is the info axis, 'index' for Series, 'columns' for DataFrame. Returns ------- same type as input object See Also -------- DataFrame.loc Notes ----- The ``items``, ``like``, and ``regex`` parameters are enforced to be mutually exclusive. ``axis`` defaults to the info axis that is used when indexing with ``[]``. Examples -------- >>> df = ps.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), ... index=['mouse', 'rabbit'], ... columns=['one', 'two', 'three']) >>> # select columns by name >>> df.filter(items=['one', 'three']) one three mouse 1 3 rabbit 4 6 >>> # select columns by regular expression >>> df.filter(regex='e$', axis=1) one three mouse 1 3 rabbit 4 6 >>> # select rows containing 'bbi' >>> df.filter(like='bbi', axis=0) one two three rabbit 4 5 6 For a Series, >>> # select rows by name >>> df.one.filter(items=['rabbit']) rabbit 4 Name: one, dtype: int64 >>> # select rows by regular expression >>> df.one.filter(regex='e$') mouse 1 Name: one, dtype: int64 >>> # select rows containing 'bbi' >>> df.one.filter(like='bbi') rabbit 4 Name: one, dtype: int64 """ifsum(xisnotNoneforxin(items,like,regex))>1:raiseTypeError("Keyword arguments `items`, `like`, or `regex` ""are mutually exclusive")axis=validate_axis(axis,none_axis=1)index_scols=self._internal.index_spark_columnsifitemsisnotNone:ifis_list_like(items):items=list(items)else:raiseValueError("items should be a list-like object.")ifaxis==0:iflen(index_scols)==1:iflen(items)<=ps.get_option("compute.isin_limit"):col=index_scols[0].isin([F.lit(item)foriteminitems])result:DataFrame=DataFrame(self._internal.with_filter(col))else:item_sdf_col=verify_temp_column_name(self._internal.spark_frame,"__item__")item_sdf=default_session().createDataFrame(pd.DataFrame({item_sdf_col:items}))joined_sdf=self._internal.spark_frame.join(other=F.broadcast(item_sdf),on=(index_scols[0]==scol_for(item_sdf,item_sdf_col)),how="semi",)result=DataFrame(self._internal.with_new_sdf(joined_sdf))result.index.name=Nonereturnresultelse:# for multi-indexcol=Noneforiteminitems:ifnotisinstance(item,tuple):raiseTypeError("Unsupported type {}".format(type(item).__name__))ifnotitem:raiseValueError("The item should not be empty.")midx_col=Nonefori,elementinenumerate(item):ifmidx_colisNone:midx_col=index_scols[i]==F.lit(element)else:midx_col=midx_col&(index_scols[i]==F.lit(element))ifcolisNone:col=midx_colelse:col=col|midx_colresult=DataFrame(self._internal.with_filter(col))result.index.names=[None]*result.index.nlevelsreturnresultelse:returnself[items]eliflikeisnotNone:ifaxis==0:col=Noneforindex_scolinindex_scols:ifcolisNone:col=index_scol.contains(like)else:col=col|index_scol.contains(like)returnDataFrame(self._internal.with_filter(col))else:column_labels=self._internal.column_labelsoutput_labels=[labelforlabelincolumn_labelsifany(likeiniforiinlabel)]returnself[output_labels]elifregexisnotNone:ifaxis==0:col=Noneforindex_scolinindex_scols:ifcolisNone:col=index_scol.rlike(regex)else:col=col|index_scol.rlike(regex)returnDataFrame(self._internal.with_filter(col))else:column_labels=self._internal.column_labelsmatcher=re.compile(regex)output_labels=[labelforlabelincolumn_labelsifany(matcher.search(i)isnotNoneforiinlabel)]returnself[output_labels]else:raiseTypeError("Must pass either `items`, `like`, or `regex`")
[docs]defrename(self,mapper:Optional[Union[Dict,Callable[[Any],Any]]]=None,index:Optional[Union[Dict,Callable[[Any],Any]]]=None,columns:Optional[Union[Dict,Callable[[Any],Any]]]=None,axis:Axis="index",inplace:bool=False,level:Optional[int]=None,errors:str="ignore",)->Optional["DataFrame"]:""" Alter axes labels. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don’t throw an error. Parameters ---------- mapper : dict-like or function Dict-like or functions transformations to apply to that axis’ values. Use either `mapper` and `axis` to specify the axis to target with `mapper`, or `index` and `columns`. index : dict-like or function Alternative to specifying axis ("mapper, axis=0" is equivalent to "index=mapper"). columns : dict-like or function Alternative to specifying axis ("mapper, axis=1" is equivalent to "columns=mapper"). axis : int or str, default 'index' Axis to target with mapper. Can be either the axis name ('index', 'columns') or number (0, 1). inplace : bool, default False Whether to return a new DataFrame. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified level. errors : {'ignore', 'raise'}, default 'ignore' If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, or `columns` contains labels that are not present in the Index being transformed. If 'ignore', existing keys will be renamed, and extra keys will be ignored. Returns ------- DataFrame with the renamed axis labels. Raises ------ `KeyError` If any of the labels is not found in the selected axis and "errors='raise'". Examples -------- >>> psdf1 = ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) >>> psdf1.rename(columns={"A": "a", "B": "c"}) # doctest: +NORMALIZE_WHITESPACE a c 0 1 4 1 2 5 2 3 6 >>> psdf1.rename(index={1: 10, 2: 20}) # doctest: +NORMALIZE_WHITESPACE A B 0 1 4 10 2 5 20 3 6 >>> psdf1.rename(columns={"A": "a", "C": "c"}, errors="raise") Traceback (most recent call last): ... KeyError: 'Index include value which is not in the `mapper`' >>> def str_lower(s) -> str: ... return str.lower(s) >>> psdf1.rename(str_lower, axis='columns') # doctest: +NORMALIZE_WHITESPACE a b 0 1 4 1 2 5 2 3 6 >>> def mul10(x) -> int: ... return x * 10 >>> psdf1.rename(mul10, axis='index') # doctest: +NORMALIZE_WHITESPACE A B 0 1 4 10 2 5 20 3 6 >>> idx = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Y', 'D')]) >>> psdf2 = ps.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=idx) >>> psdf2.rename(columns=str_lower, level=0) # doctest: +NORMALIZE_WHITESPACE x y A B C D 0 1 2 3 4 1 5 6 7 8 >>> psdf3 = ps.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=idx, columns=list('ab')) >>> psdf3.rename(index=str_lower) # doctest: +NORMALIZE_WHITESPACE a b x a 1 2 b 3 4 y c 5 6 d 7 8 """defgen_mapper_fn(mapper:Union[Dict,Callable[[Any],Any]],skip_return_type:bool=False)->Tuple[Callable[[Any],Any],Dtype,DataType]:ifisinstance(mapper,dict):mapper_dict=mappertype_set=set(map(lambdax:type(x),mapper_dict.values()))iflen(type_set)>1:raiseValueError("Mapper dict should have the same value type.")dtype,spark_return_type=pandas_on_spark_type(list(type_set)[0])defmapper_fn(x:Any)->Any:ifxinmapper_dict:returnmapper_dict[x]else:iferrors=="raise":raiseKeyError("Index include value which is not in the `mapper`")returnxreturnmapper_fn,dtype,spark_return_typeelifcallable(mapper):mapper_callable=cast(Callable,mapper)defmapper_fn(x:Any)->Any:returnmapper_callable(x)ifskip_return_type:returnmapper_fn,None,Noneelse:return_type=cast(ScalarType,infer_return_type(mapper))dtype=return_type.dtypespark_return_type=return_type.spark_typereturnmapper_fn,dtype,spark_return_typeelse:raiseValueError("`mapper` or `index` or `columns` should be ""either dict-like or function type.")index_mapper_fn=Noneindex_mapper_ret_stype=Nonecolumns_mapper_fn=Noneinplace=validate_bool_kwarg(inplace,"inplace")ifmapper:axis=validate_axis(axis)ifaxis==0:index_mapper_fn,index_mapper_ret_dtype,index_mapper_ret_stype=gen_mapper_fn(mapper)elifaxis==1:columns_mapper_fn,_,_=gen_mapper_fn(mapper)else:ifindex:index_mapper_fn,index_mapper_ret_dtype,index_mapper_ret_stype=gen_mapper_fn(index)ifcolumns:columns_mapper_fn,_,_=gen_mapper_fn(columns,skip_return_type=True)ifnotindexandnotcolumns:raiseValueError("Either `index` or `columns` should be provided.")psdf=self.copy()ifindex_mapper_fn:# rename index labels, if `level` is None, rename all index columns, otherwise only# rename the corresponding level index.# implement this by transform the underlying spark dataframe,# Example:# suppose the psdf index column in underlying spark dataframe is "index_0", "index_1",# if rename level 0 index labels, will do:# ``psdf._sdf.withColumn("index_0", mapper_fn_udf(col("index_0"))``# if rename all index labels (`level` is None), then will do:# ```# psdf._sdf.withColumn("index_0", mapper_fn_udf(col("index_0"))# .withColumn("index_1", mapper_fn_udf(col("index_1"))# ```index_columns=psdf._internal.index_spark_column_namesnum_indices=len(index_columns)iflevelisnotNoneand(level<0orlevel>=num_indices):raiseValueError("level should be an integer between [0, %s)"%num_indices)@pandas_udf(returnType=index_mapper_ret_stype)# type: ignore[call-overload]defindex_mapper_udf(s:pd.Series)->pd.Series:returns.map(index_mapper_fn)index_spark_columns=psdf._internal.index_spark_columns.copy()index_fields=psdf._internal.index_fields.copy()iflevelisNone:foriinrange(num_indices):index_spark_columns[i]=index_mapper_udf(index_spark_columns[i]).alias(index_columns[i])index_fields[i]=index_fields[i].copy(dtype=index_mapper_ret_dtype,spark_type=index_mapper_ret_stype,nullable=True,)else:index_spark_columns[level]=index_mapper_udf(index_spark_columns[level]).alias(index_columns[level])index_fields[level]=index_fields[level].copy(dtype=index_mapper_ret_dtype,spark_type=index_mapper_ret_stype,nullable=True,)psdf=DataFrame(psdf._internal.copy(index_spark_columns=index_spark_columns,index_fields=index_fields))ifcolumns_mapper_fn:# rename column name.# Will modify the `_internal._column_labels` and transform underlying spark dataframe# to the same column name with `_internal._column_labels`.iflevel:iflevel<0orlevel>=psdf._internal.column_labels_level:raiseValueError("level should be an integer between [0, column_labels_level)")defgen_new_column_labels_entry(column_labels_entry:Label)->Label:iflevelisNone:# rename all level columnsreturntuple(map(columns_mapper_fn,column_labels_entry))else:# only rename specified level columnentry_list=list(column_labels_entry)entry_list[level]=columns_mapper_fn(entry_list[level])returntuple(entry_list)new_column_labels=list(map(gen_new_column_labels_entry,psdf._internal.column_labels))new_data_pssers=[psdf._psser_for(old_label).rename(new_label)forold_label,new_labelinzip(psdf._internal.column_labels,new_column_labels)]psdf=DataFrame(psdf._internal.with_new_columns(new_data_pssers))ifinplace:self._update_internal_frame(psdf._internal)returnNoneelse:returnpsdf
[docs]defrename_axis(self,mapper:Union[Any,Sequence[Any],Dict[Name,Any],Callable[[Name],Any]]=None,index:Union[Any,Sequence[Any],Dict[Name,Any],Callable[[Name],Any]]=None,columns:Union[Any,Sequence[Any],Dict[Name,Any],Callable[[Name],Any]]=None,axis:Optional[Axis]=0,inplace:Optional[bool]=False,)->Optional["DataFrame"]:""" Set the name of the axis for the index or columns. Parameters ---------- mapper : scalar, list-like, optional A scalar, list-like, dict-like or functions transformations to apply to the axis name attribute. index, columns : scalar, list-like, dict-like or function, optional A scalar, list-like, dict-like or functions transformations to apply to that axis' values. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and/or ``columns``. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to rename. inplace : bool, default False Modifies the object directly, instead of creating a new DataFrame. Returns ------- DataFrame, or None if `inplace` is True. See Also -------- Series.rename : Alter Series index labels or name. DataFrame.rename : Alter DataFrame index labels or name. Index.rename : Set new names on index. Notes ----- ``DataFrame.rename_axis`` supports two calling conventions * ``(index=index_mapper, columns=columns_mapper, ...)`` * ``(mapper, axis={'index', 'columns'}, ...)`` The first calling convention will only modify the names of the index and/or the names of the Index object that is the columns. The second calling convention will modify the names of the corresponding index specified by axis. We *highly* recommend using keyword arguments to clarify your intent. Examples -------- >>> df = ps.DataFrame({"num_legs": [4, 4, 2], ... "num_arms": [0, 0, 2]}, ... index=["dog", "cat", "monkey"], ... columns=["num_legs", "num_arms"]) >>> df num_legs num_arms dog 4 0 cat 4 0 monkey 2 2 >>> df = df.rename_axis("animal").sort_index() >>> df # doctest: +NORMALIZE_WHITESPACE num_legs num_arms animal cat 4 0 dog 4 0 monkey 2 2 >>> df = df.rename_axis("limbs", axis="columns").sort_index() >>> df # doctest: +NORMALIZE_WHITESPACE limbs num_legs num_arms animal cat 4 0 dog 4 0 monkey 2 2 **MultiIndex** >>> index = pd.MultiIndex.from_product([['mammal'], ... ['dog', 'cat', 'monkey']], ... names=['type', 'name']) >>> df = ps.DataFrame({"num_legs": [4, 4, 2], ... "num_arms": [0, 0, 2]}, ... index=index, ... columns=["num_legs", "num_arms"]) >>> df # doctest: +NORMALIZE_WHITESPACE num_legs num_arms type name mammal dog 4 0 cat 4 0 monkey 2 2 >>> df.rename_axis(index={'type': 'class'}).sort_index() # doctest: +NORMALIZE_WHITESPACE num_legs num_arms class name mammal cat 4 0 dog 4 0 monkey 2 2 >>> df.rename_axis(index=str.upper).sort_index() # doctest: +NORMALIZE_WHITESPACE num_legs num_arms TYPE NAME mammal cat 4 0 dog 4 0 monkey 2 2 """defgen_names(v:Union[Any,Sequence[Any],Dict[Name,Any],Callable[[Name],Any]],curnames:List[Name],)->List[Label]:newnames:List[Name]ifis_scalar(v):newnames=[cast(Name,v)]elifis_list_like(v)andnotis_dict_like(v):newnames=list(cast(Sequence[Name],v))elifis_dict_like(v):v_dict=cast(Dict[Name,Name],v)newnames=[v_dict[name]ifnameinv_dictelsenamefornameincurnames]elifcallable(v):v_callable=cast(Callable[[Name],Name],v)newnames=[v_callable(name)fornameincurnames]else:raiseValueError("`mapper` or `index` or `columns` should be ""either dict-like or function type.")iflen(newnames)!=len(curnames):raiseValueError("Length of new names must be {}, got {}".format(len(curnames),len(newnames)))return[nameifis_name_like_tuple(name)else(name,)fornameinnewnames]ifmapperisnotNoneand(indexisnotNoneorcolumnsisnotNone):raiseTypeError("Cannot specify both 'mapper' and any of 'index' or 'columns'.")ifmapperisnotNone:axis=validate_axis(axis)ifaxis==0:index=mapperelifaxis==1:columns=mappercolumn_label_names=(gen_names(columns,self.columns.names)ifcolumnsisnotNoneelseself._internal.column_label_names)index_names=(gen_names(index,self.index.names)ifindexisnotNoneelseself._internal.index_names)internal=self._internal.copy(index_names=index_names,column_label_names=column_label_names)ifinplace:self._update_internal_frame(internal)returnNoneelse:returnDataFrame(internal)
[docs]defkeys(self)->pd.Index:""" Return alias for columns. Returns ------- Index Columns of the DataFrame. Examples -------- >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], ... columns=['max_speed', 'shield']) >>> df max_speed shield cobra 1 2 viper 4 5 sidewinder 7 8 >>> df.keys() Index(['max_speed', 'shield'], dtype='object') """returnself.columns
[docs]defpct_change(self,periods:int=1)->"DataFrame":""" Percentage change between the current and a prior element. .. note:: the current implementation of this API uses Spark's Window without specifying partition specification. This leads to moving all data into a single partition in a single machine and could cause serious performance degradation. Avoid this method with very large datasets. Parameters ---------- periods : int, default 1 Periods to shift for forming percent change. Returns ------- DataFrame Examples -------- Percentage change in French franc, Deutsche Mark, and Italian lira from 1980-01-01 to 1980-03-01. >>> df = ps.DataFrame({ ... 'FR': [4.0405, 4.0963, 4.3149], ... 'GR': [1.7246, 1.7482, 1.8519], ... 'IT': [804.74, 810.01, 860.13]}, ... index=['1980-01-01', '1980-02-01', '1980-03-01']) >>> df FR GR IT 1980-01-01 4.0405 1.7246 804.74 1980-02-01 4.0963 1.7482 810.01 1980-03-01 4.3149 1.8519 860.13 >>> df.pct_change() FR GR IT 1980-01-01 NaN NaN NaN 1980-02-01 0.013810 0.013684 0.006549 1980-03-01 0.053365 0.059318 0.061876 You can set periods to shift for forming percent change >>> df.pct_change(2) FR GR IT 1980-01-01 NaN NaN NaN 1980-02-01 NaN NaN NaN 1980-03-01 0.067912 0.073814 0.06883 """window=Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-periods,-periods)defop(psser:ps.Series)->PySparkColumn:prev_row=F.lag(psser.spark.column,periods).over(window)return((psser.spark.column-prev_row)/prev_row).alias(psser._internal.data_spark_column_names[0])returnself._apply_series_op(op,should_resolve=True)
# TODO(SPARK-46168): axis = 1
[docs]defidxmax(self,axis:Axis=0)->"Series":""" Return index of first occurrence of maximum over requested axis. NA/null values are excluded. .. note:: This API collect all rows with maximum value using `to_pandas()` because we suppose the number of rows with max values are usually small in general. Parameters ---------- axis : 0 or 'index' Can only be set to 0 now. Returns ------- Series See Also -------- Series.idxmax Examples -------- >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], ... 'b': [4.0, 2.0, 3.0, 1.0], ... 'c': [300, 200, 400, 200]}) >>> psdf a b c 0 1 4.0 300 1 2 2.0 200 2 3 3.0 400 3 2 1.0 200 >>> psdf.idxmax() a 2 b 0 c 2 dtype: int64 For Multi-column Index >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], ... 'b': [4.0, 2.0, 3.0, 1.0], ... 'c': [300, 200, 400, 200]}) >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) >>> psdf a b c x y z 0 1 4.0 300 1 2 2.0 200 2 3 3.0 400 3 2 1.0 200 >>> psdf.idxmax() a x 2 b y 0 c z 2 dtype: int64 """max_cols=map(lambdascol:F.max(scol),self._internal.data_spark_columns)sdf_max=self._internal.spark_frame.select(*max_cols).head()# `sdf_max` looks like below# +------+------+------+# |(a, x)|(b, y)|(c, z)|# +------+------+------+# | 3| 4.0| 400|# +------+------+------+conds=(scol==max_valforscol,max_valinzip(self._internal.data_spark_columns,sdf_max))cond=reduce(lambdax,y:x|y,conds)psdf:DataFrame=DataFrame(self._internal.with_filter(cond))returncast(ps.Series,ps.from_pandas(psdf._to_internal_pandas().idxmax()))
# TODO(SPARK-46168): axis = 1
[docs]defidxmin(self,axis:Axis=0)->"Series":""" Return index of first occurrence of minimum over requested axis. NA/null values are excluded. .. note:: This API collect all rows with minimum value using `to_pandas()` because we suppose the number of rows with min values are usually small in general. Parameters ---------- axis : 0 or 'index' Can only be set to 0 now. Returns ------- Series See Also -------- Series.idxmin Examples -------- >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], ... 'b': [4.0, 2.0, 3.0, 1.0], ... 'c': [300, 200, 400, 200]}) >>> psdf a b c 0 1 4.0 300 1 2 2.0 200 2 3 3.0 400 3 2 1.0 200 >>> psdf.idxmin() a 0 b 3 c 1 dtype: int64 For Multi-column Index >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], ... 'b': [4.0, 2.0, 3.0, 1.0], ... 'c': [300, 200, 400, 200]}) >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) >>> psdf a b c x y z 0 1 4.0 300 1 2 2.0 200 2 3 3.0 400 3 2 1.0 200 >>> psdf.idxmin() a x 0 b y 3 c z 1 dtype: int64 """min_cols=map(lambdascol:F.min(scol),self._internal.data_spark_columns)sdf_min=self._internal.spark_frame.select(*min_cols).head()conds=(scol==min_valforscol,min_valinzip(self._internal.data_spark_columns,sdf_min))cond=reduce(lambdax,y:x|y,conds)psdf:DataFrame=DataFrame(self._internal.with_filter(cond))returncast(ps.Series,ps.from_pandas(psdf._to_internal_pandas().idxmin()))
[docs]definfo(self,verbose:Optional[bool]=None,buf:Optional[IO[str]]=None,max_cols:Optional[int]=None,show_counts:Optional[bool]=None,)->None:""" Print a concise summary of a DataFrame. This method prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage. Parameters ---------- verbose : bool, optional Whether to print the full summary. buf : writable buffer, defaults to sys.stdout Where to send the output. By default the output is printed to sys.stdout. Pass a writable buffer if you need to further process the output. max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. show_counts : bool, optional Whether to show the non-null counts. .. versionadded:: 4.0.0 Returns ------- None This method prints a summary of a DataFrame and returns None. See Also -------- DataFrame.describe: Generate descriptive statistics of DataFrame columns. Examples -------- >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] >>> df = ps.DataFrame( ... {"int_col": int_values, "text_col": text_values, "float_col": float_values}, ... columns=['int_col', 'text_col', 'float_col']) >>> df int_col text_col float_col 0 1 alpha 0.00 1 2 beta 0.25 2 3 gamma 0.50 3 4 delta 0.75 4 5 epsilon 1.00 Prints information of all columns: >>> df.info(verbose=True) # doctest: +SKIP <class 'pyspark.pandas.frame.DataFrame'> Index: 5 entries, 0 to 4 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 int_col 5 non-null int64 1 text_col 5 non-null object 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) Prints a summary of columns count and its dtypes but not per column information: >>> df.info(verbose=False) # doctest: +SKIP <class 'pyspark.pandas.frame.DataFrame'> Index: 5 entries, 0 to 4 Columns: 3 entries, int_col to float_col dtypes: float64(1), int64(1), object(1) Pipe output of DataFrame.info to buffer instead of sys.stdout, get buffer content and writes to a text file: >>> import io >>> buffer = io.StringIO() >>> df.info(buf=buffer) >>> s = buffer.getvalue() >>> with open('%s/info.txt' % path, "w", ... encoding="utf-8") as f: ... _ = f.write(s) >>> with open('%s/info.txt' % path) as f: ... f.readlines() # doctest: +SKIP ["<class 'pyspark.pandas.frame.DataFrame'>\\n", 'Index: 5 entries, 0 to 4\\n', 'Data columns (total 3 columns):\\n', ' # Column Non-Null Count Dtype \\n', '--- ------ -------------- ----- \\n', ' 0 int_col 5 non-null int64 \\n', ' 1 text_col 5 non-null object \\n', ' 2 float_col 5 non-null float64\\n', 'dtypes: float64(1), int64(1), object(1)'] """# To avoid pandas' existing config affects pandas-on-Spark.# TODO: should we have corresponding pandas-on-Spark configs?withpd.option_context("display.max_info_columns",sys.maxsize,"display.max_info_rows",sys.maxsize):try:# hack to use pandas' info as is.object.__setattr__(self,"_data",self)count_func=self.countself.count=(# type: ignore[method-assign]lambda:count_func()._to_pandas()# type: ignore[assignment, misc, union-attr])returnpd.DataFrame.info(self,# type: ignore[arg-type]verbose=verbose,buf=buf,max_cols=max_cols,memory_usage=False,show_counts=show_counts,# type: ignore)finally:delself._dataself.count=count_func# type: ignore[method-assign]
# TODO: fix parameter 'axis' and 'numeric_only' to work same as pandas'
[docs]defquantile(self,q:Union[float,Iterable[float]]=0.5,axis:Axis=0,numeric_only:bool=False,accuracy:int=10000,)->DataFrameOrSeries:""" Return value at the given quantile. .. note:: Unlike pandas', the quantile in pandas-on-Spark is an approximated quantile based upon approximate percentile computation because computing quantile across a large dataset is extremely expensive. Parameters ---------- q : float or array-like, default 0.5 (50% quantile) 0 <= q <= 1, the quantile(s) to compute. axis : int or str, default 0 or 'index' Can only be set to 0 now. numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionchanged:: 4.0.0 The default value of ``numeric_only`` is now ``False``. accuracy : int, optional Default accuracy of approximation. Larger value means better accuracy. The relative error can be deduced by 1.0 / accuracy. Returns ------- Series or DataFrame If q is an array, a DataFrame will be returned where the index is q, the columns are the columns of self, and the values are the quantiles. If q is a float, a Series will be returned where the index is the columns of self and the values are the quantiles. Examples -------- >>> psdf = ps.DataFrame({'a': [1, 2, 3, 4, 5], 'b': [6, 7, 8, 9, 0]}) >>> psdf a b 0 1 6 1 2 7 2 3 8 3 4 9 4 5 0 >>> psdf.quantile(.5) a 3.0 b 7.0 Name: 0.5, dtype: float64 >>> psdf.quantile([.25, .5, .75]) a b 0.25 2.0 6.0 0.50 3.0 7.0 0.75 4.0 8.0 """axis=validate_axis(axis)ifaxis!=0:raiseNotImplementedError('axis should be either 0 or "index" currently.')ifnotisinstance(accuracy,int):raiseTypeError("accuracy must be an integer; however, got [%s]"%type(accuracy).__name__)qq:Union[float,List[float]]=list(q)ifisinstance(q,Iterable)elseqforvinqqifisinstance(qq,list)else[qq]:ifnotisinstance(v,float):raiseTypeError("q must be a float or an array of floats; however, [%s] found."%type(v))ifv<0.0orv>1.0:raiseValueError("percentiles should all be in the interval [0, 1].")defquantile(psser:"Series")->PySparkColumn:spark_type=psser.spark.data_typespark_column=psser.spark.columnifisinstance(spark_type,(BooleanType,NumericType,NullType)):returnF.percentile_approx(spark_column.cast(DoubleType()),qq,accuracy)else:raiseTypeError("Could not convert {} ({}) to numeric".format(spark_type_to_pandas_dtype(spark_type),spark_type.simpleString()))ifisinstance(qq,list):# First calculate the percentiles from all columns and map it to each `quantiles`# by creating each entry as a struct. So, it becomes an array of structs as below:## +-----------------------------------------+# | arrays|# +-----------------------------------------+# |[[0.25, 2, 6], [0.5, 3, 7], [0.75, 4, 8]]|# +-----------------------------------------+percentile_cols:List[PySparkColumn]=[]percentile_col_names:List[str]=[]column_labels:List[Label]=[]forlabel,columninzip(self._internal.column_labels,self._internal.data_spark_column_names):psser=self._psser_for(label)is_numeric_or_boolean=isinstance(psser.spark.data_type,(NumericType,BooleanType))keep_column=notnumeric_onlyoris_numeric_or_booleanifkeep_column:percentile_col=quantile(psser)percentile_cols.append(percentile_col.alias(column))percentile_col_names.append(column)column_labels.append(label)iflen(percentile_cols)==0:returnDataFrame(index=qq)sdf=self._internal.spark_frame.select(percentile_cols)# Here, after select percentile cols, a spark_frame looks like below:# +---------+---------+# | a| b|# +---------+---------+# |[2, 3, 4]|[6, 7, 8]|# +---------+---------+cols_dict:Dict[str,List[PySparkColumn]]={}forcolumninpercentile_col_names:cols_dict[column]=list()foriinrange(len(qq)):cols_dict[column].append(scol_for(sdf,column)[i].alias(column))internal_index_column=SPARK_DEFAULT_INDEX_NAMEcols=[]fori,colinenumerate(zip(*cols_dict.values())):cols.append(F.struct(F.lit(qq[i]).alias(internal_index_column),*col))sdf=sdf.select(F.array(*cols).alias("arrays"))# And then, explode it and manually set the index.# +-----------------+---+---+# |__index_level_0__| a| b|# +-----------------+---+---+# | 0.25| 2| 6|# | 0.5| 3| 7|# | 0.75| 4| 8|# +-----------------+---+---+sdf=sdf.select(F.explode(F.col("arrays"))).selectExpr("col.*")internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,internal_index_column)],column_labels=column_labels,data_spark_columns=[scol_for(sdf,col)forcolinpercentile_col_names],)returnDataFrame(internal)else:returnself._reduce_for_stat_function(quantile,name="quantile",numeric_only=numeric_only).rename(qq)
[docs]defquery(self,expr:str,inplace:bool=False)->Optional["DataFrame"]:""" Query the columns of a DataFrame with a boolean expression. .. note:: Internal columns that starting with a '__' prefix are able to access, however, they are not supposed to be accessed. .. note:: This API delegates to Spark SQL so the syntax follows Spark SQL. Therefore, the pandas specific syntax such as `@` is not supported. If you want the pandas syntax, you can work around with :meth:`DataFrame.pandas_on_spark.apply_batch`, but you should be aware that `query_func` will be executed at different nodes in a distributed manner. So, for example to use `@` syntax, make sure the variable is serialized by putting it within the closure as below. >>> df = ps.DataFrame({'A': range(2000), 'B': range(2000)}) >>> def query_func(pdf): ... num = 1995 ... return pdf.query('A > @num') >>> df.pandas_on_spark.apply_batch(query_func) A B 1996 1996 1996 1997 1997 1997 1998 1998 1998 1999 1999 1999 Parameters ---------- expr : str The query string to evaluate. You can refer to column names that contain spaces by surrounding them in backticks. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. inplace : bool Whether the query should modify the data in place or return a modified copy. Returns ------- DataFrame DataFrame resulting from the provided query expression. Examples -------- >>> df = ps.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), ... 'C C': range(10, 5, -1)}) >>> df A B C C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 >>> df.query('A > B') A B C C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] A B C C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. >>> df.query('B == `C C`') A B C C 0 1 10 10 The previous expression is equivalent to >>> df[df.B == df['C C']] A B C C 0 1 10 10 """ifisinstance(self.columns,pd.MultiIndex):raiseTypeError("Doesn't support for MultiIndex columns")ifnotisinstance(expr,str):raiseTypeError("expr must be a string to be evaluated, {} given".format(type(expr).__name__))inplace=validate_bool_kwarg(inplace,"inplace")data_columns=[label[0]forlabelinself._internal.column_labels]sdf=self._internal.spark_frame.select(self._internal.index_spark_columns+[scol.alias(col)forscol,colinzip(self._internal.data_spark_columns,data_columns)]).filter(expr)internal=self._internal.with_new_sdf(sdf,data_columns=data_columns)ifinplace:self._update_internal_frame(internal)returnNoneelse:returnDataFrame(internal)
[docs]deftake(self,indices:List[int],axis:Axis=0,**kwargs:Any)->"DataFrame":""" Return the elements in the given *positional* indices along an axis. This means that we are not indexing according to actual values in the index attribute of the object. We are indexing according to the actual position of the element in the object. Parameters ---------- indices : array-like An array of ints indicating which positions to take. axis : {0 or 'index', 1 or 'columns', None}, default 0 The axis on which to select elements. ``0`` means that we are selecting rows, ``1`` means that we are selecting columns. **kwargs For compatibility with :meth:`numpy.take`. Has no effect on the output. Returns ------- taken : same type as caller An array-like containing the elements taken from the object. See Also -------- DataFrame.loc : Select a subset of a DataFrame by labels. DataFrame.iloc : Select a subset of a DataFrame by positions. numpy.take : Take elements from an array along an axis. Examples -------- >>> df = ps.DataFrame([('falcon', 'bird', 389.0), ... ('parrot', 'bird', 24.0), ... ('lion', 'mammal', 80.5), ... ('monkey', 'mammal', np.nan)], ... columns=['name', 'class', 'max_speed'], ... index=[0, 2, 3, 1]) >>> df name class max_speed 0 falcon bird 389.0 2 parrot bird 24.0 3 lion mammal 80.5 1 monkey mammal NaN Take elements at positions 0 and 3 along the axis 0 (default). Note how the actual indices selected (0 and 1) do not correspond to our selected indices 0 and 3. That's because we are selecting the 0th and 3rd rows, not rows whose indices equal 0 and 3. >>> df.take([0, 3]).sort_index() name class max_speed 0 falcon bird 389.0 1 monkey mammal NaN Take elements at indices 1 and 2 along the axis 1 (column selection). >>> df.take([1, 2], axis=1) class max_speed 0 bird 389.0 2 bird 24.0 3 mammal 80.5 1 mammal NaN We may take elements using negative integers for positive indices, starting from the end of the object, just like with Python lists. >>> df.take([-1, -2]).sort_index() name class max_speed 1 monkey mammal NaN 3 lion mammal 80.5 """axis=validate_axis(axis)ifnotis_list_like(indices)orisinstance(indices,(dict,set)):raiseTypeError("`indices` must be a list-like except dict or set")ifaxis==0:returncast(DataFrame,self.iloc[indices,:])else:returncast(DataFrame,self.iloc[:,indices])
[docs]defeval(self,expr:str,inplace:bool=False)->Optional[DataFrameOrSeries]:""" Evaluate a string describing operations on DataFrame columns. Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. Parameters ---------- expr : str The expression string to evaluate. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned. Returns ------- The result of the evaluation. See Also -------- DataFrame.query : Evaluates a boolean expression to query the columns of a frame. DataFrame.assign : Can evaluate an expression or function to create new values for a column. eval : Evaluate a Python expression as a string using various backends. Examples -------- >>> df = ps.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df A B 0 1 10 1 2 8 2 3 6 3 4 4 4 5 2 >>> df.eval('A + B') 0 11 1 10 2 9 3 8 4 7 dtype: int64 Assignment is allowed though by default the original DataFrame is not modified. >>> df.eval('C = A + B') A B C 0 1 10 11 1 2 8 10 2 3 6 9 3 4 4 8 4 5 2 7 >>> df A B 0 1 10 1 2 8 2 3 6 3 4 4 4 5 2 Use ``inplace=True`` to modify the original DataFrame. >>> df.eval('C = A + B', inplace=True) >>> df A B C 0 1 10 11 1 2 8 10 2 3 6 9 3 4 4 8 4 5 2 7 """frompyspark.pandas.seriesimportfirst_seriesifisinstance(self.columns,pd.MultiIndex):raiseTypeError("`eval` is not supported for multi-index columns")inplace=validate_bool_kwarg(inplace,"inplace")should_return_series=Falseseries_name=Noneshould_return_scalar=False# Since `eval_func` doesn't have a type hint, inferring the schema is always preformed# in the `apply_batch`. Hence, the variables `should_return_series`, `series_name`,# and `should_return_scalar` can be updated.defeval_func(pdf):# type: ignore[no-untyped-def]nonlocalshould_return_seriesnonlocalseries_namenonlocalshould_return_scalarresult_inner=pdf.eval(expr,inplace=inplace)ifinplace:result_inner=pdfifisinstance(result_inner,pd.Series):should_return_series=Trueseries_name=result_inner.nameresult_inner=result_inner.to_frame()elifis_scalar(result_inner):should_return_scalar=Trueresult_inner=pd.Series(result_inner).to_frame()returnresult_innerresult=self.pandas_on_spark.apply_batch(eval_func)ifinplace:# Here, the result is always a frame because the error is thrown during schema inference# from pandas.self._update_internal_frame(result._internal,check_same_anchor=False)returnNoneelifshould_return_series:returnfirst_series(result).rename(series_name)elifshould_return_scalar:returnfirst_series(result)[0]else:# Returns a framereturnresult
[docs]defexplode(self,column:Name,ignore_index:bool=False)->"DataFrame":""" Transform each element of a list-like to a row, replicating index values. Parameters ---------- column : str or tuple Column to explode. ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. Returns ------- DataFrame Exploded lists to rows of the subset columns; index will be duplicated for these rows. See Also -------- DataFrame.unstack : Pivot a level of the (necessarily hierarchical) index labels. DataFrame.melt : Unpivot a DataFrame from wide format to long format. Examples -------- >>> df = ps.DataFrame({'A': [[1, 2, 3], [], [3, 4]], 'B': 1}) >>> df A B 0 [1, 2, 3] 1 1 [] 1 2 [3, 4] 1 >>> df.explode('A') A B 0 1.0 1 0 2.0 1 0 3.0 1 1 NaN 1 2 3.0 1 2 4.0 1 >>> df.explode('A', ignore_index=True) A B 0 1.0 1 1 2.0 1 2 3.0 1 3 NaN 1 4 3.0 1 5 4.0 1 """frompyspark.pandas.seriesimportSeriesifnotis_name_like_value(column):raiseTypeError("column must be a scalar")psdf:DataFrame=DataFrame(self._internal.resolved_copy)psser=psdf[column]ifnotisinstance(psser,Series):raiseValueError("The column %s is not unique. For a multi-index, the label must be a tuple ""with elements corresponding to each level."%name_like_string(column))ifnotisinstance(psser.spark.data_type,ArrayType):returnself.copy()sdf=psdf._internal.spark_frame.withColumn(psser._internal.data_spark_column_names[0],F.explode_outer(psser.spark.column))data_fields=psdf._internal.data_fields.copy()idx=psdf._internal.column_labels.index(psser._column_label)field=data_fields[idx]spark_type=cast(ArrayType,field.spark_type).elementTypedtype=spark_type_to_pandas_dtype(spark_type)data_fields[idx]=field.copy(dtype=dtype,spark_type=spark_type,nullable=True)internal=psdf._internal.with_new_sdf(sdf,data_fields=data_fields)result_df:DataFrame=DataFrame(internal)returnresult_df.reset_index(drop=True)ifignore_indexelseresult_df
[docs]defmode(self,axis:Axis=0,numeric_only:bool=False,dropna:bool=True)->"DataFrame":""" Get the mode(s) of each element along the selected axis. The mode of a set of values is the value that appears most often. It can be multiple values. .. versionadded:: 3.4.0 Parameters ---------- axis : {0 or 'index'}, default 0 Axis for the function to be applied on. numeric_only : bool, default False If True, only apply to numeric columns. dropna : bool, default True Don't consider counts of NaN/NaT. Returns ------- DataFrame The modes of each column or row. See Also -------- Series.mode : Return the highest frequency value in a Series. Series.value_counts : Return the counts of values in a Series. Examples -------- >>> df = ps.DataFrame([('bird', 2, 2), ... ('mammal', 4, np.nan), ... ('arthropod', 8, 0), ... ('bird', 2, np.nan)], ... index=('falcon', 'horse', 'spider', 'ostrich'), ... columns=('species', 'legs', 'wings')) >>> df species legs wings falcon bird 2 2.0 horse mammal 4 NaN spider arthropod 8 0.0 ostrich bird 2 NaN By default missing values are not considered, and the mode of wings are both 0 and 2. Because the resulting DataFrame has two rows, the second row of ``species`` and ``legs`` contains ``NaN``. >>> df.mode() species legs wings 0 bird 2.0 0.0 1 None NaN 2.0 Setting ``dropna=False`` ``NaN`` values are considered and they can be the mode (like for wings). >>> df.mode(dropna=False) species legs wings 0 bird 2 NaN Setting ``numeric_only=True``, only the mode of numeric columns is computed, and columns of other types are ignored. >>> df.mode(numeric_only=True) legs wings 0 2.0 0.0 1 NaN 2.0 """axis=validate_axis(axis,none_axis=0)ifaxis!=0:raiseValueError('axis should be either 0 or "index" currently.')ifnumeric_onlyisNoneandaxis==0:numeric_only=Truemode_scols:List[PySparkColumn]=[]mode_col_names:List[str]=[]mode_labels:List[Label]=[]forlabel,col_nameinzip(self._internal.column_labels,self._internal.data_spark_column_names):psser=self._psser_for(label)is_numeric=isinstance(psser.spark.data_type,(NumericType,BooleanType))ifnotnumeric_onlyoris_numeric:scol=psser.spark.columnmode_scol=SF.mode(scol,dropna).alias(col_name)mode_scols.append(mode_scol)mode_col_names.append(col_name)mode_labels.append(label)# Here, after aggregation, a spark_frame looks like below:# +-------+----+----------+# |species|legs| wings|# +-------+----+----------+# | [bird]| [2]|[0.0, 2.0]|# +-------+----+----------+sdf=self._internal.spark_frame.select(mode_scols)sdf=sdf.select(*[F.array_sort(F.col(name)).alias(name)fornameinmode_col_names])zip_col_name=verify_temp_column_name(sdf,"__mode_zip_tmp_col__")explode_col_name=verify_temp_column_name(sdf,"__mode_explode_tmp_col__")# After this transformation, sdf turns out to be:# +-------+----+-----+# |species|legs|wings|# +-------+----+-----+# | bird| 2| 0.0|# | NULL|NULL| 2.0|# +-------+----+-----+sdf=(sdf.select(F.arrays_zip(*[F.col(name)fornameinmode_col_names]).alias(zip_col_name)).select(F.explode(F.col(zip_col_name)).alias(explode_col_name)).select(*[F.col("{0}.{1}".format(explode_col_name,name)).alias(name)fornameinmode_col_names]))sdf=sdf.withColumn(SPARK_DEFAULT_INDEX_NAME,F.monotonically_increasing_id())internal=InternalFrame(spark_frame=sdf,index_spark_columns=[scol_for(sdf,SPARK_DEFAULT_INDEX_NAME)],column_labels=mode_labels,data_spark_columns=[scol_for(sdf,col)forcolinmode_col_names],)returnDataFrame(internal)
[docs]deftail(self,n:int=5)->"DataFrame":""" Return the last `n` rows. This function returns last `n` rows from the object based on position. It is useful for quickly verifying data, for example, after sorting or appending rows. For negative values of `n`, this function returns all rows except the first `n` rows, equivalent to ``df[n:]``. Parameters ---------- n : int, default 5 Number of rows to select. Returns ------- type of caller The last `n` rows of the caller object. See Also -------- DataFrame.head : The first `n` rows of the caller object. Examples -------- >>> df = ps.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the last 5 lines >>> df.tail() # doctest: +SKIP animal 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the last `n` lines (three in this case) >>> df.tail(3) # doctest: +SKIP animal 6 shark 7 whale 8 zebra For negative values of `n` >>> df.tail(-3) # doctest: +SKIP animal 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra """ifnotisinstance(n,int):raiseTypeError("bad operand type for unary -: '{}'".format(type(n).__name__))ifn<0:n=len(self)+nifn<=0:returnps.DataFrame(self._internal.with_filter(F.lit(False)))# Should use `resolved_copy` here for the case like `(psdf + 1).tail()`sdf=self._internal.resolved_copy.spark_framerows=sdf.tail(n)new_sdf=default_session().createDataFrame(rows,sdf.schema)returnDataFrame(self._internal.with_new_sdf(new_sdf))
[docs]defalign(self,other:DataFrameOrSeries,join:str="outer",axis:Optional[Axis]=None,copy:bool=True,)->Tuple["DataFrame",DataFrameOrSeries]:""" Align two objects on their axes with the specified join method. Join method is specified for each axis Index. Parameters ---------- other : DataFrame or Series join : {{'outer', 'inner', 'left', 'right'}}, default 'outer' axis : allowed axis of the other object, default None Align on index (0), columns (1), or both (None). copy : bool, default True Always returns new objects. If copy=False and no reindexing is required then original objects are returned. Returns ------- (left, right) : (DataFrame, type of other) Aligned objects. Examples -------- >>> ps.set_option("compute.ops_on_diff_frames", True) >>> df1 = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=[10, 20, 30]) >>> df2 = ps.DataFrame({"a": [4, 5, 6], "c": ["d", "e", "f"]}, index=[10, 11, 12]) Align both axis: >>> aligned_l, aligned_r = df1.align(df2) >>> aligned_l.sort_index() a b c 10 1.0 a NaN 11 NaN None NaN 12 NaN None NaN 20 2.0 b NaN 30 3.0 c NaN >>> aligned_r.sort_index() a b c 10 4.0 NaN d 11 5.0 NaN e 12 6.0 NaN f 20 NaN NaN None 30 NaN NaN None Align only axis=0 (index): >>> aligned_l, aligned_r = df1.align(df2, axis=0) >>> aligned_l.sort_index() a b 10 1.0 a 11 NaN None 12 NaN None 20 2.0 b 30 3.0 c >>> aligned_r.sort_index() a c 10 4.0 d 11 5.0 e 12 6.0 f 20 NaN None 30 NaN None Align only axis=1 (column): >>> aligned_l, aligned_r = df1.align(df2, axis=1) >>> aligned_l.sort_index() a b c 10 1 a NaN 20 2 b NaN 30 3 c NaN >>> aligned_r.sort_index() a b c 10 4 NaN d 11 5 NaN e 12 6 NaN f Align with the join type "inner": >>> aligned_l, aligned_r = df1.align(df2, join="inner") >>> aligned_l.sort_index() a 10 1 >>> aligned_r.sort_index() a 10 4 Align with a Series: >>> s = ps.Series([7, 8, 9], index=[10, 11, 12]) >>> aligned_l, aligned_r = df1.align(s, axis=0) >>> aligned_l.sort_index() a b 10 1.0 a 11 NaN None 12 NaN None 20 2.0 b 30 3.0 c >>> aligned_r.sort_index() 10 7.0 11 8.0 12 9.0 20 NaN 30 NaN dtype: float64 >>> ps.reset_option("compute.ops_on_diff_frames") """frompyspark.pandas.seriesimportSeries,first_seriesifnotisinstance(other,(DataFrame,Series)):raiseTypeError("unsupported type: {}".format(type(other).__name__))how=validate_how(join)axis=validate_axis(axis,None)right_is_series=isinstance(other,Series)ifright_is_series:ifaxisisNone:raiseValueError("Must specify axis=0 or 1")elifaxis!=0:raiseNotImplementedError("align currently only works for axis=0 when right is Series")left=selfright=otherif(axisisNoneoraxis==0)andnotsame_anchor(left,right):combined=combine_frames(left,right,how=how)left=combined["this"]right=combined["that"]ifright_is_series:right=first_series(cast(DataFrame[Any],right)).rename(other.name)if(axisisNoneoraxis==1)andleft._internal.column_labels!=right._internal.column_labels:ifleft._internal.column_labels_level!=right._internal.column_labels_level:raiseValueError("cannot join with no overlapping index names")left=left.copy()right=right.copy()ifhow=="full":column_labels=sorted(list(set(left._internal.column_labels)|set(right._internal.column_labels)))elifhow=="inner":column_labels=sorted(list(set(left._internal.column_labels)&set(right._internal.column_labels)))elifhow=="left":column_labels=left._internal.column_labelselse:column_labels=right._internal.column_labelsforlabelincolumn_labels:iflabelnotinleft._internal.column_labels:left[label]=F.lit(None).cast(DoubleType())left=left[column_labels]forlabelincolumn_labels:iflabelnotinright._internal.column_labels:right[label]=F.lit(None).cast(DoubleType())right=right[column_labels]return(left.copy(),right.copy())ifcopyelse(left,right)
[docs]@staticmethoddeffrom_dict(data:Dict[Name,Sequence[Any]],orient:str="columns",dtype:Union[str,Dtype]=None,columns:Optional[List[Name]]=None,)->"DataFrame":""" Construct DataFrame from dict of array-like or dicts. Creates DataFrame object from dictionary by columns or by index allowing dtype specification. Parameters ---------- data : dict Of the form {field : array-like} or {field : dict}. orient : {'columns', 'index'}, default 'columns' The "orientation" of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass 'columns' (default). Otherwise, if the keys should be rows, pass 'index'. dtype : dtype, default None Data type to force, otherwise infer. columns : list, default None Column labels to use when ``orient='index'``. Raises a ValueError if used with ``orient='columns'``. Returns ------- DataFrame See Also -------- DataFrame.from_records : DataFrame from structured ndarray, sequence of tuples or dicts, or DataFrame. DataFrame : DataFrame object creation using constructor. Examples -------- By default the keys of the dict become the DataFrame columns: >>> data = {'col_1': [3, 2, 1, 0], 'col_2': [10, 20, 30, 40]} >>> ps.DataFrame.from_dict(data) col_1 col_2 0 3 10 1 2 20 2 1 30 3 0 40 Specify ``orient='index'`` to create the DataFrame using dictionary keys as rows: >>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 20, 30, 40]} >>> ps.DataFrame.from_dict(data, orient='index').sort_index() 0 1 2 3 row_1 3 2 1 0 row_2 10 20 30 40 When using the 'index' orientation, the column names can be specified manually: >>> ps.DataFrame.from_dict(data, orient='index', ... columns=['A', 'B', 'C', 'D']).sort_index() A B C D row_1 3 2 1 0 row_2 10 20 30 40 """returnDataFrame(pd.DataFrame.from_dict(data,orient=orient,dtype=dtype,columns=columns# type: ignore[arg-type]))
# Override the `groupby` to specify the actual return type annotation.
[docs]defresample(self,rule:str,closed:Optional[str]=None,label:Optional[str]=None,on:Optional["Series"]=None,)->"DataFrameResampler":""" Resample time-series data. Convenience method for frequency conversion and resampling of time series. The object must have a datetime-like index (only support `DatetimeIndex` for now), or the caller must pass the label of a datetime-like series/index to the ``on`` keyword parameter. .. versionadded:: 3.4.0 Parameters ---------- rule : str The offset string or object representing target conversion. Currently, supported units are {'Y', 'A', 'M', 'D', 'H', 'T', 'MIN', 'S'}. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' for all frequency offsets except for 'A', 'Y' and 'M' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' for all frequency offsets except for 'A', 'Y' and 'M' which all have a default of 'right'. on : Series, optional For a DataFrame, column to use instead of index for resampling. Column must be datetime-like. Returns ------- DataFrameResampler See Also -------- Series.resample : Resample a Series. groupby : Group by mapping, function, label, or list of labels. """frompyspark.pandas.indexesimportDatetimeIndexfrompyspark.pandas.resampleimportDataFrameResamplerifonisNoneandnotisinstance(self.index,DatetimeIndex):raiseNotImplementedError("resample currently works only for DatetimeIndex")ifonisnotNoneandnotisinstance(as_spark_type(on.dtype),(TimestampType,TimestampNTZType)):raiseNotImplementedError("`on` currently works only for TimestampType")agg_columns:List[ps.Series]=[]forcolumn_labelinself._internal.column_labels:ifisinstance(self._internal.spark_type_for(column_label),(NumericType,BooleanType)):agg_columns.append(self._psser_for(column_label))iflen(agg_columns)==0:raiseValueError("No available aggregation columns!")returnDataFrameResampler(psdf=self,resamplekey=on,rule=rule,closed=closed,label=label,agg_columns=agg_columns,)
def_to_internal_pandas(self)->pd.DataFrame:""" Return a pandas DataFrame directly from _internal to avoid overhead of copy. This method is for internal use only. """returnself._internal.to_pandas_framedef_get_or_create_repr_pandas_cache(self,n:int)->Union[pd.DataFrame,pd.Series]:ifnothasattr(self,"_repr_pandas_cache")ornnotinself._repr_pandas_cache:object.__setattr__(self,"_repr_pandas_cache",{n:self.head(n+1)._to_internal_pandas()})returnself._repr_pandas_cache[n]def__repr__(self)->str:max_display_count=get_option("display.max_rows")ifmax_display_countisNone:returnself._to_internal_pandas().to_string()pdf=cast("DataFrame",self._get_or_create_repr_pandas_cache(max_display_count))pdf_length=len(pdf)pdf=cast("DataFrame",pdf.iloc[:max_display_count])ifpdf_length>max_display_count:repr_string=pdf.to_string(show_dimensions=True)match=REPR_PATTERN.search(repr_string)ifmatchisnotNone:nrows=match.group("rows")ncols=match.group("columns")footer="\n\n[Showing only the first {nrows} rows x {ncols} columns]".format(nrows=nrows,ncols=ncols)returnREPR_PATTERN.sub(footer,repr_string)returnpdf.to_string()def_repr_html_(self)->str:max_display_count=get_option("display.max_rows")ifmax_display_countisNone:returnself._to_internal_pandas().to_html(notebook=True)pdf=self._get_or_create_repr_pandas_cache(max_display_count)pdf_length=len(pdf)pdf=pdf.iloc[:max_display_count]ifpdf_length>max_display_count:repr_html=pdf.to_html(show_dimensions=True,notebook=True)match=REPR_HTML_PATTERN.search(repr_html)ifmatchisnotNone:nrows=match.group("rows")ncols=match.group("columns")by=chr(215)footer=("\n<p>Showing only the first {rows} rows ""{by}{cols} columns</p>\n</div>".format(rows=nrows,by=by,cols=ncols))returnREPR_HTML_PATTERN.sub(footer,repr_html)returnpdf.to_html(notebook=True)def__getitem__(self,key:Any)->Any:frompyspark.pandas.seriesimportSeriesifkeyisNone:raiseKeyError("none key")elifisinstance(key,Series):returnself.loc[key.astype(bool)]elifisinstance(key,slice):ifany(type(n)==intorNonefornin[key.start,key.stop]):# Seems like pandas Frame always uses int as positional search when slicing# with ints.returnself.iloc[key]returnself.loc[key]elifis_name_like_value(key):returnself.loc[:,key]elifis_list_like(key):returnself.loc[:,list(key)]def__setitem__(self,key:Any,value:Any)->None:frompyspark.pandas.seriesimportSeriesifisinstance(value,(DataFrame,Series))andnotsame_anchor(value,self):# Different Series or DataFrameslevel=self._internal.column_labels_levelkey=DataFrame._index_normalized_label(level,key)value=DataFrame._index_normalized_frame(level,value)defassign_columns(psdf:DataFrame,this_column_labels:List[Label],that_column_labels:List[Label])->Iterator[Tuple["Series",Label]]:assertlen(key)==len(that_column_labels)# Note that here intentionally uses `zip_longest` that combine# that_columns.fork,this_label,that_labelinzip_longest(key,this_column_labels,that_column_labels):yield(psdf._psser_for(that_label),tuple(["that",*k]))ifthis_labelisnotNoneandthis_label[1:]!=k:yield(psdf._psser_for(this_label),this_label)psdf=align_diff_frames(assign_columns,self,value,fillna=False,how="left")elifisinstance(value,list):iflen(self)!=len(value):raiseValueError("Length of values does not match length of index")# TODO: avoid using default index?withoption_context("compute.default_index_type","distributed-sequence","compute.ops_on_diff_frames",True,):psdf=self.reset_index()psdf[key]=ps.DataFrame(value)psdf=psdf.set_index(psdf.columns[:self._internal.index_level])psdf.index.names=self.index.nameselifisinstance(key,list):assertisinstance(value,DataFrame)# Same DataFrames.field_names=value.columnspsdf=self._assign({k:value[c]fork,cinzip(key,field_names)})else:# Same Series.psdf=self._assign({key:value})# Since Spark 3.4, df.__setitem__ generates a new dataframe instead of operating# in-place to follow pandas v1.4 behavior, see also SPARK-38946.self._update_internal_frame(psdf._internal,anchor_force_disconnect=True)@staticmethoddef_index_normalized_label(level:int,labels:Union[Name,Sequence[Name]])->List[Label]:""" Returns a label that is normalized against the current column index level. For example, the key "abc" can be ("abc", "", "") if the current Frame has a multi-index for its column """ifis_name_like_tuple(labels):labels=[labels]elifis_name_like_value(labels):labels=[(labels,)]else:labels=[kifis_name_like_tuple(k)else(k,)forkinlabels]ifany(len(label)>levelforlabelinlabels):raiseKeyError("Key length ({}) exceeds index depth ({})".format(max(len(label)forlabelinlabels),level))return[tuple(list(label)+([""]*(level-len(label))))forlabelinlabels]@staticmethoddef_index_normalized_frame(level:int,psser_or_psdf:DataFrameOrSeries)->"DataFrame":""" Returns a frame that is normalized against the current column index level. For example, the name in `pd.Series([...], name="abc")` can be can be ("abc", "", "") if the current DataFrame has a multi-index for its column """frompyspark.pandas.seriesimportSeriesifisinstance(psser_or_psdf,Series):psdf=psser_or_psdf.to_frame()else:assertisinstance(psser_or_psdf,DataFrame),type(psser_or_psdf)psdf=psser_or_psdf.copy()psdf.columns=pd.MultiIndex.from_tuples([tuple([name_like_string(label)]+([""]*(level-1)))forlabelinpsdf._internal.column_labels],)returnpsdfdef_build_fallback_method(self,method:str)->Callable:def_internal_fallback_function(*args:Any,**kwargs:Any)->"DataFrame":log_advice(f"`{method}` is executed in fallback mode. It loads partial data into the "f"driver's memory to infer the schema, and loads all data into one executor's "f"memory to compute. It should only be used if the pandas DataFrame is expected "f"to be small.")input_df=self.copy()index_names=input_df.index.namessdf=input_df._internal.spark_frametmp_agg_column_name=verify_temp_column_name(sdf,f"__tmp_aggregate_col_for_frame_{method}__")input_df[tmp_agg_column_name]=0tmp_idx_column_name=verify_temp_column_name(sdf,f"__tmp_index_col_for_frame_{method}__")input_df[tmp_idx_column_name]=input_df.index# TODO(SPARK-46859): specify the return type if possibledefcompute_function(pdf:pd.DataFrame):# type: ignore[no-untyped-def]pdf=pdf.drop(columns=[tmp_agg_column_name])pdf=pdf.set_index(tmp_idx_column_name,drop=True)pdf=pdf.sort_index()pdf=getattr(pdf,method)(*args,**kwargs)pdf[tmp_idx_column_name]=pdf.indexreturnpdf.reset_index(drop=True)output_df=input_df.groupby(tmp_agg_column_name).apply(compute_function)output_df=output_df.set_index(tmp_idx_column_name)output_df.index.names=index_namesreturnoutput_dfreturn_internal_fallback_functiondef_asfreq_fallback(self,*args:Any,**kwargs:Any)->"DataFrame":_f=self._build_fallback_method("asfreq")return_f(*args,**kwargs)def_asof_fallback(self,*args:Any,**kwargs:Any)->"DataFrame":_f=self._build_fallback_method("asof")return_f(*args,**kwargs)def_convert_dtypes_fallback(self,*args:Any,**kwargs:Any)->"DataFrame":_f=self._build_fallback_method("convert_dtypes")return_f(*args,**kwargs)def_infer_objects_fallback(self,*args:Any,**kwargs:Any)->"DataFrame":_f=self._build_fallback_method("infer_objects")return_f(*args,**kwargs)def_set_axis_fallback(self,*args:Any,**kwargs:Any)->"DataFrame":_f=self._build_fallback_method("set_axis")return_f(*args,**kwargs)def_to_feather_fallback(self,*args:Any,**kwargs:Any)->None:_f=self._build_fallback_driver_method("to_feather")return_f(*args,**kwargs)def_to_stata_fallback(self,*args:Any,**kwargs:Any)->None:_f=self._build_fallback_driver_method("to_stata")return_f(*args,**kwargs)def__getattr__(self,key:str)->Any:ifkey.startswith("__"):raiseAttributeError(key)ifhasattr(MissingPandasLikeDataFrame,key):ifget_option("compute.pandas_fallback"):new_key=f"_{key}_fallback"ifhasattr(self,new_key):returngetattr(self,new_key)property_or_func=getattr(MissingPandasLikeDataFrame,key)ifisinstance(property_or_func,property):returnproperty_or_func.fget(self)else:returnpartial(property_or_func,self)try:returnself.loc[:,key]exceptKeyError:raiseAttributeError("'%s' object has no attribute '%s'"%(self.__class__.__name__,key))def__setattr__(self,key:str,value:Any)->None:try:object.__getattribute__(self,key)returnobject.__setattr__(self,key,value)exceptAttributeError:passif(key,)inself._internal.column_labels:self[key]=valueelse:msg="pandas-on-Spark doesn't allow columns to be created via a new attribute name"ifis_testing():raiseAssertionError(msg)else:warnings.warn(msg,UserWarning)def__len__(self)->int:returnself._internal.resolved_copy.spark_frame.count()def__dir__(self)->Iterable[str]:fields=[fforfinself._internal.resolved_copy.spark_frame.schema.fieldNames()if" "notinf]returnlist(super().__dir__())+fieldsdef__iter__(self)->Iterator[Name]:returniter(self.columns)# NDArray Compatdef__array_ufunc__(self,ufunc:Callable,method:str,*inputs:Any,**kwargs:Any)->"DataFrame":# TODO: is it possible to deduplicate it with '_map_series_op'?ifall(isinstance(inp,DataFrame)forinpininputs)andany(notsame_anchor(inp,inputs[0])forinpininputs):# binary onlyassertlen(inputs)==2this=inputs[0]that=inputs[1]ifthis._internal.column_labels_level!=that._internal.column_labels_level:raiseValueError("cannot join with no overlapping index names")# Different DataFramesdefapply_op(psdf:DataFrame,this_column_labels:List[Label],that_column_labels:List[Label])->Iterator[Tuple["Series",Label]]:forthis_label,that_labelinzip(this_column_labels,that_column_labels):yield(ufunc(psdf._psser_for(this_label),psdf._psser_for(that_label),**kwargs).rename(this_label),this_label,)returnalign_diff_frames(apply_op,this,that,fillna=True,how="full")else:# DataFrame and Seriesapplied=[]this=inputs[0]assertall(inpisthisforinpininputsifisinstance(inp,DataFrame))forlabelinthis._internal.column_labels:arguments=[]forinpininputs:arguments.append(inp[label]ifisinstance(inp,DataFrame)elseinp)# both binary and unary.applied.append(ufunc(*arguments,**kwargs).rename(label))internal=this._internal.with_new_columns(applied)returnDataFrame(internal)def__class_getitem__(cls,params:Any)->object:# See https://github.com/python/typing/issues/193# we always wraps the given type hints by a tuple to mimic the variadic generic.returncreate_tuple_for_frame_type(params)
def_reduce_spark_multi(sdf:PySparkDataFrame,aggs:List[PySparkColumn])->Any:""" Performs a reduction on a spark DataFrame, the functions being known SQL aggregate functions. """SparkDataFrame=get_dataframe_class()assertisinstance(sdf,SparkDataFrame)sdf0=sdf.agg(*aggs)lst=sdf0.limit(2).toPandas()assertlen(lst)==1,(sdf,lst)row=lst.iloc[0]lst2=list(row)assertlen(lst2)==len(aggs),(row,lst2)returnlst2classCachedDataFrame(DataFrame):""" Cached pandas-on-Spark DataFrame, which corresponds to pandas DataFrame logically, but internally it caches the corresponding Spark DataFrame. """def__init__(self,internal:InternalFrame,storage_level:Optional[StorageLevel]=None):ifstorage_levelisNone:object.__setattr__(self,"_cached",internal.spark_frame.cache())elifisinstance(storage_level,StorageLevel):object.__setattr__(self,"_cached",internal.spark_frame.persist(storage_level))else:raiseTypeError("Only a valid pyspark.StorageLevel type is acceptable for the `storage_level`")super().__init__(internal)def__enter__(self)->"CachedDataFrame":returnselfdef__exit__(self,exception_type:Optional[Type[BaseException]],exception_value:Optional[BaseException],traceback:Optional[TracebackType],)->Optional[bool]:self.spark.unpersist()returnNone# create accessor for Spark related methods.spark=CachedAccessor("spark",CachedSparkFrameMethods)def_test()->None:importosimportdoctestimportshutilimportsysimporttempfileimportuuidfrompyspark.sqlimportSparkSessionimportpyspark.pandas.frameos.chdir(os.environ["SPARK_HOME"])globs=pyspark.pandas.frame.__dict__.copy()globs["ps"]=pyspark.pandasspark=(SparkSession.builder.master("local[4]").appName("pyspark.pandas.frame tests").getOrCreate())globs["spark"]=sparkdb_name="db%s"%str(uuid.uuid4()).replace("-","")spark.sql("CREATE DATABASE %s"%db_name)globs["db"]=db_namepath=tempfile.mkdtemp()globs["path"]=path(failure_count,test_count)=doctest.testmod(pyspark.pandas.frame,globs=globs,optionflags=doctest.ELLIPSIS|doctest.NORMALIZE_WHITESPACE,)shutil.rmtree(path,ignore_errors=True)spark.sql("DROP DATABASE IF EXISTS %s CASCADE"%db_name)spark.stop()iffailure_count:sys.exit(-1)if__name__=="__main__":_test()