
    Cd|                       d Z ddlmZ ddlZddlZddlZddlmZmZ ddl	Z
ddlZddlmZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZm Z m!Z!m"Z" dd
l#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z; d Z<d Z=d;dZ>dgdgdgdgddgg dZ?dZ@dZAd ZBd<dZCe+ZD	 	 	 	 	 	 d=dZEd ZFd  ZG eejH                  	 	 	 	 	 	 	 	 	 	 	 	 d>d!            ZHd" ZId;d#ZJd;d$ZKd% ZLd;d&ZMd;d'ZNd( ZOd?d)ZPd* ZQd+ ZR eejS                  	 	 	 	 	 	 	 	 	 	 	 	 d@d-            ZSdAd.ZTdAd/ZUdBd1ZVdCd2ZW	 	 	 	 	 dDd3ZXd4 ZYd5 ZZd6 Z[d7 Z\d8 Z]	 	 	 	 	 	 d=d9Z^d: Z_dS )Ea'	  
Algorithms that Involve Multiple DataFrames
===========================================

The pandas operations ``concat``, ``join``, and ``merge`` combine multiple
DataFrames.  This module contains analogous algorithms in the parallel case.

There are two important cases:

1.  We combine along a partitioned index
2.  We combine along an unpartitioned index or other column

In the first case we know which partitions of each dataframe interact with
which others.  This lets us be significantly more clever and efficient.

In the second case each partition from one dataset interacts with all
partitions from the other.  We handle this through a shuffle operation.

Partitioned Joins
-----------------

In the first case where we join along a partitioned index we proceed in the
following stages.

1.  Align the partitions of all inputs to be the same.  This involves a call
    to ``dd.repartition`` which will split up and concat existing partitions as
    necessary.  After this step all inputs have partitions that align with
    each other.  This step is relatively cheap.
    See the function ``align_partitions``.
2.  Remove unnecessary partitions based on the type of join we perform (left,
    right, inner, outer).  We can do this at the partition level before any
    computation happens.  We'll do it again on each partition when we call the
    in-memory function.  See the function ``require``.
3.  Embarrassingly parallel calls to ``pd.concat``, ``pd.join``, or
    ``pd.merge``.  Now that the data is aligned and unnecessary blocks have
    been removed we can rely on the fast in-memory Pandas join machinery to
    execute joins per-partition.  We know that all intersecting records exist
    within the same partition


Hash Joins via Shuffle
----------------------

When we join along an unpartitioned index or along an arbitrary column any
partition from one input might interact with any partition in another.  In
this case we perform a hash-join by shuffling data in each input by that
column.  This results in new inputs with the same partition structure cleanly
separated along that column.

We proceed with hash joins in the following stages:

1.  Shuffle each input on the specified column.  See the function
    ``dask.dataframe.shuffle.shuffle``.
2.  Perform embarrassingly parallel join across shuffled inputs.
    )annotationsN)partialwraps)is_dtype_equal)merge_sortedunique)is_dask_collectiontokenize)methods)	DataFrameIndexSeries_concat_Frame_maybe_from_pandasis_broadcastablemap_partitionsnew_dd_objectprefix_reductionsuffix_reduction)group_split_dispatchhash_object_dispatch)from_pandas)partitioning_indexrearrange_by_divisionsshuffleshuffle_group)
asciitable
check_metais_dataframe_likeis_series_like	make_metastrip_unknown_categories)HighLevelGraph)BroadcastJoinLayer)Mapplyget_default_shuffle_algorithmc                    t          t          |           fd| D             }t          |           dk    rt          d          t	          d |D                       st          d          t          t          t          d |D                                  t                    dk    rd         d         ffd| D             }t                      }d	 | D             }d
d         D ]}t                      }t          |          D ]\  }}t          |t                    rz||         }	|j        }
|	t          |
          dz
  k     r?|
|	         |k    r3|                    |j        ||         f           ||xx         dz  cc<   ~|                    d
           |                    d
           |                    |           |t                    |fS )aq  Mutually partition and align DataFrame blocks

    This serves as precursor to multi-dataframe operations like join, concat,
    or merge.

    Parameters
    ----------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        Sequence of dataframes to be aligned on their index

    Returns
    -------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        These must have consistent divisions with each other
    divisions: tuple
        Full divisions sequence of the entire result
    result: list
        A list of lists of keys that show which data exist on which
        divisions
    c                T    g | ]$}t          |t                     |          "|%S  
isinstancer   .0df_is_broadcastables     4lib/python3.11/site-packages/dask/dataframe/multi.py
<listcomp>z$align_partitions.<locals>.<listcomp>~   s:    UUU2
2v 6 6U?P?PQS?T?TUBUUU    r   z$dfs contains no DataFrame and Seriesc              3  $   K   | ]}|j         V  d S Nknown_divisionsr/   r0   s     r2   	<genexpr>z#align_partitions.<locals>.<genexpr>   s%      11br!111111r4   z]Not all divisions are known, can't align partitions. Please use `set_index` to set the index.c                    g | ]	}|j         
S r+   	divisionsr9   s     r2   r3   z$align_partitions.<locals>.<listcomp>   s    *G*G*GB2<*G*G*Gr4      c                l    g | ]0}t          |t                    r|                    d           n|1S )T)force)r-   r   repartitionr/   r0   r=   s     r2   r3   z$align_partitions.<locals>.<listcomp>   sM        2<B1G1GOy---R  r4   c                    g | ]}d S )r   r+   r9   s     r2   r3   z$align_partitions.<locals>.<listcomp>   s    "Ar4   N)r   r   len
ValueErroralllistr   r   	enumerater-   r   r=   append_nametuple)dfsdfs1dfs2resultindsdLir0   jdivsr1   r=   s              @@r2   align_partitionsrW   h   s   *   0#66UUUUUUUD
3xx1}}?@@@11D11111 
 
 
 	
 VL*G*G$*G*G*GHIIJJI
9~~q\9Q<0	     D
 VVFDss^  FFt__ 
	 
	EAr"f%% 	G|s4yy1}$$aAHHbhQ0111GGGqLGGGGHHTNNNNay!!6))r4   c                    t          t          |           fd| D             }|s| S |d         j        t          fd|D                       s*t	          t          | d                   fd| D             S | S )a  Align DataFrame blocks if divisions are different.

    Note that if all divisions are unknown, but have equal npartitions, then
    they will be passed through unchanged. This is different than
    `align_partitions`, which will fail if divisions aren't all knownc                T    g | ]$}t          |t                     |          "|%S r+   r,   r.   s     r2   r3   z+_maybe_align_partitions.<locals>.<listcomp>   s:    
U
U
U"
2v 6 6
U?P?PQS?T?T
U2
U
U
Ur4   r   c              3  .   K   | ]}|j         k    V  d S r6   r<   rB   s     r2   r:   z*_maybe_align_partitions.<locals>.<genexpr>   s*      77Rr|y(777777r4   c                \    g | ](}t          |t                    s|nt                    )S r+   )r-   r   next)r/   arO   s     r2   r3   z+_maybe_align_partitions.<locals>.<listcomp>   s2    MMM1Av..>DJJMMMr4   )r   r   r=   rG   iterrW   )argsrM   r1   rO   r=   s     @@@r2   _maybe_align_partitionsr`      s       0$77
U
U
U
U
U
U
UC A I7777377777 N$c*1-..MMMMMMMMKr4   c                (   |s| |fS |D ]fdt          |          D             }t          | t          |          t          |          dz                      } t          |t          |          t          |          dz                      }| |fS )aH  Clear out divisions where required components are not present

    In left, right, or inner joins we exclude portions of the dataset if one
    side or the other is not present.  We can achieve this at the partition
    level as well

    >>> divisions = [1, 3, 5, 7, 9]
    >>> parts = [(('a', 0), None),
    ...          (('a', 1), ('b', 0)),
    ...          (('a', 2), ('b', 1)),
    ...          (None, ('b', 2))]

    >>> divisions2, parts2 = require(divisions, parts, required=[0])
    >>> divisions2
    (1, 3, 5, 7)
    >>> parts2  # doctest: +NORMALIZE_WHITESPACE
    ((('a', 0), None),
     (('a', 1), ('b', 0)),
     (('a', 2), ('b', 1)))

    >>> divisions2, parts2 = require(divisions, parts, required=[1])
    >>> divisions2
    (3, 5, 7, 9)
    >>> parts2  # doctest: +NORMALIZE_WHITESPACE
    ((('a', 1), ('b', 0)),
     (('a', 2), ('b', 1)),
     (None, ('b', 2)))

    >>> divisions2, parts2 = require(divisions, parts, required=[0, 1])
    >>> divisions2
    (3, 5, 7)
    >>> parts2  # doctest: +NORMALIZE_WHITESPACE
    ((('a', 1), ('b', 0)),
     (('a', 2), ('b', 1)))
    c                *    g | ]\  }}|         |S r6   r+   )r/   rU   prT   s      r2   r3   zrequire.<locals>.<listcomp>   s&    FFFAQqT5E15E5E5Er4      r>   )rI   rL   minmax)r=   partsrequiredpresentrT   s       @r2   requirerj      s    H   % > >FFFF5!1!1FFF)CLL3w<<!3C$CDEE	eCLL3w<<!+;;<==er4   r>   )leftleftsemileftantirightinnerouter)ro   rk   rl   rm   )ro   rn   c          	        |^}}|                     dd          }|                     dd          }|j        j        }|                    d          j        }||D ]}	d }
d }|	| v r	| |	         }
nG|	|                     dd           k    r-|r+t          | j        j        t          j                  r| j        }
|	|v r	||	         }nG|	|                     dd           k    r-|r+t          |j        j        t          j                  r|j        }d}|
C|At          j	        |

                    d          |
                    d          g          j        }|
Wt          |
t          j                  r|

                    |          | _        n" | j        d	i |	|

                    |          i} |Xt          |t          j                  r|
                    |          |_         |j        d	i |	|
                    |          i} | j        |g|R i |}t          |           dk    r||j                 }t          |          dk    r!||j        
                    |          |_        |S )
N
left_indexFright_indexcategory)includeright_onleft_onr   r+   )getindexdtypeselect_dtypescolumnsr-   pdCategoricalDtyper   union_categoricalsastyper   assignmergerE   )lhsresult_metar_   kwargsrhsrr   rs   empty_index_dtypecategorical_columnscolrk   rn   rz   outs                 r2   merge_chunkr      s    JC$L%00J**]E22K#)/%33J3GGO&& 	C 	CCDEczz3x

:t44444cior/BCC %9DczzC

9d33333cior/BCC &IEEE$52[[,,ell:.F.FG   dBH-- B $E 2 2CII$#*AAT[[-?-?'@AAC eRX.. C %U 3 3CII$#*BBU\\%-@-@'ABBC
#)C
)$
)
)
)&
)
)C
 3xx1}}+%&
 3xx1}}*6I$$%677	Jr4   Tc                   |                     dd          }||d<   ||d<   t          | |          \  \  } }}}t          ||t          |                   \  }}dt	          | |fi |z   } | j        j        |j        fi |}	|	|d<   t                      }
t          |          D ]\  }\  }}t          t          ||g|f|
||f<    t          j        ||
| |g          }t          |||	|          S )z1Join two partitioned dataframes along their indexhowrk   rr   rs   zjoin-indexed-r   dependencies)rx   rW   rj   rh   r
   _meta_nonemptyr   dictrI   r'   r   r$   from_collectionsr   )r   r   rr   rs   r   r   r=   rg   namemetadskrT   r]   bgraphs                  r2   merge_indexed_dataframesr   1  s   
**UF
#
#C%F<'F=#3C#=#= JS#	5y%#??IuXc399&999D#3#C$6AA&AAD F=
&&Cu%% > >	6Aqq!ff=T1I+D#S#JOOOEdI666r4   ro   _x_yFc
           
        |t                      }|dk    rddlm}
  |
| |||||||          S |t          | j        |j                  }t          | ||||	          }t          |||||	          }t          |t                    rd}d}nd}t          |t                    rd}d}nd}t          |||||||	          }t          | j
                  r| j        n| j        }t          |j
                  r|j        n|j        } |j        |fi |}t          |t                    rt          t          |          f}t          |t                    rt          t          |          f}||d
<   t!          t"          ||f|dddd|}|S )a  Join two DataFrames on particular columns with hash join

    This shuffles both datasets on the joined column and then performs an
    embarrassingly parallel join partition-by-partition

    >>> hash_join(lhs, 'id', rhs, 'id', how='left', npartitions=10)  # doctest: +SKIP
    Np2pr   )hash_join_p2p)r   rw   r   rv   r   npartitionssuffixes	indicator)r   r   
max_branchTFr   rw   rv   rr   rs   r   r   r   r   enforce_metadatatransform_divisionsalign_dataframes)r(   distributed.shuffler   rf   r   shuffle_funcr-   r   r   rE   r|   r   _metar   rH   rL   r   r   )r   rw   r   rv   r   r   r   r   r   r   r   lhs2rhs2rr   rs   r   	_lhs_meta	_rhs_metar   joineds                       r2   	hash_joinr   J  s   & /11%555555}#	
 	
 	
 		
 #/3?;;W+w:  D X;J  D '5!! 


(E""   F '*#+&6&6E""CII&)#+&6&6E""CII9?9////D'4   )w((D!! +%//* F=	 !	 	 	 	F Mr4   c                h    | j         j        |j         fi |}|                    d          p'|                    |                    d                    }|                    d          p'|                     |                    d                    }t	          |          dk    rw|r*|j                            | j        j                  |_        nK|r*|j                            |j        j                  |_        n|j                            d          |_        ||d<   |j        dk    r_|d	         t          v rP|r| j
        }n|r2t	          |j
                  t	          | j
                  k    r|j
        }nd
 | j
        D             }ny| j        dk    r_|d	         t          v rP|r|j
        }nU|r2t	          | j
                  t	          |j
                  k    r| j
        }n!d |j
        D             }nt          d          t          t          | |f|dddd|}t          |          |_
        |S )Nrs   rv   rr   rw   r   int64r   r>   r   c                    g | ]}d S r6   r+   r/   _s     r2   r3   z)single_partition_join.<locals>.<listcomp>  s    666!666r4   c                    g | ]}d S r6   r+   r   s     r2   r3   z)single_partition_join.<locals>.<listcomp>  s    777!777r4   z7single_partition_join has no fallback for invalid callsFr   )r   r   rx   _contains_index_namerE   ry   r   rz   r   allowed_leftr=   allowed_rightNotImplementedErrorr   r   rL   )rk   rn   r   r   use_left	use_rightr=   r   s           r2   single_partition_joinr     s`    %4$U%9DDVDDDzz-(( E,F,F

:- -H 

<(( D,E,E

9- -I 4yyA~~ 	4**4:+;<<DJJ 	4**5;+<==DJJ**733DJ F=A&-<"?"? 	7II 	73u//3t~3F3FFFII66t~666II		Q		6%=M#A#A 	8II 	8#dn--U_1E1EEEII77u777II!E
 
 	
 	 !	 	 	 	F Y''FMr4   c                    t          |t                    s|g}t          |t                    s|g}t           fd|D                       rst          fd|D                       rZ fdt          ||          D             }|r=t	          d|          }t          j        d                    |                     dS dS dS dS )zEChecks for merge column dtype mismatches and throws a warning (#4574)c              3  *   K   | ]}|j         v V  d S r6   r|   )r/   r   rk   s     r2   r:   z&warn_dtype_mismatch.<locals>.<genexpr>  s*      
2
233$,
2
2
2
2
2
2r4   c              3  *   K   | ]}|j         v V  d S r6   r   )r/   r   rn   s     r2   r:   z&warn_dtype_mismatch.<locals>.<genexpr>  s;       ; ;!$u}; ; ; ; ; ;r4   c                    g | ]H\  }}t          j        |         j        |                   +||fj        |         j        |         fIS r+   )r   dtypes)r/   lorork   rn   s      r2   r3   z'warn_dtype_mismatch.<locals>.<listcomp>  sb     
 
 
B!$+b/5<3CDD
"Xt{2R(89
 
 
r4   )zMerge columnsz
left dtypezright dtypezrMerging dataframes with merge column data type mismatches: 
{}
Cast dtypes explicitly to avoid unexpected results.N)r-   rH   rG   zipr   warningswarnformat)rk   rn   rw   rv   
dtype_mismcol_tbs   ``    r2   warn_dtype_mismatchr     s9    gt$$ )h%% :

2
2
2
2'
2
2
222 s ; ; ; ;(0; ; ; 8 8 
 
 
 
 
gx00
 
 

  	>
 F M0 &..       	 	r4   c                $   |||fD ]&}t          |t                    rt          d          '|s!|s|s|s|sfd| j        D             }|sdx}}|r
|s|s|x}}d }d}||vrt	          d| d| d          t          | t
          j        t
          j        f          rDt          t
          j        t
          j        f          rt          j        | ||||||||	
  
        S t          |           s4|r!|r| 
                    | |                   } d }d}t          | d	
          } t                    s4|r!|r
                    |                   d }d}t          d	
          |p|                     |          o| j        }|p                    |          oj        }|r|rt          | |||	||||	  	        S | j        d	k    r	|t           v sj        d	k    r!|t"          v rt%          | |||||||		  	        S |r	| j        r|r|rÉj        r|s| j        }j        }|                    |||||||||		  	        }|r5| j        r.t)          || j        ||          |                                 } n6|r4j        r-t)          | |j        ||          }                                 t/          t0          | |||||||||	|          S |r|rt3          | ||           d}t          |t4                    rt5          |          }d }n7t          |t6                    s"| t	          dt9          |           d          | j        j        k     rdnd}t;          | j        j                  }t=          | j        j                  }|dv rT|dv rP||k    rJ|durF|s|t?          j         |          |z  k     r)tC          | |r| j"        n||rj"        n|||
||	          S tG          | |r| j"        n||rj"        n|||
|||	|
  
        S )N7Dask collections not currently allowed in merge columnsc                &    g | ]}|j         v |S r+   r   )r/   crn   s     r2   r3   zmerge.<locals>.<listcomp>  s%    <<<Aem););a);););r4   T)rk   rn   rp   ro   rm   rl   z+dask.dataframe.merge does not support how='z'. Options are: zA. Note that 'leftanti' and 'leftsemi' are only dask_cudf options.)r   onrw   rv   rr   rs   r   r   r>   r   )r   r   r   rw   rv   rr   rs   )r   rv   rw   rr   rs   r   r   r   )
r   r   r   rw   rv   rr   rs   r   r   r   g      ?z9Optional `broadcast` argument must be float or bool.Type=z is not supported.rk   rn   )tasksr   Nro   rk   rn   F)r   )r   r   r   )$r-   r   r   r|   rF   r}   r   r   r   r	   	set_indexr   r   r8   r   r   r   r   r   r   r   r=   clear_divisionsr   r   r   floatbooltypere   rf   mathlog2broadcast_joinry   r   )rk   rn   r   r   rw   rv   rr   rs   r   r   r   r   r   	broadcastosupported_howmerge_indexed_leftmerge_indexed_right
left_emptyright_emptyr   broadcast_bias
bcast_siden_smalln_bigs    `                       r2   r   r     s;   " '8$  a   	%I  	  ,g ,h ,z ,+ ,<<<<<<< 	,'++J	 ' ( (OM
-P# P P} P P P
 
 	

 $BL122 
z	2<(8 8 
 x!#
 
 
 	
 d## 0 	7 	>>$w-00DGJ4Q///e$$ 2 	( 	OOE(O44EHKEq111 	8d//88

 
 	;u11(;; 

 
  S
1 S
'!#

 

 

 
	
 	A=  !!<$!#

 

 

 
	
 	q
 q
 q
 	q

 !q
 q
 (
*!#   

 

  		,$"6 		,*xW  E ''))DD  	,U%: 	,)gu
G  D ))++E!#
 
 
 	
"  	@x 	@eWh??? i'' 		"9--NIIIt,, 	1F <Y< < <    $/%2CCCVV
d&(9::D$e&788---111z!!&&$  
Wty'7'7.'HHH%",9DJJ'#.<EKKH'	 	 	 	 $1DJJ'&4EKKH!
 
 
 	
r4   c                `    t          |j                  dk    r| S |                    d          S Nr   r>   )rE   ry   tailrk   rn   s     r2   most_recent_tailr     s,    
5;1::a==r4   c                X    t          j        | |g                              |d          S )Nlastsubsetkeepr}   concatdrop_duplicatesrk   rn   bys      r2   most_recent_tail_summaryr     s)    9dE]##332F3KKKr4   c                    | j         j        dd         }|t          t          | |          S d|i}t          t          | |fi |S )zXFor each partition, returns the last row of the most recent nonempty
    partition.
    r   Nr   )r   ilocr   r   r   ddfr   emptyr   s       r2   compute_tailsr     R     IN1Q3E	z 0#u=== 8#uOOOOOr4   c                `    t          | j                  dk    r|S |                     d          S r   )rE   ry   headr   s     r2   most_recent_headr     s*    
4:!99Q<<r4   c                X    t          j        | |g                              |d          S )Nfirstr   r   r   s      r2   most_recent_head_summaryr    s)    9dE]##332G3LLLr4   c                    | j         j        dd         }|t          t          | |          S d|i}t          t          | |fi |S )zRFor each partition, returns the first row of the next nonempty
    partition.
    r   Nr   )r   r   r   r  r  r   s       r2   compute_headsr  
  r   r4   c                   g }t          |           dz
  t          |          dz
  }}d\  }}|dz   |k     r8||dz            | |         k    r#|dz  }|dz   |k     r||dz            | |         k    #g }||k     rft          dt          |dz
  |                    }|dk    r||         | |         k    r||         nd}	|dz   |k     rD||dz            | |dz            k     s!||dz            | |dz            k    r||dz
  k    r||dz            nd}
|                    ||	|
f           |dz   |k    s!|dz   |k     r||dz            | |dz            k    r|dz   n|}|dz   |k    s!|dz   |k     r| |dz            ||dz            k    r|dz   n|}||k    r|                    |           g }n1||dz
  k    r(||         | |         k    r|                    |           n||}}||k     f|S )zoReturns which partitions to pair for the merge_asof algorithm and the
    bounds on which to split them up
    r>   )r   rD   r   N)rE   rf   re   rJ   )rS   RrP   nmrT   rU   J	partitionlowerupperi1j1s                r2   pair_partitionsr    s?    Fq66A:s1vvzqADAq
a%!))!a%AaD((	Q a%!))!a%AaD((
A
a%%3q1ua==))	Q1Q4!A$;;!D 1uqyy1q5Aa!eH$$!a%Aa!eH(<(<a!e a!eHH 	 	 	
)UE*+++!eqjjQUQYY1QU8qQx3G3GQUUa!eqjjQUQYY1QU8qQx3G3GQUUa66MM!AA!a%ZZAbEAaDLLMM!21) a%%, Mr4   c                >   g }||                     |           |                     |           ||                     |           t          j        |          }t          j        | |fi |}|j        j        | j        j        k    r| j        j        |j        _        |S )zDmerge_asof but potentially adding rows to the beginning/end of right)rJ   r}   r   
merge_asofry   r   )rk   rn   prevr\   r   framesframerP   s           r2   merge_asof_paddedr  ;  s    Fd
MM%dIfE]411&11F|DJO++ JOMr4   c                .   t          j        d | D                       j        }g }| D ]/}|                    |                    |j                             0t          j        |          }t          j        |          }|                    |          }|S )zp
    Determine the unsorted column order.

    This should match the output of concat([frames], sort=False)
    c                    g | ]	}|j         
S r+   r   )r/   r  s     r2   r3   z(get_unsorted_columns.<locals>.<listcomp>R  s    ===UU[===r4   )	r}   r   r|   rJ   get_indexer_fornpconcatenater   take)r  new_columnsorderr  s       r2   get_unsorted_columnsr"  L  s     )==f===>>FKE A A[00??@@@@N5!!EIeEU##ELr4   c           
     f   t                      }dt          | |fi |z   }t          j        | j        |j        fi |}t          t          t          j        | j                            r0t          |j
        t          |          d          | j                  S t          t          t          j        |j                            rt          t          j        | |dd|          S | |g}d x}}|d         dv r,t          ||d                   }|                    |           |d         d	v r,t!          ||d                   }|                    |           t#          t%          | j        |j                            D ]\  }	}
g }|
D ]f\  }}}t&          j        | j        |	f||d
f}|	|j        |fnd }|	|j        |fnd }|                    t,          t.          ||j        |f||g|f           gt&          j        |f|||	f<   t3          j        |||          }t7          |||| j                  }|S )Nzasof-join-indexed-r   T)rn   rr   rs   r   	direction)backwardnearestright_by)r   )forwardr&  Fr   )r   r
   r}   r  r   rG   mapisnullr=   r   r   rE   r   r   r   rJ   r  rI   r  r   boundary_slicerK   r'   r  r   r$   r   r   )rk   rn   r   r   r   r   r   tailsheadsrT   r  r  rU   r  r  slicer   r  r   rP   s                       r2   merge_asof_indexedr/  ]  sq   
&&C(4"A"A&"A"AAD=,e.BMMfMMD
3ry$.))** Q49SYY[[1t?OPPPP
3ry%/**++ 

 M
 
 
 	
 %=LEEk555ez(:;;;E"""k444ez(:;;;E"""/$.%/JJKK 2 21  	 	OAue+dj!_eUERE','8EK##dD','8EK##dDMM%U[!,dD9	    ".&1T1I+D#LQQQE5$dn==FMr4   r%  c                   |dvrt          d          ||||||||	|
|||d}| |t          d          t          | t          j                  r-t          |t          j                  rt          j        | |fi |S |||t          d          |x}}||fD ]&}t          |t
                    rt          d          't          |           st          | d          } d x}x}}|V|r=| j	        r| j
        nd }| j        j        }|                                 } | j        d	         }|                     |d
          } t          |          st          |d          }||                    |||k    d
          }|||	t          d          |x|d<   |d<   ||	t          d          ||	t          d          |d= |d= |d= |d= d
x|d<   |d<   | j	        r|j	        st          d          t!          | |fi |}|s|rq|                                }|[||                    |d
|          }n |                    t$          j        |          }|                    t$          j        |          }|S )N)r%  r(  r&  zLInvalid merge_asof direction. Choose from 'backward' 'forward', or 'nearest')r   rw   rv   rr   rs   r   left_byr'  r   	toleranceallow_exact_matchesr$  zCannot merge_asof on NonezSCan only pass argument 'on' OR 'left_on' and 'right_on', not a combination of both.r   r>   r   r   T)sorted)dropr4  zSCan only pass argument 'by' OR 'left_by' and 'right_by', not a combination of both.r1  r'  z;Must specify both left_on and right_on if one is specified.r   rw   rv   r   rr   rs   z merge_asof input must be sorted!)r4  r=   )rF   r-   r}   r   r  r   r   r	   r   r8   r=   ry   r   reset_indexr|   r   r/  r   r&   rename_axis)rk   rn   r   rw   rv   rr   rs   r   r1  r'  r   r2  r3  r$  r   r   ixnameixcolrV   rP   s                       r2   r  r    sP   " :::'
 
 	
  "2 F |u}4555 $%% 4*UBL*I*I 4}T533F333	~("6'    (x   a   	%I  	
 d## 04Q///  F UT 	$%)%9C4>>tDZ_F##%%DLOE~~gd~33e$$ 2Eq11180CTRR	~("6e   243yF:.8/VWWWx/VWWWtfY'
);VD\377F<6-0 =u'< =;<<<e66v66F B( B##%%))%)MM..q{EBB**1=&AAFMr4   c                    t          t          t          t           |                               dk    rt          d          t	          j        | d|          S )Nr>   z,Concatenated DataFrames of different lengths)axisignore_order)rE   setr)  rF   r   r   )rM   r<  s     r2   concat_and_checkr>    sK    
3s3}}!##GHHH>#ALAAAAr4   c                B    dt            z    fdt           d         j                  D             }|                    di           t	          j        d  D             fddi|}t          j        |           }t          || d         j	                  S )	Nconcat-c                D    i | ]ft           fd D             fS )c                "    g | ]}|j         fS r+   )rK   )r/   r0   rT   s     r2   r3   z:concat_unindexed_dataframes.<locals>.<dictcomp>.<listcomp>  s    &C&C&C!}&C&C&Cr4   )r>  )r/   rT   rM   r<  r   s    @r2   
<dictcomp>z/concat_unindexed_dataframes.<locals>.<dictcomp>  sM        
q	$&C&C&C&Cs&C&C&C\R  r4   r   r<  c                    g | ]	}|j         
S r+   r  r9   s     r2   r3   z/concat_unindexed_dataframes.<locals>.<listcomp>  s    22228222r4   r;  r>   r   )
r
   ranger   updater   r   r$   r   r   r=   )rM   r<  r   r   r   r   r   s   ``    @r2   concat_unindexed_dataframesrG    s    x~%D     s1v)**  C MM><0111>22c222EEEfEED+D#CHHHEdCF,<===r4   rp   c                   dk    }                     d|i           t          j        d | D             f|d}d | D             t          |  \  }}}	dt	          g| R  z   fd|	D             }
dd	fd
t          |
          D             }|D ]}|                     |j                   t          |||          S )z7Concatenate indexed dataframes together along the indexr   r<  c                    g | ]	}|j         
S r+   r  r9   s     r2   r3   z-concat_indexed_dataframes.<locals>.<listcomp>  s       b   r4   )r;  joinfilter_warningc                6    g | ]}t          |j                  S r+   )r#   r   r9   s     r2   r3   z-concat_indexed_dataframes.<locals>.<listcomp>  s#    @@@b'11@@@r4   zconcat-indexed-c                D    g | ]}d  t          |          D             S )c                     g | ]\  }}||n|S r6   r+   )r/   r0   r   s      r2   r3   z8concat_indexed_dataframes.<locals>.<listcomp>.<listcomp>$  s$    MMMYRr~5MMMr4   )r   )r/   partemptiess     r2   r3   z-concat_indexed_dataframes.<locals>.<listcomp>#  sB        	NM#dG:L:LMMM  r4   TFc           
     B    i | ]\  }}|ft           j        |fS r+   )r   r   )	r/   rT   rO  r;  rK  rJ  r   r   uniforms	      r2   rC  z-concat_indexed_dataframes.<locals>.<dictcomp>+  sD       At 
q	GND$g~vV  r4   )rF  r   r   rW   r
   rI   daskr   )rM   r;  rJ  r<  r   r   r   rO   r=   rg   parts2r   r0   rP  rK  r   rR  s    `` `        @@@@r2   concat_indexed_dataframesrU    sb   19D
MM><0111>  C   	 
  D A@C@@@G-s3D)Ux3s3333D     F
 NG         ((  C   

27dD)444r4   c           	        |                     d|i           t          t          j        d | D             f|dd|          t	                    }dt          |   }i }d}g }	| D ]zt                    rmj                            j                  }
fd|
D             }|r=	                                |         
                    |         j                  |<   t                    rYt                    rJj        j        k    s9t          j        t          j                  s
                    j                  n	 |	                               	 t%          j                   d}n# t(          t*          f$ r d}Y nw xY wd}d}                                D ]2}|r||||f<   n!t.          t          j        ||gd|||g|f|||f<   |d	z  }3|t1          j        |||	
          }t5          |||          S )z8Concatenate partitions on axis=0 by doing a simple stackr<  c                    g | ]	}|j         
S r+   )r   r9   s     r2   r3   z$stack_partitions.<locals>.<listcomp>>  s    ---2R---r4   F)rJ  rK  r@  r   c                    g | ]E}|         j         |         j         k    t          |         j         t          j                  C|FS r+   )rz   r-   r}   r~   )r/   r   r0   r   s     r2   r3   z$stack_partitions.<locals>.<listcomp>P  sU       c7=DIO33"2c7="2EFF 4 333r4   Tr>   r   )rF  r"   r   r   r#   r
   r    r|   intersectioncopyr   r   r!   rz   r-   r}   r~   rJ   r   r   rF   	TypeError__dask_keys__r'   r$   r   r   )rM   r=   rJ  r<  r   r   r   r   rT   astyped_dfsshared_columnsneeds_astypematchrK  rR  keyr   r0   r   s                    @@r2   stack_partitionsrb  5  s   
 MM><0111-----	
 	
 	
 		
 	
 D %T**E%Xs^%%D
C	AK 2 2 R   	VZ44T\BBN    )  L  VWWYY#%l#3#:#:4;M;T#U#U< " 	."6"6 	8tz))*"-3 3) YYtz**2
	rx&&&EEI& 	 	 	EEE	 ##%% 
	 
	C !$T1I NS\1dG^D	"T1I FAA
	 +D#KPPPEdI666s   'E??FFc                ~    t           t                    st          d          t                     dk    rt	          d          t                     dk    rC|dk    r5t           d         t
                    r d                                         S  d         S |dvrt	          d          t          j        |          }	 d  D              n# t          $ r Y nw xY wd  D             }t                      |dk    rt          d	 |D                       rt           f|||d
|S t          |          t                     k    r[t          d  D                       rBt          d |D                       dk    r%|st          j        d           t           fd|i|S t	          d          t          d |D                       rt           fdt!          t                     dz
            D                       rBg } dd         D ]}	||	j        dd         z  }| d         j        z  }t%           |f||d|S |rt           f||d|S dgt'          d  D                       dz   z  }t%           |f||d|S dgt'          d  D                       dz   z  }t%           |f||d|S )a'  Concatenate DataFrames along rows.

    - When axis=0 (default), concatenate DataFrames row-wise:

      - If all divisions are known and ordered, concatenate DataFrames keeping
        divisions. When divisions are not ordered, specifying
        interleave_partition=True allows concatenate divisions each by each.

      - If any of division is unknown, concatenate DataFrames resetting its
        division to unknown (None)

    - When axis=1, concatenate DataFrames column-wise:

      - Allowed if all divisions are known.

      - If any of division is unknown, it raises ValueError.

    Parameters
    ----------
    dfs : list
        List of dask.DataFrames to be concatenated
    axis : {0, 1, 'index', 'columns'}, default 0
        The axis to concatenate along
    join : {'inner', 'outer'}, default 'outer'
        How to handle indexes on other axis
    interleave_partitions : bool, default False
        Whether to concatenate DataFrames ignoring its order. If True, every
        divisions are concatenated each by each.
    ignore_unknown_divisions : bool, default False
        By default a warning is raised if any input has unknown divisions.
        Set to True to disable this warning.
    ignore_order : bool, default False
        Whether to ignore order when doing the union of categoricals.

    Notes
    -----
    This differs in from ``pd.concat`` in the when concatenating Categoricals
    with different categories. Pandas currently coerces those to objects
    before concatenating. Coercing to objects is very expensive for large
    arrays, so dask preserves the Categoricals by taking the union of
    the categories.

    Examples
    --------
    If all divisions are known and ordered, divisions are kept.

    >>> import dask.dataframe as dd
    >>> a                                               # doctest: +SKIP
    dd.DataFrame<x, divisions=(1, 3, 5)>
    >>> b                                               # doctest: +SKIP
    dd.DataFrame<y, divisions=(6, 8, 10)>
    >>> dd.concat([a, b])                               # doctest: +SKIP
    dd.DataFrame<concat-..., divisions=(1, 3, 6, 8, 10)>

    Unable to concatenate if divisions are not ordered.

    >>> a                                               # doctest: +SKIP
    dd.DataFrame<x, divisions=(1, 3, 5)>
    >>> b                                               # doctest: +SKIP
    dd.DataFrame<y, divisions=(2, 3, 6)>
    >>> dd.concat([a, b])                               # doctest: +SKIP
    ValueError: All inputs have known divisions which cannot be concatenated
    in order. Specify interleave_partitions=True to ignore order

    Specify interleave_partitions=True to ignore the division order.

    >>> dd.concat([a, b], interleave_partitions=True)   # doctest: +SKIP
    dd.DataFrame<concat-..., divisions=(1, 2, 3, 5, 6)>

    If any of division is unknown, the result division will be unknown

    >>> a                                               # doctest: +SKIP
    dd.DataFrame<x, divisions=(None, None)>
    >>> b                                               # doctest: +SKIP
    dd.DataFrame<y, divisions=(1, 4, 10)>
    >>> dd.concat([a, b])                               # doctest: +SKIP
    dd.DataFrame<concat-..., divisions=(None, None, None, None)>

    By default concatenating with unknown divisions will raise a warning.
    Set ``ignore_unknown_divisions=True`` to disable this:

    >>> dd.concat([a, b], ignore_unknown_divisions=True)# doctest: +SKIP
    dd.DataFrame<concat-..., divisions=(None, None, None, None)>

    Different categoricals are unioned

    >>> dd.concat([
    ...     dd.from_pandas(pd.Series(['a', 'b'], dtype='category'), 1),
    ...     dd.from_pandas(pd.Series(['a', 'c'], dtype='category'), 1),
    ... ], interleave_partitions=True).dtype
    CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)
    z/dfs must be a list of DataFrames/Series objectsr   zNo objects to concatenater>   )ro   rp   z!'join' must be 'inner' or 'outer'c                T    g | ]%}t          t          |j                            #|&S r+   )r   rE   r|   r9   s     r2   r3   zconcat.<locals>.<listcomp>  s-    999b4BJ#8#89r999r4   c                <    g | ]}t          |t                    |S r+   r,   r9   s     r2   r3   zconcat.<locals>.<listcomp>  s'    888BB!7!78R888r4   c              3  $   K   | ]}|j         V  d S r6   r7   r9   s     r2   r:   zconcat.<locals>.<genexpr>  %      22br!222222r4   )r;  rJ  r<  c              3  &   K   | ]}|j          V  d S r6   r7   r9   s     r2   r:   zconcat.<locals>.<genexpr>  s(      99r**999999r4   c                    h | ]	}|j         
S r+   r   r9   s     r2   	<setcomp>zconcat.<locals>.<setcomp>  s    444R^444r4   zConcatenating dataframes with unknown divisions.
We're assuming that the indices of each dataframes are 
 aligned. This assumption is not generally safe.r<  zGUnable to concatenate DataFrame with unknown division specifying axis=1c              3  $   K   | ]}|j         V  d S r6   r7   r9   s     r2   r:   zconcat.<locals>.<genexpr>  rg  r4   c              3  n   K   | ]/}|         j         d          |dz            j         d         k     V  0dS )rD   r>   r   Nr<   )r/   rT   rM   s     r2   r:   zconcat.<locals>.<genexpr>  sX         A $s1q5z';A'>>     r4   NrD   )rJ  r<  c              3  $   K   | ]}|j         V  d S r6   r   r9   s     r2   r:   zconcat.<locals>.<genexpr>+  s$      )G)GR".)G)G)G)G)G)Gr4   c              3  $   K   | ]}|j         V  d S r6   r   r9   s     r2   r:   zconcat.<locals>.<genexpr>0  s$      %C%Cbn%C%C%C%C%C%Cr4   )r-   rH   r[  rE   rF   r   to_framer   _validate_axisAttributeErrorr   rG   rU  r   r   rG  rE  r=   rb  sum)
rM   r;  rJ  interleave_partitionsignore_unknown_divisionsr<  r   dasksr=   r0   s
   `         r2   r   r     s   L c4   KIJJJ
3xx1}}4555
3xx1}}199CFF339q6??$$$q6M%%%<===#D))D99C999    98#888E
S
!
!Cqyy22E22222 	,T HN   JJ#c((""99S99999 #44e44455::+    /sXXXQWXXX-  
 22E22222 	    s3xx!|,,      	crc( 3 3Bcrc!22IISW..	')-L LR   ' 0" AG   "Fc)G)G3)G)G)G&G&G!&KL	')-L LR   #%C%Cs%C%C%C"C"Ca"GHI#Y%) HN  s   C 
C C c                     d t          |t                    rt           fd|D                       S   |          S )z
    Test whether ``columns_or_index`` contains a reference
    to the index of ``df

    This is the local (non-collection) version of
    ``dask.core.DataFrame._contains_index_name``.
    c                    | j         j        d uoKt          j        |          pt	          |t
                    o"|| j         j        k    o|t          | dd          vS )Nr|   r+   )ry   r   r  isscalarr-   rL   getattr)xra  s     r2   _is_index_level_referencez7_contains_index_name.<locals>._is_index_level_reference?  s`    GL$ 5S!!;ZU%;%;5qw|#5 71i444		
r4   c              3  0   K   | ]} |          V  d S r6   r+   )r/   r
  r{  r0   s     r2   r:   z'_contains_index_name.<locals>.<genexpr>H  s1      NN,,R33NNNNNNr4   )r-   rH   any)r0   columns_or_indexr{  s   ` @r2   r   r   6  sc    
 
 
 "D)) ?NNNNN=MNNNNNN((-=>>>r4   c                     d t          |t                    r|n|g} fd|D             } |         }t           |          r|                     j                  }|S )a'  
    Returns a DataFrame with columns corresponding to each
    column or index level in columns_or_index.  If included,
    the column corresponding to the index level is named _index.

    This is the local (non-collection) version of
    ``dask.core.DataFrame._select_columns_or_index``.
    c                f    t          j        |          st          |t                    o|| j        v S r6   )r  rx  r-   rL   r|   )r0   ra  s     r2   _is_column_label_referencez<_select_columns_or_index.<locals>._is_column_label_referenceW  s-    C  :JsE$:$:Qrz@QQr4   c                ,    g | ]} |          |S r+   r+   )r/   r
  r  r0   s     r2   r3   z,_select_columns_or_index.<locals>.<listcomp>_  s-    UUU!3M3MbRS3T3TUAUUUr4   )_index)r-   rH   r   r   ry   )r0   r~  column_namesselected_dfr  s   `   @r2   _select_columns_or_indexr  M  s    R R R
 ''7>>VEUDV  VUUUU/UUUL\"KB 011 :!(((99r4   c           	        t          |t                    rt          j        |          }t          |t                    s$t
          j        j                            |          rt          |t                    r|gnt          |          }t          |          }|                    t          | j                            |k    r/t          | |         d          }||z  }t          | ||d          S t          |t                    st!          | |          }t#          ||          }|                     |          }t'          |dgd||d|          S )z
    Split-by-hash a DataFrame into `nsplits` groups.

    Hashing will be performed on the columns or index specified by `on`.
    F)ry   )ignore_index)_partitionsr  r   )r-   bytespickleloadsstrr}   apitypesis_list_likerH   r=  rY  r|   r   r   r   r  r   r   r   )r0   r   nsplitsnsetind
partitionsdf2s          r2   _split_partitionr  i  sD    "e \""c Nbfl77;; N  C((6bTTd2hh2wwS__--55&r"vU;;;C-C'CuMMMM b&!! .%b"--#B00J
))
)
+
+C		  r4   c                >    t          | d          }d|j        v r|d= |S )z0Concat and remove temporary "_partitions" columnFr  )r   r|   )rM   r0   s     r2   _concat_wrapperr    s+    	e		B
""}Ir4   c                 R    t          | i d |                                D             S )Nc                l    i | ]1\  }}|t          |t                    rt          j        |          n|2S r+   )r-   r  r  r  )r/   kvs      r2   rC  z(_merge_chunk_wrapper.<locals>.<dictcomp>  sF     
 
 
BF!QA*Q"6"6=v|AA
 
 
r4   )r   items)r_   r   s     r2   _merge_chunk_wrapperr    s@    	
 
JP,,..
 
 
  r4   c
           	        |r=| j         |j         k     r|                    |          }n|                     |          } |dvrt          d          |dk    r| j         |j         k     rt          d          |dk    r|j         | j         k    rt          d          |dk    rZ| j         |j         k     r%t          | |d	
          }
|
j        }|
}|j        }|}n7t          ||d	
          }| j        }| }|j        }|}n| j        }| }|j        }|}t          |t                    rd}d}nd}t          |t                    rd}d}nd}t          |||||||          } | j        j	        |j        fi |}||d<   | j         |j         k     r-|j         }|j
        }t          |j        j        j                  }n,| j         }| j
        }t          | j        j        j                  }|t          |j        j                  k    r	dg|dz   z  }t          | ||fi |}d|z   }t          |||| j         ||j         fd|	i|}t!          j        ||||g          }t%          ||||          S )a  Join two DataFrames on particular columns by broadcasting

    This broadcasts the partitions of the smaller DataFrame to each
    partition of the larger DataFrame, joins each partition pair,
    and then concatenates the new data for each output partition.
    r   r   z?Only 'inner', 'left' and 'right' broadcast joins are supported.rk   z-'left' broadcast join requires rhs broadcast.rn   z.'right' broadcast join requires lhs broadcast.ro   r   r   NTFr   r   r>   zbcast-join-	parts_outr   )r   rA   rF   r   rK   r-   r   r   r   r   r=   r=  ry   namesr
   r%   r$   r   r   )r   rw   r   rv   r   r   r   r   r   r  r   lhs_namelhs_deprhs_namerhs_depr   rr   rs   merge_kwargsr   r=   _index_namestokenr   broadcast_join_layerr   s                             r2   r   r     s   &  ;?S_,,//k/::CC//k/::C
,,,M
 
 	
 f}}3?::HIII
g~~#/S_<<IJJJ g~~ ?S_,,  D
 zHGyHGG  D
 yHGzHGG99'5!! 


(E""   L $3#C$6GG,GGD"&L
 ((oM	3-39::oM	3-39:: s4:+,,,,FkAo.	S#{;;l;;E5 D-	 	 	 	 	 +w'  E dI666r4   c                   t          |           }|||||d}|dk    r| d         S |dk    r | d         j        | d         fddi|}|S |dz  }	t          t          | d|	         fi |t          | |	d         fi |gfi |}|S )am  
    Schedule the merging of a list of dataframes in a pairwise method. This is a recursive function that results
    in a much more efficient scheduling of merges than a simple loop
    from:
    [A] [B] [C] [D] -> [AB] [C] [D] -> [ABC] [D] -> [ABCD]
    to:
    [A] [B] [C] [D] -> [AB] [CD] -> [ABCD]
    Note that either way, n-1 merges are still required, but using a pairwise reduction it can be completed in parallel.
    :param dataframes_to_merge: A list of Dask dataframes to be merged together on their index
    :return: A single Dask Dataframe, comprised of the pairwise-merges of all provided dataframes
    )r   lsuffixrsuffixr   r   r>   r   rd   r   rp   N)rE   rJ  _recursive_pairwise_outer_join)
dataframes_to_merger   r  r  r   r   number_of_dataframes_to_mergemerge_options
merged_ddfmiddle_indexs
             r2   r  r  -  s    %((;$<$<! " M %))"1%% %))0(+0"
 
(/
3@
 

  593.'6 :G  /'6 :G 	

 

 

 


 r4   r6   )TT)ro   Nr   NFN)ro   NNNFFr   FNNNN)NN)NNNFFNNNr   NTr%  )F)r   rp   F)rp   F)r   rp   FFF)`__doc__
__future__r   r   r  r   	functoolsr   r   numpyr  pandasr}   pandas.api.typesr   tlzr   r   	dask.baser	   r
   dask.dataframer   dask.dataframe.corer   r   r   r   r   r   r   r   r   r   r   dask.dataframe.dispatchr   r   dask.dataframe.ior   dask.dataframe.shuffler   r   r   r   dask.dataframe.utilsr   r   r    r!   r"   r#   dask.highlevelgraphr$   dask.layersr%   
dask.utilsr&   r'   r(   rW   r`   rj   rh   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r"  r/  r  r>  rG  rU  rb  r   r   r  r  r  r  r   r  r+   r4   r2   <module>r     s  6 6n # " " " " "    $ $ $ $ $ $ $ $         + + + + + + $ $ $ $ $ $ $ $ 2 2 2 2 2 2 2 2 " " " " " "                          O N N N N N N N ) ) ) ) ) )                           / . . . . . * * * * * * > > > > > > > > > >8* 8* 8*v  $* * * *f CSV  9"; ; ;|7 7 7 7,  	Z Z Z Zz6 6 6r  > rx 	a
 a
 a
 a
R  L L L L
P 
P 
P 
P  M M M M
P 
P 
P 
P! ! !H   "  "0 0 0f r} e e e eZB B B B> > > > 5  5  5  5FK7 K7 K7 K7` 
	"p p p pf? ? ?.  8" " "J     	K7 K7 K7 K7\1 1 1 1 1r4   