
    Kd!B                         d dl Z d dlmZ d dlZddlmZmZmZ ddl	m
Z
 ddlmZmZmZmZ ddlmZ ddlmZmZmZmZmZ d	d
lmZ  G d dee          ZdS )    N)Integral   )BaseEstimatorTransformerMixin_fit_context)_safe_indexing)HiddenIntervalOptions
StrOptions_weighted_percentile)_check_feature_names_in_check_sample_weightcheck_arraycheck_is_fittedcheck_random_state   )OneHotEncoderc                   T   e Zd ZU dZ eeddd          dg eh d          g eh d          g eee	j
        e	j        h          dg eed	dd          d e ed
h                    gdgdZeed<   	 ddddd
dddZ ed          dd            Zd Zd Zd ZddZdS )KBinsDiscretizera  
    Bin continuous data into intervals.

    Read more in the :ref:`User Guide <preprocessing_discretization>`.

    .. versionadded:: 0.20

    Parameters
    ----------
    n_bins : int or array-like of shape (n_features,), default=5
        The number of bins to produce. Raises ValueError if ``n_bins < 2``.

    encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
        Method used to encode the transformed result.

        - 'onehot': Encode the transformed result with one-hot encoding
          and return a sparse matrix. Ignored features are always
          stacked to the right.
        - 'onehot-dense': Encode the transformed result with one-hot encoding
          and return a dense array. Ignored features are always
          stacked to the right.
        - 'ordinal': Return the bin identifier encoded as an integer value.

    strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
        Strategy used to define the widths of the bins.

        - 'uniform': All bins in each feature have identical widths.
        - 'quantile': All bins in each feature have the same number of points.
        - 'kmeans': Values in each bin have the same nearest center of a 1D
          k-means cluster.

    dtype : {np.float32, np.float64}, default=None
        The desired data-type for the output. If None, output dtype is
        consistent with input dtype. Only np.float32 and np.float64 are
        supported.

        .. versionadded:: 0.24

    subsample : int or None, default='warn'
        Maximum number of samples, used to fit the model, for computational
        efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
        when `strategy='uniform'` or `strategy='kmeans'`.
        `subsample=None` means that all the training samples are used when
        computing the quantiles that determine the binning thresholds.
        Since quantile computation relies on sorting each column of `X` and
        that sorting has an `n log(n)` time complexity,
        it is recommended to use subsampling on datasets with a
        very large number of samples.

        .. versionchanged:: 1.3
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="quantile"`.

        .. versionchanged:: 1.5
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="uniform"` or `strategy="kmeans"`.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for subsampling.
        Pass an int for reproducible results across multiple function calls.
        See the `subsample` parameter for more details.
        See :term:`Glossary <random_state>`.

        .. versionadded:: 1.1

    Attributes
    ----------
    bin_edges_ : ndarray of ndarray of shape (n_features,)
        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
        Ignored features will have empty arrays.

    n_bins_ : ndarray of shape (n_features,), dtype=np.int_
        Number of bins per feature. Bins whose width are too small
        (i.e., <= 1e-8) are removed with a warning.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    Binarizer : Class used to bin values as ``0`` or
        ``1`` based on a parameter ``threshold``.

    Notes
    -----
    In bin edges for feature ``i``, the first and last values are used only for
    ``inverse_transform``. During transform, bin edges are extended to::

      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])

    You can combine ``KBinsDiscretizer`` with
    :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
    part of the features.

    ``KBinsDiscretizer`` might produce constant features (e.g., when
    ``encode = 'onehot'`` and certain bins do not contain any data).
    These features can be removed with feature selection algorithms
    (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).

    Examples
    --------
    >>> from sklearn.preprocessing import KBinsDiscretizer
    >>> X = [[-2, 1, -4,   -1],
    ...      [-1, 2, -3, -0.5],
    ...      [ 0, 3, -2,  0.5],
    ...      [ 1, 4, -1,    2]]
    >>> est = KBinsDiscretizer(
    ...     n_bins=3, encode='ordinal', strategy='uniform', subsample=None
    ... )
    >>> est.fit(X)
    KBinsDiscretizer(...)
    >>> Xt = est.transform(X)
    >>> Xt  # doctest: +SKIP
    array([[ 0., 0., 0., 0.],
           [ 1., 1., 1., 0.],
           [ 2., 2., 2., 1.],
           [ 2., 2., 2., 2.]])

    Sometimes it may be useful to convert the data back into the original
    feature space. The ``inverse_transform`` function converts the binned
    data into the original feature space. Each value will be equal to the mean
    of the two bin edges.

    >>> est.bin_edges_[0]
    array([-2., -1.,  0.,  1.])
    >>> est.inverse_transform(Xt)
    array([[-1.5,  1.5, -3.5, -0.5],
           [-0.5,  2.5, -2.5, -0.5],
           [ 0.5,  3.5, -1.5,  0.5],
           [ 0.5,  3.5, -1.5,  1.5]])
    r   Nleft)closedz
array-like>   onehot-denseonehotordinal>   kmeansuniformquantiler   warnrandom_staten_binsencodestrategydtype	subsampler!   _parameter_constraints   r   r   )r$   r%   r&   r'   r!   c                Z    || _         || _        || _        || _        || _        || _        d S Nr"   )selfr#   r$   r%   r&   r'   r!   s          Elib/python3.11/site-packages/sklearn/preprocessing/_discretization.py__init__zKBinsDiscretizer.__init__   s5      
"(    T)prefer_skip_nested_validationc                 ^	   |                      |d          }| j        t          j        t          j        fv r| j        }n|j        }|j        \  }}#| j        dk    rt          d| j        d          | j        dv r%| j        dk    rt          j
        d	t                     | j        }|dk    r| j        d
k    rdnd}|B||k    r<t          | j                  }|                    ||d          }	t          ||	          }|j        d         }|                     |          }
t#          ||j                  t          j        |t&                    }t)          |          D ]}|dd|f                                                                         }}||k    rKt          j
        d|z             d|
|<   t          j        t          j         t          j        g          ||<   | j        dk    r$t          j        |||
|         dz             ||<   nl| j        d
k    r~t          j        dd|
|         dz             },t          j        t          j        |                    ||<   nt          j        fd|D             t          j                  ||<   n| j        dk    rddlm} t          j        |||
|         dz             }|dd         |dd         z   dddf         dz  } ||
|         |d          }|                    dddf                   j        dddf         }|                                  |dd         |dd         z   dz  ||<   t          j!        |||         |f         ||<   | j        dv rt          j"        ||         t          j                  dk    }||         |         ||<   tG          ||                   dz
  |
|         k    r2t          j
        d|z             tG          ||                   dz
  |
|<   || _$        |
| _%        d| j&        v rotO          d | j%        D             | j&        dk    |           | _(        | j(                            t          j        dtG          | j%                  f                     | S )!as  
        Fit the estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        y : None
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

        sample_weight : ndarray of shape (n_samples,)
            Contains weight values to be associated with each sample.
            Only possible when `strategy` is set to `"quantile"`.

            .. versionadded:: 1.3

        Returns
        -------
        self : object
            Returns the instance itself.
        numericr&   Nr   zY`sample_weight` was provided but it cannot be used with strategy='uniform'. Got strategy=z	 instead.)r   r   r    zIn version 1.5 onwards, subsample=200_000 will be used by default. Set subsample explicitly to silence this warning in the mean time. Set subsample=None to disable subsampling explicitly.r   i@ F)sizereplacer   z3Feature %d is constant and will be replaced with 0.r   d   c                 2    g | ]}t          |          S  r   ).0qcolumnsample_weights     r-   
<listcomp>z(KBinsDiscretizer.fit.<locals>.<listcomp>  s5        ! 1JJ  r/   r   r   )KMeans      ?)
n_clustersinitn_init)r<   )r   r   )to_beging:0yE>zqBins whose width are too small (i.e., <= 1e-8) in feature %d are removed. Consider decreasing the number of bins.r   c                 6    g | ]}t          j        |          S r8   )nparanger9   is     r-   r=   z(KBinsDiscretizer.fit.<locals>.<listcomp>E  s     ???QBIaLL???r/   )
categoriessparse_outputr&   ))_validate_datar&   rF   float64float32shaper%   
ValueErrorr'   warningsr    FutureWarningr   r!   choicer   _validate_n_binsr   zerosobjectrangeminmaxarrayinflinspaceasarray
percentileclusterr>   fitcluster_centers_sortr_ediff1dlen
bin_edges_n_bins_r$   r   _encoder)r,   Xyr<   output_dtype	n_samples
n_featuresr'   rngsubsample_idxr#   	bin_edgesjjcol_mincol_max	quantilesr>   uniform_edgesrB   kmcentersmaskr;   s      `                  @r-   r`   zKBinsDiscretizer.fit   s   2 33:"*bj111:LL7L !	:$))C)C.=. . .   =111dn6N6NMH
    N	"&-:"="=4I Y%:%:$T%677CJJyy%JPPMq-00AWQZ
&&z22$0QQQMHZv666	
## 6	8 6	8Bqqq"uXF%zz||VZZ\\WG'!!IBN   r
 "26'26): ; ;	"}	)) "GWfRj1n M M	"*,,K3r
Q??	 ($&Jr}VY/O/O$P$PIbMM$&J    %.   !j% % %IbMM (**,,,,,, !#GWfRj1n M M%abb)M#2#,>>4H3N VvbzQGGG&&111d7O= !  "111a4) !(wss|!;s B	" "gy}g&E F	" } 666z)B-"&AAADH )"d 3	"y}%%)VBZ77M9;=>  
 "%Yr]!3!3a!7F2J#t{"")??$,???"kX5"  DM Mbh3t|+<+<'=>>???r/   c                    | j         }t          |t                    rt          j        ||t
                    S t          |t
          dd          }|j        dk    s|j        d         |k    rt          d          |dk     ||k    z  }t          j
        |          d         }|j        d         dk    rLd	                    d
 |D                       }t          d                    t          j        |                    |S )z0Returns n_bins_, the number of bins per feature.r3   TF)r&   copy	ensure_2dr   r   z8n_bins must be a scalar or array of shape (n_features,).r   z, c              3   4   K   | ]}t          |          V  d S r+   )strrH   s     r-   	<genexpr>z4KBinsDiscretizer._validate_n_bins.<locals>.<genexpr>^  s(      BB1ABBBBBBr/   zk{} received an invalid number of bins at indices {}. Number of bins must be at least 2, and must be an int.)r#   
isinstancer   rF   fullintr   ndimrO   rP   wherejoinformatr   __name__)r,   rm   	orig_binsr#   bad_nbins_valueviolating_indicesindicess          r-   rT   z!KBinsDiscretizer._validate_n_binsO  s   K	i** 	=7:y<<<<YcNNN;??fl1o;;WXXX!A:&I*=>H_55a8"1%))iiBB0ABBBBBG::@&$-w; ;   r/   c                 2   t          |            | j        t          j        t          j        fn| j        }|                     |d|d          }| j        }t          |j        d                   D ]8}t          j	        ||         dd         |dd|f         d          |dd|f<   9| j
        d	k    r|S d}d
| j
        v r| j        j        }|j        | j        _        	 | j                            |          }|| j        _        n# || j        _        w xY w|S )a  
        Discretize the data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        Returns
        -------
        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
            Data in the binned space. Will be a sparse matrix if
            `self.encode='onehot'` and ndarray otherwise.
        NTF)rz   r&   resetr   r?   right)sider   r   )r   r&   rF   rM   rN   rL   rf   rW   rO   searchsortedr$   rh   	transform)r,   ri   r&   Xtrp   rq   
dtype_initXt_encs           r-   r   zKBinsDiscretizer.transformh  s-    	 -1J,>RZ((DJ  U% HHO	$$ 	V 	VB	"ad(;R2YWUUUBqqq"uII;)##I
t{"",J"$(DM	-],,R00F #-DM*DM,,,,s   D Dc                     t          |            d| j        v r| j                            |          }t	          |dt
          j        t
          j        f          }| j        j	        d         }|j	        d         |k    r.t          d                    ||j	        d                             t          |          D ]R}| j        |         }|dd         |dd         z   d	z  }|t          j        |dd|f                            |dd|f<   S|S )
a  
        Transform discretized data back to original feature space.

        Note that this function does not regenerate the original data
        due to discretization rounding.

        Parameters
        ----------
        Xt : array-like of shape (n_samples, n_features)
            Transformed data in the binned space.

        Returns
        -------
        Xinv : ndarray, dtype={np.float32, np.float64}
            Data in the original feature space.
        r   T)rz   r&   r   r   z8Incorrect number of features. Expecting {}, received {}.Nr?   r@   )r   r$   rh   inverse_transformr   rF   rM   rN   rg   rO   rP   r   rW   rf   int_)r,   r   Xinvrm   rq   rp   bin_centerss          r-   r   z"KBinsDiscretizer.inverse_transform  s   " 	t{""0044B2DRZ0HIII\'*
:a=J&&JQQ
1    
## 	< 	<B+I$QRR=9SbS>9S@K%bgd111b5k&:&:;DBKKr/   c                     t          | d           t          | |          }t          | d          r| j                            |          S |S )a  Get output feature names.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Input features.

            - If `input_features` is `None`, then `feature_names_in_` is
              used as feature names in. If `feature_names_in_` is not defined,
              then the following input feature names are generated:
              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
            - If `input_features` is an array-like, then `input_features` must
              match `feature_names_in_` if `feature_names_in_` is defined.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        n_features_in_rh   )r   r   hasattrrh   get_feature_names_out)r,   input_featuress     r-   r   z&KBinsDiscretizer.get_feature_names_out  sU    ( 	.///0~FF4$$ 	G=66~FFF r/   )r)   )NNr+   )r   
__module____qualname____doc__r
   r   r   r   typerF   rM   rN   r	   r(   dict__annotations__r.   r   r`   rT   r   r   r   r8   r/   r-   r   r      s        J JZ 8Haf===|L:CCCDDEZ A A ABBC'$RZ 8994@HXq$v666F::vh''((

 (($ $D    ) ) ) ) ) )" \555G G G 65GR  2% % %N$ $ $L     r/   r   )rQ   numbersr   numpyrF   baser   r   r   utilsr   utils._param_validationr	   r
   r   r   utils.statsr   utils.validationr   r   r   r   r   	_encodersr   r   r8   r/   r-   <module>r      s/              @ @ @ @ @ @ @ @ @ @ " " " " " " K K K K K K K K K K K K . . . . . .              % $ $ $ $ $u u u u u' u u u u ur/   