
    
f                     :    d dl mZ d dlmZ d dlZd dlmZ ddZdS )    )cuda)driverN)numpy_supportc           	         t          | dd          }|sT| j        \  }}| j        j        |z  | j        j        f}t          j        j                            ||f|| j        |          }t          j	        | j                  t          j                    j        }t          t          j        dt          j        |d          dz                      }t          ||z            }||dz   ft          j        fd            }	t          |j        d         |z  dz             t          |j        d         |z  dz             f}
||f} |	|
||f         | |           |S )a  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    streamr   )dtyper         c                 8   t           j                            
	          }t           j        j        }t           j        j        }t           j        j        t           j        j        z  }t           j        j        t           j        j        z  }||z   }||z   }||z   | j        d         k     r)||z   | j        d         k     r| ||z   ||z   f         |||f<   t          j	                     ||j        d         k     r"||j        d         k     r|||f         |||f<   d S d S d S )N)shaper   r   r
   )
r   sharedarray	threadIdxxyblockIdxblockDimr   syncthreads)inputoutputtiletxtybxbyr   r   dt
tile_shapes            <lib/python3.11/site-packages/numba/cuda/kernels/transpose.pykernelztranspose.<locals>.kernel)   s    {  z <<^^]_t}.]_t}.GG7U[^##R%+a.(@(@ b"r'!12DRLv|A1v|A#6#6B<F1a4LLL #6#6    )getattrr   r   itemsizer   cudadrvdevicearrayDeviceNDArraynps
from_dtyper   
get_deviceMAX_THREADS_PER_BLOCKintmathpowlogjit)abr   colsrowsstridestpb
tile_widthtile_heightr   blocksthreadsr   r   s               @@r   	transposer9      su    Q!$$F W
d'"T)17+;;L$224L'	 3   
	 	 B




3CTXa#q!1!1A!56677JcJ&''KzA~.J	X( ( ( ( ( X($ k)A-..AGAJ4Ka4O0P0PPF:%G#F67F"#Aq)))Hr    )N)	numbar   numba.cuda.cudadrv.driverr   r+   numba.npr   r&   r9    r    r   <module>r>      sb          , , , , , ,  ) ) ) ) ) ): : : : : :r    