
    +gd                     F   d dl Zd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlZddlmZ er*ddlmZ 	 d dlmZ n# e$ r Y nw xY w	 d dlZn# e$ r Y nw xY wej                            d          duZej                            d	          duZ ej        e          Z G d
 de          Z  G d de
          Z! G d de
          Z" G d de
          Z# G d de
          Z$ G d d          Z% G d de%          Z& G d de%          Z' G d d          Z(dS )    N)PurePath)TYPE_CHECKINGDictList
NamedTupleOptionalUnion   )logging)DatasetElasticsearchelasticsearchfaissc                       e Zd ZdS )MissingIndexN)__name__
__module____qualname__     /lib/python3.11/site-packages/datasets/search.pyr   r   "   s        Dr   r   c                   <    e Zd ZU ee         ed<   ee         ed<   dS )SearchResultsscoresindicesNr   r   r   r   float__annotations__intr   r   r   r   r   &   s2         K#Yr   r   c                   T    e Zd ZU eee                  ed<   eee                  ed<   dS )BatchedSearchResultstotal_scorestotal_indicesNr   r   r   r   r"   r"   +   s;         tE{####S	?"""""r   r"   c                   0    e Zd ZU ee         ed<   eed<   dS )NearestExamplesResultsr   examplesNr   r   r   r   r   r   dictr   r   r   r&   r&   0   s)         KNNNNNr   r&   c                   H    e Zd ZU eee                  ed<   ee         ed<   dS )BatchedNearestExamplesResultsr#   total_examplesNr(   r   r   r   r+   r+   5   s7         tE{####Jr   r+   c                       e Zd ZdZddedefdZddedefdZde	e
ef         fdZede	e
ef         dd fd	            Zd
S )	BaseIndexzBase class for indexing
   kreturnc                     t           )z
        To implement.
        This method has to return the scores and the indices of the retrieved examples given a certain query.
        NotImplementedError)selfqueryr0   kwargss       r   searchzBaseIndex.search=   s
    
 "!r   c                     g g }}|D ]E}|                      ||          \  }}|                    |           |                    |           Ft          ||          S )a  Find the nearest examples indices to the query.

        Args:
            queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
            k (`int`): The number of examples to retrieve per query.

        Ouput:
            total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
            total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
        )r8   appendr"   )	r5   queriesr0   r7   r#   r$   r6   r   r   s	            r   search_batchzBaseIndex.search_batchD   sn     ')"m 	* 	*E"kk%33OFG'''  ))))#L-@@@r   filec                     t           )zSerialize the index on diskr3   )r5   r=   s     r   savezBaseIndex.saveV   s    !!r   c                     t           )zDeserialize the index from diskr3   )clsr=   s     r   loadzBaseIndex.loadZ   s
     "!r   Nr/   )r   r   r   __doc__r    r   r8   r"   r<   r	   strr   r?   classmethodrB   r   r   r   r.   r.   :   s        !!" "s "m " " " "A As A>R A A A A$"sH}- " " " " "c8m, " " " " [" " "r   r.   c                       e Zd ZdZ	 	 	 	 	 ddee         dee         ded         dee         dee         f
d	Zdd
e	e
e         df         dee         fdZddedefdZddedefdZdS )ElasticSearchIndexa/  
    Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity.
    An Elasticsearch server needs to be accessible, and a python client is declared with
    ```
    es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
    ```
    for example.
    Nhostport	es_clientr   es_index_namees_index_configc                    t           st          d          |||t          d          |pd}|pd}dd l}ddlm} ||n ||t          |          dg          | _        ||n7dt          j	        
                    t          j                    j                  z   | _        ||nd	d
ddddiidddddddiid| _        d S )Nz}You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`zBPlease specify either `es_client` or `(host, port)`, but not both.	localhosti#  r   r   )rI   rJ   huggingface_datasets_r
   analyzerstop_standardstandard	_english_)typez
 stopwords)number_of_shardsanalysis
propertiestextBM25)rU   rQ   
similarity)settingsmappings)_has_elasticsearchImportError
ValueErrorelasticsearch.helpersr   r   rE   rK   ospathbasenametempfileNamedTemporaryFilenamerL   rM   )r5   rI   rJ   rK   rL   rM   r   r   s           r   __init__zElasticSearchIndex.__init__j   sH    " 	 P    d&6$:Jabbb"{|t$$$$//////&/&;Y]gjkogpgpPqPqOrAsAs ( M(27+;+;H<W<Y<Y<^+_+__ 	 * O )*!+o
bm?n?n-o p  *FVQ[kq4r4r+st  	r   	documentsr   columnc                 b   | j         }| j        }| j        j                            ||           t                    }t          j        d|t          j                               }d}fd}ddl	}	|	j
                            | j        | |                      D ]\  }
}|                    d           ||
z  } |t                    k    r=t                              d	t                    |z
   d
t                                t                              d|dd           dS )z
        Add documents to the index.
        If the documents are inside a certain column, you can specify it using the `column` argument.
        indexbodydocs)unittotaldisabler   c               3      K   $t                    D ]\  } }|         | dV  d S t                    D ]\  } }|| dV  d S )N)rY   _id)	enumerate)iexamplerj   ri   s     r   passage_generatorz;ElasticSearchIndex.add_documents.<locals>.passage_generator   s      !"+I"6"6 > >JAw#*6?1======> > #,I"6"6 6 6JAw#*15555556 6r   N)clientrm   actionsr
   z>Some documents failed to be added to ElasticSearch. Failures: /zIndexed dz
 documents)rL   rM   rK   r   createlenr   tqdmis_progress_bar_enabledr   helpersstreaming_bulkupdateloggerwarninginfo)r5   ri   rj   
index_nameindex_confignumber_of_docsprogress	successesrx   esokactions    ``         r   add_documentsz ElasticSearchIndex.add_documents   sl   
 '
+%%J\%JJJY<V>wOnOpOpKpqqq		6 	6 	6 	6 	6 	6 	#"""*33>%%'' 4 
 
 	 	JB
 OOAOIII&&NN|QTU^Q_Q_`iQi||lopylzlz||   	6y666677777r   r/   r6   r1   c           	           | j         j        d
| j        d|dgddi|dd|}|d         d         }t          d |D             d	 |D                       S )am  Find the nearest examples indices to the query.

        Args:
            query (`str`): The query as a string.
            k (`int`): The number of examples to retrieve.

        Ouput:
            scores (`List[List[float]`): The retrieval scores of the retrieved examples.
            indices (`List[List[int]]`): The indices of the retrieved examples.
        multi_matchrY   cross_fields)r6   fieldsrU   )r6   sizerl   hitsc                     g | ]
}|d          S )_scorer   .0hits     r   
<listcomp>z-ElasticSearchIndex.search.<locals>.<listcomp>   s    <<<c(m<<<r   c                 8    g | ]}t          |d                    S )rt   )r    r   s     r   r   z-ElasticSearchIndex.search.<locals>.<listcomp>   s#    >_>_>_SVs3u:>_>_>_r   r   )rK   r8   rL   r   )r5   r6   r0   r7   responser   s         r   r8   zElasticSearchIndex.search   s     )4>( 
$)UvhXf+g+ghrstt
 
 
 

 '<<t<<<>_>_Z^>_>_>_```r   r0   c                     dd l }d gt          |          z  d gt          |          z  }}|j                            |          5  fdt	          |          D             }|j                            |          D ]2}	||	         }
|	                                }|j        ||
<   |j        ||
<   3	 d d d            n# 1 swxY w Y   t          ||          S )Nr   )max_workersc                 B    i | ]\  }} j         j        |fi |S r   )submitr8   )r   rv   r6   executorr0   r7   r5   s      r   
<dictcomp>z3ElasticSearchIndex.search_batch.<locals>.<dictcomp>   s=    vvvW_WXZ_xt{E1OOOOQRvvvr   )r$   r#   )
concurrent.futuresr~   futuresThreadPoolExecutorru   as_completedresultr   r   r"   )r5   r;   r0   r   r7   
concurrentr#   r$   future_to_indexfuturerm   resultsr   s   ` ` `       @r   r<   zElasticSearchIndex.search_batch   s9   !!!!'+fs7||&;dVc'll=Rm22{2KK 	7xvvvvvvvclmtcucuvvvO$,99/JJ 7 7'/)/&-nU#'.e$$	7	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 $-l[[[[s   
A,CC
C)NNNNNNrC   )r/   r/   )r   r   r   rD   r   rE   r    r)   rh   r	   r   r   r   r8   r"   r<   r   r   r   rH   rH   `   s(         #"/3'+*.$
 $
sm$
 sm$
 O,	$

  }$
 "$$
 $
 $
 $
L"8 "8uT#Y	-A'B "8HUXM "8 "8 "8 "8Ha aC aM a a a a&\ \s \Nb \ \ \ \ \ \r   rH   c                   
   e Zd ZdZ	 	 	 	 ddeeeee         f                  dee         dee         ded         fdZ		 	 	 	 dd
ee
j        df         dee         dedee         dee         f
dZeddddeeeee         f                  ddfd            Zd de
j        defdZd de
j        defdZddeeef         dee         fdZe	 	 d!deeef         deeeee         f                  dee         dd fd            ZdS )"
FaissIndexa  
    Dense index using Faiss. It is used to index vectors.
    Faiss is a library for efficient similarity search and clustering of dense vectors.
    It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM.
    You can find more information about Faiss here:
    - For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory
    - For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
    Ndevicestring_factorymetric_typecustom_indexfaiss.Indexc                     ||t          d          ||t          d          || _        || _        || _        || _        t
          st          d          dS )a$  
        Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
        You can find more information about Faiss here:
        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
        NzFPlease specify either `string_factory` or `custom_index` but not both.zsCannot pass both 'custom_index' and 'device'. Pass 'custom_index' already transferred to the target device instead.a{  You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available.)r`   r   r   r   faiss_index
_has_faissr_   )r5   r   r   r   r   s        r   rh   zFaissIndex.__init__   s     %,*Befff,":X   ,&' 	T  	 	r     vectorsr   rj   
batch_size
train_sizefaiss_verbosec                    ddl }| j        |t          |d                   nt          |d         |                   }| j        ;| j         |j        || j                  }nK |j        || j        | j                  }n.| j         |j        |          }n |j        || j                  }|                     || j                  | _        t          
                    dt          | j                              ||| j        _        t          | j        d          r| j        j        || j        j        _        t          | j        d          r| j        j        || j        j        _        t          | j        d          r| j        j        || j        j        _        |b|
|d|         n|d|         |         }	t          
                    dt          |	           d           | j                            |	           nt          
                    d	           t          
                    d
t          |           d           t%          j        t)          dt          |          |          t%          j                               D ]>}
|||
|
|z            n||
|
|z            |         }| j                            |           ?dS )z
        Add vectors to the index.
        If the arrays are inside a certain column, you can specify it using the `column` argument.
        r   NzCreated faiss index of type rm   	quantizerclustering_indexz"Training the index with the first z vectorszEIgnored the training step of the faiss index as `train_size` is None.zAdding z vectors to the faiss index)rr   )r   r   r~   r   r   index_factory	IndexFlat_faiss_index_to_devicer   r   r   rU   verbosehasattrrm   r   r   trainr   r   ranger   add)r5   r   rj   r   r   r   r   r   rm   
train_vecsrv   vecss               r   add_vectorszFaissIndex.add_vectors   s    	 #&,n3wqz???#gaj>P:Q:QD".#+/E/d6IJJEE/E/d6I4K[\\EE#++EOD11EE+EOD$2BCCE#::5$+NNDKKOtD<L7M7MOOPPP $'4D$t'11 ?d6F6L6X1> &.t'55 C$:J:T:`5B *2t');<< JAQAbAn<I 19 !17*--W[j[EYZ`EaJKKVS__VVVWWW"":....KK_``` 	Gc'llGGGHHHeAs7||Z@@gNmNoNoJoppp 	' 	'A28.71q:~-..gaRSV`R`N`FabhFiD  &&&&	' 	'r   rm   r1   c                 p   || S ddl }t          |t                    r9|dk    r" |j                    } |j        |||           } np |j        |           } n_t          |t          t          f          r  |j        | t          |                    } n#t          dt          |           ddz             | S )z
        Sends a faiss index to a device.
        A device can either be a positive integer (GPU id), a negative integer (all GPUs),
            or a list of positive integers (select GPUs to use), or `None` for CPU.
        Nr   )gpuszThe argument type: z is not expected. zZPlease pass in either nothing, a positive int, a negative int, or a list of positive ints.)r   
isinstancer    StandardGpuResourcesindex_cpu_to_gpuindex_cpu_to_all_gpuslisttupleindex_cpu_to_gpus_list	TypeErrorrU   )rm   r   r   	faiss_ress       r   r   z!FaissIndex._faiss_index_to_device4  s     >L fc"" 	{{6E688	..y&%HH 43E::u.. 	0E0T&\\JJJEEFd6llFFFno  
 r   r/   r6   c                    t          |j                  dk    r8t          |j                  dk    s|j        d         dk    rt          d          |                    dd          }|j        j        st          j        |d          } | j        j	        ||fi |\  }}t          |d         |d                             t                              S )aw  Find the nearest examples indices to the query.

        Args:
            query (`np.array`): The query as a numpy array.
            k (`int`): The number of examples to retrieve.

        Ouput:
            scores (`List[List[float]`): The retrieval scores of the retrieved examples.
            indices (`List[List[int]]`): The indices of the retrieved examples.
        r
      r   zHShape of query is incorrect, it has to be either a 1D array or 2D (1, N)r   Corder)r~   shaper`   reshapeflagsc_contiguousnpasarrayr   r8   r   astyper    )r5   r6   r0   r7   r;   r   r   s          r   r8   zFaissIndex.searchV  s     u{q  c%+&6&6!&;&;u{1~QR?R?Rghhh--2&&}) 	5j444G1$*1'1GGGGVAY
(9(9#(>(>???r   r;   c                    t          |j                  dk    rt          d          |j        j        st          j        |d          } | j        j        ||fi |\  }}t          ||
                    t                              S )a  Find the nearest examples indices to the queries.

        Args:
            queries (`np.array`): The queries as a numpy array.
            k (`int`): The number of examples to retrieve.

        Ouput:
            total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
            total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
        r   zShape of query must be 2Dr   r   )r~   r   r`   r   r   r   r   r   r8   r"   r   r    )r5   r;   r0   r7   r   r   s         r   r<   zFaissIndex.search_batchj  s     w}""8999}) 	5j444G1$*1'1GGGG#FGNN3,?,?@@@r   r=   storage_optionsc           	      ~   ddl }| j        =t          | j        t          t          t
          f          r |j        | j                  }n| j        }t          j	        t          |          dfi |pi 5 } |j        | |j         |j        |j                                       ddd           dS # 1 swxY w Y   dS )z Serialize the FaissIndex on diskr   Nwb)r   r   r   r    r   r   index_gpu_to_cpur   fsspecopenrE   write_indexBufferedIOWriterPyCallbackIOWriterwrite)r5   r=   r   r   rm   fs         r   r?   zFaissIndex.save|  s   ;"z$+T5?Q'R'R"*E*4+;<<EE$E[TDDD_-BDD 	`Ee%;U%;<TE<TUVU\<]<]%^%^___	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	`s   23B22B69B6c                 <   ddl } | |          }t          j        t          |          dfi |pi 5 } |j         |j         |j        |j                                      }ddd           n# 1 swxY w Y   |                    ||j	                  |_
        |S )z$Deserialize the FaissIndex from diskr   N)r   rb)r   r   r   rE   
read_indexBufferedIOReaderPyCallbackIOReaderreadr   r   r   )rA   r=   r   r   r   r   r   rm   s           r   rB   zFaissIndex.load  s     	 c((([TDDD_-BDD 	_$E$%;U%;<TE<TUVU[<\<\%]%]^^E	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_ 	_"-"D"DUKL^"_"_s   2A11A58A5NNNN)Nr   NNr   rC   NN)r   r   r   rD   r   r	   r    r   rE   rh   r   arrayboolr   staticmethodr   r   r8   r"   r<   r   r   r?   rF   rB   r   r   r   r   r      sZ         37(,%)04 sDI~./ ! c]	
 }-   B !%$((,5' 5'rx*+5' 5' 	5'
 SM5'  ~5' 5' 5' 5'n  m XeCQUVYQZNF[=\ hu    \B@ @BH @ @ @ @ @(A ABH AAU A A A A$
` 
`sH}- 
` 
` 
` 
` 
`  37*.	 CM" sDI~./ "$	
 
   [  r   r   c                   
   e Zd ZdZd Zd Zd ZdedefdZ	defdZ
dee         fd	Zdedefd
Z	 	 	 	 	 	 	 	 d/dedee         deeeee         f                  dee         dee         ded         dedee         defdZ	 	 	 	 	 	 	 d0dej        dedeeeee         f                  dee         dee         ded         dedee         defdZd1dedeeef         dee         fdZ	 	 d2dedeeef         deeeee         f                  dee         fdZ	 	 	 	 	 	 d3dedee         dee         dee         d ed!         d"ee         d#ee         fd$Z	 	 	 	 d4ded"edee         dee         d ed!         d#ee         fd%Zdefd&Zd5ded(eeej        f         d)edefd*Z	 d5ded+eee         ej        f         d)ede fd,Z!	 d5ded(eeej        f         d)ede"fd-Z#	 d5ded+eee         ej        f         d)ede$fd.Z%dS )6IndexableMixinz+Add indexing features to `datasets.Dataset`c                     i | _         d S r   _indexesr5   s    r   rh   zIndexableMixin.__init__  s    .0r   c                     t           r   r3   r   s    r   __len__zIndexableMixin.__len__      !!r   c                     t           r   r3   )r5   keys     r   __getitem__zIndexableMixin.__getitem__  r  r   r   r1   c                     || j         v S r   r   r5   r   s     r   is_index_initializedz#IndexableMixin.is_index_initialized  s    T]**r   c                 V    |                      |          st          d| d          d S )NzIndex with index_name 'zk' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first.)r  r   r  s     r   _check_index_is_initializedz*IndexableMixin._check_index_is_initialized  sN    ((44 	 b*  b  b  b  	 	r   c                 *    t          | j                  S )zEList the `colindex_nameumns`/identifiers of all the attached indexes.)r   r   r   s    r   list_indexeszIndexableMixin.list_indexes  s    DM"""r   c                 F    |                      |           | j        |         S )zList the `index_name`/identifiers of all the attached indexes.

        Args:
            index_name (`str`): Index name.

        Returns:
            [`BaseIndex`]
        )r
  r   r  s     r   	get_indexzIndexableMixin.get_index  s%     	((444}Z((r   Nr   Frj   r   r   r   r   r   r   r   r   c
                     ||n|}t          ||||          }
|
                    | ||||	           |
| j        |<   dS )a  Add a dense index using Faiss for fast retrieval.
        The index is created using the vectors of the specified column.
        You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below).
        You can find more information about Faiss here:
        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory

        Args:
            column (`str`): The column of the vectors to add to the index.
            index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
                By default it corresponds to `column`.
            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
            string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
            metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
            custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
            batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
                <Added version="2.4.0"/>
            train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
            faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
        Nr   r   r   r   rj   r   r   r   r   r   r   )r5   rj   r   r   r   r   r   r   r   r   r   s              r   add_faiss_indexzIndexableMixin.add_faiss_index  sp    @ $.#9ZZv
 .k`l
 
 
 	J:]j 	  	
 	
 	
 %0j!!!r   external_arraysc
                 t    t          ||||          }
|
                    |d|||	           |
| j        |<   dS )a8  Add a dense index using Faiss for fast retrieval.
        The index is created using the vectors of `external_arrays`.
        You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
        You can find more information about Faiss here:
        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory

        Args:
            external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`.
                It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.
            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
            string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
            metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
            custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
            batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
                <Added version="2.4.0"/>
            train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
            faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
        r  Nr  r  )r5   r  r   r   r   r   r   r   r   r   r   s              r   $add_faiss_index_from_external_arraysz3IndexableMixin.add_faiss_index_from_external_arrays  sb    @ !.k`l
 
 
 	DZJfs 	  	
 	
 	
 %0j!!!r   r=   r   c                    |                      |          }t          |t                    s#t          d| dt	          |           d          |                    ||           t                              d| d|            dS )a  Save a FaissIndex on disk.

        Args:
            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
            file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.11.0"/>

        zIndex 'z' is not a FaissIndex but a '')r   zSaved FaissIndex z at N)r  r   r   r`   rU   r?   r   r   )r5   r   r=   r   rm   s        r   save_faiss_indexzIndexableMixin.save_faiss_index  s     z**%,, 	`^z^^PTUZP[P[^^^___

4
999>
>>>>?????r   c                 4   t                               |||          }|j        j        t	          |           k    r3t          d| d| d|j        j         dt	          |            d	          || j        |<   t                              d| d|            d	S )
a  Load a FaissIndex from disk.

        If you want to do additional configurations, you can have access to the faiss index object by doing
        `.get_index(index_name).faiss_index` to make it fit your needs.

        Args:
            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to
                call `.get_nearest` or `.search`.
            file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.11.0"/>

        )r   r   z1Index size should match Dataset size, but Index 'z' at z has z  elements while the dataset has z
 examples.zLoaded FaissIndex z from N)	r   rB   r   ntotalr~   r`   r   r   r   )r5   r   r=   r   r   rm   s         r   load_faiss_indexzIndexableMixin.load_faiss_index"  s    0 V_UU#s4yy00 pJ  p  pUY  p  p`e`q`x  p  p  [^  _c  [d  [d  p  p  p   %*j!AAA4AABBBBBr   rI   rJ   rK   r   rL   rM   c                 |    ||n|}t          |||||          }|                    | |           || j        |<   dS )a  Add a text index using ElasticSearch for fast retrieval.

        Args:
            column (`str`): The column of the documents to add to the index.
            index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`.
                By default it corresponds to `column`.
            host (Optional `str`, defaults to localhost):
                host of where ElasticSearch is running
            port (Optional `str`, defaults to 9200):
                port of where ElasticSearch is running
            es_client (Optional `elasticsearch.Elasticsearch`):
                The elasticsearch client used to create the index if host and port are None.
            es_index_name (Optional `str`): The elasticsearch index name used to create the index.
            es_index_config (Optional `dict`):
                The configuration of the elasticsearch index.
                Default config is:

        Config::

            {
                "settings": {
                    "number_of_shards": 1,
                    "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
                },
                "mappings": {
                    "properties": {
                        "text": {
                            "type": "text",
                            "analyzer": "standard",
                            "similarity": "BM25"
                        },
                    }
                },
            }
        NrI   rJ   rK   rL   rM   )rj   )rH   r   r   )	r5   rj   r   rI   rJ   rK   rL   rM   es_indexs	            r   add_elasticsearch_indexz&IndexableMixin.add_elasticsearch_indexB  s`    Z $.#9ZZv
%DI]ds
 
 
 	tF333$,j!!!r   c                 >    t          |||||          | j        |<   dS )aA  Load an existing text index using ElasticSearch for fast retrieval.

        Args:
            index_name (`str`):
                The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`.
            es_index_name (`str`):
                The name of elasticsearch index to load.
            host (`str`, *optional*, defaults to `localhost`):
                Host of where ElasticSearch is running.
            port (`str`, *optional*, defaults to `9200`):
                Port of where ElasticSearch is running.
            es_client (`elasticsearch.Elasticsearch`, *optional*):
                The elasticsearch client used to create the index if host and port are `None`.
            es_index_config (`dict`, *optional*):
                The configuration of the elasticsearch index.
                Default config is:
                    ```
                    {
                        "settings": {
                            "number_of_shards": 1,
                            "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
                        },
                        "mappings": {
                            "properties": {
                                "text": {
                                    "type": "text",
                                    "analyzer": "standard",
                                    "similarity": "BM25"
                                },
                            }
                        },
                    }
                    ```
        r  N)rH   r   )r5   r   rL   rI   rJ   rK   rM   s          r   load_elasticsearch_indexz'IndexableMixin.load_elasticsearch_indexv  s2    V %7DI]ds%
 %
 %
j!!!r   c                     | j         |= dS )zDrop the index with the specified column.

        Args:
            index_name (`str`):
                The `index_name`/identifier of the index.
        Nr   r  s     r   
drop_indexzIndexableMixin.drop_index  s     M*%%%r   r/   r6   r0   c                 `    |                      |            | j        |         j        ||fi |S )a]  Find the nearest examples indices in the dataset to the query.

        Args:
            index_name (`str`):
                The name/identifier of the index.
            query (`Union[str, np.ndarray]`):
                The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
            k (`int`):
                The number of examples to retrieve.

        Returns:
            - scores (`List[List[float]`): The retrieval scores of the retrieved examples.
            - indices (`List[List[int]]`): The indices of the retrieved examples.
        )r
  r   r8   )r5   r   r6   r0   r7   s        r   r8   zIndexableMixin.search  s<     	((444/t}Z(/qCCFCCCr   r;   c                 `    |                      |            | j        |         j        ||fi |S )a  Find the nearest examples indices in the dataset to the query.

        Args:
            index_name (`str`):
                The `index_name`/identifier of the index.
            queries (`Union[List[str], np.ndarray]`):
                The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
            k (`int`):
                The number of examples to retrieve per query.

        Returns:
            - total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
            - total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
        )r
  r   r<   )r5   r   r;   r0   r7   s        r   r<   zIndexableMixin.search_batch  s<    " 	((4445t}Z(5gqKKFKKKr   c                     |                      |            | j        |||fi |\  }}d |D             }t          |dt          |                   | |                   S )a=  Find the nearest examples in the dataset to the query.

        Args:
            index_name (`str`):
                The index_name/identifier of the index.
            query (`Union[str, np.ndarray]`):
                The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
            k (`int`):
                The number of examples to retrieve.

        Returns:
            - scores (`List[float]`): The retrieval scores of the retrieved examples.
            - examples (`dict`): The retrieved examples.
        c                     g | ]
}|d k    |S r   r   r   rv   s     r   r   z7IndexableMixin.get_nearest_examples.<locals>.<listcomp>  s    444QQ!VVqVVVr   N)r
  r8   r&   r~   )r5   r   r6   r0   r7   r   r   top_indicess           r   get_nearest_examplesz#IndexableMixin.get_nearest_examples  st    " 	((444%$+j%EEfEE44'444%f-?s;/?/?-?&@${BSTTTr   c                                            |             j        |||fi |\  }}d t          ||          D             } fd|D             }t          ||          S )a  Find the nearest examples in the dataset to the query.

        Args:
            index_name (`str`):
                The `index_name`/identifier of the index.
            queries (`Union[List[str], np.ndarray]`):
                The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
            k (`int`):
                The number of examples to retrieve per query.

        Returns:
            - total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
            - total_examples (`List[dict]`): The retrieved examples per query.
        c                 V    g | ]&\  }}|d t          d |D                                'S )Nc                     g | ]
}|d k    |S r)  r   r*  s     r   r   zHIndexableMixin.get_nearest_examples_batch.<locals>.<listcomp>.<listcomp>  s    ;;;!AFFAFFFr   )r~   )r   scores_i	indices_is      r   r   z=IndexableMixin.get_nearest_examples_batch.<locals>.<listcomp>  sN     
 
 
#) <s;;y;;;<<<=
 
 
r   c                 4    g | ]}d  |D                      S )c                     g | ]
}|d k    |S r)  r   r*  s     r   r   zHIndexableMixin.get_nearest_examples_batch.<locals>.<listcomp>.<listcomp>  s    <<<QQ!VVqVVVr   r   )r   r   r5   s     r   r   z=IndexableMixin.get_nearest_examples_batch.<locals>.<listcomp>  s-    [[['<<'<<<=[[[r   )r
  r<   zipr+   )r5   r   r;   r0   r7   r#   r$   total_sampless   `       r   get_nearest_examples_batchz)IndexableMixin.get_nearest_examples_batch  s    " 	((444&7d&7
GQ&Y&YRX&Y&Y#m
 
'*<'G'G
 
 
 \[[[][[[,\=IIIr   )NNNNNr   NF)NNNNr   NFr   r   )NNNNNNr   rC   )&r   r   r   rD   rh   r  r  rE   r   r  r
  r   r  r.   r  r   r	   r    r  r   r   r  r   r   r  r  r)   r   r"  r$  r   r8   r"   r<   r&   r,  r+   r6  r   r   r   r   r     sG       551 1 1" " "" " "+s +t + + + +c    #d3i # # # #
)C 
)I 
) 
) 
) 
) %)26(,%)04$(#'0 '0'0 SM'0 sDI~./	'0
 !'0 c]'0 }-'0 '0 SM'0 '0 '0 '0 '0Z 37(,%)04$(#&0 &0&0 &0 sDI~./	&0
 !&0 c]&0 }-&0 &0 SM&0 &0 &0 &0 &0P@ @3 @eCM6J @]efj]k @ @ @ @, 37*.C CC CM"C sDI~./	C
 "$C C C CF %)""/3'+*.2- 2-2- SM2- sm	2-
 sm2- O,2-  }2- "$2- 2- 2- 2-p #"/3*.-
 -
-
 -
 sm	-

 sm-
 O,-
 "$-
 -
 -
 -
^&S & & & &D D DU3=-A Dc D]j D D D D& NPL LL(-d3i.A(BLGJL	L L L L* FHU UU&+CM&:U?BU	U U U U. NPJ JJ(-d3i.A(BJGJJ	&J J J J J Jr   r   ))importlib.util	importlibrb   re   pathlibr   typingr   r   r   r   r   r	   r   numpyr   utilsr   arrow_datasetr   r   r   r_   r   util	find_specr^   r   
get_loggerr   r   	Exceptionr   r   r"   r&   r+   r.   rH   r   r   r   r   r   <module>rB     s       				        I I I I I I I I I I I I I I I I             &&&&&&///////       ^--o>>dJ ^%%g..d:
 
	H	%	%	 	 	 	 	9 	 	 	    J   
# # # # #: # # #
    Z   
    J   
#" #" #" #" #" #" #" #"Lr\ r\ r\ r\ r\ r\ r\ r\jB B B B B B B BJhJ hJ hJ hJ hJ hJ hJ hJ hJ hJs#   A A	A	A AA