
    чoeY                     B   d dl Zd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZ d dlZd dlmZ dZdZg dZd	Zd
ZdZdZeeeeeedZdZddhZddhZh dZh dZd Zd Z d Z!d Z"d Z#d Z$d Z%d Z&d Z'd,d Z(d! Z) G d" d#e*          Z+d$ Z,d% Z-d-d'Z.	 	 d.d(Z/d) Z0d/d+Z1dS )0    N)BiomParseExceptionUnknownAxisError)Table)	biom_open__version__)defaultdictJustin Kuczynskiz5Copyright 2011-2020, The BIOM Format Development Team)r	   Daniel McDonaldzGreg CaporasozJose Carlos Clemente LitranzAdam Robbins-PiankazJose Antonio Navas MolinaBSDzhttp://biom-format.orgr
   zdaniel.mcdonald@colorado.edu)intfloatunicoder   r   r   "[{]}>   	
 ,>   0123456789r   r   r   c                    |                      d|z            }|dk    rdS |t          |          z   dz   }|}| |         t          vr|dz  }| |         t          v| |         t          vr| |         dvr|dz  }| |         dvn| |         g}|dz  }|r| |         }|t          k    r<|d         t          k    r|                                 nh|                    |           nR|t          v r+	 |                                 n3# t          $ r |dz  }Y n)w xY w|t          v r|                    |           |dz  }|| ||         S )z|Returns key:value from the biom string, or ""

    This method pulls an arbitrary key/value pair out from a BIOM string
    z"%s":       )r   r   r   )	findlen
JSON_START	JSON_OPENQUOTEpopappend
JSON_CLOSE
IndexError)biom_strkeybase_idx	start_idxcur_idxstackcur_chars          _/mounts/lovelace/software/anaconda3/envs/kraken-biom/lib/python3.11/site-packages/biom/parse.pydirect_parse_keyr8   9   s   
 }}Ws]++H2~~rs3xx'!+	 G
7
:
-
-1 7
:
-
- 	))w66qLG w66
 '"#1 	(H5  9%%IIKKKKLL****Z''IIKKKK!   qLGE Y&&X&&&qLG!  	$ HW$%%s   -D DDc                 &   |dvrt          d          t          | d          }|dk    rt          d          t          | d          }|dk    rt          d          t          | d          }|dk    rt          d          |                    d          d	                             d
d                              dd          }t          t          t          |                    d                              \  }}|                    d
          dz   }	||	t          |          dz
           }t          |          dk     rt          d          d}
|dk    r7t          |          |k    rt          d          |
t          |          |fz  }
n<|dk    r6t          |          |k    rt          d          |
|t          |          fz  }
t          |          }g }|dk    rt          ||          }n|dk    rt          ||          }d| d|
 S )a0  Pull out specific slices from a BIOM string

    biom_str : JSON-formatted BIOM string
    to_keep  : indices to keep
    axis     : either 'samples' or 'observations'

    Will raise IndexError if the inices are out of bounds. Fully zerod rows
    or columns are possible and this is _not_ checked.
    observationsamplezUnknown axis typeshaper$   .biom_str does not appear to be in BIOM format!datamatrix_type:r#   r   r   r   r&   r   z'Observations to keep are out of bounds!z[%d, %d]r;   r<   z"Samples to keep are out of bounds!z"data": z, "shape": )r/   r8   
ValueErrorsplitreplacelistmapr   r'   r(   minmaxset_direct_slice_data_sparse_obs_direct_slice_data_sparse_samp)r0   to_keepaxisshape_kv_pairdata_fieldsmatrix_type_kv_pair	raw_shapen_rowsn_cols
data_start	new_shapenew_datas               r7   direct_slice_datarW   g   s;    ,,,,--- %Xw77MIJJJ"8V44KbIJJJ*8]CCb  IJJJ ##C((,44S"==EEc2NNI#c9??3#7#78899NFF !!#&&*Jj[)9)9A)==>K 7||aBCCC I}w<<6!!FGGGWv 66					w<<6!!ABBBW 66	'llGH}0gFF			1+wGG6h669666    c                 ,    |                      d          S )Nz[] 
	stripxs    r7   strip_fr^      s    779rX   c                     t          t          t          |                     d                              \  }}}||          d| d| S )zRemap a sparse observation axisr   rE   rF   r^   rC   rcvlookuprowcolvalues        r7   _remap_axis_sparse_obsrg      sI    3w		#7788OCeSk))C))%)))rX   c                     t          t          t          |                     d                              \  }}}| d||          d| S )zRemap a sparse sample axisr   r`   ra   s        r7   _remap_axis_sparse_sampri      sI    3w		#7788OCe))F3K))%)))rX   c                 L   g }d t          t          |                    D             }|                     d          D ]O}t          |                              d          \  }}}||v r#|                    t          ||                     Pdd                    |          z  S )zislice observations from data

    data : raw data string from a biom file
    to_keep : rows to keep
    c                 4    i | ]\  }}t          |          |S  str.0ivs      r7   
<dictcomp>z1_direct_slice_data_sparse_obs.<locals>.<dictcomp>   $    EEE$!QCFFAEEErX   ],r   [[%s]]],[)	enumeratesortedrC   r^   r-   rg   joinr?   rL   rV   remap_lookuprb   rcrr   s           r7   rJ   rJ      s     HEE)F7OO*D*DEEELzz$ G G#,,$$S))1aOO23EEFFFejj****rX   c                 2   g }d t          t          |                    D             }|                     d          D ]B}|                    d          \  }}}||v r#|                    t	          ||                     Cdd                    |          z  S )zgslice samples from data

    data : raw data string from a biom file
    to_keep : columns to keep
    c                 4    i | ]\  }}t          |          |S rl   rm   ro   s      r7   rs   z2_direct_slice_data_sparse_samp.<locals>.<dictcomp>   rt   rX   ru   r   rv   rw   )rx   ry   rC   r-   ri   rz   r{   s           r7   rK   rK      s     HEE)F7OO*D*DEEELzz$ H H))C..1aOO3CFFGGGejj****rX   c                    t                    |dk    rd}t          | |          }n(|dk    rd}t          | |          }nt          d          |dk    rt          d          t          j        d|z            }d	 ||         D             }                    |          st          d
          fdt          ||                   D             }t          |          }|g i}t          ||                   D ]$\  }	}
|	|v r||                             |
           %|t          j	        |          dd         fS )zReturns the indices for the associated ids to keep

    biom_str : a BIOM formatted JSON string
    to_keep  : a list of IDs to get indices for
    axis     : either 'samples' or 'observations'

    Raises KeyError if unknown key is specified
    r;   rowsr<   columnszUnknown axis!r$   r>   z{%s}c                     h | ]
}|d          S idrl   )rp   rr   s     r7   	<setcomp>z#get_axis_indices.<locals>.<setcomp>   s    4441qw444rX   z+Not all of the to_keep ids are in biom_str!c                 .    g | ]\  }}|d          v |S r   rl   )rp   rq   rr   rL   s      r7   
<listcomp>z$get_axis_indices.<locals>.<listcomp>   s+    OOO$!QAdGw<N<NA<N<N<NrX   r&   r#   )
rI   r8   rB   jsonloadsissubsetKeyErrorrx   r-   dumps)r0   rL   rM   axis_key	axis_dataall_idsidxsidxs_lookupsubsetrq   rr   s    `         r7   get_axis_indicesr      sg    'llG}$Xx88					$Xx88		)))BIJJJ
6I-..I44	( 3444GG$$ FDEEEOOOO)Ih$788OOODd))K^F)H-.. ' '18##A&&&F##AbD)))rX   c                     t          t                    }i }g }i }g }t          d          }| D ]N}|                                }|s|                    d          }|d         }	|	|vr<|d                                         d         }
|d                                         d         }|
dk    r|}
|
|v r	||
         }n)t          |          }|                    |
           |||
<   |	dk    s|	dk    r	 |                    d	          }n# t          $ r t          d
          w xY w|d|         }||v r	||         }n)t          |          }|||<   |                    |           |||fxx         dz  cc<   NPt          |||          S )a   Create a Table object from a uclust/usearch/vsearch uc file.

        Parameters
        ----------
        fh : file handle
            The ``.uc`` file to be parsed.

        Returns
        -------
        biom.Table : The resulting BIOM table.

        Raises
        ------
        ValueError
            If a sequence identifier is encountered that doesn't have at least
            one underscore in it (see Notes).

        Notes
        -----
        This function assumes sequence identifiers in this file are in QIIME's
        "post-split-libraries" format, where the identifiers are of the form
        ``<sample-id>_<sequence-id>``. Everything before the first underscore
        will be used as the sample identifier in the resulting ``Table``.
        The information after the first underscore is not used directly, though
        the full identifiers of seeds will be used as the observation
        identifier in the resulting ``Table``.

    HSLr   r   	      *HS_zA query sequence was encountered that does not have an underscore. An underscore is required in all query sequence identifiers to indicate the sample identifier.Nr&   )observation_ids
sample_ids)
r   r   rI   r[   rC   r(   r-   rindexrB   r   )fhr?   sample_idxsr   observation_idxsr   
line_typeslinefields	line_typeobservation_idquery_idobservation_idxunderscore_index	sample_id
sample_idxs                   r7   parse_ucr      s   : sDKJO UJ 3 3zz|| 	D!!1I	J&&  **1-!9??$$Q'S  %N ---.~>OO!/22O"">222/>^,yC//L#+??3#7#7   L L L KL L LL !!2"2!23IK''(3

 __
)3I&!!),,, /:.///14//// :NNNNs   5DD%r<   Fc                 ,   |dvrt          |           	 t          j        | |          S # t          $ r Y nt          $ r Y nw xY wt          | d          r|                                 }|                     d          }|                                r)|                     d          }|                                )|dk    r>| 	                    |           t          j
        t          j        |           |          }n| 	                    |           t          j        | ddd           }nt          | t                    re	 t          j
        t          j        d	                    |                     |          }nP# t          $ r t          j        | ddd
           }Y n,w xY wt          j
        t          j        |           |          }fd}d }8|                    ||           |dk    rdnd}|                    ||           |S )a  Parses the biom table stored in `file_obj`

    Parameters
    ----------
    file_obj : file-like object, or list
        file-like object storing the BIOM table (tab-delimited or JSON), or
        a list of lines of the BIOM table in tab-delimited or JSON format
    ids : iterable
        The sample/observation ids of the samples/observations that we need
        to retrieve from the biom table
    axis : {'sample', 'observation'}, optional
        The axis to subset on
    input_is_dense : boolean
        Indicates if the BIOM table is dense or sparse. Valid only for JSON
        tables.

    Returns
    -------
    Table
        The BIOM table stored at file_obj

    Raises
    ------
    ValueError
        If `samples` and `observations` are provided.

    Notes
    -----
    Subsetting from the BIOM table is only supported in one axis

    Examples
    --------
    Parse a hdf5 biom table

    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f) # doctest: +SKIP

    Parse a hdf5 biom table subsetting observations
    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f, ids=["GG_OTU_1"],
    ...                      axis='observation') # doctest: +SKIP
    r:   )idsrM   readr&   r   )input_is_denseNc                     | S Nrl   r\   s    r7   <lambda>z"parse_biom_table.<locals>.<lambda>      q rX   r$   c                     | S r   rl   r\   s    r7   r   z"parse_biom_table.<locals>.<lambda>  r   rX   c                     |v S r   rl   )r?   id_mdr   s      r7   
subset_idsz$parse_biom_table.<locals>.subset_ids  s    czrX   c                 *    t          j        |           S r   )npany)valsr   r   s      r7   gt_zeroz!parse_biom_table.<locals>.gt_zero  s    vd||rX   rM   r<   r;   )r   r   	from_hdf5rB   RuntimeErrorhasattrtellr   isspaceseek	from_jsonr   loadfrom_tsv
isinstancerE   r   rz   filter)	file_objr   rM   r   old_posr~   tr   r   s	    `       r7   parse_biom_tabler   Y  sf   ^ ,,,xSt<<<<      x   ;--// MM!iikk 	!a  A iikk 	!88MM'"""	( 3 3/=? ? ?AA MM'"""xt[[AAAA	Hd	#	# ;	B
2778+<+< = =/=? ? ?AA 	B 	B 	Bxt[[AAAAA	B ODJx00+9; ; ;       	$''' $ 0 0}}h	t$$$Hs&   - 
A	AA;F "F'&F'c                     g }|                      d          D ]X}g }|                     d          D ])}|                    |                                           *|                    |           Y|S )N|;)rC   r-   r[   )r]   complex_metadataysimple_metadataes        r7   sc_pipe_separatedr     s{    WWS\\ 1 1 	. 	.A""17799----0000rX   c                   :     e Zd Ze	 	 dd            Z fdZ xZS )MetadataMapTFNc           	         t          |d          r.	 t          |          }n# t          $ r t          d          w xY w|r
|rd }nd }n	|rd }nd }|i }g }|pg }g }|D ]!}	 ||	          }	|	r|r|	                                s'|	                    d          rJ|	d	d         }	|s(|	                                                    d
          }p|                    |	           t          t          ||	                    d
                              }
t          |
          t          |          k     r6|
                    dgt          |          t          |
          z
  z             |                    |
           #|st          d          |st          d          d |D             }t          |          t          t          |                    k    rt          d          i }|D ]_}i }t          |d	d         |d	d                   D ]/\  }}	  ||         |          ||<   # t          $ r |||<   Y ,w xY w|||d         <   ` | |          S )a,  Parse mapping file that relates samples or observations to metadata.

        Format: header line with fields
                optionally other comment lines starting with #
                tab-delimited fields

        process_fns: a dictionary of functions to apply to metadata categories.
         the keys should be the column headings, and the values should be
         functions which take a single value. For example, if the values in a
         column called "taxonomy" should be split on semi-colons before being
         added as metadata, and all other columns should be left as-is,
         process_fns should be:
          {'taxonomy': lambda x: x.split(';')}

        Assumes the first column in the mapping file is the id.

        This method is ported from QIIME (http://www.qiime.org), previously
        named parse_mapping_file/parse_mapping_file_to_dict. QIIME is a GPL
        project, but we obtained permission from the authors of this method
        to port it to the BIOM Format project (and keep it under BIOM's BSD
        license).
        upperzAA string was passed that doesn't refer to an accessible filepath.c                 .    |                      dd          S Nr   r$   )rD   r\   s    r7   r^   z&MetadataMap.from_file.<locals>.strip_f  s    99S"---rX   c                 R    |                      dd                                          S r   rD   r[   r\   s    r7   r^   z&MetadataMap.from_file.<locals>.strip_f  s"    99S"--33555rX   c                     | S r   rl   r\   s    r7   r^   z&MetadataMap.from_file.<locals>.strip_f  s    HrX   c                 *    |                                  S r   rZ   r\   s    r7   r^   z&MetadataMap.from_file.<locals>.strip_f  s    7799$rX   N#r&   r   r$   z)No header line was found in mapping file.zNo data found in mapping file.c                     g | ]
}|d          S )r   rl   )rp   rq   s     r7   r   z)MetadataMap.from_file.<locals>.<listcomp>  s    000aQqT000rX   z2First column values are not unique! Cannot be ids.r   )r   openOSErrorr   r[   
startswithrC   r-   rE   rF   r(   extendrI   zipr   )clslinesstrip_quotessuppress_strippingheaderprocess_fnsr^   mapping_datacommentsr   tmp_line	first_colmappingr   	current_dkrr   s                    r7   	from_filezMetadataMap.from_file  s2   2 5'"" 	MMU M M M( *L M M MM  	%! 6. . . .6 6 6 6 " %   % % % K 2  	. 	.D74==D . tzz|| s## .ABBx *!ZZ\\//55FFOOD))))  GTZZ-=-= > >??x==3v;;..OORDCKK#h--,G$HIII##H---- 	.$ &- . . . 	G$%EFFF00<000	y>>SY0000$ &6 7 7 7   	) 	)DIF122JQRR11 % %1%#1;q>!#4#4IaLL % % %#$IaLLL%(GDGs7||s   " <H22IIc                 J    t                                          |           dS )zAccepts dictionary mapping IDs to metadata.

        ``mapping`` should be a dictionary mapping an ID to a dictionary of
        metadata. For example:

        {'Sample1': {'Treatment': 'Fast'}, 'Sample2': {'Treatment': 'Control'}}
        N)super__init__)selfr   	__class__s     r7   r   zMetadataMap.__init__,  s#     	!!!!!rX   )TFNN)__name__
__module____qualname__classmethodr   r   __classcell__)r   s   @r7   r   r     sa        DI+/f f f [fP" " " " " " " " "rX   r   c                      dt           z  S )zReturns a generated by stringzBIOM-Format %s)r   rl   rX   r7   generatedbyr  7  s    k))rX   c                 n    t          j        | |||fi |}|                    t                                S )a   Convert a contigency table to a biom table

    sample_mapping : dict of {'sample_id':metadata} or None
    obs_mapping : dict of {'obs_id':metadata} or None
    process_func: a function to transform observation metadata
    dtype : type of table data
    )r   r   to_jsonr  )table_fsample_mappingobs_mappingprocess_funckwargs	otu_tables         r7   convert_table_to_biomr  <  sC     w^+7 7/57 7I[]]+++rX   rA   c                    t          | t                    r|                     d          S t          | t                    rt                              dd                    g                    t          | d         t                    rRg }| D ]8}fd|D             }|                    d                    |                     9d                    |          S d                    fd| D                       S d	S )
zCDetermine which format the metadata is and then convert to a stringr   z;|r$   r   c                 ^    g | ])}|                                                               *S rl   )r[   trans)rp   r   transtabs     r7   r   z'biom_meta_to_string.<locals>.<listcomp>Y  s-    ???!''))//(33???rX   z; r   c              3   h   K   | ],}|                     d                                           V  -dS )r   Nr   )rp   r]   replace_strs     r7   	<genexpr>z&biom_meta_to_string.<locals>.<genexpr>`  sQ       - - ))C55;;== - - - - - -rX   N)r   rn   rD   rE   bytes	maketransrz   r-   )metadatar  new_metadatar]   valuesr  s    `   @r7   biom_meta_to_stringr  J  s$    (C   [111	Hd	#	# ??4+{1K)L)LMMhqk4(( 	L 7 7????Q???##DIIf$5$5666688L)))
 		 - - - -#+- - - - - rX   c                    t          |           }|t          }|                    d          |                                S ||                    d          d         v r|                    |||          S |                                S )z*Convert a biom table to a contigency tableNr;   r   r   )
header_keyheader_valuemetadata_formatter)
load_tabler  r  delimited_self)biom_fr  r  	md_formattables        r7   convert_biom_to_tabler#  e  s     vE'	~~=~))1##%%%U^^^77:::##z1=7@ $ B B 	B ##%%%rX   c                    t          | t          j        t          j        f          r8	 t          |           }n# t          t          f$ r t          d| z            w xY wt          |           5 }	 t          |          }n'# t          t          f$ r t          d| z            w xY w	 ddd           n# 1 swxY w Y   |S )a\  Load a `Table` from a path

    Parameters
    ----------
    f : str or file-like object
        The entity to parse

    Returns
    -------
    Table

    Raises
    ------
    IOError
        If the path does not exist
    TypeError
        If the data in the path does not appear to be a BIOM table

    Examples
    --------
    Parse a table from a path. BIOM will attempt to determine if the fhe file
    is either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse
    accordingly:

    >>> from biom import load_table
    >>> table = load_table('path/to/table.biom') # doctest: +SKIP

    z%%s does not appear to be a BIOM file!N)	r   ioIOBaseh5pyFiler   r/   	TypeErrorr   )fr"  fps      r7   r  r  w  s6   : !bi+,, 
M	I$Q''EEI& 	I 	I 	ICaGHHH	I q\\ 	MRM(,,	* M M M G! KLLLM 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M
 Ls3   8 $A.B40B ?B4 $B$$B44B8;B82.1.0c                 .   |dk    rt          d          |sdt                      i}t          |t          j        t          j        f          r | j        |fi | dS t          |d          5 } | j        |fi | ddd           dS # 1 swxY w Y   dS )aU  Save a `Table` to a path

    Parameters
    ----------
    t : biom.Table
        The table to save
    f : str or file-like object
        Where to save
    format_ : str
        The format to write. Currently only supports 2.1.0

    Returns
    -------
    None

    Raises
    ------
    IOError
        If the path is not writable
    r,  z%Only 2.1.0 is supported at this time.generated_bywN)rB   r  r   r'  r(  Groupto_hdf5r   )r   r*  format_r
  outs        r7   
save_tabler4    s    * '@AAA 1 +--0!di,-- %	!vq# 	%#AIc$$V$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%s   .B

BB)Nr<   F)rA   )NNN)r,  )2numpyr   r%  r'  biom.exceptionr   r   
biom.tabler   	biom.utilr   r   r   collectionsr   
__author____copyright____credits____license____url____maintainer__	__email__r   r   rn   MATRIX_ELEMENT_TYPEr+   r*   r.   	JSON_SKIPr)   r8   rW   r^   rg   ri   rJ   rK   r   r   r   r   dictr   r  r  r  r#  r  r4  rl   rX   r7   <module>rD     sq       				  ? ? ? ? ? ? ? ?       , , , , , , , ,  # # # # # #  
G, , , 
""*	!Ec!EcC C  	#J	3Z
"""	  
"+& +& +&\;7 ;7 ;7|  * * ** * *+ + + + + +"$* $* $*NYO YO YOx[ [ [ [|  s" s" s" s" s"$ s" s" s"l* * *
, , ,   6 AE$(& & & &$( ( (V% % % % % %rX   