U
    WcW                     @   sn  d dl Zd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZ d dlZd dlmZmZ dZdZddd	d
ddgZdZdZdZdZeeeeeedZdZddhZddhZddddhZddddddd d!d"d#dddhZd$d% Z d&d' Z!d(d) Z"d*d+ Z#d,d- Z$d.d/ Z%d0d1 Z&d2d3 Z'd4d5 Z(dId8d9Z)d:d; Z*G d<d= d=e+Z,d>d? Z-d@dA Z.dJdCdDZ/dKdEdFZ0dGdH Z1dS )L    N)BiomParseExceptionUnknownAxisError)Table)	biom_open__version__)defaultdictOrderedDictzJustin Kuczynskiz5Copyright 2011-2020, The BIOM Format Development TeamzDaniel McDonaldzGreg CaporasozJose Carlos Clemente LitranzAdam Robbins-PiankazJose Antonio Navas MolinaZBSDzhttp://biom-format.orgzdaniel.mcdonald@colorado.edu)intfloatunicoder	   r
   r   "[{]} 	
,0123456789c                 C   s  |  d| }|dkrdS |t| d }|}| | tkrD|d7 }q.| | tkrh| | dkrf|d7 }qPn| | g}|d7 }|r
| | }|tkr|d tkr|  n
|| nN|tkrz|  W n" tk
r   |d8 }Y q
Y nX n|tkr || |d7 }qz| || S )z|Returns key:value from the biom string, or ""

    This method pulls an arbitrary key/value pair out from a BIOM string
    z"%s":       )r   r   r   )	findlen
JSON_START	JSON_OPENQUOTEpopappend
JSON_CLOSE
IndexError)biom_strkeyZbase_idxZ	start_idxZcur_idxstackZcur_char r/   )lib/python3.8/site-packages/biom/parse.pydirect_parse_key9   s8    





r1   c                 C   sr  |dkrt dt| d}|dkr*tdt| d}|dkrDtdt| d}|dkr^td|dd	 d
ddd}ttt|d\}}|d
d }	||	t	|d  }t
|dk rt dd}
|dkrt||krt d|
t	||f }
n0|dkr,t||krt d|
|t	|f }
t|}g }|dkrNt||}n|dkrbt||}d| d|
 S )a0  Pull out specific slices from a BIOM string

    biom_str : JSON-formatted BIOM string
    to_keep  : indices to keep
    axis     : either 'samples' or 'observations'

    Will raise IndexError if the inices are out of bounds. Fully zerod rows
    or columns are possible and this is _not_ checked.
    observationsamplezUnknown axis typeshaper    .biom_str does not appear to be in BIOM format!dataZmatrix_type:r   r   r   r   r"   r   z'Observations to keep are out of bounds!z[%d, %d]r3   r4   z"Samples to keep are out of bounds!z"data": z, "shape": )r+   r1   
ValueErrorsplitreplacelistmapr	   r#   r$   minmaxset_direct_slice_data_sparse_obs_direct_slice_data_sparse_samp)r,   to_keepaxisZshape_kv_pairZdata_fieldsZmatrix_type_kv_pairZ	raw_shapeZn_rowsZn_colsZ
data_startZ	new_shapenew_datar/   r/   r0   direct_slice_datag   sB    







rF   c                 C   s
   |  dS )Nz[] 
	stripxr/   r/   r0   strip_f   s    rK   c                 C   s2   t tt| d\}}}||  d| d| S )zRemap a sparse observation axisr   r<   r=   rK   r:   rcvlookuprowcolvaluer/   r/   r0   _remap_axis_sparse_obs   s    rS   c                 C   s2   t tt| d\}}}| d||  d| S )zRemap a sparse sample axisr   rL   rM   r/   r/   r0   _remap_axis_sparse_samp   s    rT   c                 C   sd   g }dd t t|D }| dD ]0}t|d\}}}||kr$|t|| q$dd| S )zislice observations from data

    data : raw data string from a biom file
    to_keep : rows to keep
    c                 S   s   i | ]\}}t ||qS r/   str.0ivr/   r/   r0   
<dictcomp>   s      z1_direct_slice_data_sparse_obs.<locals>.<dictcomp>],r   [[%s]]],[)	enumeratesortedr:   rK   r)   rS   joinr7   rC   rE   Zremap_lookuprN   rcrZ   r/   r/   r0   rA      s    rA   c                 C   s`   g }dd t t|D }| dD ],}|d\}}}||kr$|t|| q$dd| S )zgslice samples from data

    data : raw data string from a biom file
    to_keep : columns to keep
    c                 S   s   i | ]\}}t ||qS r/   rU   rW   r/   r/   r0   r[      s      z2_direct_slice_data_sparse_samp.<locals>.<dictcomp>r\   r   r]   r^   )r_   r`   r:   r)   rT   ra   rb   r/   r/   r0   rB      s    rB   c                    s   t   |dkr d}t| |}n |dkr8d}t| |}ntd|dkrPtdtd| }d	d
 || D } |std fddt|| D }t |}|g i}t|| D ]\}	}
|	|kr|| |
 q|t	|dd fS )zReturns the indices for the associated ids to keep

    biom_str : a BIOM formatted JSON string
    to_keep  : a list of IDs to get indices for
    axis     : either 'samples' or 'observations'

    Raises KeyError if unknown key is specified
    r3   Zrowsr4   columnszUnknown axis!r    r6   z{%s}c                 S   s   h | ]}|d  qS idr/   )rX   rZ   r/   r/   r0   	<setcomp>   s     z#get_axis_indices.<locals>.<setcomp>z+Not all of the to_keep ids are in biom_str!c                    s    g | ]\}}|d   kr|qS rf   r/   rW   rC   r/   r0   
<listcomp>   s      z$get_axis_indices.<locals>.<listcomp>r"   r   )
r@   r1   r9   jsonloadsissubsetKeyErrorr_   r)   dumps)r,   rC   rD   Zaxis_keyZ	axis_dataZall_idsZidxsZidxs_lookupZsubsetrY   rZ   r/   ri   r0   get_axis_indices   s*    	
rp   c              	   C   sH  t t}i }g }i }g }td}| D ]}| }|s8q$|d}|d }	|	|krTq$|d  d }
|d  d }|
dkr|}
|
|kr||
 }nt|}||
 |||
< |	dks|	dkr$z|d	}W n tk
r   td
Y nX |d| }||kr|| }nt|}|||< || |||f  d7  < q$q$t	|||dS )a   Create a Table object from a uclust/usearch/vsearch uc file.

        Parameters
        ----------
        fh : file handle
            The ``.uc`` file to be parsed.

        Returns
        -------
        biom.Table : The resulting BIOM table.

        Raises
        ------
        ValueError
            If a sequence identifier is encountered that doesn't have at least
            one underscore in it (see Notes).

        Notes
        -----
        This function assumes sequence identifiers in this file are in QIIME's
        "post-split-libraries" format, where the identifiers are of the form
        ``<sample-id>_<sequence-id>``. Everything before the first underscore
        will be used as the sample identifier in the resulting ``Table``.
        The information after the first underscore is not used directly, though
        the full identifiers of seeds will be used as the observation
        identifier in the resulting ``Table``.

    ZHSLr   r   	      *HS_zA query sequence was encountered that does not have an underscore. An underscore is required in all query sequence identifiers to indicate the sample identifier.Nr"   )observation_ids
sample_ids)
r   r	   r@   rH   r:   r$   r)   rindexr9   r   )Zfhr7   Zsample_idxsrx   Zobservation_idxsrw   Z
line_typeslineZfieldsZ	line_typeZobservation_idZquery_idZobservation_idxZunderscore_indexZ	sample_idZ
sample_idxr/   r/   r0   parse_uc   sN    







r{   r4   Fc           	   
      s  |dkrt | ztj|  |dW S  tk
r6   Y n tk
rH   Y nX t| dr|  }| d}| rz| d}qf|dkr| 	| tj
tj| td|d}n| 	| t| ddd	d
 }ntt| tr"z"tj
tjd| td|d}W n* tk
r   t| dddd
 }Y nX ntj
tj| td|d} fdd}dd } dk	r|j||d |dkrtdnd}|j||d |S )a  Parses the biom table stored in `file_obj`

    Parameters
    ----------
    file_obj : file-like object, or list
        file-like object storing the BIOM table (tab-delimited or JSON), or
        a list of lines of the BIOM table in tab-delimited or JSON format
    ids : iterable
        The sample/observation ids of the samples/observations that we need
        to retrieve from the biom table
    axis : {'sample', 'observation'}, optional
        The axis to subset on
    input_is_dense : boolean
        Indicates if the BIOM table is dense or sparse. Valid only for JSON
        tables.

    Returns
    -------
    Table
        The BIOM table stored at file_obj

    Raises
    ------
    ValueError
        If `samples` and `observations` are provided.

    Notes
    -----
    Subsetting from the BIOM table is only supported in one axis

    Examples
    --------
    Parse a hdf5 biom table

    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f) # doctest: +SKIP

    Parse a hdf5 biom table subsetting observations
    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f, ids=["GG_OTU_1"],
    ...                      axis='observation') # doctest: +SKIP
    r2   )idsrD   readr"   r   )Zobject_pairs_hook)input_is_denseNc                 S   s   | S Nr/   rI   r/   r/   r0   <lambda>      z"parse_biom_table.<locals>.<lambda>r    c                 S   s   | S r   r/   rI   r/   r/   r0   r     r   c                    s   | kS r   r/   )r7   id_mdr|   r/   r0   
subset_ids  s    z$parse_biom_table.<locals>.subset_idsc                 S   s
   t | S r   )npany)valsr   r   r/   r/   r0   gt_zero  s    z!parse_biom_table.<locals>.gt_zerorD   r4   r3   )r   r   Z	from_hdf5r9   RuntimeErrorhasattrtellr}   isspaceseekZ	from_jsonrk   loadr   from_tsv
isinstancer<   rl   ra   filter)	Zfile_objr|   rD   r~   Zold_posrd   tr   r   r/   r   r0   parse_biom_tableY  sV    /







r   c                 C   sD   g }|  dD ]0}g }| dD ]}||  q || q|S )N|;)r:   r)   rH   )rJ   Zcomplex_metadatayZsimple_metadataer/   r/   r0   sc_pipe_separated  s    r   c                       s*   e Zd ZedddZ fddZ  ZS )	MetadataMapTFNc              
   C   s  t |dr4zt|}W n tk
r2   tdY nX |rP|rFdd }qfdd }n|r^dd }ndd }|dkrri }g }|p|g }g }|D ]}	||	}	|	r|r|	 sq|	d	r|	d
d }	|s|	 d}n
||	 qtt	||	d}
t
|
t
|k r|
dgt
|t
|
   ||
 q|s6td|sDtddd |D }t
|t
t|krptdi }|D ]p}i }t|d
d |d
d D ]>\}}z|| |||< W n tk
r   |||< Y nX q|||d < qx| |S )a,  Parse mapping file that relates samples or observations to metadata.

        Format: header line with fields
                optionally other comment lines starting with #
                tab-delimited fields

        process_fns: a dictionary of functions to apply to metadata categories.
         the keys should be the column headings, and the values should be
         functions which take a single value. For example, if the values in a
         column called "taxonomy" should be split on semi-colons before being
         added as metadata, and all other columns should be left as-is,
         process_fns should be:
          {'taxonomy': lambda x: x.split(';')}

        Assumes the first column in the mapping file is the id.

        This method is ported from QIIME (http://www.qiime.org), previously
        named parse_mapping_file/parse_mapping_file_to_dict. QIIME is a GPL
        project, but we obtained permission from the authors of this method
        to port it to the BIOM Format project (and keep it under BIOM's BSD
        license).
        upperzAA string was passed that doesn't refer to an accessible filepath.c                 S   s   |  ddS Nr   r    )r;   rI   r/   r/   r0   rK     s    z&MetadataMap.from_file.<locals>.strip_fc                 S   s   |  dd S r   r;   rH   rI   r/   r/   r0   rK     s    c                 S   s   | S r   r/   rI   r/   r/   r0   rK     s    c                 S   s   |   S r   rG   rI   r/   r/   r0   rK     s    N#r"   r   r    z)No header line was found in mapping file.zNo data found in mapping file.c                 S   s   g | ]}|d  qS )r   r/   )rX   rY   r/   r/   r0   rj     s     z)MetadataMap.from_file.<locals>.<listcomp>z2First column values are not unique! Cannot be ids.r   )r   openOSErrorr   rH   
startswithr:   r)   r<   r=   r$   extendr@   ziprn   )clslinesZstrip_quotesZsuppress_strippingheaderZprocess_fnsrK   Zmapping_dataZcommentsrz   Ztmp_lineZ	first_colmappingr   Z	current_dkrZ   r/   r/   r0   	from_file  s^    




"zMetadataMap.from_filec                    s   t  | dS )zAccepts dictionary mapping IDs to metadata.

        ``mapping`` should be a dictionary mapping an ID to a dictionary of
        metadata. For example:

        {'Sample1': {'Treatment': 'Fast'}, 'Sample2': {'Treatment': 'Control'}}
        N)super__init__)selfr   	__class__r/   r0   r   /  s    zMetadataMap.__init__)TFNN)__name__
__module____qualname__classmethodr   r   __classcell__r/   r/   r   r0   r     s       hr   c                   C   s   dt  S )zReturns a generated by stringzBIOM-Format %s)r   r/   r/   r/   r0   generatedby:  s    r   c                 K   s    t j| |||f|}|t S )a   Convert a contigency table to a biom table

    sample_mapping : dict of {'sample_id':metadata} or None
    obs_mapping : dict of {'obs_id':metadata} or None
    process_func: a function to transform observation metadata
    dtype : type of table data
    )r   r   Zto_jsonr   )Ztable_fZsample_mappingZobs_mappingZprocess_funckwargsZ	otu_tabler/   r/   r0   convert_table_to_biom?  s    	
r   r8   c                    s   t | tr| d S t | trtdd  gt | d tr~g }| D ]&}fdd|D }|d| qLd|S d fd	d
| D S dS )zCDetermine which format the metadata is and then convert to a stringr   z;|r    r   c                    s   g | ]}|   qS r/   )rH   Ztrans)rX   r   )transtabr/   r0   rj   \  s     z'biom_meta_to_string.<locals>.<listcomp>z; r   c                 3   s   | ]}| d   V  qdS )r   Nr   )rX   rJ   )replace_strr/   r0   	<genexpr>c  s   z&biom_meta_to_string.<locals>.<genexpr>N)r   rV   r;   r<   bytes	maketransra   r)   )metadatar   Znew_metadatarJ   valuesr/   )r   r   r0   biom_meta_to_stringM  s    


r   c                 C   s\   t | }|dkrt}|jdddkr,| S ||jddd krP|j|||dS | S dS )z*Convert a biom table to a contigency tableNr3   r   r   )
header_keyheader_valueZmetadata_formatter)
load_tabler   r   Zdelimited_self)Zbiom_fr   r   Z	md_formattabler/   r/   r0   convert_biom_to_tableh  s    r   c              
   C   s   t | tjtjfrFzt| }W q ttfk
rB   td|  Y qX nFt| 8}zt|}W n$ ttfk
r   td|  Y nX W 5 Q R X |S )a\  Load a `Table` from a path

    Parameters
    ----------
    f : str or file-like object
        The entity to parse

    Returns
    -------
    Table

    Raises
    ------
    IOError
        If the path does not exist
    TypeError
        If the data in the path does not appear to be a BIOM table

    Examples
    --------
    Parse a table from a path. BIOM will attempt to determine if the fhe file
    is either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse
    accordingly:

    >>> from biom import load_table
    >>> table = load_table('path/to/table.biom') # doctest: +SKIP

    z%%s does not appear to be a BIOM file!)	r   ioIOBaseh5pyZFiler   r+   	TypeErrorr   )fr   fpr/   r/   r0   r   z  s    
r   )Nr4   F)r8   )NNN)2Znumpyr   r   r   Zbiom.exceptionr   r   Z
biom.tabler   Z	biom.utilr   r   rk   collectionsr   r   
__author__Z__copyright____credits__Z__license__Z__url__Z__maintainer__Z	__email__r	   r
   rV   ZMATRIX_ELEMENT_TYPEr'   r&   r*   Z	JSON_SKIPr%   r1   rF   rK   rS   rT   rA   rB   rp   r{   r   r   dictr   r   r   r   r   r   r/   r/   r/   r0   <module>   sx      .>'\
a
v
  
