
    чoe*=                     :   d Z ddlmZmZmZ ddlZddlmZ ddlZddl	m	Z
 ddlmZ ddlmZ ddlZddlmZ ddlmZ ddlZ	 ddlZd	Zn# e$ r d
ZY nw xY wdZdZg dZdZdZ dZ!dZ"dZ#g dZ$g dZ%d Z&d Z'd Z(d Z)ddZ*d Z+d Z,d Z-e.dk    r e-             dS dS )zo
Create BIOM-format tables (http://biom-format.org) from 
Kraken output (http://ccb.jhu.edu/software/kraken/).
    )absolute_importdivisionprint_functionN)OrderedDict)datetime)open)dedent)TableTFShareef M. Dabdoubz"Copyright 2016, Shareef M. Dabdoub)r   zAkshay ParopkarizSukirth GanesanzPurnima KumarMITz'http://github.com/smdabdoub/kraken-biomzdabdoub.2@osu.eduz1.0.1)	pct_readsclade_readstaxon_readsrankncbi_taxsci_name)DPCOFGSSSc                 L    d v rNd v rJ d                               d                   r) d         t           d                   dz   d          d<   d v rNd v rJ d                               d                   r) d         t           d                   dz   d          d<    fdt          d|dz            D             }|                    d t          |dz   d         D                        |d	                              d
          rd|d	         dd         z   |d	<   |S )al  
    Create a string representation of a taxonomic hierarchy (QIIME for now).

    :type tax_lvl: dict
    :param tax_lvl: Keyed on the entries in ranks
    :type end: int
    :param end: The end rank index (0-based indexing). If end == 3
                then the returned string will contain K, P, C, and O.
    
    >>> tax_fmt({"K": "Bacteria", "P": "Firmicutes", "C": "Negativicutes", 
    ... "O": "Selenomonadales", "F": "Veillonellaceae", "G": "Veillonella", 
    ... "S": "Veillonella parvula", "SS": "Veillonella parvula DSM 2008"}, 4)
    'k__Bacteria; p__Firmicutes; c__Negativicutes; o__Selenomonadales'
    r   r      Nr   c                 x    g | ]6}d                      |                                |v r|         nd          7S )z{}__{} formatlower.0rtax_lvls     `/mounts/lovelace/software/anaconda3/envs/kraken-biom/lib/python3.11/site-packages/kraken_biom.py
<listcomp>ztax_fmt.<locals>.<listcomp>C   sO     % % % ??17799ALLgajjbII % % %    c                 \    g | ])}d                      |                                          *S )z{}__r   )r#   r$   s     r&   r'   ztax_fmt.<locals>.<listcomp>G   s,    BBBQaggii((BBBr(   r   dk)
startswithlenranksextend)r%   endtaxs   `  r&   tax_fmtr3   ,   sN    g~~'>>gcl55gclCC>"3<GCL(9(9!(;(<(<=GCLw'>>gdm66ws|DD>#DM#gcl*;*;A*=*>*>?GDM% % % %FSUFm% % %C JJBB%Ab/BBBCCC 1v  SVABBZA Jr(   c                    t                      }t                      }i d}t                              |          }t                              |          }| D ]o}|d                                         }	|	t          v rSt                              |	          |k     r5t                              |	          }fdt          d|         D             |	t          v r7|d                                         |	<   t                              |	          }|	t          v r|t                              |d                   cxk    r|k    rn t	          |d                   }
t	          |d                   }|
dk    s|dk    rI|d         |k    r=t          |          ||d         <   |d         |k    r|||d         <   d|
||d         <   q||fS )	a2  
    Parse a single output file from the kraken-report tool. Return a list
    of counts at each of the acceptable taxonomic levels, and a list of 
    NCBI IDs and a formatted string representing their taxonomic hierarchies.

    :type kdata: str
    :param kdata: Contents of the kraken report file.
    r   r   c                 .    i | ]}||v r|         nd S )r    r"   s     r&   
<dictcomp>z'parse_kraken_report.<locals>.<dictcomp>o   s+    PPPqW'!**"PPPr(   Nr   r   r   r   )r   r/   indexstripintr3   )kdatamax_rankmin_ranktaxacountsr$   max_rank_idxmin_rank_idxentryerankr   r   r%   s               @r&   parse_kraken_reportrD   T   s    ==D]]FG	A;;x((L;;x((L < <f##%% E>>ekk%00144E""APPPPeBQBiPPPG E>>":.4466GENE""A E>>lekk%-.H.HXXXXLXXXXXeM233KeM233KQ;??uV}7P7P*1'1*=*=U:&'=H,,0;F5,--0;F5,- 4<r(   c                    t                      }t                      }| D ])}t          j        |          s"t          d                    |                    t          j        t          j        |          d                   d         }t          |d          5 }	 t          j	        |t          d          }d |D             dd         }	n4# t          $ r'}
t          d	                    |
                    d}
~
ww xY w	 ddd           n# 1 swxY w Y   t          |	||
          \  }}|                    |           |||<   +||fS )zs
    Parse all kraken-report data files into sample counts dict
    and store global taxon id -> taxonomy data
    zERROR: File '{}' not found.r   r   rt	)
fieldnames	delimiterc                     g | ]}|S r6   r6   )r#   rB   s     r&   r'   z#process_samples.<locals>.<listcomp>   s    0005000r(   Nz	ERROR: {}r<   r=   )r   ospisfileRuntimeErrorr    splitextsplitr   csv
DictReaderfield_namesOSErrorrD   update)kraken_reports_fpr<   r=   r>   sample_countskrep_fp	sample_idkfkdrr;   oescountsstaxas                r&   process_samplesr_      s   
 ==DMMM$ + +z'"" 	N<CCGLLMMM L7!3!3A!677:	'4   	;B;nRK/35 5 500C0004 ; ; ;";#5#5b#9#9:::; 		; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; -UX6>@ @ @ 	E#*i  $s6   D0CD
C>"C99C>>DD	D	c                 v     fdD             }t          j        |t                    }fdD             }d                    t          t
                    }t          |t                    t                     |dt          t          j
                                                              |d          S )a  
    Create a BIOM table from sample counts and taxonomy metadata.

    :type sample_counts: dict
    :param sample_counts: A dictionary of dictionaries with the first level
                          keyed on sample ID, and the second level keyed on
                          taxon ID with counts as values.
    :type taxa: dict
    :param taxa: A mapping between the taxon IDs from sample_counts to the
                 full representation of the taxonomy string. The values in
                 this dict will be used as metadata in the BIOM table.
    :rtype: biom.Table
    :return: A BIOM table containing the per-sample taxon counts and full
             taxonomy identifiers as metadata for each taxon.
    c                 0    g | ]fd D             S )c                 D    g | ]}|         vrd n|                  S )r   r6   )r#   sidrW   taxids     r&   r'   z0create_biom_table.<locals>.<listcomp>.<listcomp>   sF     ( ( ( }S111QQ}S7I%7P ( ( (r(   r6   )r#   rd   rW   s    @r&   r'   z%create_biom_table.<locals>.<listcomp>   sP     # # #( ( ( ( (&( ( ( # # #r(   )dtypec                 $    g | ]}d |         iS )taxonomyr6   )r#   rd   r>   s     r&   r'   z%create_biom_table.<locals>.<listcomp>   s"    <<<eT%[)<<<r(   zkraken-biom v{} ({})z	OTU tableT)typecreate_dategenerated_byinput_is_dense)nparrayr:   r    __version____url__r
   liststrdtnow	isoformat)rW   r>   datatax_metagen_strs   ``   r&   create_biom_tablerx      s     # # # #!# # #D 8D$$$D<<<<t<<<H$++KAAGtDzz4#6#6!s26883E3E3G3G/H/H%d< < < <r(   hdf5c                    t           }d}|r)|dk    r#|                    d          s|dz  }t          }d}|dk    rt          j        } |||          5 }|dk    r|                     | j        |           nI|dk    r(|                    |                                            n| 	                    || j                   ddd           n# 1 swxY w Y   |S )	a  
    Write the BIOM table to a file.

    :type biomT: biom.table.Table
    :param biomT: A BIOM table containing the per-sample OTU counts and metadata
                  to be written out to file.
    :type output_fp str
    :param output_fp: Path to the BIOM-format file that will be written.
    :type fmt: str
    :param fmt: One of: hdf5, json, tsv. The BIOM version the table will be
                output (2.x, 1.0, 'classic').
    wry   z.gzwtjson)	direct_iotsvN)
r   endswith	gzip_openh5pyFileto_jsonrj   writeto_tsvto_hdf5)biomT	output_fpfmtgzipopenermodebiom_fs          r&   
write_biomr      s(    FD v!!%(( 	I f}}		4	 	  6F&==MM%,M????E\\LL((((MM&%"45556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 s   A-CCCc                 T   t          j        |          d         }|dk    s6t          j        |          s"t          d                    |                    t          |d          5 }|                    d                    |                      ddd           dS # 1 swxY w Y   dS )a  
    Write out a file containing only the list of OTU IDs from the kraken data.
    One line per ID.

    :type otu_ids: list or iterable
    :param otu_ids: The OTU identifiers that will be written to file.
    :type fp: str
    :param fp: The path to the output file.
    r   r   z!Specified path does not exist: {}r|   
N)rL   rP   isdirrN   r    r   r   join)otu_idsfpfpdiroutfs       r&   write_otu_filer      s     IbMM!EB;;sy//;>EEeLLMMM	b$ '4

499W%%&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 's   ')BB!$B!c                     d} t          j        t          |           t           j                  }|                    ddd           |                    ddt
          d d	         d
           |                    ddt
          d d	         d           |                    dddd           |                    dd           |                    ddg dd           |                    ddd           |                    dd d!                    t          t                    "           |                    d#d$dd%           |	                                S )&Na3	      Create BIOM-format tables (http://biom-format.org) from Kraken output 
    (http://ccb.jhu.edu/software/kraken/).

    The program takes as input, one or more files output from the kraken-report
    tool. Each file is parsed and the counts for each OTU (operational taxonomic
    unit) are recorded, along with database ID (e.g. NCBI), and lineage. The
    extracted data are then stored in a BIOM table where each count is linked
    to the Sample and OTU it belongs to. Sample IDs are extracted from the input
    filenames (everything up to the '.').

    OTUs are defined by the --max and --min arguments. By default these are
    set to Order and Species respectively. This means that counts assigned
    directly to an Order, Family, or Genus are recorded under the associated
    OTU ID, and counts assigned at or below the Species level are assigned to
    the OTU ID for the species. Setting a minimum rank below Species is not yet
    available.

    The BIOM format currently has two major versions. Version 1.0 uses the 
    JSON (JavaScript Object Notation) format as a base. Version 2.x uses the
    HDF5 (Hierarchical Data Format v5) as a base. The output format can be
    specified with the --fmt option. Note that a tab-separated (tsv) output
    format is also available. The resulting file will not contain most of the
    metadata, but can be opened by spreadsheet programs.

    Version 2 of the BIOM format is used by default for output, but requires the
    Python library 'h5py'. If the library is not installed, kraken-biom will 
    automatically switch to using version 1.0. Note that the output can 
    optionally be compressed with gzip (--gzip) for version 1.0 and TSV files. 
    Version 2 files are automatically compressed.

    Usage examples
    --------------
    1. Basic usage with default parameters:

    $ kraken-biom.py S1.txt S2.txt

      This produces a compressed BIOM 2.1 file: table.biom

    2. BIOM v1.0 output:

    $ kraken-biom.py S1.txt S2.txt --fmt json

      Produces a BIOM 1.0 file: table.biom

    3. Compressed TSV output:

    $ kraken-biom.py S1.txt S2.txt --fmt tsv --gzip -o table.tsv

      Produces a TSV file: table.tsv.gz

    4. Change the max and min OTU levels to Class and Genus:

    $ kraken-biom.py S1.txt S2.txt --max C --min G

    Program arguments
    -----------------)descriptionformatter_classkraken_reports+z*Results files from the kraken-report tool.)nargshelpz--maxr   r*   zpAssigned reads will be recorded only if                               they are at or below max rank. Default: O.)defaultchoicesr   z--minr   zReads assigned at and below min rank                               will be recorded as being assigned to the                               min rank level. Default: S.z-oz--output_fpz
table.biomaI  Path to the BIOM-format file. By default, the                        table will be in the HDF5 BIOM 2.x format. Users can                        output to a different format using the --fmt option.                        The output can also be gzipped using the --gzip                        option. Default path is: ./table.biom)r   r   z--otu_fpaU  Create a file containing just the (NCBI) OTU IDs                        for use with a service such as phyloT                         (http://phylot.biobyte.de/) to generate a phylogenetic                        tree for use in downstream analysis such as UniFrac,                         iTol (itol.embl.de), or PhyloToAST (phylotoast.org).)r   z--fmtry   )ry   r}   r   zVSet the output format of the BIOM table.                              Default is HDF5.z--gzip
store_truezCompress the output BIOM table with gzip.                              HDF5 BIOM (v2.x) files are internally                              compressed by default, so this option                              is not needed when specifying --fmt hdf5.)actionr   z	--versionversionzkraken-biom version {}, {})r   r   z-vz	--verbosezNPrints status messages during program                               execution.)
argparseArgumentParsertwddRawDescriptionHelpFormatteradd_argumentr/   r    rn   ro   
parse_args)descrparsers     r&   handle_program_optionsr      s   8Et $e+3+OQ Q QF
(I  K K K
eCRCjJ  K K K eCRCj;  < < < m\?  @ @ @ 
N  O O O  7 7 70  1 1 1 I  J J J I188gNN  P P P
k,*  + + + r(   c                     t                      } | j        dk    r,t          s%d| _        d}t          t	          |                     t
                              | j                  t
                              | j                  k    r4d}t          j
        |                    | j        | j                             t          | j        | j        | j                  \  }}t          ||          }t          || j        | j        | j                  }| j        rd	 t'          t)          |          | j                   n@# t*          $ r3}d}t          j
        |                    |                     Y d }~nd }~ww xY w| j        rt          d                    |                     d                    ||j        d	         |j        d
         |                                          }t          t	          |                     d S d S )Nry   r}   zo        Library 'h5py' not found, unable to write BIOM 2.x (HDF5) files.
        Defaulting to BIOM 1.0 (JSON).z2ERROR: Max and Min ranks are out of order: {} < {}rK   zERROR creating OTU file: 
	{}r   z        BIOM-format table written to: {out_fp}
        Table contains {rows} rows (OTUs) and {cols} columns (Samples)
        and is {density:.1%} dense.r   r   )out_fprowscolsdensity)r   r   	HAVE_H5PYprintr   r/   r8   maxminsysexitr    r_   r   rx   r   r   r   otu_fpr   rp   rN   verboseshapeget_table_density)argsmsgrW   r>   r   r   re	table_strs           r&   mainr   d  s   !##Dx6)* 	d3ii{{48DH 5 555BDHdh//000 *$*=378378= = =M4 mT22Et~txCCF{ %	%4::t{3333 	% 	% 	%3CHSZZ^^$$$$$$$$	% | 	bii   ' (.vV38;q>38;q>6;6M6M6O6O (. (Q (Q 	 	d9oo	 	s   "D: :
E7)E22E7__main__)ry   F)/__doc__
__future__r   r   r   r   collectionsr   rQ   r   rr   r   r   r   os.pathpathrL   r   textwrapr	   r   
biom.tabler
   numpyrl   r   r   ImportError
__author____copyright____credits____license__ro   __maintainer__	__email__rn   rS   r/   r3   rD   r_   rx   r   r   r   r   __name__r6   r(   r&   <module>r      s    A @ @ @ @ @ @ @ @ @  # # # # # # 



 # # # # # # " " " " " "       



 # # # # # #          KKKII   III "
43 3 3
3%	/ / /111% % %P3 3 3l  @< < <:! ! ! !H' ' '&c c cL) ) )X zDFFFFF s   A	 	AA