U
    e+                  	   @   s:  d dl mZ d dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlmZ d dlmZmZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlZd	d
lmZ e ddZ!d)ddZ"d*ddZ#dd Z$dd Z%dd Z&dd Z'dd Z(d+e)ee*e*e+e)dddd Z,d!d" Z-d,d#d$Z.d%d& Z/d'd( Z0dS )-    )	cpu_countNjoin)Paralleldelayed)DNAIterator)HashingVectorizer)Counter)zip_longest   _taxon_to_listZrescriptZassetsc                 C   s   t ||}g }t||D ]0\}}t|tj|}t||d< || qt|	 }tj
dd tdt|jd D dd|_t|}| dd}	|	|d	dd
d\}
|
S )NZDatasetc                 S   s   g | ]}t |qS  str.0ir   r   Llib/python3.8/site-packages/rescript-2024.2.0-py3.8.egg/rescript/evaluate.py
<listcomp>1   s     z%evaluate_taxonomy.<locals>.<listcomp>r   id)nameZlongitudinal
volatilityLevelTaxonomic Entropy)ZmetadataZstate_columnZdefault_group_columnZdefault_metric)_process_labelszip_evaluate_taxonomyZviewpdZSeriesr   appendconcatZreset_indexZIndexrangelenindexq2ZMetadataZ
get_action)Zctx
taxonomieslabelsrank_handle_regexZ	summariesr   taxonomysummaryresultsr   Zplotsr   r   r   evaluate_taxonomy    s,    

  
r+   r%   c                 C   sb   | d krg } t | t | }}||k rJt| dd tt| |dD } n||kr^| d | } | S )Nc                 S   s$   g | ]\}\}}|d k	r|n|qS Nr   )r   nr   tr   r   r   r   B   s    
 z#_process_labels.<locals>.<listcomp>r   )r"   _warn_uneven_length	enumerater
   )r&   r%   descriptionZn_labelsZn_taxonomiesr   r   r   r   ;   s    r   c                 C   s   d | }t|t d S )NzThe lists of input {0} and labels are different lengths. Additional {0} will be labeled numerically by their order in the inputs. Note that if these numbers match existing labels, those data will be grouped in the visualization.)formatwarningswarnUserWarning)r1   msgr   r   r   r/   J   s    r/   c                 C   sL   |d krd}t | |d}|j }t| ||d}|j|ddd}d|j_|S )N r'   )r'   	max_depthT)Z
left_indexZright_indexr   )summarize_taxonomic_depthr#   max_taxonomic_entropymerger   )r(   r'   r)   r9   entropyr   r   r   r   R   s     
  r   c                    s  t | |d}| }t|   }|  }tj||gdd}ddg|_|j}g }g }td| d D ]D}||kr~ddg|j	|< |
| |
 |  ||j	|df 8 }qd| } fdd|D }	 fd	d|D }
||d
< |	|d< ||d< |
|d< d|j_|dS )Nr8   r   Zaxisz'Number of Features Terminating at Depthz+Proportion of Features Terminating at Depthr   c                    s   g | ]}|  qS r   r   )r   cZtotalr   r   r   x   s     z-summarize_taxonomic_depth.<locals>.<listcomp>c                    s   g | ]}|  qS r   r   )r   urA   r   r   r   y   s     z&Number of Features Classified at Depthz*Proportion of Features Classified at Depthz(Number of Features Unclassified at Depthz,Proportion of Features Unclassified at Depthr   )_taxonomic_depthZvalue_countsr"   r   r    columnsr#   r!   r;   locr   Z
sort_indexr   Zdrop)r(   r'   ZdepthsZ	remainingZproportionsZ
depths_idxZ
classifiedZunclassifieddZclassified_propZunclassified_propr   rA   r   r:   a   s4    
r:   c                    sn   fdd| j D } fdd|D }dd t| D }dd |D }dd t|dD }tj|d	d
gdjS )Nc                    s"   g | ]}d d t | dD qS )c                 S   s   g | ]}|d kr|qS )Nr7   r   )r   r.   r   r   r   r      s    1_taxonomic_entropy.<locals>.<listcomp>.<listcomp>Zrank_handler   )r   vr8   r   r   r      s   z&_taxonomic_entropy.<locals>.<listcomp>c                    s(   g | ]   fd dt dd D qS )c                    s   g | ]}d   d| qS );Nr   r   r.   r   r   r      s     rH   r   )r!   )r   )r9   rL   r   r      s   c                 S   s   g | ]}|qS r   r   r   r   r   r   r      s     c                 S   s"   g | ]}d d t | D qS )c                 S   s   g | ]\}}|qS r   r   )r   krJ   r   r   r   r      s     rH   )r	   items)r   rr   r   r   r      s   c                 S   s&   i | ]\}}|t |tj|gqS r   )r"   scipystatsr>   )r   r-   rO   r   r   r   
<dictcomp>   s    z&_taxonomic_entropy.<locals>.<dictcomp>r   zUnique Labelsr   r#   )valuesr
   r0   r   	DataFrameT)r(   r'   r9   Z
taxa_listsZranksZunique_countsr>   r   )r9   r'   r   r<      s     

r<   c                    s   |   fddS )Nc                    s   t dd t|  dD S )Nc                 S   s   g | ]}|d kr|qS rG   r   r   r   r   r   r      s    z6_taxonomic_depth.<locals>.<lambda>.<locals>.<listcomp>rI   )r"   r   )xr8   r   r   <lambda>   s   
 z"_taxonomic_depth.<locals>.<lambda>)Zapply)r(   r'   r   r8   r   rC      s    rC         ?viridis)
output_dir	sequencesr&   kmer_lengthssubsample_kmerspalettereturnc           	      C   sF   t ||dd}t||||\}}t|||}t| || td d S )Nr\   )r1   all)r   _evaluate_seqs_plot_eval_seqs
_visualizepltclose)	r[   r\   r&   r]   r^   r_   r*   lengthsfigr   r   r   evaluate_seqs   s       ri   c                 C   sf   |dk r*t |t|  }tjj| |dd} tdd||gd}|| }|jdd}tj	j
|ddd S )Nr   F)replacechar)Zalternate_signZanalyzerZngram_ranger   r?   )intr"   npZrandomZchoicer   Zfit_transformsumrP   rQ   r>   )seqsZkmer_lenr^   Zsubsample_sizeZ
vectorizerXZ	kmer_freqr   r   r   _process_kmers   s    
rq   c                    s,  |sg }dd dD }|ddg7 }|dd |D 7 }g }t j|d}t|| D ]\} dd  D  td	d  D }|| t|d
ddddddg}	t  }
t	j
t|
}t|	t|
|g }tt|t }|d
krt|dd}| fdd|D }|| |||< qN|d|fS )Nc                 S   s   g | ]}d | qS )zLength r   )r   r-   r   r   r   r      s     z"_evaluate_seqs.<locals>.<listcomp>)minz1%z25%Zmedianz75%z99%r;   	N uniquesSequence Entropyc                 S   s   g | ]}d | qS )z%smer Entropyr   r   rM   r   r   r   r      s     rS   c                 S   s   g | ]}t |qS r   r   r   sr   r   r   r      s     c                 S   s   g | ]}t |qS r   )r"   rv   r   r   r   r      s     r   g{Gz?g      ?      ?g      ?gGz?r   Zloky)n_jobsZbackendc                 3   s   | ]}t t |V  qd S r,   )r   rq   ru   ro   r^   r   r   	<genexpr>   s     z!_evaluate_seqs.<locals>.<genexpr>   )r   rU   r   rm   Zarrayr   Znanquantiler	   rT   rP   rQ   r>   listr"   rr   r   r   extendround)r\   r&   r]   r^   Zrownamesrg   r*   r-   Zlength_arrayZlen_quantilesZuniqsZseq_entropyresry   ZparallelZ
kmer_freqsr   rz   r   rb      s6    
 


rb   c                 C   s  t |}t||}d| d }tjd|dfd}|jdddddd	}||d d
d d f }t||D ]\}	}
tj|	d|
|dd qj|	d |
d |d || jd  | jd   ||d
d df }tj| jdg ||d |	d |
d |j| ddd ||d
dd f }t | dkr| dd  }dd |jD |_tj|d||d tjd d!d" n,tj| jd#g ||d |j| ddd |	d$ |
d% |S )&N   T   )Zconstrained_layoutZfigsizer|      g?gQ?)ZnrowsZncolsleftrightZwspacerx   )ZshadecoloraxZalphazSequence Length DistributionZ
ProportionzLength (nt)z	Length 1%z
Length 99%rs   )datar_   r   zN Unique SequencesZCountZ   r   )ZrotationZha	      c                 S   s   g | ]}| d d qS ) r   )splitr   r   r   r   r      s     z#_plot_eval_seqs.<locals>.<listcomp>F)r   sortr_   r   )g?r   z
upper left)Zbbox_to_anchorrE   rt   ZEntropyH)r"   snsZcolor_palettere   ZfigureZadd_gridspecZadd_subplotr   ZkdeplotZ	set_titleZ
set_ylabelZ
set_xlabelZset_xlimrE   rr   r;   ZbarplotZset_xticklabelsZget_xticklabelsr#   ZlineplotZlegend)r*   rg   r_   Zn_groupsZcmapZ	fig_widthrh   Zgs1r   Zdatr   r   r   r   r   rc      sN        




 

rc   c                 C   s|   t dd |jt| ddd tj|dd}|jt| dd	d
 |jt| dd	d
 ttd}tj|| dd|dd d S )Nzdisplay.max_colwidthr   zevaluate_seqs_results.tsv	)sepTrS   zevaluate_seqs.pngZtight)Zbbox_incheszevaluate_seqs.pdfz
index.htmlzSequence Evaluation Resultsri   )titleZrunning_titler*   )context)	r   Z
set_optionZto_csvr   q2templatesZ
df_to_htmlZsavefig	TEMPLATESZrender)r[   r*   Zplotr#   r   r   r   rd     s    
rd   )NN)r%   )NNrY   rZ   )NrY   )1Zmultiprocessingr   Zpandasr   Znumpyrm   rP   r3   Zqiime2r$   r   os.pathr   Zjoblibr   r   Zq2_types.feature_datar   Zsklearn.feature_extraction.textr   Zseabornr   Zmatplotlib.pyplotZpyplotre   collectionsr	   	itertoolsr
   Zpkg_resourcesZ
_utilitiesr   Zresource_filenamer   r+   r   r/   r   r:   r<   rC   r   r}   floatri   rq   rb   rc   rd   r   r   r   r   <module>   sP     

$       
 -