U
    e                  
   @   s   d dl Z d dlZd dlZd dlZd dlmZ ddlmZm	Z	m
Z
mZmZmZ ddlmZ ddddd	d
ddddgdfeejeeeeeeejfdddZdd Zdd Zdd Zdd Zdd ZdS )    N)DNAFASTAFormat   )run_command	_find_lca	_majority_find_super_lca_sort_rank_handles _return_stripped_taxon_rank_list)_allowed_ranksuniq      ?ZdomainZphylumclassorderZfamilyZgenusZspeciesF)	sequencestaxamodeperc_identitythreadsrank_handlesderep_prefixreturnc              
      s`  t  H}t  2}tt| |j|jt|| |d t|j}	t }
|dk rtt|jt|t|
|jt| |d t|j	d|	d 
fdd|	d< nt|jt|
 d |jd d df 
 fd	d|jd d df< t|| |
|	 |d
\}}d|krDt|t}|d j
t| gd|jd d df< W 5 Q R X W 5 Q R X ||fS )Nr   r   seqID
centroidIDc                    s    j | df S )Nr   locx)uc_clust Olib/python3.8/site-packages/rescript-2024.2.0-py3.8.egg/rescript/dereplicate.py<lambda>5       zdereplicate.<locals>.<lambda>;Taxonc                    s     t| S N)joinr	   r   sc_delimr   r   r    ?   s   )r   disable)args)tempfileZNamedTemporaryFile_vsearch_derepstrnameseek	_parse_ucr   _vsearch_cluster_sizeZ	set_indexapplyshutilZcopyfiler   _dereplicate_taxar   r
   _backfill_taxonomy)r   r   r   r   r   r   r   	out_fastaZout_ucucZclustered_seqs
derep_taxaseqs_outZsorted_rank_handlesr   )r'   r   r   dereplicate   sX     

 



    

 (r9   c                 C   s:   t | }t|}|t|kr | S ||||d   S d S r$   )r	   lenr%   )Ztaxonr   r'   Zformatted_taxonZtax_lenr   r   r   r4   P   s
    r4   c              
   C   s0   dd| d|d|dd|g
}|r$d|d< t | d S )	Nvsearchz--derep_fulllengthz--output--uc--xsize	--threadsz--derep_prefixr   r   )sequences_fpout_fasta_fp	out_uc_fpr   r   cmdr   r   r   r+   Y   s        r+   c                 C   s,   dd| d|d|d|dddd	|g}t | d S )
Nr;   z--cluster_sizez--idz--centroidsr<   z--qmaskZnoner=   r>   r?   )r@   r   rA   rB   r   rC   r   r   r   r0   f   s           r0   c                 C   sX   t j| dd td}||d ddg ddg }|d |j|d dkdf< d	d
g|_|S )N	)sepheaderZdtyper   HS   	   *r   r   )pdZread_csvobjectZisinr   columns)Zuc_fpr6   r   r   r   r/   s   s
    
r/   c              	      s  |dkr,t |d  }||d |d k }|d fdd|d< |d fdd|d< |dkr||d |d k }|ddg}||d  }t }	|	 >}
t|}tj	|d	tj
d
D ]}|jd |kr||
 qW 5 Q R X |}n||dd dd }|dkr4| fdd }n>|dkrV| fdd }n|dkrr|dd  }|}	d|j_||	fS )Nr   r   r   c                    s
    j |  S r$   r   r   r   r   r   r       r!   z#_dereplicate_taxa.<locals>.<lambda>r#   c                    s
    j |  S r$   r   r   rO   r   r   r       r!   ZcentroidtaxaZfasta)formatconstructoridc                 S   s   t | S r$   )listr   r   r   r   r       r!   Zlcac                    s     tdd | D S )Nc                 S   s   g | ]}t |qS r   r	   .0yr   r   r   
<listcomp>   s   7_dereplicate_taxa.<locals>.<lambda>.<locals>.<listcomp>)r%   r   r   r&   r   r   r       s   superc                    s     tdd | D S )Nc                 S   s   g | ]}t |qS r   rT   rU   r   r   r   rX      s   rY   )r%   r   r   r&   r   r   r       s   Zmajorityc                 S   s   t | S r$   )r   r   r   r   r   r       r!   z
Feature ID)setuniquer1   Zdrop_duplicatesunionr   openr,   skbioreadZDNAZmetadatawriteZreindexgroupbyZto_frameindexr-   )r   Zraw_seqsZ
derep_seqsr6   r'   r   Zcentroid_idsZrereplicatesZrereplicate_idsr8   r5   Zseq_fpsr7   r   )r'   r   r   r3   }   s4    



r3   )r*   ZpandasrL   r_   r2   Zq2_types.feature_datar   Z
_utilitiesr   r   r   r   r   r	   Zncbir
   Z	DataFramer,   floatintrS   boolr9   r4   r+   r0   r/   r3   r   r   r   r   <module>	   s:       9	
