U
    QÁ™bþ  ã                   @   sF   d dl mZ d dlZd dlZd dlmZ dd„ Zdd„ Zddd„Z	dS )é    )Ú
attrgetterN©ÚSequencec                  C   s    ddddddddddddg} | S )zSReturn the default error profile for deblurring
    based on illumina run data
    é   g¸…ëQ¸®?g{®Gáz”?ç{®Gáz„?ç{®Gázt?gü©ñÒMbP?gü©ñÒMb@?© )Ú
error_distr   r   ú0lib/python3.8/site-packages/deblur/deblurring.pyÚget_default_error_profile   s    
     þr   c                 C   sÊ   zdd„ | D ƒ}W n t k
r*   g }Y nX t|ƒdkrPt t¡}| d¡ dS tdd„ |D ƒƒ}tdd„ |D ƒƒ}t|ƒd	ksŒt|ƒd	kr´td
d t	t
|ƒ¡d t	t
|ƒ¡f ƒ‚t|tdƒdd}|S )a  Returns a list of Sequences

    Parameters
    ----------
    input_seqs : iterable of (str, str)
        The list of input sequences in (label, sequence) format

    Returns
    -------
    list of Sequence

    Raises
    ------
    ValueError
        If no sequences where found in `input_seqs`
        If all the sequences do not have the same length either aligned or
        unaligned.
    c                 S   s   g | ]\}}t ||ƒ‘qS r   r   )Ú.0ÚidÚseqr   r   r
   Ú
<listcomp>/   s     z!get_sequences.<locals>.<listcomp>r   z!No sequences found in fasta file!Nc                 s   s   | ]}|j V  qd S ©N)Úlength©r   Úsr   r   r
   Ú	<genexpr>9   s     z get_sequences.<locals>.<genexpr>c                 s   s   | ]}|j V  qd S r   )Úunaligned_lengthr   r   r   r
   r   :   s     r   zPNot all sequence have the same length. Aligned lengths: %s, sequence lengths: %sz, Ú	frequencyT)ÚkeyÚreverse)Ú	ExceptionÚlenÚloggingÚ	getLoggerÚ__name__ÚwarnÚsetÚ
ValueErrorÚjoinÚmapÚstrÚsortedr   )Ú
input_seqsÚseqsÚloggerZaligned_lengthsZunaligned_lengthsr   r   r
   Úget_sequences   s(    


ÿþÿr(   r   r   é   c              
   C   sê  t  t¡}|dkrtƒ }| d| ¡ t| ƒ}|dkrD| d¡ dS | dt|ƒ ¡ t	d| |d j
ƒ}t |¡| }t|ƒd }|D ]:}	|	jdkršqˆ||	j }
|
d dk r²qˆt|	j d¡ƒ}|D ]ü}|	|krÔqÆt t |	j|j¡¡}||krôqÆt|t|j d¡ƒƒ}|	jd|… }|jd|… }||k}t || d	k|| d	k¡}| ¡ }|dkr‚t t |	jd|… |jd|… ¡¡}|| }|
| }||kr¢d}n|dkr´|| }| j|8  _qÆqˆd
d„ |D ƒ}| dt|ƒ ¡ |S )a  Deblur the reads

    Parameters
    ----------
    input_seqs : iterable of (str, str)
        The list of input sequences in (label, sequence) format. The label
        should include the sequence count in the 'size=X' format.
    mean_error : float, optional
        The mean illumina error, used for original sequence estimate.
        Default: 0.005
    error_dist : list of float, optional
        A list of error probabilities. The length of the list determines the
        amount of hamming distances taken into account. Default: None, use
        the default error profile (from get_default_error_profile() )
    indel_prob : float, optional
        Indel probability (same for N indels). Default: 0.01
    indel_max : int, optional
        The maximal number of indels expected by errors. Default: 3

    Results
    -------
    list of Sequence
        The deblurred sequences

    Notes
    -----
    mean_error is used only for normalizing the peak height before deblurring.
    The array 'error_dist' represents the error distribution, where
    Xi = max frequency of error hamming. The length of this array - 1 limits
    the hamming distance taken into account, i.e. if the length if `error_dist`
    is 10, sequences up to 10 - 1 = 9 hamming distance will be taken into
    account
    NzUsing error profile %szno sequences deblurredzdeblurring %d sequencesr   r   gš™™™™™¹?ú-é   c                 S   s   g | ]}t |jƒd kr|‘qS )r   )Úroundr   r   r   r   r
   r   ½   s      zdeblur.<locals>.<listcomp>z-%d unique sequences left following deblurring)r   r   r   r   Údebugr(   r   Úinfor   Úpowr   ÚnpZarrayr   ZsequenceÚrstripZcount_nonzeroZ	not_equalZnp_sequenceÚminZ
logical_orÚsum)r%   Z
mean_errorr	   Z
indel_probZ	indel_maxr'   r&   Z
mod_factorZ
max_h_distZseq_iZnum_errZ	seq_i_lenZseq_jZh_distr   Z	sub_seq_iZ	sub_seq_jÚmaskZmut_is_indelZ
num_indelsZnum_substitutionsZcorrection_valueÚresultr   r   r
   ÚdeblurG   sf    $




ÿ
ÿ
ÿÿ

r6   )r   Nr   r)   )
Úoperatorr   Znumpyr0   r   Zdeblur.sequencer   r   r(   r6   r   r   r   r
   Ú<module>	   s   
,     þ