U
    H$xe                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZmZ ddlZddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZ e ZG dd dZ G dd de!Z"G dd deZ#dd Z$G dd dZ%G dd deZ&G dd de&eZ'G dd de'Z(G dd de'Z)G dd de&Z*G dd  d e&Z+G d!d" d"eZ,G d#d$ d$e,eZ-G d%d& d&e-Z.G d'd( d(e-Z/dgfe0d)d*d+Z1G d,d- d-eZ2G d.d/ d/e2eZ3G d0d1 d1e3eZ4G d2d3 d3e4Z5G d4d5 d5e5Z6G d6d7 d7e4Z7G d8d9 d9e4Z8G d:d; d;e5Z9G d<d= d=e7Z:G d>d? d?e9Z;G d@dA dAe:Z<G dBdC dCe,Z=G dDdE dEe3Z>G dFdG dGe2Z?G dHdI dIe2eZ@G dJdK dKe@ZAG dLdM dMe@ZBdNdO ZCe	e, eeDeDf dPdQdRZEdS )Sz
Adapter finding and trimming classes

The ...Adapter classes are responsible for finding adapters.
The ...Match classes trim the reads.
    N)IntFlag)defaultdict)OptionalTupleSequenceDictAnyListUnion)ABCabstractmethod   )
KmerFinder)EndSkipAlignerPrefixComparerSuffixCompareredit_environmenthamming_sphere)create_positions_and_kmerskmer_probability_analysisc                   @   s   e Zd ZedddZdS )MockKmerFindersequencec                 C   s   dS )NT selfr   r   r   0lib/python3.8/site-packages/cutadapt/adapters.pykmers_present   s    zMockKmerFinder.kmers_presentN)__name__
__module____qualname__strr   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )InvalidCharacterN)r   r    r!   r   r   r   r   r#   !   s   r#   c                   @   s^   e Zd ZdZejejB ejB ZejejB ej	B Z
ejZejZej	ejB ZejejB ZejZdS )Wherez
    Aligner flag combinations for all adapter types.

    "REFERENCE" is the adapter sequence, "QUERY" is the read sequence
    N)r   r    r!   __doc__r   ZQUERY_STARTZ
QUERY_STOPZREFERENCE_ENDBACKZREFERENCE_STARTFRONTPREFIXSUFFIXFRONT_NOT_INTERNALBACK_NOT_INTERNALZ
SEMIGLOBALANYWHEREr   r   r   r   r$   &   s   r$   c                   C   s   t tS N)r   intr   r   r   r   returns_defaultdict_int7   s    r/   c                   @   sT   e Zd ZdZddddZdd Zedd	d
Zedd Z	e
ee
 dddZdS )EndStatisticsz!Statistics about the 5' or 3' endSingleAdapteradapterc                 C   sf   |j | _ |j| _|j| _|j| _|j| _| | _|j| _t	t
| _dddddd| _t|t| _d S )Nr   ACGT )max_error_rater   effective_lengthadapter_wildcardshas_wildcardsindelsdescriptive_identifierZadapter_typeallows_partial_matchesr   r/   errorsadjacent_bases
isinstanceFrontAdapter_remove_prefixr   r3   r   r   r   __init__A   s    

zEndStatistics.__init__c                 C   s&   dd | j  D }d| j|| jS )Nc                 S   s   i | ]\}}|t |qS r   )dict).0kvr   r   r   
<dictcomp>P   s      z*EndStatistics.__repr__.<locals>.<dictcomp>z>EndStatistics(max_error_rate={}, errors={}, adjacent_bases={}))rA   itemsformatr:   rB   )r   rA   r   r   r   __repr__O   s    zEndStatistics.__repr__otherc                 C   s   t || jstd| j|jksD| j|jksD| j|jksD| j|jkrLtddD ]}| j|  |j| 7  < qP|j	
 D ]2\}}|D ]$}| j	| |  |j	| | 7  < qqx| S )NzCannot comparez+Incompatible EndStatistics, cannot be addedr4   )rC   	__class__
ValueErrorr:   r   r;   r>   RuntimeErrorrB   rA   rM   )r   rQ   baselengthZ
error_dictrA   r   r   r   __iadd__W   s"    



$zEndStatistics.__iadd__c                 C   s   dd | j  D }|S )Nc                 S   s   i | ]\}}|t | qS r   )sumvalues)rI   rV   rA   r   r   r   rL   j   s      z)EndStatistics.lengths.<locals>.<dictcomp>)rA   rM   )r   dr   r   r   lengthsh   s    zEndStatistics.lengths)
gc_contentreturnc                 C   s   d|  krdksn t | j}| jr4|ddd }| jr>dnd}d}|g}t|D ]8\}}||krr||d 9 }n|d| d 9 }|| qT|S )a  
        Estimate probabilities that this adapter end matches a
        random sequence. Indels are not taken into account.

        Returns a list p, where p[i] is the probability that
        i bases of this adapter match a random sequence with
        GC content gc_content.
        g        g      ?NZCGRYSKMBDHVNZGCg       @)AssertionErrorr   rE   r=   	enumerateappend)r   r\   seqZallowed_basespZprobabilitiesicr   r   r   random_match_probabilitiesm   s    	z(EndStatistics.random_match_probabilitiesN)r   r    r!   r%   rG   rO   r   rW   propertyr[   floatr	   rf   r   r   r   r   r0   >   s   
r0   c                   @   sj   e Zd ZU dZeed< eed< ded< edd Zee	e
e e
e f dd	d
ZeddddZdS )AdapterStatisticsr   reverse_complementednameAdapterr3   c                 C   s   d S r-   r   r   rQ   r   r   r   rW      s    zAdapterStatistics.__iadd__r]   c                 C   s   d S r-   r   r   r   r   r   end_statistics   s    z AdapterStatistics.end_statisticsNc                 C   s   d S r-   r   r   matchr   r   r   	add_match   s    zAdapterStatistics.add_match)r   r    r!   rj   r.   __annotations__r"   r   rW   r   r   r0   rp   rs   r   r   r   r   ri      s   

 ri   c                   @   s4   e Zd ZdZddddZdd Zd dd	d
ZdS )SingleAdapterStatisticszx
    Statistics about a 5' or 3' adapter, where we only need to keep track of sequences
    removed from one "end".
    r1   r2   c                 C   s   |j | _ || _t|| _d S r-   )rk   r3   r0   endrF   r   r   r   rG      s    z SingleAdapterStatistics.__init__c                 C   s   d| j  d| j dS )NzSingleAdapterStatistics(name=z, end=))rk   rv   ro   r   r   r   rO      s    z SingleAdapterStatistics.__repr__rP   c                 C   s8   t || jstd|  j|j7  _|  j|j7  _| S NzCannot iadd)rC   rR   rS   rv   rj   rm   r   r   r   rW      s
    z SingleAdapterStatistics.__iadd__N)r   r    r!   r%   rG   rO   rW   r   r   r   r   ru      s   ru   c                   @   s8   e Zd ZddddZeee ee f dddZdS )	FrontAdapterStatisticsRemoveBeforeMatchrr   c                 C   s"   | j j|  |j  d7  < d S Nr   )rv   rA   removed_sequence_lengthrq   r   r   r   rs      s    z FrontAdapterStatistics.add_matchrn   c                 C   s
   | j d fS r-   rv   ro   r   r   r   rp      s    z%FrontAdapterStatistics.end_statisticsNr   r    r!   rs   r   r   r0   rp   r   r   r   r   ry      s   ry   c                   @   s8   e Zd ZddddZeee ee f dddZdS )	BackAdapterStatisticsRemoveAfterMatchr{   c                 C   sl   |  }| jj|  |j  d7  < z| jj|  d7  < W n( tk
rf   | jjd  d7  < Y nX d S Nr   r9   )adjacent_baserv   rA   r}   rB   KeyErrorr   rr   r   r   r   r   rs      s    zBackAdapterStatistics.add_matchrn   c                 C   s
   d | j fS r-   r~   ro   r   r   r   rp      s    z$BackAdapterStatistics.end_statisticsNr   r   r   r   r   r      s   r   c                   @   sd   e Zd ZdZddddddZdd Zd d	d
dZddddZee	e
 e	e
 f dddZdS )LinkedAdapterStatistics@
    Statistics about sequences removed by a lined adapter.
    LinkedAdapterr1   )r3   frontbackc                 C   s,   |j | _ || _t|| _t|| _d| _d S Nr   rk   r3   r0   r   r   rj   )r   r3   r   r   r   r   r   rG      s
    

z LinkedAdapterStatistics.__init__c                 C   s   d| j  d| j d| j dS )NzLinkedAdapterStatistics(name=, front=, back=rw   rk   r   r   ro   r   r   r   rO      s    z LinkedAdapterStatistics.__repr__rP   c                 C   sH   t || jstd|  j|j7  _|  j|j7  _|  j|j7  _| S rx   )rC   rR   rS   r   r   rj   rm   r   r   r   rW      s    z LinkedAdapterStatistics.__iadd__LinkedMatchr{   c                 C   s   |j r(| jj|j   |j j  d7  < |jr|j }| jj|j  |jj  d7  < z| jj|  d7  < W n( tk
r   | jjd  d7  < Y nX d S r   )	front_matchr   rA   r}   
back_matchr   r   rB   r   r   r   r   r   rs      s"    
z!LinkedAdapterStatistics.add_matchrn   c                 C   s   | j | jfS r-   r   r   ro   r   r   r   rp      s    z&LinkedAdapterStatistics.end_statisticsN)r   r    r!   r%   rG   rO   rW   rs   r   r   r0   rp   r   r   r   r   r      s   r   c                   @   sf   e Zd ZdZddddZdd Zd dd	d
Zed ddddZe	e
e e
e f dddZdS )AnywhereAdapterStatisticsr   AnywhereAdapterr2   c                 C   s,   |j | _ || _t|| _t|| _d| _d S r   r   rF   r   r   r   rG      s
    

z"AnywhereAdapterStatistics.__init__c                 C   s   d| j  d| j d| j dS )NzAnywhereAdapterStatistics(name=r   r   rw   r   ro   r   r   r   rO      s    z"AnywhereAdapterStatistics.__repr__rP   c                 C   sF   t |tstd|  j|j7  _|  j|j7  _|  j|j7  _| S )Nz
Cannot add)rC   r   rS   r   r   rj   rm   r   r   r   rW     s    
z"AnywhereAdapterStatistics.__iadd__)rz   r   N)rr   r]   c                 C   s   t |tr*| jj|  |j  d7  < nh| }| jj|  |j  d7  < z| jj|  d7  < W n( tk
r   | jjd  d7  < Y nX d S r   )	rC   rz   r   rA   r}   r   r   rB   r   r   r   r   r   rs     s    
 z#AnywhereAdapterStatistics.add_matchrn   c                 C   s   | j | jfS r-   r   ro   r   r   r   rp     s    z(AnywhereAdapterStatistics.end_statistics)r   r    r!   r%   rG   rO   rW   r
   rs   r   r   r0   rp   r   r   r   r   r      s   r   c                   @   sx   e Zd ZU ded< eeeef dddZeeeef dddZee	e	 ddd	Z
ed
d Zedd ZdS )Matchrl   r3   rn   c                 C   s   d S r-   r   ro   r   r   r   remainder_interval  s    zMatch.remainder_intervalc                 C   s   d S r-   r   ro   r   r   r   retained_adapter_interval"  s    zMatch.retained_adapter_intervalc                 C   s   d S r-   r   r   readr   r   r   get_info_records&  s    zMatch.get_info_recordsc                 C   s   d S r-   r   r   r   r   r   trimmed*  s    zMatch.trimmedc                 C   s   d S r-   r   ro   r   r   r   match_sequence.  s    zMatch.match_sequenceN)r   r    r!   rt   r   r   r.   r   r   r	   r   r   r   r   r   r   r   r     s   

r   c                
   @   s   e Zd ZdZdddddddd	d
dg
ZeeeeeededddZdd Ze	dddZ
d eedddZee dddZdd ZeedddZdS )!SingleMatchzG
    Representation of a single adapter matched to a single string
    astartastoprstartrstopscorerA   r3   r   rV   r   r1   r   r   r   r   r   rA   r3   r   c	           	      C   s>   || _ || _|| _|| _|| _|| _|| _|| _|| | _d S r-   )	r   r   r   r   r   rA   r3   r   rV   )	r   r   r   r   r   r   rA   r3   r   r   r   r   rG   E  s    zSingleMatch.__init__c                 C   s>   | j j d| j d| j d| j d| j d| j d| j dS )Nz(astart=z, astop=z	, rstart=z, rstop=z, score=z	, errors=rw   )rR   r   r   r   r   r   r   rA   ro   r   r   r   rO   ]  s    <zSingleMatch.__repr__rn   c                 C   sl   |j | j koj| j|jkoj| j|jkoj| j|jkoj| j|jkoj| j|jkoj| j|jkoj| j|jkoj| j|jkS r-   )	rR   r   r   r   r   r   rA   r3   r   rm   r   r   r   __eq__d  s"    







zSingleMatch.__eq__N)wildcard_charr]   c                    s$    fddt  jD }d|S )a4  
        Return a string that contains, for each wildcard character,
        the character that it matches. For example, if the adapter
        ATNGNA matches ATCGTA, then the string 'CT' is returned.

        If there are indels, this is not reliable as the full alignment
        is not available.
        c                    sF   g | ]>} j j j|  kr j| t jk r j j|  qS r   )r3   r   r   r   len)rI   rd   r   r   r   r   
<listcomp>z  s   z)SingleMatch.wildcards.<locals>.<listcomp>r9   )rangerV   join)r   r   	wildcardsr   r   r   r   q  s    	zSingleMatch.wildcardsc              	   C   s   |j }|j}d| j| j| j|d| j || j| j || jd  | jjg}|r~||d| j || j| j || jd  g7 }n|dddg7 }|gS )Nr9   r   )r   	qualitiesrA   r   r   r3   rk   )r   r   rb   r   infor   r   r   r     s&    
zSingleMatch.get_info_recordsc                 C   s   | j | j| j S r-   )r   r   r   ro   r   r   r   r     s    zSingleMatch.match_sequencec                 C   s   d S r-   r   ro   r   r   r   r}     s    z#SingleMatch.removed_sequence_lengthN)r   )r   r    r!   r%   	__slots__r.   r"   rG   rO   boolr   r   r	   r   r   r   r}   r   r   r   r   r   3  s8   r   c                   @   sh   e Zd ZdZedddZeeef dddZeeef dddZ	d	d
 Z
dd ZedddZdS )rz   z.A match that removes sequence before the matchrn   c                 C   s   | j d| j S z
        Return the part of the read before this match if this is a
        'front' (5') adapter,
        return the part after the match if this is not a 'front' adapter (3').
        This can be an empty string.
        Nr   r   ro   r   r   r   rest  s    zRemoveBeforeMatch.restc                 C   s   | j t| jfS )
        Return an interval (start, stop) that describes the part of the read that would
        remain after trimming
        )r   r   r   ro   r   r   r   r     s    z$RemoveBeforeMatch.remainder_intervalc                 C   s   | j t| jfS r-   )r   r   r   ro   r   r   r   r     s    z+RemoveBeforeMatch.retained_adapter_intervalc                 C   s   t | jd S r-   )slicer   ro   r   r   r   
trim_slice  s    zRemoveBeforeMatch.trim_slicec                 C   s   || j d  S r-   r   r   r   r   r   r     s    zRemoveBeforeMatch.trimmedc                 C   s   | j S r-   r   ro   r   r   r   r}     s    z)RemoveBeforeMatch.removed_sequence_lengthN)r   r    r!   r%   r"   r   r   r.   r   r   r   r   r}   r   r   r   r   rz     s   	rz   c                   @   sv   e Zd ZdZedddZeeef dddZeeef dddZ	d	d
 Z
dd ZedddZedddZdS )r   z-A match that removes sequence after the matchrn   c                 C   s   | j | jd S r   )r   r   ro   r   r   r   r     s    zRemoveAfterMatch.restc                 C   s
   d| j fS )r   r   r   ro   r   r   r   r     s    z#RemoveAfterMatch.remainder_intervalc                 C   s
   d| j fS r   r   ro   r   r   r   r     s    z*RemoveAfterMatch.retained_adapter_intervalc                 C   s   t d | jS r-   )r   r   ro   r   r   r   r     s    zRemoveAfterMatch.trim_slicec                 C   s   |d | j  S r-   r   r   r   r   r   r     s    zRemoveAfterMatch.trimmedc                 C   s   | j | jd | j S r|   r   ro   r   r   r   r     s    zRemoveAfterMatch.adjacent_basec                 C   s   t | j| j S r-   )r   r   r   ro   r   r   r   r}     s    z(RemoveAfterMatch.removed_sequence_lengthN)r   r    r!   r%   r"   r   r   r.   r   r   r   r   r   r}   r   r   r   r   r     s   	r   rn   c                 C   s    t | d }| d  d7  < |S )Nr   r   )r"   )Z_startrk   r   r   r   _generate_adapter_name  s    r   c                   @   s@   e Zd ZdZee dddZedd Zeeddd	Z	d
S )	Matchablez'Something that has a match_to() method.rk   c                 O   s
   || _ d S r-   r   )r   rk   argskwargsr   r   r   rG     s    zMatchable.__init__c                 C   s   d S r-   r   ro   r   r   r   enable_debug  s    zMatchable.enable_debugr   c                 C   s   d S r-   r   r   r   r   r   match_to  s    zMatchable.match_toN)
r   r    r!   r%   r   r"   rG   r   r   r   r   r   r   r   r     s   
r   c                   @   sF   e Zd ZdZeedddZeedddZeedddZ	d	S )
rl   zadapter with one componentrn   c                 C   s   dS )z,Return string representation of this adapterNr   ro   r   r   r   spec  s    zAdapter.specc                 C   s   d S r-   r   ro   r   r   r   create_statistics  s    zAdapter.create_statisticsc                 C   s   d S r-   r   ro   r   r   r   r?     s    zAdapter.descriptive_identifierN)
r   r    r!   descriptionr   r"   r   ri   r   r?   r   r   r   r   rl     s   rl   c                	       s   e Zd ZU dZdZeed< d!eee	eee
e ed fd	d
Zee	edddZd"eeeeedddZdd Zee	dddZddddZedd Zedd ZeedddZe	ddd Z  ZS )#r1   a  
    This class is used to find a single adapter characterized by sequence, error rate,
    type etc. within reads.

    Arguments:
        sequence (str): The adapter sequence. Will be converted to uppercase.
            Also, Us will be converted to Ts.

        max_errors: Maximum allowed errors (non-negative float). If the values is less than 1, this
            is interpreted as a rate and passed to the aligner. If it is 1 or greater, the value
            is converted to a rate by dividing it by the number of non-N characters in the sequence.

        The error rate is the number of errors in the alignment divided by the length
        of the part of the alignment that matches the adapter.

        min_overlap: Report a match only if at least this number of bases of the adapter are
            aligned to the read.

        read_wildcards: Whether IUPAC wildcards in the read are allowed.

        adapter_wildcards: Whether IUPAC wildcards in the adapter are allowed.

        name: Optional name of the adapter. If not provided, the name is set to a
            unique number.

        indels: Whether indels are allowed in the alignment.
    Tr@   皙?   FN)r   
max_errorsmin_overlapread_wildcardsr<   rk   r>   c           
         s$  |d krt  n|| _t | j d| _| dddd| _| jsPtd|dkr| j	dt
| jkr|t
| j| j	d  }|| _t|t
| j| _td}|rt| j|ks| jD ]$}	|	|krtd	|	 d
| j dq|ot| jtdk | _|| _|| _|  | _|  | _d S )NFUr8   Ir   zAdapter sequence is emptyr   ZABCDGHKMNRSTUVWXYzCharacter 'z' in adapter sequence 'zF' is not a valid IUPAC code. Use only characters 'ABCDGHIKMNRSTUVWXY'.ZACGT)r   rk   superrG   _debugupperreplacer   rS   countr   r:   minr   	frozensetsetr#   r<   r   r>   _aligneraligner_kmer_finderkmer_finder)
r   r   r   r   r   r<   rk   r>   Ziupacre   rR   r   r   rG   +  s4    


zSingleAdapter.__init__)r   flagsr]   c              	   C   s,   | j r
dnd}t|| j|| j| j|| jdS )Nr   i )r   wildcard_refwildcard_query
indel_costr   )r>   r   r:   r<   r   r   )r   r   r   r   r   r   r   _make_alignerP  s    zSingleAdapter._make_aligner)r   back_adapterfront_adapterinternalr]   c                 C   s8   t || j| j|||}| jr(tt| t|| j| jS r-   )	r   r   r:   r   printr   r   r<   r   )r   r   r   r   r   Zpositions_and_kmersr   r   r   _make_kmer_finder_  s      zSingleAdapter._make_kmer_finderc                 C   s   dj f d| jjit| S )Nz<{cls}(name={name!r}, sequence={sequence!r}, max_error_rate={max_error_rate}, min_overlap={min_overlap}, read_wildcards={read_wildcards}, adapter_wildcards={adapter_wildcards}, indels={indels})>cls)rN   rR   r   varsro   r   r   r   rO   t  s    zSingleAdapter.__repr__rn   c                 C   s   | j jS r-   )r   r;   ro   r   r   r   r;   }  s    zSingleAdapter.effective_lengthc                 C   s   d| _ | j  dS )zg
        Print out the dynamic programming matrix after matching a read to an
        adapter.
        TN)r   r   r   ro   r   r   r   r     s    zSingleAdapter.enable_debugc                 C   s   d S r-   r   ro   r   r   r   r     s    zSingleAdapter._alignerc                 C   s   d S r-   r   ro   r   r   r   r     s    zSingleAdapter._kmer_finderr   c                 C   s   dS )
        Attempt to match this adapter to the given string.

        Return a Match instance if a match was found;
        return None if no match was found given the matching criteria (minimum
        overlap length, maximum error rate).
        Nr   r   r   r   r   r     s    zSingleAdapter.match_toc                 C   s
   t | jS r-   )r   r   ro   r   r   r   __len__  s    zSingleAdapter.__len__)r   r   FTNT)T)r   r    r!   r%   r@   r   rt   r"   rh   r.   r   rG   r   r   r   r   rO   rg   r;   r   r   r   r   r   r   __classcell__r   r   r   r   r1     sJ   
      % 	

	r1   c                       sr   e Zd ZdZdZ fddZedddZeddd	Z	d
d Z
edddZedddZedddZ  ZS )rD   zA 5' adapterz
regular 5'c                    s    | dd| _t j|| d S NZforce_anywhereFpop_force_anywherer   rG   r   r   r   r   r   r   rG     s    zFrontAdapter.__init__rn   c                 C   s   dS )NZregular_five_primer   ro   r   r   r   r?     s    z#FrontAdapter.descriptive_identifierc                 C   s    |  | j| jrtjjntjjS r-   )r   r   r   r$   r,   valuer'   ro   r   r   r   r     s    zFrontAdapter._alignerc                 C   s   | j | j| jddS NTr   r   r   r   r   ro   r   r   r   r     s
      zFrontAdapter._kmer_finderr   c                 C   sJ   | j |sdS | j|}| jr.t| jj |dkr:dS t|| |dS 
        Attempt to match this adapter to the given read.

        Return a Match instance if a match was found;
        return None if no match was found given the matching criteria (minimum
        overlap length, maximum error rate).
        Nr3   r   )r   r   r   locater   r   dpmatrixrz   r   r   	alignmentr   r   r   r     s    zFrontAdapter.match_toc                 C   s   | j  dS N...r   ro   r   r   r   r     s    zFrontAdapter.specc                 C   s   t | S r-   )ry   ro   r   r   r   r     s    zFrontAdapter.create_statistics)r   r    r!   r%   r   rG   r"   r?   r   r   r   r   r   ry   r   r   r   r   r   r   rD     s   rD   c                   @   sT   e Zd ZdZdZedddZedddZdd	 Z	ed
ddZ
edddZdS )RightmostFrontAdapterz+A 5' adapter that prefers rightmost matcheszrightmost 5'rn   c                 C   s   dS )NZrightmost_five_primer   ro   r   r   r   r?     s    z,RightmostFrontAdapter.descriptive_identifierc                 C   s.   |  | jd d d | jr tjjntjj}|S )Nr^   r   r   r   r$   r,   r   r&   )r   r   r   r   r   r     s
    zRightmostFrontAdapter._alignerc                 C   s"   | j | jd d d d| jd}|S )Nr^   Tr   r   )r   r   r   r   r   r     s      z"RightmostFrontAdapter._kmer_finderr   c           
      C   s   |ddd }| j |sdS | j|}| jr<t| jj |dkrHdS |\}}}}}}	t| j| t| j| t|| t|| ||	f}t	|| |dS )r   Nr^   r   )
r   r   r   r   r   r   r   r   r   rz   )
r   r   Zreversed_sequencer   Z	ref_startZref_endZquery_startZ	query_endr   rA   r   r   r   r     s&    

zRightmostFrontAdapter.match_toc                 C   s   | j  dS )Nz...;rightmostr   ro   r   r   r   r     s    zRightmostFrontAdapter.specN)r   r    r!   r%   r   r"   r?   r   r   r   r   r   r   r   r   r   r     s   r   c                       sl   e Zd ZdZdZ fddZedddZdd	 Zd
d Z	edddZ
edddZedddZ  ZS )BackAdapterzA 3' adapterz
regular 3'c                    s    | dd| _t j|| d S r   r   r   r   r   r   rG     s    zBackAdapter.__init__rn   c                 C   s   dS )NZregular_three_primer   ro   r   r   r   r?     s    z"BackAdapter.descriptive_identifierc                 C   s    |  | j| jrtjjntjjS r-   r   ro   r   r   r   r     s    zBackAdapter._alignerc                 C   s   | j | jd| jdS r   r   ro   r   r   r   r     s
      zBackAdapter._kmer_finderr   c                 C   sJ   | j |sdS | j|}| jr.t| jj |dkr:dS t|| |dS r   )r   r   r   r   r   r   r   r   r   r   r   r   r   "  s    zBackAdapter.match_toc                 C   s   | j  S r-   r   ro   r   r   r   r   5  s    zBackAdapter.specc                 C   s   t | S r-   )r   ro   r   r   r   r   8  s    zBackAdapter.create_statistics)r   r    r!   r%   r   rG   r"   r?   r   r   r   r   r   r   r   r   r   r   r   r     s   r   c                   @   s\   e Zd ZdZdZedddZdd Zdd	 Zed
ddZ	edddZ
edddZdS )r   z
    An adapter that can be 5' or 3'. If a match involves the first base of
    the read, it is assumed to be a 5' adapter and a 3' otherwise.
    zvariable 5'/3'rn   c                 C   s   dS )NZanywherer   ro   r   r   r   r?   D  s    z&AnywhereAdapter.descriptive_identifierc                 C   s   |  | jtjjS r-   )r   r   r$   r,   r   ro   r   r   r   r   G  s    zAnywhereAdapter._alignerc                 C   s   | j | jdddS r   )r   r   ro   r   r   r   r   J  s
      zAnywhereAdapter._kmer_finderr   c                 C   sp   | j |sdS | j| }| jr2t| jj |dkr>dS |d dkr\t|| |d}nt	|| |d}|S )r   N   r   r   )
r   r   r   r   r   r   r   r   rz   r   )r   r   r   rr   r   r   r   r   O  s    zAnywhereAdapter.match_toc                 C   s   d| j  dS r   r   ro   r   r   r   r   e  s    zAnywhereAdapter.specc                 C   s   t | S r-   )r   ro   r   r   r   r   h  s    z!AnywhereAdapter.create_statisticsN)r   r    r!   r%   r   r"   r?   r   r   r   r   r   r   r   r   r   r   r   <  s   r   c                   @   sN   e Zd ZdZdZedddZdd Zdd	 Zed
ddZ	edddZ
dS )NonInternalFrontAdapterzA non-internal 5' adapterznon-internal 5'rn   c                 C   s   dS )NZnoninternal_five_primer   ro   r   r   r   r?   q  s    z.NonInternalFrontAdapter.descriptive_identifierc                 C   s   |  | jtjjS r-   )r   r   r$   r*   r   ro   r   r   r   r   t  s    z NonInternalFrontAdapter._alignerc                 C   s   | j | jd| jddS )NTF)r   r   r   r   ro   r   r   r   r   w  s    z$NonInternalFrontAdapter._kmer_finderr   c                 C   sd   | j |sd S | j|}| jrHzt| jj W n tk
rF   Y nX |d krTd S t|| |dS Nr   )	r   r   r   r   r   r   r   AttributeErrorrz   r   r   r   r   r     s    z NonInternalFrontAdapter.match_toc                 C   s   d| j  dS )NXr   r   ro   r   r   r   r     s    zNonInternalFrontAdapter.specNr   r    r!   r%   r   r"   r?   r   r   r   r   r   r   r   r   r   l  s   r   c                   @   sN   e Zd ZdZdZedddZdd Zdd	 Zed
ddZ	edddZ
dS )NonInternalBackAdapterzA non-internal 3' adapterznon-internal 3'rn   c                 C   s   dS )NZnoninternal_three_primer   ro   r   r   r   r?     s    z-NonInternalBackAdapter.descriptive_identifierc                 C   s   |  | jtjjS r-   )r   r   r$   r+   r   ro   r   r   r   r     s    zNonInternalBackAdapter._alignerc                 C   s   | j | jd| jddS )NTF)r   r   r   r   ro   r   r   r   r     s    z#NonInternalBackAdapter._kmer_finderr   c                 C   sd   | j |sd S | j|}| jrHzt| jj W n tk
rF   Y nX |d krTd S t|| |dS r   )	r   r   r   r   r   r   r   r   r   r   r   r   r   r     s    zNonInternalBackAdapter.match_toc                 C   s   | j  dS )Nr   r   ro   r   r   r   r     s    zNonInternalBackAdapter.specNr  r   r   r   r   r    s   r  c                       s^   e Zd ZdZdZdZed fddZeddd	Zd
d Z	 fddZ
edddZ  ZS )PrefixAdapterzAn anchored 5' adapterzanchored 5'Fr   c                    s$   t ||d< t j|f|| d S Nr   r   r   rG   r   r   r   r   r   r   r   rG     s    zPrefixAdapter.__init__rn   c                 C   s   dS )NZanchored_five_primer   ro   r   r   r   r?     s    z$PrefixAdapter.descriptive_identifierc                 C   s8   | j s"t| j| j| j| j| jdS | | jtj	j
S d S N)r   r   r   )r>   r   r   r:   r<   r   r   r   r$   r(   r   ro   r   r   r   r     s    zPrefixAdapter._alignerc                    s    t | jtrt S t  S d S r-   )rC   r   r   r   r   r   ro   r   r   r   r     s    zPrefixAdapter._kmer_finderc                 C   s   d| j  dS )N^r   r   ro   r   r   r   r     s    zPrefixAdapter.specr   r    r!   r%   r   r@   r"   rG   r?   r   r   r   r   r   r   r   r   r    s   r  c                       s^   e Zd ZdZdZdZed fddZeddd	Zd
d Z	 fddZ
edddZ  ZS )SuffixAdapterzAn anchored 3' adapterzanchored 3'Fr   c                    s$   t ||d< t j|f|| d S r  r  r  r   r   r   rG     s    zSuffixAdapter.__init__rn   c                 C   s   dS )NZanchored_three_primer   ro   r   r   r   r?     s    z$SuffixAdapter.descriptive_identifierc                 C   s8   | j s"t| j| j| j| j| jdS | | jtj	j
S d S r  )r>   r   r   r:   r<   r   r   r   r$   r)   r   ro   r   r   r   r     s    zSuffixAdapter._alignerc                    s    t | jtrt S t  S d S r-   )rC   r   r   r   r   r   ro   r   r   r   r     s    zSuffixAdapter._kmer_finderc                 C   s   | j  dS )N$r   ro   r   r   r   r     s    zSuffixAdapter.specr	  r   r   r   r   r
    s   r
  c                   @   s   e Zd ZdZeeddddZdd Zedd	 Z	ed
d Z
dd Zeeef dddZeeef dddZee dddZdd ZdS )r   z.
    Represent a match of a LinkedAdapter
    r   )r   r   r3   c                 C   s*   |d k	s|d k	st || _|| _|| _d S r-   )r_   r   r   r3   )r   r   r   r3   r   r   r   rG     s    zLinkedMatch.__init__c                 C   s   d | j| j| jS )Nz:<LinkedMatch(front_match={!r}, back_match={}, adapter={})>)rN   r   r   r3   ro   r   r   r   rO     s
      zLinkedMatch.__repr__c                 C   s4   d}| j dk	r|| j j7 }| jdk	r0|| jj7 }|S )zNumber of matching basesr   N)r   r   r   )r   sr   r   r   r     s    

zLinkedMatch.scorec                 C   s4   d}| j d k	r|| j j7 }| jd k	r0|| jj7 }|S r   )r   rA   r   )r   er   r   r   rA     s    

zLinkedMatch.errorsc                 C   s(   | j r| j |}| jr$| j|}|S r-   )r   r   r   r   r   r   r   r   (  s
    zLinkedMatch.trimmedrn   c                 C   s   dd | j | jfD }t|S )Nc                 S   s   g | ]}|d k	r|qS r-   r   )rI   rr   r   r   r   r   0  s     z2LinkedMatch.remainder_interval.<locals>.<listcomp>)r   r   	remainder)r   matchesr   r   r   r   /  s    
zLinkedMatch.remainder_intervalc                 C   sH   | j r| j j}| j j}nd }}| jr4| jj| }nt| j j}||fS r   )r   r   r   r   r   r   )r   startoffsetrv   r   r   r   r   5  s    
z%LinkedMatch.retained_adapter_intervalc                 C   sr   g }| j df| jdffD ]T\}}|d kr*q||d }| jjd krHdn| jj| |d< || ||}q|S )Nz;1z;2r   Znone   )r   r   r   r3   rk   ra   r   )r   r   Zrecordsrr   Z
namesuffixrecordr   r   r   r   A  s    
zLinkedMatch.get_info_recordsc                 C   s,   | j r| j  ndd | jr&| j nd S )Nr9   ,)r   r   r   ro   r   r   r   r   Q  s    zLinkedMatch.match_sequenceN)r   r    r!   r%   rz   r   rG   rO   rg   r   rA   r   r   r.   r   r   r	   r   r   r   r   r   r   r      s   
	
r   c                       s   e Zd ZdZdZeeeeee d fddZ	edddZ
d	d
 Zeee dddZedddZedd Zedd ZedddZ  ZS )r   z'A 5' adapter combined with a 3' adapterlinked)r   r   front_requiredback_requiredrk   c                    sL   t  | || _|| _d| _|d kr,t n|| _|| _| j| j_|| _d S Nr  )	r   rG   r  r  wherer   rk   r   r   )r   r   r   r  r  rk   r   r   r   rG   ^  s    
zLinkedAdapter.__init__rn   c                 C   s   dS r  r   ro   r   r   r   r?   q  s    z$LinkedAdapter.descriptive_identifierc                 C   s   | j   | j  d S r-   )r   r   r   ro   r   r   r   r   t  s    
zLinkedAdapter.enable_debugr   r]   c                 C   sd   | j |}| jr|dkrdS |dk	r2||  }| j|}|dkrX| jsT|dkrXdS t||| S )z@
        Match the two linked adapters against a string
        N)r   r   r  r   r   r  r   )r   r   r   r   r   r   r   r   x  s    zLinkedAdapter.match_toc                 C   s   t | | j| jdS )Nr   )r   r   r   ro   r   r   r   r     s
      zLinkedAdapter.create_statisticsc                 C   s   | j jd | jj S r   )r   r   r   ro   r   r   r   r     s    zLinkedAdapter.sequencec                 C   s   d S r-   r   ro   r   r   r   remove  s    zLinkedAdapter.removec                 C   s   | j   d| j  S r   )r   r   r   ro   r   r   r   r     s    zLinkedAdapter.spec)r   r    r!   r%   r   r1   r   r   r"   rG   r?   r   r   r   r   r   rg   r   r  r   r   r   r   r   r   r   Y  s"   

r   c                       sV   e Zd ZdZee d fddZdd Zdd Zd	d
 Z	e
ee dddZ  ZS )MultipleAdaptersz-
    Represent multiple adapters at once
    )adaptersc                    s   t  jdd || _d S )NZmultiple_adaptersr   )r   rG   	_adapters)r   r  r   r   r   rG     s    zMultipleAdapters.__init__c                 C   s   | j D ]}|  qd S r-   )r  r   )r   ar   r   r   r     s    
zMultipleAdapters.enable_debugc                 C   s
   | j | S r-   )r  )r   itemr   r   r   __getitem__  s    zMultipleAdapters.__getitem__c                 C   s
   t | jS r-   )r   r  ro   r   r   r   r     s    zMultipleAdapters.__len__r  c                 C   sX   d}| j D ]H}||}|dkr"q
|dksN|j|jksN|j|jkr
|j|jk r
|}q
|S )z
        Find the adapter that best matches the sequence.

        Return either a Match instance or None if there are no matches.
        N)r  r   r   rA   )r   r   Z
best_matchr3   rr   r   r   r   r     s    




zMultipleAdapters.match_to)r   r    r!   r%   r   r   rG   r   r!  r   r"   r   r   r   r   r   r   r   r   r    s   r  c                       s   e Zd ZdZeeeeeef f Z	 fddZ
dd ZedddZed	d
 ZeedddZedd Zedd Zeee df dddZedd ZedddZedddZdd Zdd Z  ZS )IndexedAdaptersa  
    Represent multiple adapters of the same type at once and use an index data structure
    to speed up matching. This acts like a "normal" Adapter as it provides a match_to
    method, but is faster with lots of adapters.

    There are quite a few restrictions:
    - the error rate allows at most 2 mismatches
    - wildcards in the adapter are not allowed
    - wildcards in the read are not allowed

    Use the is_acceptable() method to check individual adapters.
    c                    s   t  jdd |std|D ]}| | q|| _t|| _|  \| _| _	t
dt| jdd t| jdkr| jd | _| j| _n| j| _|  | _d	S )
z+All given adapters must be of the same typeZindexed_adaptersr   zAdapter list is emptyzString lengths in the index: %sTreverser   r   N)r   rG   rS   _acceptr  r  Z_multiple_adapters_make_index_lengths_indexloggerdebugsortedr   _length_match_to_one_lengthr   _match_to_multiple_lengths_get_make_affix_make_affix)r   r  r3   r   r   r   rG     s"    
 
zIndexedAdapters.__init__c                 C   s   | j j d| jdS )Nz
(adapters=rw   )rR   r   r  ro   r   r   r   rO     s    zIndexedAdapters.__repr__r   c                 C   s   dS )z4Never called because it gets overwritten in __init__Nr   r   r   r   r   r     s    zIndexedAdapters.match_toc                 C   s   d S r-   r   ro   r   r   r   r/    s    zIndexedAdapters._get_make_affixrn   c                 C   s   d S r-   r   )r   r3   rV   r  rA   r   r   r   r   _make_match  s    zIndexedAdapters._make_matchc                 C   sB   |j rtd|jrtdtt||j }|dkr>tddS )z3Raise a ValueError if the adapter is not acceptablez#Wildcards in the read not supportedz&Wildcards in the adapter not supportedr   zError rate too highN)r   rS   r<   r.   r   r:   )r   r3   rJ   r   r   r   r%    s    zIndexedAdapters._acceptc                 C   s*   z|  | W n tk
r$   Y dS X dS )z
        Return whether this adapter is acceptable for being used in an index

        Adapters are not acceptable if they allow wildcards, allow too many errors,
        or would lead to a very large index.
        FT)r%  rS   r   r3   r   r   r   is_acceptable  s
    zIndexedAdapters.is_acceptableAdapterIndexc              
   C   s  t   }tdt| j t }t }d}| jD ],}|j}t|j	t| }|j
rt||D ]h\}}	}
||kr|| \}}}|
|k rq^||
kr|s| |||||
 d}||	|
f||< |t| q^q0t|}t|d D ]t}	t||	D ]d}||	 }
||krB|| \}}}|
|k rq||
krB|sB| |||||
 d}||	|
f||< qq|| q0t   | }tdt|| t|dd|fS )Nz!Building index of %s adapters ...FTr   z/Built an index containing %s strings in %.1f s.r#  )timer)  r   r   r  rH   r   r   r.   r:   r>   r   _warn_similaraddr   r   r+  )r   Z
start_timeindexr[   Z
has_warnedr3   r   rJ   r  rA   r  other_adapterZother_errorsZother_matchesnelapsedr   r   r   r&    sZ    

      zIndexedAdapters._make_indexc              
   C   s$   t d|j|j| j| j||| d S )NzAdapters %s %r and %s %r are very similar. At %s allowed errors, the sequence %r cannot be assigned uniquely because the number of matches is %s compared to both adapters.)r)  warningrk   r   )r3   r9  rJ   r  r  r   r   r   r6  9  s    zIndexedAdapters._warn_similarc                 C   s|   |  | | j}d|kr<| |}|dkr0dS |\}}}n,z| j| \}}}W n tk
rf   Y dS X | || j|||S )z
        Match a query string against all adapters and return a Match that represents
        the best match or None if no match was found
        r   N)r0  r   r,  _lookup_with_nr(  r   r1  )r   r   affixresultr3   r  mr   r   r   r-  H  s    
z$IndexedAdapters._match_to_one_lengthc              	   C   s   |  }d}d}d}d}| jD ]}||k r. q| ||}d|krb| |}|dkrVq|\}	}
}n.z| j| \}	}
}W n tk
r   Y qY nX ||ks||kr|
|k r|	}|
}|}|}q|dkrdS | |||||S dS )z
        Match the adapters against a string and return a Match that represents
        the best match or None if no match was found
        Nr   r^   i  r   )r   r'  r0  r=  r(  r   r1  )r   r   r>  Zbest_adapterZbest_lengthZbest_mZbest_erV   r?  r3   r  r@  r   r   r   r.  Z  s4    


z*IndexedAdapters._match_to_multiple_lengthsc                 C   s^   | dd}z| j| }W n tk
r0   Y d S X |d }||}|d krPd S ||j|jfS )Nr   r5   r   )r   r(  r   r   rA   r   )r   r>  Zaffix_without_nr?  r3   rr   r   r   r   r=    s    
zIndexedAdapters._lookup_with_nc                 C   s   d S r-   r   ro   r   r   r   r     s    zIndexedAdapters.enable_debug)r   r    r!   r%   r   r"   r   r1   r.   r4  rG   rO   r   r   r/  r   r1  classmethodr%  r3  r	   r&  staticmethodr6  r-  r.  r=  r   r   r   r   r   r   r"    s(   



,
+r"  c                       s<   e Zd Ze fddZdd Zdd Zedd Z  Z	S )	IndexedPrefixAdaptersc                    s   t |tstdt |S )Nz%Only 5' anchored adapters are allowed)rC   r  rS   r   r%  r2  r   r   r   r%    s    
zIndexedPrefixAdapters._acceptc              
   C   s   t dt|jd|||||dS Nr   r   )rz   r   r   r   r3   rV   r   rA   r   r   r   r   r1    s    z!IndexedPrefixAdapters._make_matchc                 C   s   | j S r-   )_make_prefixro   r   r   r   r/    s    z%IndexedPrefixAdapters._get_make_affixc                 C   s   | d | S r-   r   r  r:  r   r   r   rF    s    z"IndexedPrefixAdapters._make_prefix)
r   r    r!   rA  r%  r1  r/  rB  rF  r   r   r   r   r   rC    s   rC  c                       s<   e Zd Ze fddZdd Zdd Zedd Z  Z	S )	IndexedSuffixAdaptersc                    s   t |tstdt |S )Nz%Only anchored 3' adapters are allowed)rC   r
  rS   r   r%  r2  r   r   r   r%    s    
zIndexedSuffixAdapters._acceptc              
   C   s*   t dt|jt|| t|||||dS rD  )r   r   r   rE  r   r   r   r1    s    
z!IndexedSuffixAdapters._make_matchc                 C   s   | j S r-   )_make_suffixro   r   r   r   r/    s    z%IndexedSuffixAdapters._get_make_affixc                 C   s   | | d  S r-   r   rG  r   r   r   rI    s    z"IndexedSuffixAdapters._make_suffix)
r   r    r!   rA  r%  r1  r/  rB  rI  r   r   r   r   r   rH    s   rH  c                 C   sD   t  }| D ]4}|j|jf}||kr4td|j|j |j||< q
d S )NzZAdapter %r (%s) was specified multiple times! Please make sure that this is what you want.)rH   rR   r   r)  r<  r   rk   )r  rZ   r3   keyr   r   r   warn_duplicate_adapters  s    rK  )r  r]   c                 C   sB   | st dd}| D ]}| \}}||7 }q|| }||| fS )z
    Determine which section of the read would not be trimmed. Return a tuple (start, stop)
    that gives the interval of the untrimmed part relative to the original read.

    matches must be non-empty
    zmatches must not be emptyr   )rS   r   )r  r  rr   Zmatch_startZ
match_stoprV   r   r   r   r    s    
r  )Fr%   Zloggingenumr   collectionsr   typingr   r   r   r   r   r	   r
   abcr   r   r5  r   r   Zalignr   r   r   r   r   r   Zkmer_heuristicr   r   Z	getLoggerr)  r   	Exceptionr#   r$   r/   r0   ri   ru   ry   r   r   r   r   r   rz   r   r"   r   r   rl   r1   rD   r   r   r   r   r  r  r
  r   r   r  r"  rC  rH  rK  r.   r  r   r   r   r   <module>   s^   $ I0'o!$ 1;10%%%%Y>+ Z