a
    VA(f"                     @   s   d Z ddlZddlZeddZeddZdd Zd	d
ddd	d
ddddddddddddddddZddddddd d d!d!d"d"d#d#d$d$d%Zd&d' Z	d(d) Z
d*d+ Zd,d- Zd.d/ ZdS )0z>Tools for working with files in the samtools pileup -c format.    NPileupSubstitution)

chromosomeposreference_basegenotypeconsensus_qualitysnp_qualitymapping_qualitycoverage
read_basesbase_qualitiesPileupIndel)r   r   r   r   r   r   r	   r
   Zfirst_alleleZsecond_alleleZreads_firstZreads_secondZ
reads_diffc                 c   s   t dd t t ttttt t f
}t dd t t ttttt t tttf}| D ]}|dd  }|d dkrztdd	 t||D  V  W q ty   td
| Y q0 qBztdd	 t||D  V  W qB ty   td
| Y qB0 qBdS )a/  iterate over ``samtools pileup -c`` formatted file.

    *infile* can be any iterator over a lines.

    The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution`
    or :class:`pysam.Pileup.PileupIndel`.

    .. note::

       The parser converts to 0-based coordinates
    c                 S   s   t | d S N   intx r   +lib/python3.9/site-packages/pysam/Pileup.py<lambda>0       ziterate.<locals>.<lambda>c                 S   s   t | d S r   r   r   r   r   r   r   2   r   N   *c                 S   s   g | ]\}}||qS r   r   .0r   yr   r   r   
<listcomp>9   r   ziterate.<locals>.<listcomp>zparsing error in line: `%s`c                 S   s   g | ]\}}||qS r   r   r   r   r   r   r   >   r   )	strr   splitr   zip	TypeErrorpysamZSamtoolsErrorr   )infileZ
conv_substZ
conv_indellinedr   r   r   iterate#   s"    
r'   ACGTUrRr   YmMkKsSwW)r(   r)   r*   r+   AACCGGTTZUUAGZGACTZTCACZCAGTZTGCGZGCATZTAr8   r9   r:   r;   r<   r=   r>   r?   r@   rA   )r(   r)   r*   r+   r-   r.   r   r/   r0   r1   r2   r3   r4   r5   r6   r7   c                 C   s   t |   S )zencode genotypes like GG, GA into a one-letter code.
    The returned code is lower case if code[0] < code[1], otherwise
    it is uppercase.
    )ENCODE_GENOTYPEuppercoder   r   r   encodeGenotype^   s    rF   c                 C   s   t |  S )z|decode single letter genotypes like m, M into two letters.
    This is the reverse operation to :meth:`encodeGenotype`.
    )DECODE_GENOTYPErD   r   r   r   decodeGenotypef   s    rH   c           	   	      s   dd  dd  fdd}g g  }}d}| D ]L}z|||\}}W n t y^   Y  qY n0 || |dkr0|| q0d	}|rt  tt|d
ksJ d|d }d|}||fS )z*translate indel from vcf to pileup format.c                 S   sL   t t| t|}t|D ]$}| | || kr| d|   S q| d| S )z'get common prefix of strings s1 and s2.Nminlenranges1s2nr   r   r   r   	getPrefixq   s
    z0translateIndelGenotypeFromVCF.<locals>.getPrefixc                 S   sp   t t| t|}| d |d kr&dS td| d dD ](}| | || kr8| |d d   S q8| | d S )z&get common sufix of strings s1 and s2.r    r   NrI   rM   r   r   r   	getSuffixy   s    z0translateIndelGenotypeFromVCF.<locals>.getSuffixc                    s  | |krdS t |t | kr|| rFd|t | d   t | d fS || rjd|d t |    dfS  || }|| }t |t | t |  }|dk rt d|t |t ||    t |d fS nt |t | k r| |rd| t |d   t |d fS | |r2d| d t |  dfS  || }|| }t |t | t | }|dk rnt d| t |t ||    t |fS ndsJ dd S )N)r   r   z-%sr   r   r   z+%szsnp?)rK   
startswithendswith
ValueError)variantrefprefixsuffixZsharedrQ   rT   r   r   getGenotype   s2    
 


, 


(z2translateIndelGenotypeFromVCF.<locals>.getGenotypeTr   Fr   zmultiple offsets for indelr   /)rW   appendrK   setjoin)	Zvcf_genotypesrY   r]   	genotypesZoffsetsZis_errorrX   goffsetr   r\   r   translateIndelGenotypeFromVCFm   s(    
)


re   c                    s>  | j }| j}| j}|g| j  | | }|d }t|dkrJtdt|  |d }|d dkrbdS  fdd|D }|d	dgd  }}| jd
dgd }	|dd}
t|dkst	dd | jD dkr
t
||\}}t||| d||||	|
|dt| dddS td|}d}d}t|||||||	|
||
S dS )z$convert vcf record to pileup record.r?   r   z%only single genotype per position, %sr   .Nc                    s    g | ]}|d kr t | qS )r^   r   r   r   Zallellesr   r   r      r   zvcf2pileup.<locals>.<listcomp>ZGQZMQZDPc                 S   s   g | ]}t |qS r   )rK   rg   r   r   r   r      r   r   <rR   )Zcontigr   rY   ZaltrK   rW   r   getinfomaxre   r   rF   ra   r   )vcfsampler   r   Z	referencedatarb   r   r   r	   r
   r   rd   r   r   r   rh   r   
vcf2pileup   sP    &
rp   c                 c   sL   t  }||  || vr&td| D ]}t||}|r.|V  q.dS )a  iterate over a vcf-formatted file.

    *infile* can be any iterator over a lines.

    The function yields named tuples of the type
    :class:`pysam.Pileup.PileupSubstitution` or
    :class:`pysam.Pileup.PileupIndel`.

    Positions without a snp will be skipped.

    This method is wasteful and written to support same legacy code
    that expects samtools pileup output.

    Better use the vcf parser directly.

    zsample %s not vcf fileN)r#   ZVCFZconnectZ
getsamplesKeyErrorZfetchrp   )r$   rn   rm   rowresultr   r   r   iterate_from_vcf   s    

rt   )__doc__collectionsr#   
namedtupler   r   r'   rB   rG   rF   rH   re   rp   rt   r   r   r   r   <module>   sD   !
Y: