U
    7gb;                     @   s4   d Z ddlmZ ddlZdd Zdd Zd	d
 ZdS )z
Utilities for VCF files.
   )
BgzfBlocks    Nc                 C   s   | d krdS t | d}| d d }|dkrN|dd tj|dddS |d	krv|d td
d t|D S t | d}|dtj |	 S d S )Nr   rb   s      little)	byteorders   c                 S   s   g | ]}|d  qS )    .0ir   r   (lib/python3.8/site-packages/vcf/utils.py
<listcomp>   s     z)get_uncompressed_size.<locals>.<listcomp>)
openreadseekint
from_bytessumr   osSEEK_ENDtell)filepathZdeviceZmagic_4bytesr   r   r   get_uncompressed_size   s    


r   c               	   /   sF  d|kr|d  ndd  g | D ]6}z t| W q" tk
rV    d Y q"X q"dtdd D rBt fdd	tD }fd
dt| D }t|rt|nt| t	fddt|
 D fddttD V  D ]:}zt| | |< W n tk
r:   d|< Y nX qq^dS )a  
    Simultaneously iteratate over two or more VCF readers. For each 
    genomic position with a variant, return a list of size equal to the number 
    of VCF readers. This list contains the VCF record from readers that have
    this variant, and None for readers that don't have it. 
    The caller must make sure that inputs are sorted in the same way and use the 
    same reference otherwise behaviour is undefined.

    Args:
        vcf_record_sort_key: function that takes a VCF record and returns a 
            tuple that can be used as a key for comparing and sorting VCF 
            records across all readers. This tuple defines what it means for two 
            variants to be equal (eg. whether it's only their position or also 
            their allele values), and implicitly determines the chromosome 
            ordering since the tuple's 1st element is typically the chromosome 
            name (or calculated from it).
    Zvcf_record_sort_keyc                 S   s   | j | jfS N)ZCHROMZPOS)rr   r   r   <lambda>2       zwalk_together.<locals>.<lambda>Nr   c                 S   s   g | ]}|d k	qS r   r   )r   r   r   r   r   r   <   s     z!walk_together.<locals>.<listcomp>c                 3   s&   | ]\}}|d k	r| |fV  qd S r   r   )r   r   r   )get_keyr   r   	<genexpr>=   s     z walk_together.<locals>.<genexpr>c                    s    g | ]}|d   d  kr|qS )r   r   )r   kmin_kr   r   r   ?   s     c                    s   g | ]\}}| kr|qS r   r   )r   r   r"   r#   r   r   r   G   s      c                    s    g | ]}| kr| nd qS r   r   r   )
min_k_idxsnextsr   r   r   H   s     )appendnextStopIterationanydict	enumeratelistvaluesminsetitemsrangelen)ZreaderskwargsreaderZnext_idx_to_kZkeys_with_prev_contigr   r   )r    r$   r%   r&   r   walk_together   s6    



r6   c                     s   | sg S dd | D }t |}t|}t|dk r6| S t|dd D ]:\ }||  krF dkrj|   S  fdd| D   S qF fdd| D S )	a_  
    Trim a list of sequences by removing the longest common suffix while
    leaving all of them at least one character in length.

    Standard convention with VCF is to place an indel at the left-most
    position, but some tools add additional context to the right of the
    sequences (e.g. samtools). These common suffixes are undesirable when
    comparing variants, for example in variant databases.

        >>> trim_common_suffix('TATATATA', 'TATATA')
        ['TAT', 'T']

        >>> trim_common_suffix('ACCCCC', 'ACCCCCCCC', 'ACCCCCCC', 'ACCCCCCCCC')
        ['A', 'ACCC', 'ACC', 'ACCCC']

    c                 S   s   g | ]}|d d d qS )Nr   r   seqr   r   r   r   d   s     z&trim_common_suffix.<locals>.<listcomp>r   Nr7   r   c                    s   g | ]}|d    qS r   r   r8   r   r   r   r   m   s     c                    s   g | ]}|d  d   qS )Nr   r   r8   r:   r   r   r   n   s     )r/   maxr3   r,   )	sequencesZreversesZrev_minZrev_maxcr   r:   r   trim_common_suffixQ   s    r>   )__doc__Zbgzfr   r   r   r6   r>   r   r   r   r   <module>   s
   4