U
    7gbF_                     @   s   d dl mZmZ d dlZd dlZd dlZzd dlmZ W n  ek
rX   d dlmZ Y nX e	dZ
G dd deZG dd deZG d	d
 d
eedZG dd deZG dd deZG dd deZG dd deZdd ZdS )    )ABCMetaabstractmethodN)Counterz[|/]c                   @   s   e Zd ZdZdddddddgZd	d
 Zdd Zdd Zdd Zdd Z	dd Z
edd Zedd Zedd Zdd Zedd Zedd  Zed!d" Zd#S )$_Callz, A genotype call, a cell entry in a VCF filesitesampledatagt_nums
gt_allelescalledploidityc                 C   s   || _ || _|| _t| jdd d k	rvdd t| jjD | _t| j| _	t
dd | jD | _| jrn| jjnd | _nd | _d | _	d | _d | _d S )NGTc                 S   s   g | ]}|d kr|ndqS ).N .0Zalr   r   (lib/python3.8/site-packages/vcf/model.py
<listcomp>   s     z"_Call.__init__.<locals>.<listcomp>c                 s   s   | ]}|d k	V  qd S Nr   r   r   r   r   	<genexpr>   s     z!_Call.__init__.<locals>.<genexpr>)r   r   r   getattrallele_delimitersplitr   r
   lenr   anyr   r	   )selfr   r   r   r   r   r   __init__   s    z_Call.__init__c                 C   s   d| j t| jf S )NzCall(sample=%s, %s))r   strr   r   r   r   r   __repr__&   s    z_Call.__repr__c                 C   s6   | j t|ddko4| jt|ddko4| jt|ddkS )zt Two _Calls are equal if their _Records are equal
            and the samples and ``gt_type``s are the same
        r   Nr   gt_type)r   r   r   r    r   otherr   r   r   __eq__)   s
    z_Call.__eq__c                    s   t  fdd jD S )Nc                 3   s   | ]}|t  |fV  qd S r   )r   )r   attrr   r   r   r   2   s     z%_Call.__getstate__.<locals>.<genexpr>)dict	__slots__r   r   r   r   __getstate__1   s    z_Call.__getstate__c                 C   s"   | j D ]}t| ||| qd S r   )r&   setattrget)r   stater$   r   r   r   __setstate__4   s    
z_Call.__setstate__c                 C   s   | j s
dS dS )N/|)phasedr   r   r   r   gt_phase_char8   s    z_Call.gt_phase_charc                    sJ    j rBz    fdd jD W S    tjd Y qFX ndS dS )zXThe actual genotype alleles.
           E.g. if VCF genotype is 0/1, return A/G
        c                 3   s.   | ]&}t |d k	r  jjt| ndV  qd S )Nr   )r   r   allelesintr   Xr   r   r   r   D   s     z!_Call.gt_bases.<locals>.<genexpr>z+Allele number not found in list of alleles
N)r   r/   joinr
   sysstderrwriter   r   r   r   gt_bases;   s     z_Call.gt_basesc                    sN   | j rF| j t fdd dd D r@ d dkr:dS dS qJdS ndS dS )zThe type of genotype.
           hom_ref  = 0
           het      = 1
           hom_alt  = 2  (we don;t track _which+ ALT)
           uncalled = None
        c                 3   s   | ]}| d  kV  qdS )r   Nr   r2   r0   r   r   r   U   s     z _Call.gt_type.<locals>.<genexpr>   Nr   0   )r   r
   allr   r   r9   r   r    J   s    	z_Call.gt_typec                 C   s   | j dk	o| j ddkS )z^A boolean indicating whether or not
           the genotype is phased for this sample
        Nr-   r   )r	   findr   r   r   r   r.   _   s    z_Call.phasedc                 C   s   t | j|S )z' Lookup value, backwards compatibility )r   r   )r   keyr   r   r   __getitem__f   s    z_Call.__getitem__c                 C   s   | j s
dS | jdkS )z% Return True if not a reference call Nr   r   r    r   r   r   r   
is_variantj   s    z_Call.is_variantc                 C   s   | j s
dS | jdkS )z$ Return True for heterozygous calls Nr:   rA   r   r   r   r   is_hetq   s    z_Call.is_hetc                 C   sD   z| j j}W n tk
r"   Y dS X |dks8t|dkr<dS dS dS )z  Return True for filtered calls FNr   T)r   ZFTAttributeErrorr   r   Zfiltr   r   r   is_filteredx   s    z_Call.is_filteredN)__name__
__module____qualname____doc__r&   r   r   r#   r'   r+   r/   propertyr8   r    r.   r@   rB   rC   rF   r   r   r   r   r      s*   




r   c                   @   s  e Zd ZdZdRddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd ZdSddZd d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Z ed<d= Z!ed>d? Z"ed@dA Z#edBdC Z$edDdE Z%edFdG Z&edHdI Z'edJdK Z(edLdM Z)edNdO Z*edPdQ Z+dS )T_Recordah   A set of calls at a site.  Equivalent to a row in a VCF file.

        The standard VCF fields CHROM, POS, ID, REF, ALT, QUAL, FILTER,
        INFO and FORMAT are available as properties.

        The list of genotype calls is in the ``samples`` property.

        Regarding the coordinates associated with each instance:

        - ``POS``, per VCF specification, is the one-based index
          (the first base of the contig has an index of 1) of the first
          base of the ``REF`` sequence.
        - The ``start`` and ``end`` denote the coordinates of the entire
          ``REF`` sequence in the zero-based, half-open coordinate
          system (see
          http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms),
          where the first base of the contig has an index of 0, and the
          interval runs up to, but does not include, the base at the
          ``end`` index. This indexing scheme is analagous to Python
          slice notation.
        - The ``affected_start`` and ``affected_end`` coordinates are
          also in the zero-based, half-open coordinate system. These
          coordinates indicate the precise region of the reference
          genome actually affected by the events denoted in ``ALT``
          (i.e., the minimum ``affected_start`` and maximum
          ``affected_end``).

          - For SNPs and structural variants, the affected region
            includes all bases of ``REF``, including the first base
            (i.e., ``affected_start = start = POS - 1``).
          - For deletions, the region includes all bases of ``REF``
            except the first base, which flanks upstream the actual
            deletion event, per VCF specification.
          - For insertions, the ``affected_start`` and ``affected_end``
            coordinates represent a 0 bp-length region between the two
            flanking bases (i.e., ``affected_start`` =
            ``affected_end``). This is analagous to Python slice
            notation (see http://stackoverflow.com/a/2947881/38140).
            Neither the upstream nor downstream flanking bases are
            included in the region.
    Nc                 C   s   || _ || _|| _|| _|| _|| _|| _|| _|	| _| jd | _	| j	t
| j | _| jg| _| j| j |prg | _|
| _d | _d | _|   d S Nr:   )CHROMPOSIDREFALTQUALFILTERINFOFORMATstartr   endr0   extendsamples_sample_indexesaffected_startaffected_end_set_start_and_end)r   rN   rO   rP   rQ   rR   rS   rT   rU   rV   Zsample_indexesrZ   r   r   r   r      s$    

z_Record.__init__c                 C   s   | j  | _| _| jD ]r}|d kr.|  \}}n<|jdkrF|  \}}n$|jdkr^|  \}}n|  \}}t	| j|| _t
| j|| _qd S )NSNVMNV)rO   r\   r]   rR   !_compute_coordinates_for_none_alttype_compute_coordinates_for_snp_compute_coordinates_for_indel_compute_coordinates_for_svminmax)r   altrW   rX   r   r   r   r^      s    


z_Record._set_start_and_endc                 C   s    | j d }|t| j }||fS rM   rO   r   rQ   r   rW   rX   r   r   r   ra      s    
z)_Record._compute_coordinates_for_none_altc                 C   s@   t | jdkr(| j}|t | jd  }n| jd }| j}||fS rM   r   rQ   rO   rj   r   r   r   rc      s    
z$_Record._compute_coordinates_for_snpc                 C   s:   t | jdkr(| j}|t | jd  }n
| j }}||fS rM   rk   rj   r   r   r   rd      s
    
z&_Record._compute_coordinates_for_indelc                 C   s    | j d }|t| j }||fS rM   ri   rj   r   r   r   re      s    
z#_Record._compute_coordinates_for_svc                 C   s&   t | j| jft|dd t|dd fS NrN   rO   )ZcmprN   rO   r   r!   r   r   r   __cmp__   s    z_Record.__cmp__c                 C   sH   | j t|ddkoF| jt|ddkoF| jt|ddkoF| jt|ddkS )zO _Records are equal if they describe the same variant (same position, alleles) rN   NrO   rQ   rR   )rN   r   rO   rQ   rR   r!   r   r   r   r#     s    z_Record.__eq__c                 C   s$   | j | jft|dd t|dd fk S rl   )rN   rO   r   r!   r   r   r   __lt__
  s    z_Record.__lt__c                 C   s
   t | jS r   )iterrZ   r   r   r   r   __iter__  s    z_Record.__iter__c                 C   s
   d| j  S )Nz>Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s))__dict__r   r   r   r   __str__  s    z_Record.__str__c                 C   s   | j d | | _ d S )N:)rV   )r   Zfmtr   r   r   
add_format  s    z_Record.add_formatc                 C   s$   | j d kr|g| _ n| j | d S r   )rT   append)r   Zfltr   r   r   
add_filter  s    

z_Record.add_filterTc                 C   s   || j |< d S r   )rU   )r   infovaluer   r   r   add_info  s    z_Record.add_infoc                 C   s   | j | j|  S )z5 Lookup a ``_Call`` for the sample given in ``name`` )rZ   r[   )r   namer   r   r   genotype  s    z_Record.genotypec                 C   s   t dd | jD S )z The number of called samplesc                 s   s   | ]}|j rd V  qdS )r:   N)r   r   sr   r   r   r   &  s      z%_Record.num_called.<locals>.<genexpr>)sumrZ   r   r   r   r   
num_called#  s    z_Record.num_calledc                 C   s   t | jt t| j S )z6 The fraction of genotypes that were actually called. )floatr   r   rZ   r   r   r   r   	call_rate(  s    z_Record.call_ratec                 C   s   t dd | jD S )z2 The number of homozygous for ref allele genotypesc                 S   s   g | ]}|j d kr|qS r   r    r|   r   r   r   r   0  s     
 z'_Record.num_hom_ref.<locals>.<listcomp>r   rZ   r   r   r   r   num_hom_ref-  s    z_Record.num_hom_refc                 C   s   t dd | jD S )z2 The number of homozygous for alt allele genotypesc                 S   s   g | ]}|j d kr|qS r<   r   r|   r   r   r   r   5  s     
 z'_Record.num_hom_alt.<locals>.<listcomp>r   r   r   r   r   num_hom_alt2  s    z_Record.num_hom_altc                 C   s   t dd | jD S )z% The number of heterozygous genotypesc                 S   s   g | ]}|j d kr|qS r:   r   r|   r   r   r   r   :  s     
 z#_Record.num_het.<locals>.<listcomp>r   r   r   r   r   num_het7  s    z_Record.num_hetc                 C   s   t dd | jD S )z  The number of unknown genotypesc                 S   s   g | ]}|j d kr|qS r   r   r|   r   r   r   r   ?  s     
 z'_Record.num_unknown.<locals>.<listcomp>r   r   r   r   r   num_unknown<  s    z_Record.num_unknownc                    sd   dt   | jD ].}|jdk	r|jD ]} |g d7 q$q fddtdt| jd D S )zz A list of allele frequencies of alternate alleles.
           NOTE: Denominator calc'ed from _called_ genotypes.
        g        Nr:   c                    s   g | ]} t |  qS r   r   )r   iZallele_counts
num_chromsr   r   r   M  s     z_Record.aaf.<locals>.<listcomp>)r   rZ   r    r
   updateranger   rR   )r   r}   ar   r   r   aafA  s    


z_Record.aafc                 C   sN   t | jdkrdS | jd }d| }td| j }t||d  d| |  S )ak  
        pi_hat (estimation of nucleotide diversity) for the site.
        This metric can be summed across multiple sites to compute regional
        nucleotide diversity estimates.  For example, pi_hat for all variants
        in a given gene.

        Derived from:
        "Population Genetics: A Concise Guide, 2nd ed., p.45"
        John Gillespie.
        r:   Nr   g      ?g       @)r   rR   r   r   r   )r   pqr   r   r   r   nucl_diversityO  s    
z_Record.nucl_diversityc                 C   s,   dt | j g| j }dt dd |D  S )a6  
        Heterozygosity of a site. Heterozygosity gives the probability that
        two randomly chosen chromosomes from the population have different
        alleles, giving a measure of the degree of polymorphism in a population.

        If there are i alleles with frequency p_i, H=1-sum_i(p_i^2)
        r:   c                 S   s   g | ]}|d  qS r   r   )r   xr   r   r   r   m  s     z*_Record.heterozygosity.<locals>.<listcomp>)r~   r   )r   Zallele_freqsr   r   r   heterozygosityc  s    	z_Record.heterozygosityc                 C   s   dd | j D S )z The list of hom ref genotypesc                 S   s   g | ]}|j d kr|qS r   r   r|   r   r   r   r   q  s     
 z(_Record.get_hom_refs.<locals>.<listcomp>rZ   r   r   r   r   get_hom_refso  s    z_Record.get_hom_refsc                 C   s   dd | j D S )z The list of hom alt genotypesc                 S   s   g | ]}|j d kr|qS r   r   r|   r   r   r   r   u  s     
 z(_Record.get_hom_alts.<locals>.<listcomp>r   r   r   r   r   get_hom_altss  s    z_Record.get_hom_altsc                 C   s   dd | j D S )z The list of het genotypesc                 S   s   g | ]}|j d kr|qS r   r   r|   r   r   r   r   y  s     
 z$_Record.get_hets.<locals>.<listcomp>r   r   r   r   r   get_hetsw  s    z_Record.get_hetsc                 C   s   dd | j D S )z The list of unknown genotypesc                 S   s   g | ]}|j d kr|qS r   r   r|   r   r   r   r   }  s     
 z(_Record.get_unknowns.<locals>.<listcomp>r   r   r   r   r   get_unknowns{  s    z_Record.get_unknownsc                 C   sH   t | jdkrdS | jD ]*}|dks.|jdkr4 dS |dkr dS qdS )z, Return whether or not the variant is a SNP r:   FNr_   )ACGTN*T)r   rQ   rR   rb   )r   rh   r   r   r   is_snp  s    
z_Record.is_snpc                 C   sv   | j }t| jdkr|sdS | jD ]N}|dkr4 dS |jdkrN|jdkrN dS t|t| jkr"|sj dS  dS q"dS )z/ Return whether or not the variant is an INDEL r:   TNFr_   r`   )is_svr   rQ   rR   rb   )r   r   rh   r   r   r   is_indel  s    
z_Record.is_indelc                 C   s   | j ddkrdS dS )z; Return whether or not the variant is a structural variant SVTYPENFT)rU   r)   r   r   r   r   r     s    z_Record.is_svc                 C   s|   t | jdkrdS | jrt| jd }| jdkr4|dksj| jdkrF|dksj| jdkrX|dksj| jdkrn|dkrndS dS ndS d	S )
z/ Return whether or not the SNP is a transition r:   Fr   r   r   r   r   TN)r   rR   r   rQ   r   Z
alt_alleler   r   r   is_transition  s(    
z_Record.is_transitionc                 C   sR   t | jdkrdS | jrJ| jd }|dkr.dS t | jt |krDdS dS ndS dS )z/ Return whether or not the INDEL is a deletion r:   Fr   NT)r   rR   r   rQ   r   r   r   r   is_deletion  s    
z_Record.is_deletionc                 C   s&   | j r
dS | jrdS | jrdS dS dS )z]
        Return the type of variant [snp, indel, unknown]
        TO DO: support SVs
        ZsnpZindelsvunknownN)r   r   r   r   r   r   r   var_type  s    z_Record.var_typec                 C   s   | j r(| jrdS t| jdkr"dS dS nb| jrP| jr8dS t| jdkrJdS dS n:| jr| jd dkrhd	S | jrx| jd S | jd
 j	S ndS dS )a  
        Return the subtype of variant.

        - For SNPs and INDELs, yeild one of: [ts, tv, ins, del]
        - For SVs yield either "complex" or the SV type defined in the ALT
          fields (removing the brackets). E.g.::

              <DEL>       -> DEL
              <INS:ME:L1> -> INS:ME:L1
              <DUP>       -> DUP

        The logic is meant to follow the rules outlined in the following
        paragraph at:

        http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41

        "For precisely known variants, the REF and ALT fields should contain
        the full sequences for the alleles, following the usual VCF conventions.
        For imprecise variants, the REF field may contain a single base and the
        ALT fields should contain symbolic alleles (e.g. <ID>), described in more
        detail below. Imprecise variants should also be marked by the presence
        of an IMPRECISE flag in the INFO field."
        tsr:   Ztvr   delZinsr   BNDcomplexr   N)
r   r   r   rR   r   r   r   rU   is_sv_preciserb   r   r   r   r   var_subtype  s&    
z_Record.var_subtypec                 C   s   | j r| jd S dS )z$ Return the end position for the SV ZENDN)r   rU   r   r   r   r   sv_end  s    
z_Record.sv_endc                 C   sR   | j ddkr| jsdS | j ddk	r4| jr4dS | j ddkrN| jrNdS dS )zW Return whether the SV cordinates are mapped
            to 1 b.p. resolution.
        Z	IMPRECISENFT)rU   r)   r   r   r   r   r   r     s    z_Record.is_sv_precisec                 C   s   t | jdko| jd dkS )z! Return True for reference calls r:   r   N)r   rR   r   r   r   r   is_monomorphic"  s    z_Record.is_monomorphicc                 C   s&   | j }|dkst|dkrdS dS dS )z, Return True if a variant has been filtered Nr   FT)rT   r   rE   r   r   r   rF   '  s    z_Record.is_filtered)N)T),rG   rH   rI   rJ   r   r^   ra   rc   rd   re   rm   r#   rn   rp   rr   rt   rv   ry   r{   rK   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rF   r   r   r   r   rL      sz   * 
 
	
















0


rL   c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )
_AltRecordzTAn alternative allele record: either replacement string, SV placeholder, or breakendc                    s   t t| jf | || _d S r   )superr   r   rb   r   rb   kwargs	__class__r   r   r   4  s    z_AltRecord.__init__c                 C   s   t d S r   )NotImplementedErrorr   r   r   r   rr   9  s    z_AltRecord.__str__c                 C   s   | j t|dd kS )Nrb   )rb   r   r!   r   r   r   r#   =  s    z_AltRecord.__eq__)	rG   rH   rI   rJ   r   r   rr   r#   __classcell__r   r   r   r   r   1  s
   
r   )	metaclassc                       sD   e Zd ZdZ fddZdd Zdd Zdd	 Z fd
dZ  Z	S )_SubstitutionzGA basic ALT record, where a REF sequence is replaced by an ALT sequencec                    sP   t |dkr(tt| jf ddi| ntt| jf ddi| t|| _d S )Nr:   rb   r_   r`   )r   r   r   r   r   sequence)r   Znucleotidesr   r   r   r   r   D  s    z_Substitution.__init__c                 C   s   | j S r   )r   r   r   r   r   rr   L  s    z_Substitution.__str__c                 C   s   t | S r   r   r   r   r   r   r   O  s    z_Substitution.__repr__c                 C   s
   t | jS r   )r   r   r   r   r   r   __len__R  s    z_Substitution.__len__c                    s@   t |tr| j|kS t || js$dS tt| |o>| j|jkS )NF)
isinstancer   r   r   r   r   r#   r!   r   r   r   r#   U  s
    

z_Substitution.__eq__)
rG   rH   rI   rJ   r   rr   r   r   r#   r   r   r   r   r   r   A  s   r   c                       s<   e Zd ZdZ fddZdd Zdd Z fdd	Z  ZS )
	_BreakendzDA breakend which is paired to a remote location on or off the genomec                    sj   t t| jf ddi| |d k	r.t|| _nd | _|d k	rHt|| _nd | _|| _|| _|| _	|| _
d S )Nrb   r   )r   r   r   r   chrr1   posremoteOrientationwithinMainAssemblyorientationconnectingSequence)r   r   r   r   r   r   r   r   r   r   r   r   `  s    z_Breakend.__init__c                 C   s   t | S r   r   r   r   r   r   r   u  s    z_Breakend.__repr__c                 C   s   | j d krd}nX| jr| j }nd| j  d }| jrNd| d t| j d }nd| d t| j d }| jrx|| j S | j| S d S )Nr   <>[rs   ])r   r   r   r   r   r   r   )r   Z	remoteTagZ	remoteChrr   r   r   rr   x  s    

z_Breakend.__str__c                    s   t || jsdS tt| |o| jt|dd ko| jt|dd ko| jt|dd ko| j	t|dd ko| j
t|dd ko| jt|dd kS )NFr   r   r   r   r   r   )r   r   r   r   r#   r   r   r   r   r   r   r   r!   r   r   r   r#     s    z_Breakend.__eq__)	rG   rH   rI   rJ   r   r   rr   r#   r   r   r   r   r   r   ]  s
   r   c                       s    e Zd ZdZ fddZ  ZS )_SingleBreakendzA single breakendc                    s"   t t| jd d |d |d f| d S r   )r   r   r   )r   r   r   r   r   r   r   r     s    z_SingleBreakend.__init__)rG   rH   rI   rJ   r   r   r   r   r   r   r     s   r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )_SVzAn SV placeholderc                    s   t t| j|f| d S r   )r   r   r   r   r   r   r   r     s    z_SV.__init__c                 C   s   d| j  d S )Nr   r   )rb   r   r   r   r   rr     s    z_SV.__str__c                 C   s   t | S r   r   r   r   r   r   r     s    z_SV.__repr__)rG   rH   rI   rJ   r   rr   r   r   r   r   r   r   r     s   r   c                    s"   G  fdddt d  S )z- Return a namedtuple for a given call format c                       s4   e Zd ZdZg Zg Zdd Z fddZ  ZS )z%make_calldata_tuple.<locals>.CallDatar   c                 S   s(   d dd t| j| D }d| d S )Nz, c                 S   s   g | ]\}}d ||f qS )z%s=%sr   )r   r   yr   r   r   r     s   zAmake_calldata_tuple.<locals>.CallData.__str__.<locals>.<listcomp>z	CallData())r4   zip_fields)r   Zdatr   r   r   rr     s    

z-make_calldata_tuple.<locals>.CallData.__str__c                    s   t  |  }tffS r   )r   
__reduce__make_calldata_tuple)r   args)CallDatar   fieldsr   r   r     s    z0make_calldata_tuple.<locals>.CallData.__reduce__)	rG   rH   rI   r&   Z_typesZ_numsrr   r   r   r   r   r   r   r   r     s
   r   Zcalldata)collections
namedtuple)r   r   r   r   r     s    r   )abcr   r   r   r5   rer   ImportErrorZcountercompiler   objectr   rL   r   r   r   r   r   r   r   r   r   r   <module>   s&   
x   /9