U
    7gbu                  *   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZddlm	Z	 zd dlm
Z
 W n  ek
r|   d dlm
Z
 Y nX zd dlZW n ek
r   dZY nX zd dlZW n ek
r   dZY nX ddlmZmZmZ ddlmZmZmZmZ ddd	dd	dd
ddd
d
d	dddd
d
d
d
d
dddddddddddddddddddddd)Zdddd	ddd	dddddddd	d	ddddZd ZdZdZdZdd Zdd e D Zdd e D Z dddgZ!dddddZ"e#dddddd d!d"gZ$e#d#ddgZ%e#d$ddgZ&e#d%ddddd"gZ'e#d&d'd(d)d*gZ(e#d+dd,gZ)G d-d. d.e*Z+G d/d0 d0e*Z,G d1d2 d2e*Z-d3d4 Z.e,Z/e-Z0dS )5    N   )get_uncompressed_size)OrderedDict)_Call_Recordmake_calldata_tuple)_Substitution	_Breakend_SingleBreakend_SVStringIntegerFloatFlag))ZAAZACZAFZANZBQZCIGARZDBDPZENDZH2ZH3MQZMQ0ZNSZSBZSOMATICZ	VALIDATEDZ1000GZ	IMPRECISEZNOVELZSVTYPEZSVLENZCIPOSZCIENDZHOMLENZHOMSEQZBKPTIDZMEINFOZMETRANSZDGVIDZDBVARIDZDBRIPIDZMATEIDZPARIDZEVENTZCILENZDPADJCNZCNADJZCICNZCICNADJ)GTr   FTZGLZGLEZPLZGPZGQZHQZPSZPQZECr   r   ZCNQZCNLZNQZHAPZAHAP      c                 C   s   t tttttd|  S )N)r   r   Z	Characterr   ZNumericr   )INTEGERSTRINGFLOATFLAG)Z
field_type r   )lib/python3.8/site-packages/vcf/parser.py_encode_typej   s    r   c                 C   s   i | ]\}}|t |qS r   r   .0kvr   r   r   
<dictcomp>u   s      r#   c                 C   s   i | ]\}}|t |qS r   r   r   r   r   r   r#   v   s      Z
fileformatZfileDateZ	reference).AGRZInfoidnumtypedescsourceversion	type_codeZFilterZAltZFormatZ
SampleInfosamplesZgt_basesZgt_typesZ	gt_phasesZContiglengthc                       s`   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Z  ZS )_vcf_metadata_parserz/Parse the metadata in the header of a VCF file.c                    sn   t t|   tdtj| _tdtj| _tdtj| _tdtj| _	tdtj| _
td| _d S )Na=  \#\#INFO=<
            ID=(?P<id>[^,]+),\s*
            Number=(?P<number>-?\d+|\.|[AGR])?,\s*
            Type=(?P<type>Integer|Float|Flag|Character|String),\s*
            Description="(?P<desc>[^"]*)"
            (?:,\s*Source="(?P<source>[^"]*)")?
            (?:,\s*Version="?(?P<version>[^"]*)"?)?
            >ze\#\#FILTER=<
            ID=(?P<id>[^,]+),\s*
            Description="(?P<desc>[^"]*)"
            >zb\#\#ALT=<
            ID=(?P<id>[^,]+),\s*
            Description="(?P<desc>[^"]*)"
            >z\#\#FORMAT=<
            ID=(?P<id>.+),\s*
            Number=(?P<number>-?\d+|\.|[AGR]),\s*
            Type=(?P<type>.+),\s*
            Description="(?P<desc>.*)"
            >zr\#\#contig=<
            ID=(?P<id>[^>,]+)
            (,.*length=(?P<length>-?\d+))?
            .*
            >z##(?P<key>.+?)=(?P<val>.+))superr4   __init__recompileVERBOSEinfo_patternfilter_patternalt_patternformat_patterncontig_patternmeta_patternself	__class__r   r   r6      s,    	z_vcf_metadata_parser.__init__c                 C   s(   |dkrdS |t krt|S t | S dS )z*Cast vcf header numbers to integer or NoneN)field_countsintrA   Znum_strr   r   r   vcf_field_count   s
    z$_vcf_metadata_parser.vcf_field_countc                 C   sv   | j |}|std| | |d}t|d||d|d|d|dt|d}|d|fS )z"Read a meta-information INFO line.z&One of the INFO lines is malformed: %snumberr+   r-   r.   r/   r0   )r:   matchSyntaxErrorrG   group_Infor   )rA   Zinfo_stringrI   r,   infor   r   r   	read_info   s    
z_vcf_metadata_parser.read_infoc                 C   s@   | j |}|std| t|d|d}|d|fS )z$Read a meta-information FILTER line.z(One of the FILTER lines is malformed: %sr+   r.   )r;   rI   rJ   _FilterrK   )rA   Zfilter_stringrI   filtr   r   r   read_filter   s    z _vcf_metadata_parser.read_filterc                 C   s@   | j |}|std| t|d|d}|d|fS )z Read a meta-information ALTline.z%One of the ALT lines is malformed: %sr+   r.   )r<   rI   rJ   _AltrK   )rA   Z
alt_stringrI   altr   r   r   read_alt   s
    z_vcf_metadata_parser.read_altc              	   C   sf   | j |}|std| | |d}t|d||d|dt|d}|d|fS )z$Read a meta-information FORMAT line.z(One of the FORMAT lines is malformed: %srH   r+   r-   r.   )r=   rI   rJ   rG   rK   _Formatr   )rA   format_stringrI   r,   Zformr   r   r   read_format   s    z _vcf_metadata_parser.read_formatc                 C   sJ   | j |}|std| | |d}t|d|}|d|fS )z$Read a meta-contigrmation INFO line.z(One of the contig lines is malformed: %sr3   r+   )r>   rI   rJ   rG   rK   _Contig)rA   Zcontig_stringrI   r3   Zcontigr   r   r   read_contig
  s    z _vcf_metadata_parser.read_contigc           	      C   s   | dd}|d d}t }d}d}d}|d dD ]}|dkr^|dkrTd}q||7 }q:|dkr|dkr|dkr||7 }d}q|d	kr|||< d}d}d}q||7 }q:|dkr:|dkr||7 }d}q:||7 }q:|dkr|||< ||fS )
N=r   r   # z[<>]"r   ,)splitlstripr   strip)	rA   meta_stringitemskeyvalstater!   r"   cr   r   r   read_meta_hash  s:    


z#_vcf_metadata_parser.read_meta_hashc                 C   sH   t d|r| |S | j|}|s4|ddfS |d|dfS )Nz##.+=<r[   nonerd   re   )r7   rI   rh   r?   r`   rK   )rA   rb   rI   r   r   r   	read_meta;  s    
z_vcf_metadata_parser.read_meta)__name__
__module____qualname____doc__r6   rG   rN   rQ   rT   rW   rY   rh   rj   __classcell__r   r   rB   r   r4      s   .

&r4   c                       s   e Zd ZdZd" fdd	Zdd Zd	d
 Zdd Zdd ZdddgfddZ	dd Z
dd Zdd Zdd Zdd Zdd Zd#d d!Z  ZS )$ReaderzFReader for a VCF v 4.0 file, an iterator returning ``_Record objects``NFasciic                    s^  t t|   |s|std|rT|| _|dkr~t|dr~|j}|dkr~|d}n*|r~|dkrj|d}t||rvdnd| _|| _	d| _
d| _|rtj| jd| _tjd	krt|| j| _|rd
| _nd| _t| j| _td| _dd | jD | _d| _d| _d| _d| _d| _d| _d| _d| _g | _ g | _!d| _"|| _#| $  i | _%|| _&dS )a  Create a new Reader for a VCF file.

        You must specify either fsock (stream) or filename.  Gzipped streams
        or files are attempted to be recogized by the file extension, or gzipped
        can be forced with ``compressed=True``

        'prepend_chr=True' will put 'chr' before all the CHROM values, useful
        for different sources.

        'strict_whitespace=True' will split records on tabs only (as with VCF
        spec) which allows you to parse files with spaces in the sample names.
        z+You must provide at least fsock or filenameNnamez.gzrbZrtr   )Zfileobj3	z	| +z[\[\]]c                 s   s   | ]}|  r|  V  qd S N)ra   )r    liner   r   r   	<genexpr>~  s      z"Reader.__init__.<locals>.<genexpr>)'r5   rp   r6   	ExceptionZ_readerhasattrrr   endswithopenfilename_total_bytes_read_bytesgzipZGzipFilesysr0   codecs	getreaderZ
_separatorr7   r8   _row_pattern_alt_patternreadermetadatainfosfiltersaltsformatscontigsr2   _sample_indexes_header_lines_column_headers_tabix_prepend_chr_parse_metainfo_format_cacheencoding)rA   Zfsockr}   
compressedZprepend_chrZstrict_whitespacer   rB   r   r   r6   J  sR    

zReader.__init__c                 C   s
   d| _ | S )Nr   r   r@   r   r   r   __iter__  s    zReader.__iter__c                 C   s   | j S )zFReturn read bytes from uncompress data. Usefull to have a progress barr   r@   r   r   r   
read_bytes  s    zReader.read_bytesc                 C   s   | j dkrt| j| _ | j S )zGReturn total bytes from uncompress data. Usefull to have a progress barr   )r~   r   r}   r@   r   r   r   total_bytes  s    
zReader.total_bytesc                 C   s  dD ]}t | |t  qt }t| j}|drJ| j| |drd||\}}|| j	|< n|dr|
|\}}|| j|< n|dr||\}}|| j|< n|dr||\}}|| j|< nn|dr||\}}|| j|< nJ||\}}|tkr|| j|< n&|| jkr.g | j|< | j| | t| j}q(| j|dd	 }|d	d
 | _|d
d	 | _tdd t| jD | _d	S )zParse the information stored in the metainfo of the VCF.

        The end user shouldn't have to use this.  She can access the metainfo
        directly with ``self.metadata``.)r   r   r   r   r   r   z##z##INFOz##FILTERz##ALTz##FORMATz##contigr   N	   c                 S   s   g | ]\}}||fqS r   r   )r    ixr   r   r   
<listcomp>  s     z*Reader._parse_metainfo.<locals>.<listcomp>)setattrr   r4   nextr   
startswithr   appendrN   r   rQ   r   rT   r   rW   r   rY   r   rj   SINGULAR_METADATAr   r   r_   r   r2   dict	enumerater   )rA   attrparserrw   rd   re   Zfieldsr   r   r   r     s@    







zReader._parse_metainfor'   r\   ZNAc                    s    fdd|D S )z"``map``, but make bad values None.c                    s    g | ]}| kr|nd qS rv   r   r    r   badfuncr   r   r     s     zReader._map.<locals>.<listcomp>r   )rA   r   iterabler   r   r   r   _map  s    zReader._mapc                 C   s&   |dkrdS |dkrg S | dS dS )zParse the FILTER field of a VCF entry into a Python list

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        r'   NPASS;)r_   )rA   Zfilt_strr   r   r   _parse_filter  s
    zReader._parse_filterc           	      C   s  |dkri S | d}i }|D ]}| dd}|d }z| j| j}W nL tk
r   zt| }W n* tk
r   |dd rt}nt}Y nX Y nX |tkr|d  d}z| t	|}W n  t
k
r   | t|}Y nX n~|tkr|d  d}| t|}nX|tkrd}nH|tkr^z|d  d}| t|}W n tk
r\   t}d}Y nX z(| j| jdkr|tkr|d }W n tk
r   Y nX |||< q|S )	zXParse the INFO field of a VCF entry into a dictionary of Python
        types.

        r'   r   rZ   r   r   Nr^   T)r_   r   r1   KeyErrorRESERVED_INFO_CODESr   r   r   r   rE   
ValueErrorfloatr   str
IndexErrorr,   )	rA   Zinfo_strentriesZretdictentryID
entry_typevalsre   r   r   r   _parse_info  sV    






zReader._parse_infoc                 C   s   t |d}|jD ]x}z| j| j}| j| j}W n> tk
rr   d}zt| }W n tk
rl   t}Y nX Y nX |j	
| |j
| q|S )z-Parse the format of the calls in this _Record:N)r   r_   _fieldsr   r1   r,   r   RESERVED_FORMAT_CODESr   _typesr   _nums)rA   samp_fmtfmtr   	entry_numr   r   r   _parse_sample_format$  s    
zReader._parse_sample_formatc              
   C   s  || j kr| || j |< | j | }trBt| j|||j|j|S g }| j}t|j	}t
| j|D ]\}}dg| }	t|dD ]H\}
}|j	|
 dkr||	|
< qn6|j	|
 dkr| ||	|
< qn|r|dkrd|	|
< q|j|
 }|j|
 }|dkr^|tkr<zt||	|
< W n" tk
r8   t||	|
< Y nX q|tkrTt||	|
< q||	|
< q|d}|tkrz|t||	|
< W n$ tk
r   |t||	|
< Y nX q|tkr|t||	|
< q||	|
< qt||||	 }|| qb|S )zParse a sample entry according to the format specified in the FORMAT
        column.

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        Nr   r   r   r'   r   r^   )r   r   cparseZparse_samplesr2   r   r   r   lenr   zipr   r_   r   r   rE   r   r   r   r   r   )rA   r2   r   ZsiteZ	samp_datar   Znfieldsrr   sampleZsampdatr   r   r   r   Zcallr   r   r   _parse_samples6  sf    	

     










zReader._parse_samplesc           
      C   s<  | j |d k	r| j |}|d d}|d }|d dkrP|dd }d}nd}|d }|d dkpr|d d	k}td
|d k	}|r|d }	n|d }	t|||||	|S |d dkrt|dkrtd|dd  S |d dkrt|dkrtd|d d S |d dkr0|d dkr0t|dd S t|S d S )Nr   r   r   <r$   FT[]z\[r   r'   >)	r   searchr_   r7   r	   r   r
   r   r   )
rA   r   rc   ZremoteCoordschrZwithinMainAssemblyposZorientationZremoteOrientationZconnectingSequencer   r   r   
_parse_alt  s:    
zReader._parse_altc                 C   s  t | j}|  jt|7  _| j| }|d }| jrBd| }t|d }|d dkrd|d }nd}|d }| 	| j
|d d	}zt|d
 }W n> tk
r   zt|d
 }W n tk
r   d}Y nX Y nX | |d }	tdk	rt|d | jt}
n| |d }
z|d }W n tk
r:   d}Y nX |dkrJd}t|||||||	|
|| j
}|dk	r| |dd ||}||_|S )z#Return the next record in the file.r   r   r   r   r'   Nr      r^               r   )r   r   r   r   r   r_   rstripr   rE   r   r   r   r   r   r   Z
parse_infor   r   r   r   r   r   r   r2   )rA   rw   rowchromr   r   refrS   ZqualrP   rM   r   recordr2   r   r   r   __next__  sZ    




         
zReader.__next__c                 C   sl   t std| jstd| js4t j| j| jd| _| jrV|dd dkrV|dd }| j|||| _| S )aO  Fetches records from a tabix-indexed VCF file and returns an
        iterable of ``_Record`` instances

        chrom must be specified.

        The start and end coordinates are in the zero-based,
        half-open coordinate system, similar to ``_Record.start`` and
        ``_Record.end``. The very first base of a chromosome is
        index 0, and the the region includes bases up to, but not
        including the base at the end coordinate. For example
        ``fetch('4', 10, 20)`` would include all variants
        overlapping a 10 base pair region from the 11th base of
        through the 20th base (which is at index 19) of chromosome
        4. It would not include the 21st base (at index 20). See
        http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms
        for more information on the zero-based, half-open coordinate
        system.

        If end is omitted, all variants from start until the end of
        the chromosome chrom will be included.

        If start and end are omitted, all variants on chrom will be
        returned.

        requires pysam

        z-pysam not available, try "pip install pysam"?z/Please provide a filename (or a "normal" fsock))r   Nr   r   )	pysamry   r}   r   Z	Tabixfiler   r   fetchr   )rA   r   startendr   r   r   r     s    zReader.fetch)NNNFFrq   )NN)rk   rl   rm   rn   r6   r   r   r   r   r   r   r   r   r   r   r   r   ro   r   r   rB   r   rp   G  s(         L	2:K#2rp   c                   @   s   e Zd ZdZedd e D Zd"ddZdd Z	d	d
 Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd#ddZd$ddZd%dd Zd!S )&Writerz7VCF Writer. On Windows Python 2, open stream with 'wb'.c                 c   s   | ]\}}||fV  qd S rv   r   r   r   r   r   rx     s     zWriter.<genexpr>
c              	      s  t j|d|dt jd| _ | _|| _t fddttt	t
 j t | _d}d}| j} j D ]l\}}|tkr|g}|D ]P}	t|	trdd	d
 t|	 D }
|d||
 q|d||	 qqn j D ]"}||j|d||jd q j D ]$}||j|d||jd q j D ]}||j|ddi qD j D ]}||j|ddi ql j D ]0}|jr|dj|  n|dj|  q|   d S )Nru   r\   )Z	delimiterlineterminatorZ	quotecharZquotingc                      s
   t  jS rv   )r   r   r   templater   r   <lambda>      z!Writer.__init__.<locals>.<lambda>z###{key}=<ID={0},Description="{1}">
z9##{key}=<ID={0},Number={num},Type={2},Description="{3}">
r^   c                 s   s   | ]\}}d  ||V  qdS )z{0}={1}N)format)r    rd   valuer   r   r   rx   !  s    z"Writer.__init__.<locals>.<genexpr>z##{0}=<{1}>
z
##{0}={1}
INFO)rd   r,   FORMATrd   FILTERALTz##contig=<ID={0},length={1}>
z##contig=<ID={0}>
) csvwriterZ
QUOTE_NONEr   streamcollectionsdefaultdictr   listr   iterr   keys	itertoolscount
info_order_fix_field_countr   rc   r   
isinstancejoinwriter   valuesr,   r   r   r   r   r3   _write_header)rA   r   r   r   ZtwoZfourZ_numrd   r   re   r   rw   r   r   r   r6     sN    



 "zWriter.__init__c                 C   s*   | j dd| jj| jj  d  d S )Nr[   ru   r   )r   r   r   r   r   r2   r@   r   r   r   r   7  s    zWriter._write_headerc                    s    t j j j jg j jp,d	 j
 jg } jrX| j  fdd jD }j||  dS )zwrite a record to the filer'   c                    s   g | ]}  j|qS r   )_format_sampler   )r    r   r   rA   r   r   r   J  s    z'Writer.write_record.<locals>.<listcomp>N)r   r   ZCHROMZPOSr   ZREF_format_altr   ZQUAL_format_filterr   _format_infor   r   r   r2   r   Zwriterow)rA   r   Zffsr2   r   r   r   write_record?  s    


zWriter.write_recordc                 C   s(   z| j   W n tk
r"   Y nX dS )zFlush the writerN)r   flushAttributeErrorr@   r   r   r   r  O  s    zWriter.flushc                 C   s(   z| j   W n tk
r"   Y nX dS )zClose the writerN)r   closer  r@   r   r   r   r  V  s    zWriter.closec                 C   s   || j kr|S | j | S dS )z'Restore header number to original stateN)countsrF   r   r   r   r   ]  s    
zWriter._fix_field_countc                 C   s   d | t|S )Nr^   )r   r   r   )rA   rS   r   r   r   r  d  s    zWriter._format_altc                 C   s   |g krdS | j |dddS )Nr   r'   r   ri   delim)
_stringify)rA   Zfltr   r   r   r  g  s    zWriter._format_filterc                    sH   t rt  jS  sdS fdd}d fddt |dD S )Nr'   c                    s    j |  | fS rv   )r   )fieldr@   r   r   	order_keys  s    z&Writer._format_info.<locals>.order_keyr   c                 3   s   | ]} | | V  qd S rv   )_stringify_pair)r    frM   rA   r   r   rx   w  s    z&Writer._format_info.<locals>.<genexpr>)rd   )r   format_infor   r   sorted)rA   rM   r  r   r  r   r  l  s    
zWriter._format_infoc                 C   s   t rt ||S t|jdr&|jj}nd|kr2dnd}|r@|gng }|jjD ]D}t|j|}|dkrfqL|dkr|| | qL|| 	| qLd
|S )Nr   z./.r\   r   r   )r   Zformat_samplerz   datar   r   getattrr   r  r  r   )rA   r   r   gtresultr  r   r   r   r   r   {  s    
zWriter._format_sampler'   r^   c                 C   s8   t |t g kr$|| t||S |d k	r4t|S |S rv   )r-   r   r   r   )rA   r   ri   r
  r   r   r   r    s    zWriter._stringifyc                 C   s6   t |tr|rt|S dS dt|| j|||df S )Nr\   z%s=%sr	  )r   boolr   r  )rA   r   yri   r
  r   r   r   r    s    
zWriter._stringify_pairc                    s    fdd|D S )z#``map``, but make None values none.c                    s    g | ]}|d k	r |nqS rv   r   r   r   ri   r   r   r     s     zWriter._map.<locals>.<listcomp>r   )rA   r   r   ri   r   r  r   r     s    zWriter._mapN)r   )r'   r^   )r'   r^   )r'   )rk   rl   rm   rn   r   rD   rc   r  r6   r   r  r  r  r   r  r  r  r   r  r  r   r   r   r   r   r     s   
0

r   c                  C   s&   dd l } dd l}tdd|j d S )Nr   z
README.rstw)r   vcffiler   rn   )r   r  r   r   r   __update_readme  s    r  )1r   r   r   r   r   osr7   r   Zutilsr   r   ImportErrorZordereddictr   r   Zmodelr   r   r   r   r	   r
   r   ZRESERVED_INFOZRESERVED_FORMATr   r   r   r   r   rc   r   r   r   rD   
namedtuplerL   rO   rR   rU   Z_SampleInforX   objectr4   rp   r   r  Z	VCFReaderZ	VCFWriterr   r   r   r   <module>   s   

.
  
 7   = 