U
    Ae<@                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZm	Z	m
Z
mZ d dlmZ dd ZdddZdd	 Zd
d ZdddZdddZdd Zdd ZG dd deZedkre  dS )    N)Fastawrap_sequence
FetchError
ucsc_split	bed_splitget_valid_filename)defaultdictc                    s  t j j\}}|r"|dd  }t jj} jrB fdd}t	 j j
t j j  j| j j d}t \}}|s| }d}|D ]^}||\}	}
}|	d krq jr|
d k	r|d k	r||
 }nt||	 } jd |ks jd |k rq jr0ddd	 |	|
||fD }t|}t|d
}n jr@ j}ntj}zf jr|sp jdkrp|d d}|t ||	|
| n"t ||	|
|D ]}|| qW n4 t k
r } zt t!|d W 5 d }~X Y nX  jr|"  q|#  d S )N   c                    s   t  j|  S N)recompileregexsearch)xargs *lib/python3.8/site-packages/pyfaidx/cli.py<lambda>       z write_sequence.<locals>.<lambda>)default_seqZkey_functionZstrict_bounds
split_charfilt_functionZread_long_namesZrebuildFr   .c                 s   s   | ]}|rt |V  qd S r
   )str).0er   r   r   	<genexpr>'   s      z!write_sequence.<locals>.<genexpr>w
nucleotidez name	start	end	A	T	C	G	N	others
Tz Try setting --lazy.
)$ospathsplitextfastar   r   r   r   Zinvert_matchr   r   evalZheader_functionZlazy	delimiterZ
long_namesZ
no_rebuildsplit_regionskeysZ
size_rangelenZsplit_filesjoinr   openoutsysstdout	transformwritetransform_sequencefetch_sequencer   r   close__exit__)r   _extr   r#   regions_to_fetchsplit_functionheaderregionnamestartendZsequence_lenfilenameZoutfileliner   r   r   r   write_sequence	   sR    .


"
r?   c                 c   s  zb|j j| j}| jrP||krP|d k	rP|d k	rP|| |d |d  }|jj}n|| || }W n, tk
r   tj	dj
f t  Y d S X | jr|j}| jr|j}| jrd S | jrn8|s|r| jsdd|jdgV  ndd|jdgV  t||jD ]}|V   qd S )Nr	   z"warning: {name} not found in file
 >
)faidxindexlencauto_strandreverse
complementKeyErrorr,   stderrr/   formatlocals	no_outputZno_namesZ	no_coordsr)   Z
fancy_namer:   r   seq)r   r#   r:   r;   r<   line_lenZsequencer>   r   r   r   r1   >   s,    
r1   c           	      C   s   t | jd| jd}t| \}}|D ]}||\}}}| jr|rL|rL|| }n*|sb|sbt|| }nt|| || }|| j || ||< q"| jr"|| ||  || ||< q"d S )NT)Zmutabler   )	r   r#   r%   r&   mask_with_default_seqr(   r   mask_by_caseZ	lowercase)	r   r#   r6   r7   r9   Zrnamer;   r<   spanr   r   r   mask_sequence[   s    
rS   c                 C   s$   | j r| j }t}n
| j}t}||fS r
   )bedr   regionsr   )r   r6   r7   r   r   r   r&   n   s    r&   c                    s^  |j j| j}|| || }| jr*|j}| jr6|j}| jr@d S | jdkrddj|j|j	d |j
dS | jdkrdj|jt|dS | jdkr2t|  tt}| fd	d
t D  |dd}|dd}	|dd}
|dd}|dd}ddd
 | D }djf |j|j	|j
dt S | jdkrZdj|j|j	|j
t|dS d S )NrT   z{name}	{start}	{end}
r	   )r:   r;   r<   
chromsizesz{name}	{length}
)r:   lengthr   c                    s   g | ]}|  |fqS r   )count)r   cssr   r   
<listcomp>   s     z&transform_sequence.<locals>.<listcomp>Ar   TCGN|c                 S   s"   g | ]\}}d  |t|fqS ):)r)   r   )r   kvr   r   r   r\      s     z5{sname}	{sstart}	{send}	{A}	{T}	{C}	{G}	{N}	{others}
)ZsnameZsstartsend
transposedz{name}	{start}	{end}	{seq}
)r:   r;   r<   rN   )rC   rD   rE   rH   rG   rM   r.   rK   r:   r;   r<   r(   r   upperr   intupdatesetpopr)   itemsrL   )r   r#   r:   r;   r<   rO   sZnucsr]   r^   r_   r`   ra   Zothersr   rZ   r   r0   x   s2    

 r0   c           
      C   s  ddl m} tjddd}|jdtdd |jd	td
dd |d}|d}|d}|jddtddd |jddtddd |jddtddd |jdddd d!d" |jd#d$dd d%d" |jd&d'dd d(d" |jd)d*td d+d, |	 }|jd-d.dd d/d" |jd0d1dd d2d" |jd3d4dd d5d" |jd6d7dd d8d" |jd9d:dd d;d" |jd<d=t
d d>d, |jd?d@td dAd, |jdBdCtdDdEd, |jdFdGtdHdIdJdK |dL}|jdMdNtdOdPd, |jdQdRdd dSd" |	 }|jdTdUdd dVd" |jdWdXdd dYd" |jdZdd d[d" |jd\dd d]d" |jd^d_|d`da ttjdbkrn| sn|  tdb n| r|| }	n| }	|	jr|	jrtjdc |	jrtjdd |	js|	jrt|	 nt|	 d S )eNr   )__version__zFetch sequences from FASTA. If no regions are specified, all entries in the input file are returned. Input FASTA file must be consistently line-wrapped, and line wrapping of output is based on input line lengths.zPlease cite: Shirley MD, Ma Z, Pedersen BS, Wheelan SJ. (2015) Efficient "pythonic" access to FASTA files using pyfaidx. PeerJ PrePrints 3:e1196 https://dx.doi.org/10.7287/peerj.preprints.970v1)descriptionepilogr#   z
FASTA file)typehelprU   *z=space separated regions of sequence to fetch e.g. chr1:1-1000)rr   nargsrs   zinput optionszoutput optionszheader optionsz-bz--bedrz1bed file of regions (zero-based start coordinate)z-oz--outr   z"output file name (default: stdout)z-iz--transform)rT   rV   r   rg   zItransform the requested regions into another format. default: %(default)s)rr   choicesrs   z-cz--complement
store_trueFz-complement the sequence. default: %(default)s)actiondefaultrs   z-rz	--reversez*reverse the sequence. default: %(default)sz-yz--auto-strandzQreverse complement the sequence when start > end coordinate. default: %(default)sz-az--size-rangezZselected sequences are in the size range [low, high]. example: 1,1000 default: %(default)s)rr   rz   rs   z-nz
--no-namesz5omit sequence names from output. default: %(default)sz-fz--long-nameszpoutput full (long) names from the input fasta headers. default: headers are truncated after the first whitespacez-tz--no-coordszOomit coordinates (e.g. chr:start-end) from output headers. default: %(default)sz-xz--split-fileszEwrite each region to a separate file (names are derived from regions)z-lz--lazyz>fill in --default-seq for missing ranges. default: %(default)sz-sz--default-seqzDdefault base for missing positions and masking. default: %(default)sz-dz--delimiterzjdelimiter for splitting names to multiple values (duplicate names will be discarded). default: %(default)sz-ez--header-functionzlambda x: x.split()[0]z]python function to modify header lines e.g: "lambda x: x.split("|")[0]". default: %(default)sz-uz--duplicates-actionstop)r{   firstZlastZlongestZshortestzQentry to take when duplicate sequence names are encountered. default: %(default)s)rr   rz   rw   rs   zmatching argumentsz-gz--regexz.*zNselected sequences are those matching regular expression. default: %(default)sz-vz--invert-matchzRselected sequences are those not matching 'regions' argument. default: %(default)sz-mz--mask-with-default-seqz<mask the FASTA file using --default-seq default: %(default)sz-Mz--mask-by-casezBmask the FASTA file by changing to lowercase. default: %(default)sz--no-outputz0do not output any sequence. default: %(default)sz--no-rebuildzMdo not rebuild the .fai index even if it is out of date. default: %(default)sz	--versionversionzprint pyfaidx version number)ry   r}   rs   r	   zQ--auto-strand and --complement are both set. Are you sure this is what you want?
zN--auto-strand and --reverse are both set. Are you sure this is what you want?
)pyfaidxro   argparseArgumentParseradd_argumentr   add_argument_groupFileTypeparse_size_rangeadd_mutually_exclusive_groupcheck_seq_lengthr(   r,   argv
print_helpexit
parse_argsrF   rH   rJ   r/   rG   rP   rQ   rS   r?   )
Zext_argsro   parserZ_inputoutputr8   namesZmatcherZmaskingr   r   r   r   main   sb    




r   c                 C   s$   | d kr
nt | dkr td| S )Nr	   z/--default-seq value must be a single character!)r(   r   ArgumentTypeError)valuer   r   r   r      s
    
r   c              
   C   s^   | dkr| S z"|  dd ddd\}}W n tttfk
rL   tY nX t|t|fS )zK Size range argument should be in the form start,end and is end-inclusive. N r@   	,)replacesplit	TypeError
ValueError
IndexErrorri   )r   r;   r<   r   r   r   r      s    "
r   c                   @   s   e Zd ZdZdddZdd ZdddZd	d
 ZedddZ	d ddZ
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )!CounterzDict subclass for counting hashable objects.  Sometimes called a bag
    or multiset.  Elements are stored as dictionary keys and their counts
    are stored as dictionary values.
    Nc                 K   s   | j |f| dS )zCreate a new, empty Counter object.  And if given, count elements
        from an input iterable.  Or, initialize the count from another mapping
        of elements to their counts.
        N)rj   )selfiterablekwdsr   r   r   __init__   s    zCounter.__init__c                 C   s   dS )Nr   r   )r   keyr   r   r   __missing__   s    zCounter.__missing__c                 C   s4   |dkrt |  tdddS t||  tddS )zList the n most common elements and their counts from the most
        common to the least.  If n is None, then list all element counts.
        Nr	   T)r   rG   )r   )sorted	iteritems
itemgetternlargest)r   nr   r   r   most_common   s    zCounter.most_commonc                 c   s,   |   D ]\}}td|D ]
}|V  qqdS )zIterator over elements repeating each as many times as its count.

        If an element's count has been set to zero or is a negative number,
        elements() will ignore it.

        N)r   repeat)r   elemrX   r4   r   r   r   elements   s    zCounter.elementsc                 C   s   t dd S )Nz@Counter.fromkeys() is undefined.  Use Counter(iterable) instead.)NotImplementedError)clsr   re   r   r   r   fromkeys  s    zCounter.fromkeysc                 K   s   |dk	rrt |drP| rB| j}| D ]\}}||d| | |< q$qrt| | n"| j}|D ]}||dd | |< qZ|r| | dS )zLike dict.update() but add counts instead of replacing them.

        Source can be an iterable, a dictionary, or another Counter instance.

        Nr   r   r	   )hasattrgetr   dictrj   )r   r   r   self_getr   rX   r   r   r   rj     s    
zCounter.updatec                 C   s   t | S )zBLike dict.copy() but returns a Counter instance instead of a dict.)r   )r   r   r   r   copy   s    zCounter.copyc                 C   s   || krt | | dS )zGLike dict.__delitem__() but does not raise KeyError for missing values.N)r   __delitem__)r   r   r   r   r   r   $  s    zCounter.__delitem__c                 C   s6   | sd| j j S dtdj|  }d| j j|f S )Nz%s()z, z%r: %rz%s({%s}))	__class____name__r)   map__mod__r   )r   rm   r   r   r   __repr__)  s    zCounter.__repr__c                 C   sN   t |tstS t }t| t|B D ]$}| | ||  }|dkr$|||< q$|S )z'Add counts from two counters.

        r   
isinstancer   NotImplementedrk   r   otherresultr   newcountr   r   r   __add__8  s    

zCounter.__add__c                 C   sN   t |tstS t }t| t|B D ]$}| | ||  }|dkr$|||< q$|S )zF Subtract count, but keep only results with positive counts.

        r   r   r   r   r   r   __sub__E  s    

zCounter.__sub__c                 C   sT   t |tstS t}t }t| t|B D ]&}|| | || }|dkr(|||< q(|S )zHUnion is the maximum of value in either of the input counters.

        r   )r   r   r   maxrk   )r   r   Z_maxr   r   r   r   r   r   __or__R  s    

zCounter.__or__c                 C   sj   t |tstS t}t }t| t|k r2||  } }t| j|D ]&}|| | || }|dkr>|||< q>|S )z? Intersection is the minimum of corresponding counts.

        r   )r   r   r   minr(   filter__contains__)r   r   Z_minr   r   r   r   r   r   __and__`  s    


zCounter.__and__)N)N)N)N)r   
__module____qualname____doc__r   r   r   r   classmethodr   rj   r   r   r   r   r   r   r   r   r   r   r   r      s   


r   __main__)NN)NN)N)r   r,   Zos.pathr    r   r~   r   r   r   r   r   r   collectionsr   r?   r1   rS   r&   r0   r   r   r   r   r   r   r   r   r   r   <module>   s"    5



9 