U
    _j=                     @   s<  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dlm
Z
 d dlZd dlmZ d dlmZ d dlmZ ejejeZdd Zd	d
 Zdd Zd2ddZdd Zdd Zd3ddZd4ddZd5ddZdd Zdd Zdd  Zd6d"d#Z d7d$d%Z!d&d' Z"d(d) Z#d8d+d,Z$d-d. Z%d9d0d1Z&dS ):    N)	constants)bins)	gffwriter)parser)
dict_classc                 C   s   t jtdd| S )zG
    Return the full path of a data file that ships with gffutils.
    Ztestdata)ospathjoinHERE)fn r   /lib/python3.8/site-packages/gffutils/helpers.pyexample_filename   s    r   c                 C   s(   t | tjr| g} dd | D }t|S )a!  
    Infer the dialect based on the attributes.

    Parameters
    ----------
    attributes : str or iterable
        A single attributes string from a GTF or GFF line, or an iterable of
        such strings.

    Returns
    -------
    Dictionary representing the inferred dialect
    c                 S   s   g | ]}t |d  qS )   )r   Z_split_keyvals.0ir   r   r   
<listcomp>)   s     z!infer_dialect.<locals>.<listcomp>)
isinstancesixstring_types_choose_dialect)
attributesdialectsr   r   r   infer_dialect   s    r   c                 C   sT   t | dkrtjS g }| D ]$}|d D ]}||kr&|| q&q| d }||d< |S )a  
    Given a list of dialects, choose the one to use as the "canonical" version.

    If `dialects` is an empty list, then use the default GFF3 dialect

    Parameters
    ----------
    dialects : iterable
        iterable of dialect dictionaries

    Returns
    -------
    dict
    r   order)lenr   dialectappend)r   Zfinal_orderr   or   r   r   r   -   s    r   Fc	              	   C   s  d}	t tjddddddd}
|r(||
d< |r4||
d< |
d |
d  d}t| |krbtd|  |rt|tjrd|
d	< | 	| n&d
d
dd |D  |
d	< | | |rdt|tjr|d\}}|d\}}n
|\}}}tjt|t|dd}|rd|
d< | |||g nd|
d< | |||g t|dk rd|
d  dd
tt| 7  < |r|d|
d< | 	| tjddg }g }|rt|tjr|	| n>|D ]8}||krtd||f |dkrd}|	| qd
|}|rd}nd}d||f |
d < d}d!|
d  kr0d"}d#D ]<}|
| r4|s^d$|
|  |
|< d"}nd%|
|  |
|< q4|	jf |
| fS )&a{  
    Multi-purpose, bare-bones ORM function.

    This function composes queries given some commonly-used kwargs that can be
    passed to FeatureDB methods (like .parents(), .children(), .all_features(),
    .features_of_type()).  It handles, in one place, things like restricting to
    featuretype, limiting to a genomic range, limiting to one strand, or
    returning results ordered by different criteria.

    Additional filtering/subsetting/sorting behavior should be added here.

    (Note: this ended up having better performance (and flexibility) than
    sqlalchemy)

    This function also provides support for additional JOINs etc (supplied via
    the `other` kwarg) and extra conditional clauses (`extra` kwarg).  See the
    `_QUERY` var below for the order in which they are used.

    For example, FeatureDB._relation uses `other` to supply the JOIN
    substatment, and that same method also uses `extra` to supply the
    "relations.level = ?" substatment (see the source for FeatureDB._relation
    for more details).

    `args` contains the arguments that will ultimately be supplied to the
    sqlite3.connection.execute function.  It may be further populated below --
    for example, if strand="+", then the query will include a strand clause,
    and the strand will be appended to the args.

    `args` can be pre-filled with args that are passed to `other` and `extra`.
    zC{_SELECT} {OTHER} {EXTRA} {FEATURETYPE} {LIMIT} {STRAND} {ORDER_BY} )_SELECTOTHERFEATURETYPELIMITSTRANDORDER_BYEXTRAr#   r(   ?z!Not enough args (%s) for subqueryzfeatures.featuretype = ?r$   zfeatures.featuretype IN  (%s),c                 S   s   g | ]}d qS )r)   r   )r   _r   r   r   r      s     zmake_query.<locals>.<listcomp>:-FZonez@features.seqid = ? AND features.start >= ? AND features.end <= ?r%   z@features.seqid = ? AND features.start <= ? AND features.end >= ?i  z AND features.bin IN (%s)zfeatures.strand = ?r&   Z
file_orderlengthz#%s not a valid order-by value in %sz(end - start)ZDESCZASCzORDER BY %s %sr'   whereT)r(   r$   r%   r&   zWHERE zAND )dictr   r"   countr   
ValueErrorr   r   r   r   r
   extendsplitr   intmapstrZ_gffkeys_extralowerformat)argsotherlimitZstrandfeaturetypeextraZorder_byreverseZcompletely_withinZ_QUERYdZrequired_argsZseqidZ	startstopstartendZ_binsZvalid_order_byZ	_order_byk	directionr0   r   r   r   r   
make_queryN   s    "  


 




rF   c                 C   sF   z*t | d }t | d }tj||ddW S  tk
r@   Y dS X dS )zQ
    Given a dictionary yielded by the parser, return the genomic "UCSC" bin
    rB   rC   Tr.   N)r6   r   r3   )rA   rB   rC   r   r   r   _bin_from_dict   s    rG   c                 C   s(   t | trtj| jddS tj| ddS )zUse most compact form of JSON)r*   r,   )Z
separators)r   r   jsondumpsZ_dxr   r   r   _jsonify   s    
rL   c                 C   s    |rt | }t|S t | S )z.Convert JSON string to an ordered defaultdict.)rH   loadsr   )rK   Zisattributesobjr   r   r   
_unjsonify  s    
rO   Tc                 C   sH   g }t jD ]4}t| |}|r4|dkr4|t| q
|| q
t|S )z=
    Convert feature to tuple, for faster sqlite3 import
    r   r?   )r   _keysgetattrr   rL   tuple)fjsonifyrK   rD   vr   r   r   _feature_to_fields  s    

rW   c                 C   sF   g }t jD ]2}| | }|r2|dkr2|t| q
|| q
t|S )z:
    Convert dict to tuple, for faster sqlite3 import
    rP   )r   rQ   r   rL   rS   )rA   rU   rK   rD   rV   r   r   r   _dict_to_fields  s    
rX   c                 C   s   ddl }|t| dS )z>
    Converts a gffutils.Feature to a pybedtools.Interval
    r   N	)
pybedtoolsZcreate_interval_from_listr8   r5   )ZfeaturerZ   r   r   r   
asinterval+  s    r[   c                 C   s   t | }|t | | D ]\}}t|ts"|g||< q"t| D ].\}}||krJt|tsj|g}|| | qJt	dd | D S )z
    Merges two attribute dictionaries into a single dictionary.

    Parameters
    ----------
    `attr1`, `attr2` : dict

    Returns
    -------
    dict
    c                 s   s"   | ]\}}|t t|fV  qd S N)sortedset)r   rD   rV   r   r   r   	<genexpr>M  s     z#merge_attributes.<locals>.<genexpr>)
copydeepcopyupdateitemsr   listr   Z	iteritemsr4   r1   )Zattr1Zattr2Znew_drD   rV   r   r   r   merge_attributes3  s    


re   c                 C   s@   t |  }t | }ttt||tt||dS )z 
    Compares two dialects.
    )ZaddedZremoved)r^   rc   r1   rd   
difference)Zdialect1Zdialect2Zorignewr   r   r   dialect_compareP  s    rh   gidc                    s$    fdd}t j| ddd}|S )a  
    Sanitize given GFF db. Returns a sanitized GFF db.

    Sanitizing means:

    - Ensuring that start < stop for all features
    - Standardizing gene units by adding a 'gid' attribute
      that makes the file grep-able

    TODO: Do something with negative coordinates?
    c                  3   sV      D ]H} | d j}| D ]4}|j|jkr<|j|j |_|_|g|j< |V  qqd S )Nr   )Ziter_by_parent_childsidrB   stopr   )Z	gene_recsZgene_idZrecdb	gid_fieldr   r   sanitized_iteratorh  s    
z+sanitize_gff_db.<locals>.sanitized_iterator:memory:Fverbose)gffutils	create_db)rm   rn   ro   sanitized_dbr   rl   r   sanitize_gff_db\  s    
rv   c                 C   s   d}t | rt| }n|r.tj| ddd}nt| }|rJtj| |d}nttj}t	|}|j
ddD ]}|||j qj|  dS )z
    Sanitize a GFF file.
    Nrp   Frq   )in_placegene)r>   )	is_gff_dbrs   Z	FeatureDBrt   
get_gff_dbr   Z	GFFWritersysstdoutrv   Zall_featuresZwrite_gene_recsrj   close)	gff_fnameZ	in_memoryrw   rm   Zgff_outru   Zgene_recr   r   r   sanitize_gff_file{  s"    r   c                 C   s   dS )zq
    Annotate a GFF file by cross-referencing it with another GFF
    file, e.g. one containing gene models.
    Nr   )rm   r   r   r   annotate_gff_db  s    r   c                 C   s"   t j| sdS | drdS dS )zc
    Return True if the given filename is a GFF database.

    For now, rely on .db extension.
    F.dbT)r   r	   isfileendswith)db_fnamer   r   r   ry     s
    
ry   utf-8c                 C   s(   t | tjr$t | tjs$t| |} | S r\   )r   r   r   Z	text_type)rN   encodingr   r   r   
to_unicode  s    r   c                 #   s  dd l }|j|dd | dD ]}g }t| j|ddD ]`\}}d}d}t| j|dd}	|	D ]&}
t|
}|
jdkr~||7 }||7 }q`|||||	f q:t	dd	 |D dkrt
|d }nt
|d
d d }t| |d }|d } fdd|D }|d|fV  q d S )Nr   T)Zas_rawrx   r   )levelZCDSc                 s   s   | ]}|d  V  qdS )r   Nr   r   r   r   r   r_     s     z(canonical_transcripts.<locals>.<genexpr>c                 S   s   | d S )Nr   r   rJ   r   r   r   <lambda>      z'canonical_transcripts.<locals>.<lambda>c                    s   g | ]}|  qS r   )Zsequencer   Zfastar   r   r     s     z)canonical_transcripts.<locals>.<listcomp>r!   )pyfaidxZFastaZfeatures_of_type	enumerateZchildrenrd   r   r>   r   maxr]   printr
   )rm   Zfasta_filenamer   rx   Z	exon_listZtiZ
transcriptZcds_lenZ	total_lenZexonsZexonZexon_lengthZbestZcanonical_exonsZseqsr   r   r   canonical_transcripts  s,    

r   r   c                 C   s   t j| std|  d| |f }t j|r4|S tjdd}td|   t }tj	| |j
ddd}t }td||   |S )	z
    Get db for GFF file. If the database has a .db file,
    load that. Otherwise, create a named temporary file,
    serialize the db to that, and return the loaded database.
    zGFF %s does not exist.z%s.%sF)deletezCreating db for %smerge)Zmerge_strategyrr   z  - Took %.2f seconds)r   r	   r   r3   tempfileZNamedTemporaryFiler   timers   rt   name)r~   extZcandidate_db_fnamer   t1rm   t2r   r   r   rz     s    
rz   )NNNNNNFF)F)T)T)ri   )TF)r   )r   )'r`   r{   r   Z
simplejsonrH   r   r   r   rs   r   r   r   r   Zgffutils.attributesr   r	   dirnameabspath__file__r   r   r   r   rF   rG   rL   rO   rW   rX   r[   re   rh   rv   r   r   ry   r   r   rz   r   r   r   r   <module>   sR   !       
 %



   

) 