U
    _W                     @   sZ  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dl
mZ d dl
mZ d dl
mZ d dl
mZ d dl
mZ d d	l
mZ d dlZed
ZeeZe Zeej ee ee dd ZG dd deZ G dd de Z!G dd de Z"dddddddddddddej#dej$ddddddfddZ%dS )    N)dedent)	constants)version)bins)helpers)feature)	interface)	iteratorsz)%(asctime)s - %(levelname)s - %(message)sc                 C   s   t| dkrtd|  dS )zH
    As things change from version to version, deal with them here.
    r   zunhandled kwarg in %sN)
ValueErrorlen	TypeError)kwargs r   .lib/python3.8/site-packages/gffutils/create.pydeprecation_handler   s    r   c                   @   s   e Zd ZddddddddddddddejejddfddZd-d	d
Zdd Z	dd Z
dd Zd.ddZdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, ZdS )/
_DBCreatorFNerror
   zutf-8Tc                 K   s~  || _ |dkrg }|dkr^tddg|r4tdt|ddg}|D ]}td|  qJ|| _|| _|| _|| _	|dkrg }|| _
|std	 d
}d
}|| _|| _|rtj|rt| || _|| _t|tjrt|}n|}|| _tj| j_| | |dk	r2| jdkr*td|  || j_|| _ tj!| _"t#j$||||	|
|d| _%d|krn|d | _&nt'(t)| _&dS )zk
        Base class for _GFFDBCreator and _GTFDBCreator; see create_db()
        function for docs
        Nmergestartendz8Can't merge start/end fields since they must be integersframestrandz\%s field will be merged for features with the same ID; this may result in unusable features.z'infer_gene_extent' will be deprecated. For now, the following equivalent values were automatically set: 'disable_infer_genes=True', 'disable_infer_transcripts=True'. Please use these instead in the future.Tdebugzsetting text factory to %s)data
checklines	transformforce_dialect_checkfrom_stringdialect_autoincrements)*_keep_tempfilessetintersectionr
   warningswarnforce_merge_fieldspragmasmerge_strategydefault_encoding
directivesdisable_infer_genesdisable_infer_transcriptsospathexistsunlinkdbfnid_spec
isinstancesixstring_typessqlite3ZconnectconnZRowZrow_factoryset_verboseverboseloggerr   text_factory_datalevelZ_orig_logger_levelr	   DataIteratoriteratorr    collectionsdefaultdictint)selfr   r1   forcer9   r2   r(   r   r   r   r   r   r)   r+   r,   infer_gene_extentr&   r;   r'   r!   r*   r   r%   wr7   r   r   r   __init__2   sp    




   
z_DBCreator.__init__c                 C   s>   |dkrt tj n|r(t tj nt tj || _d S )Nr   )r:   setLevelloggingDEBUGINFOZERRORr9   )rC   r9   r   r   r   r8      s    z_DBCreator.set_verbosec                 C   s$   | j |  d7  < d|| j | f S )N   z%s_%s)r    )rC   keyr   r   r   _increment_featuretype_autoid   s    z(_DBCreator._increment_featuretype_autoidc              
   C   sL  t | jtjr| jg}nnt| jdr.| jg}nXt | jtrz"| j|j }t |tjrZ|g}W q tk
r|   | |j Y S X n| j}|D ]}t|dr||}|r|	dr| |dd   S |  S qt
|dkr|d dkr|d dkrt||d	d   S z|j| d W   S  ttfk
r<   Y qX q| |jS )
z
        Given a Feature from self.iterator, figure out what the ID should be.

        This uses `self.id_spec` identify the ID.
        __call__zautoincrement:   N   r   :rL   )r3   r2   r4   r5   hasattrdictfeaturetypeKeyErrorrN   
startswithr   getattr
attributes
IndexError)rC   fZid_keykZ_idr   r   r   _id_handler   s4    





*z_DBCreator._id_handlerc                 C   s:   | j  }|tjd |f }tjf d| jj	i|S )Nz WHERE id = ?r   )
r7   cursorexecuter   _SELECTfetchoner   Featurer?   r   )rC   IDcresultsr   r   r   _get_feature   s    
 z_DBCreator._get_featurec              
      s  |dkrt d |dkr6td  d|fS |dkrF |fS |dkrg }| jdkr|td	d
d |  D   ttt	j
dd | j}|  D ]z}d}|D ] }t||t |krd} qq|r|| | jdkrtd| f  q| jdkrtd| f  qt|dkr\ j}	| j dd\}
}| |	|
j |
|fS | jdkrztdt|  t j}t fdd| jD }|D ]|}| jdkrtd |f  |j D ](}||g }|||  |||< q| jD ]}|| t||g qq| D ]\}}tt|||< q(||_| D ]&\}}t||dttt | qR| jdkrtd|  ||fS n,|dkr| ! j _ |fS t d| dS )a?  
        Different merge strategies upon name conflicts.

        "error":
            Raise error

        "warning"
            Log a warning, which indicates that all future instances of the
            same ID will be ignored

        "merge":
            Combine old and new attributes -- but only if everything else
            matches; otherwise error. This can be slow, but is thorough.

        "create_unique":
            Autoincrement based on the ID, always creating a new ID.

        "replace":
            Replaces existing database feature with `f`.
        r   zDuplicate ID {0.id}warningzCDuplicate lines in file for id '{0.id}'; ignoring all but the firstNreplacer   r   zcandidates with same idspec: %sc                 S   s   g | ]
}|j qS r   )id.0ir   r   r   
<listcomp>   s     z(_DBCreator._do_merge.<locals>.<listcomp>rS   TFz2same attributes between:
existing: %s
this    : %sz7different attributes between:
existing: %s
this    : %sr   create_unique)r(   znum candidates: %sc                    s    g | ]}|t t |gfqS r   )r"   rY   rl   fieldr\   r   r   rn   6  s   z
merging

%s
%s
,z
MERGED:
%szInvalid merge strategy '%s')"r
   formatr:   rh   r9   r   _candidate_mergeslistr"   r   Z_gffkeys
differencer&   rY   appendr   rj   	_do_merge_add_duplicatecopydeepcopyrZ   rU   keys
setdefaultextendupdateitemssetattrjoinsortedmapstrrN   )rC   r\   r(   Zadd_duplicateZfeatures_to_mergeZ_gffkeys_to_checkZexisting_featureZother_attributes_samer]   Zorig_idZuniqued_featureZmerged_attributesZfinal_fieldsvrq   r   rr   r   ry      s    



 




 

z_DBCreator._do_mergec              
   C   s   | j  }z|d||f W n6 tjk
rT   |d|| j|| jf Y nX | jdkrrt	d||f  | j 
  dS )aR  
        Adds a duplicate ID (as identified by id_spec) and its new ID to the
        duplicates table so that they can be later searched for merging.

        Parameters
        ----------
        newid : str
            The primary key used in the features table

        idspecid : str
            The ID identified by id_spec
        zg
                INSERT INTO duplicates
                (idspecid, newid)
                VALUES (?, ?)r   zadded id=%s; new=%sN)r7   r_   r`   r6   ProgrammingErrordecoder)   r9   r:   r   commit)rC   ZidspecidZnewidre   r   r   r   rz   \  s     




z_DBCreator._add_duplicatec                 C   sb   |  |jg}| j }|tjd |jf}|D ]"}|tj	f d| j
ji| q2tt|S )z
        Identifies those features that originally had the same ID as `f`
        (according to the id_spec), but were modified because of duplicate
        IDs.
        zh
            JOIN duplicates ON
            duplicates.newid = features.id WHERE duplicates.idspecid = ?r   )rg   rj   r7   r_   r`   r   ra   rx   r   rc   r?   r   rv   r"   )rC   r\   Z
candidatesre   rf   rm   r   r   r   ru   ~  s    
z_DBCreator._candidate_mergesc                 C   s   t d S NNotImplementedError)rC   linesr   r   r   _populate_from_lines  s    z_DBCreator._populate_from_linesc                 C   s   t d S r   r   rC   r   r   r   _update_relations  s    z_DBCreator._update_relationsc                 C   s2   | j  }tjD ]}|d|f q| j   d S )NzDROP INDEX IF EXISTS ?)r7   r_   r   ZINDEXESr`   r   )rC   re   indexr   r   r   _drop_indexes  s    

z_DBCreator._drop_indexesc                 C   s>   || _ | j }|ddd | j  D  | j  dS )a  
        Set pragmas for the current database connection.

        Parameters
        ----------
        pragmas : dict
            Dictionary of pragmas; see constants.default_pragmas for a template
            and http://www.sqlite.org/pragma.html for a full list.
        z;
c                 S   s   g | ]}d | qS )zPRAGMA %s=%sr   rk   r   r   r   rn     s     z*_DBCreator.set_pragmas.<locals>.<listcomp>N)r'   r7   r_   executescriptr   r   r   )rC   r'   re   r   r   r   set_pragmas  s    

z_DBCreator.set_pragmasc                 C   s6   | j  }tj}| | j |tj | j 	  dS )z 
        Table creation
        N)
r7   r_   r6   Zsqlite_version_infor   r'   r   r   ZSCHEMAr   )rC   re   r   r   r   r   _init_tables  s
    
z_DBCreator._init_tablesc                 C   s&  | j  }| j| jj }|ddd |D  |dttjt	| jj
d |dt| j  td |d |d	 td
 |d |d td |d |d td |d |d td |d |d td |d | j   | jj| _dS )z
        Various last-minute stuff to perform after file has been parsed and
        imported.

        In general, if you'll be adding stuff to the meta table, do it here.
        zO
                      INSERT INTO directives VALUES (?)
                      c                 s   s   | ]}|fV  qd S r   r   rk   r   r   r   	<genexpr>  s     z'_DBCreator._finalize.<locals>.<genexpr>zX
            INSERT INTO meta (version, dialect)
            VALUES (:version, :dialect))r   r   zM
            INSERT OR REPLACE INTO autoincrements VALUES (?, ?)
             Creating relations(parent) index$DROP INDEX IF EXISTS relationsparent2CREATE INDEX relationsparent ON relations (parent)Creating relations(child) index#DROP INDEX IF EXISTS relationschild0CREATE INDEX relationschild ON relations (child)z$Creating features(featuretype) indexz DROP INDEX IF EXISTS featuretypez2CREATE INDEX featuretype ON features (featuretype)z+Creating features (seqid, start, end) indexz"DROP INDEX IF EXISTS seqidstartendz:CREATE INDEX seqidstartend ON features (seqid, start, end)z3Creating features (seqid, start, end, strand) indexz(DROP INDEX IF EXISTS seqidstartendstrandzHCREATE INDEX seqidstartendstrand ON features (seqid, start, end, strand)zRunning ANALYZE featureszANALYZE featuresN)r7   r_   r*   r?   executemanyr`   rU   r   r   _jsonifyr   rv   r    r   r:   infor   r$   )rC   re   r*   r   r   r   	_finalize  sD    


















z_DBCreator._finalizec                 C   s(   |    | | j |   |   dS )zb
        Calls various methods sequentially in order to fully build the
        database.
        N)r   r   r?   r   r   r   r   r   r   create  s    z_DBCreator.createc                 C   s   |  | |   d S r   )r   r   )rC   r?   r   r   r   r      s    
z_DBCreator.updatec                 c   s(   | j  }||}|D ]
}|V  qdS )z;
        Execute a query directly on the database.
        N)r7   r_   r`   )rC   Zqueryre   resultrm   r   r   r   r`     s    

z_DBCreator.executec              	   C   sH   z| tj|  W n, tjk
rB   | tj|| j Y nX dS z5
        Insert a feature into the database.
        N)r`   r   _INSERTastupler6   r   r)   rC   r   r_   r   r   r   _insert  s     
z_DBCreator._insertc              
   C   s`   z"| tjt| |jg  W n8 tjk
rZ   | tjt|| j	|jg  Y nX dS r   )
r`   r   Z_UPDATErv   r   rj   r6   r   r   r)   r   r   r   r   _replace  s    z_DBCreator._replace)N)F)__name__
__module____qualname__r6   OptimizedUnicoder   default_pragmasrG   r8   rN   r^   rg   ry   rz   ru   r   r   r   r   r   r   r   r   r`   r   r   r   r   r   r   r   1   sH        
Q
	6
 "
9	
r   c                       s,   e Zd Z fddZdd Zdd Z  ZS )_GFFDBCreatorc                    s   t t| j|| dS )z
        _DBCreator subclass specifically for working with GFF files.

        create_db() delegates to this class -- see that function for docs
        N)superr   rG   rC   argsr   	__class__r   r   rG   &  s    z_GFFDBCreator.__init__c              
      s  | j  }|   d}td d}d }g g  }}t|D ]P\}}	|}| jrt|d dkrttj	||  tj
  | |	|	_z| |	| W n tjk
r\   | |	| j\ }
|
dkr*|dt j jf | jrXddd	 | jD } fd
d	| jD  jg }|d| t| n.|
dkrB| |	| n|
dkrX| |	| Y nX d|	jkr:|	jd D ]}|d||	jf qrq:|d krtd| j   | jrt||  d S )Nr   zPopulating featuresAPopulating features table and first-order relations: %d features  r   y
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        , c                 S   s   g | ]}d | qS z%s = ?r   rp   r   r   r   rn   \  s   z6_GFFDBCreator._populate_from_lines.<locals>.<listcomp>c                    s   g | ]}t  |qS r   rY   rp   fixedr   r   rn   ^  s   y
                            UPDATE features SET %s
                            WHERE id = ?
                            ri   ro   ZParentzz
                        INSERT OR IGNORE INTO relations VALUES
                        (?, ?, 1)
                        .No lines parsed -- was an empty file provided?)r7   r_   r   r:   r   	enumerater9   sysstderrwriteflushr^   rj   r   r6   IntegrityErrorry   r(   r`   r   r   rZ   r&   r   tupler   r
   r   )rC   r   re   	last_percmsgZfeatures_seenZ	_featuresZ
_relationsrm   r\   final_strategy_set_clausevaluesparentr   r   r   r   .  sr    





	






z"_GFFDBCreator._populate_from_linesc           	   
      s  t d | j }| j }| j }t| jtjr>| j}nd}tj	d|dj
}t|dT |d |D ]>}|dt| |D ]$} d|d	 |d	 fd
  qqlW 5 Q R X  fdd}|d|  |d |d | j  | jst j
 d S )NzUpdating relations	.gffutilsFdeletesuffixrF   zSELECT id FROM featuresz
                           SELECT child FROM relations WHERE parent IN
                           (SELECT child FROM relations WHERE parent = ?)
                           	r   
c               	   3   sF   t  j2} | D ]&}| d\}}t||ddV  qW 5 Q R X d S )Nr      )r   childr=   )opennamestripsplitrU   )finliner   r   foutr   r   relations_generator  s    z<_GFFDBCreator._update_relations.<locals>.relations_generatorzf
            INSERT OR IGNORE INTO relations VALUES
            (:parent, :child, :level)
            zDROP INDEX IF EXISTS binindexz'CREATE INDEX binindex ON features (bin))r:   r   r7   r_   r3   r!   r4   r5   tempfileNamedTemporaryFiler   r   r`   r   r   r   r   r   r-   r0   )	rC   re   c2Zc3r   tmpr   Z
grandchildr   r   r   r   r   {  s4    



	
.


z_GFFDBCreator._update_relationsr   r   r   rG   r   r   __classcell__r   r   r   r   r   %  s   Mr   c                       s,   e Zd Z fddZdd Zdd Z  ZS )_GTFDBCreatorc                    s@   | dd| _| dd| _| dd| _tt| j|| dS )zS
        create_db() delegates to this class -- see that function for docs
        transcript_keytranscript_idgene_keygene_id
subfeatureexonN)popr   r   r   r   r   rG   r   r   r   r   rG     s    z_GTFDBCreator.__init__c              
      sj  d}| j  }d}d}d}t|D ]\}}||k rj|jdkrP| jsPtd n|jdkrj| jsjtd |d }| jr|d dkrt	j
||  t	j
  | ||_z| || W n tjk
r   | || j\ }	|	d	krP|d
t j jf | jr~ddd | jD }
 fdd| jD  jg }|d|
 | n.|	dkrh| || n|	dkr~| || Y nX g }d }d }| j|jkr|j| j d }|||jdf | j|jkr|j| j }t|dkr|d }|||jdf |d k	r|||df |d| q"|dkr<t dt!"d | j #  | jrft!"||  d S )Nr   r   r   
transcriptzIt appears you have a transcript feature in your GTF file. You may want to use the `disable_infer_transcripts=True` option to speed up database creationgenezIt appears you have a gene feature in your GTF file. You may want to use the `disable_infer_genes=True` option to speed up database creationrL   r   r   r   c                 S   s   g | ]}d | qS r   r   rp   r   r   r   rn     s   z6_GTFDBCreator._populate_from_lines.<locals>.<listcomp>c                    s   g | ]}t  |qS r   r   rp   r   r   r   rn      s   r   ri   ro   r   zy
                INSERT OR IGNORE INTO relations (parent, child, level)
                VALUES (?, ?, ?)
                r   Committing changes)$r7   r_   r   rV   r,   r$   r%   r+   r9   r   r   r   r   r^   rj   r   r6   r   ry   r(   r`   r   r   rZ   r&   r   r   r   rx   r   r   r   r
   r:   r   r   )rC   r   r   re   Zgene_and_transcript_check_limitr   Z
lines_seenrm   r\   r   r   r   Z	relationsr   Zgrandparentr   r   r   r     s    










z"_GTFDBCreator._populate_from_linesc                    s  j rjrd S j }j }td |d |d td |d |d j srjsrd}njr~d}n
j rd	}td
|  tjt	j
rj}nd}tjd|dj}t|dD |_|djf d }d}|D ]\}}	js|d|jf | \}
}}}j|gj|	gi}tj|
|dd} dtt|||
||d	|t|gd  |d7 }j s|	|kr|d|	jf | \}}}}j|	gi}tj||dd} dtt|	||||d|t|gd  |	}|d7 }qW 5 Q R X  fdd}|d |d td d }t| D ]\}}t|t| d }||krtjd|||f  tj   |}z!|| W nB t"j#k
r   $|d\}}|dt|j%|j&f Y nX qPtd j'  jst() j d S ) Nr   r   r   r   r   r   zgene and transcriptr   r   z,Inferring %s extents and writing to tempfiler   Fr   rF   a  
                SELECT DISTINCT firstlevel.parent, relations.parent
                FROM (
                    SELECT DISTINCT parent
                    FROM relations
                    JOIN features ON features.id = relations.child
                    WHERE features.featuretype = ?
                    AND relations.level = 1
                )
                AS firstlevel
                JOIN relations ON firstlevel.parent = child
                WHERE relations.level = 1
                ORDER BY relations.parent
                r   a   
                        SELECT MIN(start), MAX(end), strand, seqid
                        FROM features
                        JOIN relations ON
                        features.id = relations.child
                        WHERE parent = ? AND featuretype == ?
                        T)Zoner   r   rL   a8  
                            SELECT MIN(start), MAX(end), strand, seqid
                            FROM features
                            JOIN relations ON
                            features.id = relations.child
                            WHERE parent = ? AND featuretype == ?
                            c               	   3   s   ddddddddg} t  j}|D ]z}ttt| | d	}|d d
|d< d|d< d
|d< g |d< t	|d |d< t
jf |}||_|V  q$W 5 Q R X dS )zS
            Generator of items from the file that was just created...
            r   seqidr   r   r   rV   binrZ   r   .ZscoreZgffutils_derivedsourcer   ZextraN)r   r   rU   rv   zipr   r   r   r   Z
_unjsonifyr   rc   r^   rj   )r}   r   r   dr\   r   rC   r   r   derived_feature_generator  s"    
  
zB_GTFDBCreator._update_relations.<locals>.derived_feature_generatorz#Importing inferred features into dbd   z%s of %s (%s%%)r   zm
                    UPDATE features SET attributes = ?
                    WHERE id = ?
                    r   )*r+   r,   r7   r_   r:   r   r`   r3   r!   r4   r5   r   r   r   r   Z_tmpfiler   rb   r   r   r   r   r   r   r   r   r   r   rB   floatr   r   r   r   r6   r   ry   rZ   rj   r   r-   r0   )rC   re   r   r   r   r   Zlast_gene_idZ
n_featuresr   r   Ztranscript_startZtranscript_endr   r   Ztranscript_attributesZtranscript_binZ
gene_startZgene_endZgene_attributesZgene_binr   r   rm   r\   percr   r   r   r   r   r   0  s    







    	
	







z_GTFDBCreator._update_relationsr   r   r   r   r   r     s   	rr   Fr   r   r   r   r   Tc                    s  t   t| t fddtjD }tjf |}|jf   |dkrL|j}|j	|d< |j
|d< d|d< |sx|d d	krt}|pd
}t|d}n.|d dkrt}|pddd}t||	|
|d}|jf | ||d< |f |}|  |dkrtj|j||||d}ntj|||||d}|S )a$  
    Create a database from a GFF or GTF file.

    For more details on when and how to use the kwargs below, see the examples
    in the online documentation (:ref:`examples`).

    Parameters
    ----------
    data : string or iterable

        If a string (and `from_string` is False), then `data` is the path to
        the original GFF or GTF file.

        If a string and `from_string` is True, then assume `data` is the actual
        data to use.

        Otherwise, it's an iterable of Feature objects.

    dbfn : string

        Path to the database that will be created.  Can be the special string
        ":memory:" to create an in-memory database.

    id_spec : string, list, dict, callable, or None

        This parameter guides what will be used as the primary key for the
        database, which in turn determines how you will access individual
        features by name from the database.

        If `id_spec=None`, then auto-increment primary keys based on the
        feature type (e.g., "gene_1", "gene_2").  This is also the fallback
        behavior for the other values below.

        If `id_spec` is a string, then look for this key in the attributes.  If
        it exists, then use its value as the primary key, otherwise
        autoincrement based on the feature type.  For many GFF3 files, "ID"
        usually works well.

        If `id_spec` is a list or tuple of keys, then check for each one in
        order, using the first one found.  For GFF3, this might be ["ID",
        "Name"], which would use the ID if it exists, otherwise the Name,
        otherwise autoincrement based on the feature type.

        If `id_spec` is a dictionary, then it is a mapping of feature types to
        what should be used as the ID.  For example, for GTF files, `{'gene':
        'gene_id', 'transcript': 'transcript_id'}` may be useful.  The values
        of this dictionary can also be a list, e.g., `{'gene': ['gene_id',
        'geneID']}`

        If `id_spec` is a callable object, then it accepts a dictionary from
        the iterator and returns one of the following:

            * None (in which case the feature type will be auto-incremented)
            * string (which will be used as the primary key)
            * special string starting with "autoincrement:X", where "X" is
              a string that will be used for auto-incrementing.  For example,
              if "autoincrement:chr10", then the first feature will be
              "chr10_1", the second "chr10_2", and so on.

    force : bool

        If `False` (default), then raise an exception if `dbfn` already exists.
        Use `force=True` to overwrite any existing databases.

    verbose : bool

        Report percent complete and other feedback on how the db creation is
        progressing.

        In order to report percent complete, the entire file needs to be read
        once to see how many items there are; for large files you may want to
        use `verbose=False` to avoid this.

    checklines : int

        Number of lines to check the dialect.

    merge_strategy : str
        One of {merge, create_unique, error, warning, replace}.

        This parameter specifies the behavior when two items have an identical
        primary key.

        Using `merge_strategy="merge"`, then there will be a single entry in
        the database, but the attributes of all features with the same primary
        key will be merged.

        Using `merge_strategy="create_unique"`, then the first entry will use
        the original primary key, but the second entry will have a unique,
        autoincremented primary key assigned to it

        Using `merge_strategy="error"`, a :class:`gffutils.DuplicateID`
        exception will be raised.  This means you will have to edit the file
        yourself to fix the duplicated IDs.

        Using `merge_strategy="warning"`, a warning will be printed to the
        logger, and the duplicate feature will be skipped.

        Using `merge_strategy="replace"` will replace the entire existing
        feature with the new feature.

    transform : callable

        Function (or other callable object) that accepts a `Feature` object and
        returns a (possibly modified) `Feature` object.

    gtf_transcript_key, gtf_gene_key : string

        Which attribute to use as the transcript ID and gene ID respectively
        for GTF files.  Default is `transcript_id` and `gene_id` according to
        the GTF spec.

    gtf_subfeature : string

        Feature type to use as a "gene component" when inferring gene and
        transcript extents for GTF files.  Default is `exon` according to the
        GTF spec.

    force_gff : bool
        If True, do not do automatic format detection -- only use GFF.

    force_dialect_check : bool
        If True, the dialect will be checkef for every feature (instead of just
        `checklines` features).  This can be slow, but may be necessary for
        inconsistently-formatted input files.

    from_string : bool
        If True, then treat `data` as actual data (rather than the path to
        a file).

    keep_order : bool

        If True, all features returned from this instance will have the
        order of their attributes maintained.  This can be turned on or off
        database-wide by setting the `keep_order` attribute or with this
        kwarg, or on a feature-by-feature basis by setting the `keep_order`
        attribute of an individual feature.

        Note that a single order of attributes will be used for all features.
        Specifically, the order will be determined by the order of attribute
        keys in the first `checklines` of the input data. See
        helpers._choose_dialect for more information on this.

        Default is False, since this includes a sorting step that can get
        time-consuming for many features.

    infer_gene_extent : bool
        DEPRECATED in version 0.8.4. See `disable_infer_transcripts` and
        `disable_infer_genes` for more granular control.

    disable_infer_transcripts, disable_infer_genes : bool
        Only used for GTF files. By default -- and according to the GTF spec --
        we assume that there are no transcript or gene features in the file.
        gffutils then infers the extent of each transcript based on its
        constituent exons and infers the extent of each gene bases on its
        constituent transcripts.

        This default behavior is problematic if the input file already contains
        transcript or gene features (like recent GENCODE GTF files for human),
        since 1) the work to infer extents is unnecessary, and 2)
        trying to insert an inferred feature back into the database triggers
        gffutils' feature-merging routines, which can get time consuming.

        The solution is to use `disable_infer_transcripts=True` if your GTF
        already has transcripts in it, and/or `disable_infer_genes=True` if it
        already has genes in it. This can result in dramatic (100x) speedup.

        Prior to version 0.8.4, setting `infer_gene_extents=False` would
        disable both transcript and gene inference simultaneously. As of
        version 0.8.4, these argument allow more granular control.

    force_merge_fields : list
        If merge_strategy="merge", then features will only be merged if their
        non-attribute values are identical (same chrom, source, start, stop,
        score, strand, phase).  Using `force_merge_fields`, you can override
        this behavior to allow merges even when fields are different.  This
        list can contain one or more of ['seqid', 'source', 'featuretype',
        'score', 'strand', 'frame'].  The resulting merged fields will be
        strings of comma-separated values.  Note that 'start' and 'end' are not
        available, since these fields need to be integers.

    text_factory : callable
        Text factory to use for the sqlite3 database.  See
        https://docs.python.org/2/library/                sqlite3.html#sqlite3.Connection.text_factory
        for details. The default sqlite3.OptimizedUnicode will return Unicode
        objects only for non-ASCII data, and bytestrings otherwise.

    pragmas : dict
        Dictionary of pragmas used when creating the sqlite3 database. See
        http://www.sqlite.org/pragma.html for a list of available pragmas.  The
        defaults are stored in constants.default_pragmas, which can be used as
        a template for supplying a custom dictionary.

    sort_attribute_values : bool
        All features returned from the database will have their attribute
        values sorted.  Typically this is only useful for testing, since this
        can get time-consuming for large numbers of features.

    _keep_tempfiles : bool or string
        False by default to clean up intermediate tempfiles created during GTF
        import.  If True, then keep these tempfile for testing or debugging.
        If string, then keep the tempfile for testing, but also use the string
        as the suffix fo the tempfile. This can be useful for testing in
        parallel environments.

    Returns
    -------
    New :class:`FeatureDB` object.
    c                 3   s   | ]}| | fV  qd S r   r   rk   Z_localsr   r   r     s     zcreate_db.<locals>.<genexpr>Nr   r*   r   r   ZfmtZgff3rd   )r2   Zgtfr   r   )r   r   )r   r   r   r2   r   z:memory:)
keep_orderr'   sort_attribute_valuesr;   )localsr   rU   r   Z_iterator_kwargsr	   r>   r   r   Z_iterr*   r   r   r   r   Z	FeatureDBr7   )r   r1   r2   rD   r9   r   r(   r   Zgtf_transcript_keyZgtf_gene_keyZgtf_subfeatureZ	force_gffr   r   r   r;   r&   r'   r   r   r!   rE   r+   r,   r   r?   clsZ
add_kwargsre   Zdbr   r   r   	create_db  sX     ]




r   )&r{   r$   r@   r   r   r-   r6   r4   textwrapr   Zgffutilsr   r   r   r   r   r   r	   rI   Z	Formatter	formatterZ	getLoggerr   r:   ZStreamHandlerZchrH   rJ   ZsetFormatterZ
addHandlerr   objectr   r   r   r   r   r   r   r   r   r   <module>   sj   



   w   I           