
    &h                     x    d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ  G d d	      Z G d
 d      Zy)a'  Load biopython objects into a BioSQL database for persistent storage.

This code makes it possible to store biopython objects in a relational
database and then retrieve them back. You shouldn't use any of the
classes in this module directly. Rather, call the load() method on
a database object.
    )gmtime)strftime)Entrez)UndefinedSequenceError)UnknownPosition)crc64c                       e Zd ZdZddZd Zd dZd!dZd Zd Z	d	 Z
d
 Z	 d"dZd Zd Zd Zd Zd Zd Zd Zd Z	 d#dZd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zy)$DatabaseLoaderz=Object used to load SeqRecord objects into a BioSQL database.c                 .    || _         || _        || _        y)a  Initialize with connection information for the database.

        Creating a DatabaseLoader object is normally handled via the
        BioSeqDatabase DBServer object, for example::

            from BioSQL import BioSeqDatabase
            server = BioSeqDatabase.open_database(driver="MySQLdb",
                                                  user="gbrowse",
                                                  passwd="biosql",
                                                  host="localhost",
                                                  db="test_biosql")
            try:
                db = server["test"]
            except KeyError:
                db = server.new_database("test",
                description="For testing GBrowse")

        N)adaptordbidfetch_NCBI_taxonomy)selfr   r   r   s       \/mounts/lovelace/software/anaconda3/envs/py312/lib/python3.12/site-packages/BioSQL/Loader.py__init__zDatabaseLoader.__init__$   s    & 	#6     c           
          | j                  |      }| j                  ||       | j                  ||       | j                  ||       | j	                  ||       |j
                  j                  dd      }t        |t        t        t        |                        D ]  \  }}| j                  |||        | j                  ||       t        t        |j                              D ]$  }|j                  |   }| j                  |||       & y)z-Load a Biopython SeqRecord into the database.
references N)_load_bioentry_table_load_bioentry_date_load_biosequence_load_comment_load_dbxrefsannotationsgetziplistrangelen_load_reference_load_annotationsfeatures_load_seqfeature)r   recordbioentry_idr   	referencerankseq_feature_numseq_features           r   load_seqrecordzDatabaseLoader.load_seqrecord;   s    //7  5v{36;/6;/''++L"=
":tE#j/4J/KL 	?OIt  D+>	?v{3$S%9: 	MO ///:K!!+L	Mr   Nc                     | j                   j                  d|f      }|r|d   S | j                   j                  d||f       | j                   j                  d      S )ax  Return identifier for the named ontology (PRIVATE).

        This looks through the onotology table for a the given entry name.
        If it is not found, a row is added for this ontology (using the
        definition if supplied).  In either case, the id corresponding to
        the provided name is returned, so that you can reference it in
        another table.
        z0SELECT ontology_id FROM ontology WHERE name = %sr   z6INSERT INTO ontology(name, definition) VALUES (%s, %s)ontology)r   execute_and_fetch_col0executelast_id)r   name
definitionoidss       r   _get_ontology_idzDatabaseLoader._get_ontology_idJ   s`     ||22>
 7NDtZFX	
 ||##J//r   c                 X   d}|g}|r|dz  }|j                  |       | j                  j                  ||      }t        |      dkD  rt	        d| d|      t        |      dk(  r|d   d   S d}| j                  j                  |||||f       | j                  j                  d      S )	a  Get the id that corresponds to a term (PRIVATE).

        This looks through the term table for a the given term. If it
        is not found, a new id corresponding to this term is created.
        In either case, the id corresponding to that term is returned, so
        that you can reference it in another table.

        The ontology_id should be used to disambiguate the term.
        z(SELECT term_id FROM term WHERE name = %sz AND ontology_id = %s   zMultiple term ids for z: r   zTINSERT INTO term (name, definition, identifier, ontology_id) VALUES (%s, %s, %s, %s)term)appendr   execute_and_fetchallr    
ValueErrorr/   r0   )r   r1   ontology_idr2   
identifiersqlfields
id_resultss           r   _get_term_idzDatabaseLoader._get_term_id]   s     9**CMM+&\\66sFC
z?Q5dV2j^LMM_!a=##+ 
 LL  tZ[&QR<<''//r   c                 v    | j                   j                  d|||f       | j                   j                  d      S )z,Insert a dbxref and return its id (PRIVATE).zBINSERT INTO dbxref(dbname, accession, version) VALUES (%s, %s, %s)dbxref)r   r/   r0   )r   dbname	accessionversions       r   _add_dbxrefzDatabaseLoader._add_dbxref|   s8    PY(	
 ||##H--r   c           	         d}d|j                   v rZt        |j                   d   t              r.t        |j                   d         dk(  r"|j                   d   d   }n|j                   d   }|sj|j                  D ][  }|j
                  dk(  rFt        |di       }d|v r5|j                  d   D ]#  }|j                  d      st        |d	d       } n |s[ n 	 |j                   d
   dd }	 |j                   d   dd }|r| j                  |||      S |s|sy|r$| j                  j                  d|f      }|r|d   S |rK| j                  j                  d|f      }t        |      dkD  rt        dt        |      |fz        |r|d   S g }	|j                   j                  dg       D ]  }
|	j!                  dd|
g        |	rd|	d   d<   |	j!                  dd|j                   d
   g       d|j                   v r!|	j!                  dd|j                   d   g       d|j                   v r!|	j!                  dd|j                   d   g       ||	d   d<   | j                  j#                  d      d   }|sd}|dz  }| j                  j#                  d      d   }|sd}|dt        |	      z  z   dz
  }d}|	D ]t  }| j                  j%                  d||d   |d   ||f       | j                  j'                  d      }| j                  j%                  d||d   dd f       |dz  }|dz  }|}v |r| j                  j%                  d|f       S # t        $ r d}Y pw xY w# t        $ r d}Y ow xY w)a!  Get the taxon id for this record (PRIVATE).

        Arguments:
         - record - a SeqRecord object

        This searches the taxon/taxon_name tables using the
        NCBI taxon ID, scientific name and common name to find
        the matching taxon table entry's id.

        If the species isn't in the taxon table, and we have at
        least the NCBI taxon ID, scientific name or common name,
        at least a minimal stub entry is created in the table.

        Returns the taxon id (database key for the taxon table,
        not an NCBI taxon ID), or None if the taxonomy information
        is missing.

        See also the BioSQL script load_ncbi_taxonomy.pl which
        will populate and update the taxon/taxon_name tables
        with the latest information from the NCBI.
        N
ncbi_taxidr6   r   source
qualifiersdb_xrefztaxon:   organism   zRSELECT taxon_id FROM taxon_name WHERE name_class = 'scientific name' AND name = %sz8SELECT DISTINCT taxon_id FROM taxon_name WHERE name = %szTaxa: %d species have name %rtaxonomygenusspecies
subspeciesvariantvarietas!SELECT MAX(left_value) FROM taxonz"SELECT MAX(right_value) FROM taxon   zqINSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank, left_value, right_value) VALUES (%s, %s, %s, %s, %s)taxonzTINSERT INTO taxon_name(taxon_id, name, name_class)VALUES (%s, %s, 'scientific name')zPINSERT INTO taxon_name(taxon_id, name, name_class)VALUES (%s, %s, 'common name'))r   
isinstancer   r    r#   typegetattrrJ   
startswithintKeyError _get_taxon_id_from_ncbi_taxon_idr   r.   r:   r   r8   execute_oner/   r0   )r   r%   ncbi_taxon_idfqualsrK   scientific_namecommon_nametaxalineagec
left_valueright_start_valueright_valueparent_taxon_idrX   taxon_ids                    r   _get_taxon_idzDatabaseLoader._get_taxon_id   s   . 6---&,,\:DAv)),78A=$*$6$6|$DQ$GM & 2 2< @__ 	66X%#A|R8E E)'(||I'> &G&11(;03GABK0@ %& !		#$00<TcBO	 ,,X6t<K  88  ?  <<66F "D
 Aw <<66JD 4y1} 3s4y+6NN  Aw ##''
B7 	,ANND$?+	,$GBKNi););J)GHI6---NND,0B0B<0PQR***NND*f.@.@.KLM&A\\--.QRSTU
Ja
 !LL440

 ! !'!c'l*::Q> 	'ELL  / !%(E!Hj+N	 ||++G4HLL  558DS>* !OJ1K&O#	'$ LL  1;' o  	#"O	#  	K	s$   M +M MMM-,M-c                     d dj                  fd|D              j                         }||j                         k7  rt        d| d      |S )aj  Map Entrez name terms to those used in taxdump (PRIVATE).

        We need to make this conversion to match the taxon_name.name_class
        values used by the BioSQL load_ncbi_taxonomy.pl script.

        e.g.::

            "ScientificName" -> "scientific name",
            "EquivalentName" -> "equivalent name",
            "Synonym" -> "synonym",

        c                 L    | j                         rd| j                         z   S | S )z$Add a space before a capital letter. )isupperlower)letters    r   	add_spacez1DatabaseLoader._fix_name_class.<locals>.add_spaceA  s"    ~~V\\^++r    c              3   .   K   | ]  } |        y wNr   ).0rt   ru   s     r   	<genexpr>z1DatabaseLoader._fix_name_class.<locals>.<genexpr>H  s     Ev6*Es   z!Expected processed entrez_name, 'z"' to only have lower case letters.)joinstriprs   r:   )r   entrez_nameanswerru   s      @r   _fix_name_classzDatabaseLoader._fix_name_class+  sV    ,	 EEEKKMV\\^#3F8;]^  r   c                    |sy| j                   j                  d||f      }g }g }|D ]L  }|d   }|d   }||k\  r|dz  }||kD  r|dz  }|j                  ||d   f       |j                  ||d   f       N t        |d d      }t        |d	 d      }| j                   j	                  d
|       | j                   j	                  d|       y)z>Update the left and right taxon values in the table (PRIVATE).Nz^SELECT left_value, right_value, taxon_id FROM taxon WHERE right_value >= %s or left_value > %sr6   r   rW   c                     | d   S Nr   r   xs    r   <lambda>z@DatabaseLoader._update_left_right_taxon_values.<locals>.<lambda>q  s
    ad r   T)keyreversec                     | d   S r   r   r   s    r   r   z@DatabaseLoader._update_left_right_taxon_values.<locals>.<lambda>r  s
    AaD r   z4UPDATE taxon SET left_value = %s WHERE taxon_id = %sz5UPDATE taxon SET right_value = %s WHERE taxon_id = %s)r   r9   r8   sortedexecutemany)r   ri   rows
right_rows	left_rowsrow	new_rightnew_lefts           r   _update_left_right_taxon_valuesz.DatabaseLoader._update_left_right_taxon_valuesO  s     ||009$
 
	 		1CAI1vHJ&Q	*$Ay#a&12hA/0		1 JNDI
9.$G	  BI	
 	  CZ	
r   c           
         |st        d      | j                  j                  dt        |      f      }|r|d   S d}d}d}d}d}	d}
d}d}g }|r|j	                  d|f       |r|j	                  d|f       | j
                  rEt        j                  d|d	
      }t        j                  |      }t        |      dk(  r
|d   d   t        |      k7  rt        d| d|d   d          | j                  |d   d         \  }}	}
|
}|
dz   }t        |d   d         }t        |d   d   d         }t        |d   d   d         }dt        |d   d         fg}	 |d   d   j                         D ]T  \  }}| j                  |      }t        |t              s|g}|D ]&  }t        |t              s|j	                  ||f       ( V n	 | j#                  |       | j                  j%                  d|||||||f       | j                  j'                  d      }|D ]'  \  }}| j                  j%                  d||dd |f       ) |S # t         $ r Y w xY w)a  Get the taxon id for record from NCBI taxon ID (PRIVATE).

        Arguments:
         - ncbi_taxon_id - string containing an NCBI taxon id
         - scientific_name - string, used if a stub entry is recorded
         - common_name - string, used if a stub entry is recorded

        This searches the taxon table using ONLY the NCBI taxon ID
        to find the matching taxon table entry's ID (database key).

        If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
        flag is true, Biopython will attempt to go online using Bio.Entrez
        to fetch the official NCBI lineage, recursing up the tree until an
        existing entry is found in the database or the full lineage has been
        fetched.

        Otherwise the NCBI taxon ID, scientific name and common name are
        recorded as a minimal stub entry in the taxon and taxon_name tables.
        Any partial information about the lineage from the SeqRecord is NOT
        recorded.  This should mean that (re)running the BioSQL script
        load_ncbi_taxonomy.pl can fill in the taxonomy lineage.

        Returns the taxon id (database key for the taxon table, not
        an NCBI taxon ID).
        z-Expected a non-empty value for ncbi_taxon_id.z3SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %sr   NrR   zscientific namezcommon namerO   XML)dbidretmoder6   TaxIdz.ncbi_taxon_id different from parent taxon id. z versus 	LineageExRankGeneticCodeGCIdMitoGeneticCodeMGCIdScientificName
OtherNameszINSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank, genetic_code, mito_genetic_code, left_value, right_value) VALUES (%s, %s, %s, %s, %s, %s, %s)rX   zFINSERT INTO taxon_name(taxon_id, name, name_class) VALUES (%s, %s, %s)rN   )r:   r   r.   r]   r8   r   r   efetchreadr    str_get_taxon_id_from_ncbi_lineageitemsr   rY   r   r^   r   r/   r0   )r   ra   rd   re   rm   rl   r(   genetic_codemito_genetic_codeparent_left_valueparent_right_valueri   rk   species_nameshandletaxonomic_record
name_classnamesr1   s                      r   r_   z/DatabaseLoader._get_taxon_id_from_ncbi_taxon_id{  s    8 LMM<<66ACDVCX
 A;
   !
  "3_!EF  -!=>##]]j]ERF%{{62#$)#A&w/3}3EE$HW_`pqr`st{`|_}~  88$Q'4	#%&
 0
014+A.v67"#3A#6}#Ef#MN$'(8(;<M(Nw(W$X! ',<Q,?@P,Q(RS!-=a-@-N-T-T-V 
I)
E%)%9%9*%E
)%6 &+GE$) ID  *$4 - 4 4j$5G H	I
I   	,,Z83  !		
 <<''0 !. 	JLL  '4:z2	 S    s   AI 1I 	IIc                    t        |d   d         }d}d}d}d}| j                  j                  d|z        }|r*t        |      dk7  rt	        dt        |             |d   S t        |      dkD  r=| j                  |dd       \  }}}|}|dz   }t        |t               s7t	        d|       d}| j                  j                  d	      d   }|sd}|dz   }| j                  |       t        |d   j                  d
            }	| j                  j                  d|||	||f       | j                  j                  d      }
|d   j                  d      }|r!| j                  j                  d|
|dd f       |
||fS )a|  Recursive method to get taxon ID from NCBI lineage (PRIVATE).

        Arguments:
         - taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez

        First dictionary in list is the taxonomy root, highest would be
        the species. Each dictionary includes:

        - TaxID (string, NCBI taxon id)
        - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
        - ScientificName (string)

        (and that is all at the time of writing)

        This method will record all the lineage given, returning the taxon id
        (database key, not NCBI taxon id) of the final entry (the species).
        rQ   r   NzJSELECT taxon_id, left_value, right_value FROM taxon WHERE ncbi_taxon_id=%sr6   zExpected 1 response, got r   z+Expected parent_taxon_id to be an int, got rV   r   zqINSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, left_value, right_value) VALUES (%s, %s, %s, %s, %s)rX   r   zUINSERT INTO taxon_name(taxon_id, name, name_class) VALUES (%s, %s, 'scientific name')rN   )r]   r   r9   r    r:   r   rY   r`   r   r   r   r/   r0   )r   taxonomic_lineagera   ri   rk   r   r   r   rl   r(   rm   rd   s               r   r   z.DatabaseLoader._get_taxon_id_from_ncbi_lineage  s   $ -b1':;
 !||00&(56
  4yA~ #<SYK!HII7N  !A% 445Fs5KL	!"+J,q0Kos3 A/ARS  #O112UVJ 
$q.K,,Z8 $R(,,V45COT:{K	
 <<''0 ,B/334DELL  5?4C01
 [00r   c                    |j                   j                  d      dk(  r+|j                   j                  d      \  }}	 t        |      }n|j                   }d}d|j
                  v r>t        |j
                  d   t              r!|j
                  d   r|j
                  d   d   }| j                  |      }d|j
                  v r|j
                  d   }n|j                   }t        |dd      }|j
                  j                  d      }d	}| j                  j                  || j                  ||j                  |||||f       | j                  j                  d
      S # t        $ r |j                   }d}Y w xY w)zFill the bioentry table with sequence information (PRIVATE).

        Arguments:
         - record - SeqRecord object to add to the database.

        .r6   r   
accessionsgidescriptionNdata_file_divisiona7  
        INSERT INTO bioentry (
         biodatabase_id,
         taxon_id,
         name,
         accession,
         identifier,
         division,
         description,
         version)
        VALUES (
         %s,
         %s,
         %s,
         %s,
         %s,
         %s,
         %s,
         %s)bioentry)r   countsplitr]   r:   r   rY   r   rn   r[   r   r   r/   r   r1   r0   )	r   r%   rD   rE   rm   r<   r   divisionr=   s	            r   r   z#DatabaseLoader._load_bioentry_table[  sr    99??31$!'!5Iwg,
 		IG F...6--l;TB""<0 **<8;I
 %%f-6%%%++D1JJ fmT:%%))*>?* 					
 ||##J//A  "II	s   E E76E7c                 .   |j                   j                  dt        dt                     j	                               }t        |t              r|d   }| j                  d      }| j                  d|      }d}| j                  j                  ||||f       y)zAdd the effective date of the entry into the database (PRIVATE).

        record - a SeqRecord object with an annotated date
        bioentry_id - corresponding database identifier
        datez%d-%b-%Yr   Annotation Tagsdate_changedzaINSERT INTO bioentry_qualifier_value (bioentry_id, term_id, value, "rank") VALUES (%s, %s, %s, 1)N)r   r   r   r   upperrY   r   r4   r@   r   r/   )r   r%   r&   r   annotation_tags_iddate_idr=   s          r   r   z"DatabaseLoader._load_bioentry_date  s     !!%%fhz68.L.R.R.TUdD!7D!223DE##N4FG& 	
 	S;">?r   c                 <   |j                   y|j                  j                  dd      }d|v rd}nd|v rd}n	d|v rd}nd	}	 t        |j                         }d
}| j
                  j                  ||t        |j                         ||f       y# t        $ r d}Y Cw xY w)zRecord SeqRecord's sequence and alphabet in DB (PRIVATE).

        Arguments:
         - record - a SeqRecord object with a seq property
         - bioentry_id - corresponding database identifier

        Nmolecule_typerv   DNAdnaRNArnaproteinunknownz`INSERT INTO biosequence (bioentry_id, version, length, seq, alphabet) VALUES (%s, 0, %s, %s, %s))seqr   r   r   r   r   r/   r    )r   r%   r&   r   alphabetseq_strr=   s          r   r   z DatabaseLoader._load_biosequence  s     :: **..CM!Hm#H-' H H	&**oG
) 	
 	S;FJJ("ST & 	G	s   B BBc                     |j                   j                  d      }|syt        |t              s|g}t	        |      D ];  \  }}|j                  dd      }d}| j                  j                  ||||dz   f       = y)zRecord a SeqRecord's annotated comment in the database (PRIVATE).

        Arguments:
         - record - a SeqRecord object with an annotated comment
         - bioentry_id - corresponding database identifier

        commentN
rq   zKINSERT INTO comment (bioentry_id, comment_text, "rank") VALUES (%s, %s, %s)r6   )r   r   rY   r   	enumeratereplacer   r/   )r   r%   r&   commentsindexr   r=   s          r   r   zDatabaseLoader._load_comment  s     %%)))4(D) zH'1 	INE7oodC0G'  LL  {GUQY&GH	Ir   c           
         d}d}| j                  d      }|j                  j                         D ]  \  }}|dv r| j                  ||      }t	        |t
        t        f      rOd}	|D ]G  }
t	        |
t        t        f      r/|	dz  }	| j                  j                  |||t        |
      |	f       HI t	        |t        t        f      r)| j                  j                  |||t        |      f        y)	a  Record a SeqRecord's misc annotations in the database (PRIVATE).

        The annotation strings are recorded in the bioentry_qualifier_value
        table, except for special cases like the reference, comment and
        taxonomy which are handled with their own tables.

        Arguments:
         - record - a SeqRecord object with an annotations dictionary
         - bioentry_id - corresponding database identifier

        zUINSERT INTO bioentry_qualifier_value(bioentry_id, term_id, value) VALUES (%s, %s, %s)zaINSERT INTO bioentry_qualifier_value(bioentry_id, term_id, value, "rank") VALUES (%s, %s, %s, %s)r   )r   r   r   rH   r   r;   r   r6   N)r4   r   r   r@   rY   r   tupler   r]   r   r/   )r   r%   r&   mono_sqlmany_sqltag_ontology_idr   valueterm_idr(   entrys              r   r"   z DatabaseLoader._load_annotations  s    # 	' 	
 //0AB ,,224 	JCVV'''IG%$/" E!%#s4	,,${GSZ&N  EC:.$$XWc%j/QR)	r   c           	      |   d}|j                   r'| j                  j                  d|j                   f      }|s3|j                  r'| j                  j                  d|j                  f      }|svg }|j                  |j
                  |j                  fD ]  }|j                  |xs d        t        dj                  |            }| j                  j                  d|f      }|s|j                   r| j                  d|j                   d      }n,|j                  r| j                  d	|j                  d      }nd}|j                  xs d}	|j
                  xs d}
|j                  xs d}| j                  j                  d
|||
|	f       | j                  j                  d      }n|d   }|j                  rZdt        t        |j                  d   j                               z   }t        t        |j                  d   j"                              }nd}d}d}| j                  j                  ||||||dz   f       y)zRecord SeqRecord's annotated references in the database (PRIVATE).

        Arguments:
         - record - a SeqRecord object with annotated references
         - bioentry_id - corresponding database identifier

        NzlSELECT reference_id FROM reference JOIN dbxref USING (dbxref_id) WHERE dbname = 'MEDLINE' AND accession = %szkSELECT reference_id FROM reference JOIN dbxref USING (dbxref_id) WHERE dbname = 'PUBMED' AND accession = %sz<undef>rv   z1SELECT reference_id FROM reference WHERE crc = %sMEDLINEr   PUBMEDz\INSERT INTO reference (dbxref_id, location, title, authors, crc) VALUES (%s, %s, %s, %s, %s)r'   r6   zrINSERT INTO bioentry_reference (bioentry_id, reference_id, start_pos, end_pos, "rank") VALUES (%s, %s, %s, %s, %s))
medline_idr   r.   	pubmed_idauthorstitlejournalr8   r   r{   rF   r/   r0   locationr]   r   startend)r   r'   r(   r&   refssrb   crc	dbxref_idr   r   r   reference_idr   r   r=   s                   r   r!   zDatabaseLoader._load_reference*  s)    <<66? %%'	D 	++<<66> $$&	D A&&	9J9JJ )i()
#C<<66CcVD ## ,,Y	8L8LaP	$$ ,,Xy7J7JAN	 	''/4GOO+tE  ''-2GLL  / GUGS9	  <<//<L7LC	 2 21 5 ; ;<==Ec),,Q/3345CECG 	 	S;eS$QR("STr   c                 @   	 |j                   d   }t        |t              r|d   }| j                  |j                  |||      }| j                  ||       | j                  |j                   |       y# t
        $ r  | j                  |j                  ||      }Y Ww xY w)z8Load a biopython SeqFeature into the database (PRIVATE).rI   r   )rI   N)rJ   rY   r   _load_seqfeature_basicrZ   r^   _load_seqfeature_locations_load_seqfeature_qualifiers)r   featurefeature_rankr&   rI   seqfeature_ids         r   r$   zDatabaseLoader._load_seqfeaturel  s    
	''1F&$' 77lK 8 M 	''?((););]K  	 77lKM	s   AA4 4&BBc                    | j                  d      }| j                  ||      }| j                  d      }| j                  ||      }d}	| j                  j                  |	||||dz   f       | j                  j	                  d      S )zLoad the first tables of a seqfeature and returns the id (PRIVATE).

        This loads the "key" of the seqfeature (ie. CDS, gene) and
        the basic seqfeature table itself.
        zSeqFeature Keysr   zSeqFeature SourceszbINSERT INTO seqfeature (bioentry_id, type_term_id, source_term_id, "rank") VALUES (%s, %s, %s, %s)r6   
seqfeature)r4   r@   r   r/   r0   )
r   feature_typer   r&   rI   r;   seqfeature_key_idsource_cat_idsource_term_idr=   s
             r   r   z%DatabaseLoader._load_seqfeature_basic  s     ++,=> --l-T--.BC**6}*M> 	 	+0.,QRBRS	
 ||##L11r   c                    	 |j                   j                  dk7  r)ddl}ddlm} |j                  d|j                  z  |       |j                   j                  }|r&|D ch c]  }|j                   c}dhk(  r|ddd   }t        |      D ]  \  }}| j                  ||dz   |        y# t        $ r Y sw xY wc c}w )a4  Load all of the locations for a SeqFeature into tables (PRIVATE).

        This adds the locations related to the SeqFeature into the
        seqfeature_location table. Fuzzies are not handled right now.
        For a simple location, ie (1..2), we have a single table row
        with seq_start = 1, seq_end = 2, location_rank = 1.

        For split locations, ie (1..2, 3..4, 5..6) we would have three
        row tables with::

            start = 1, end = 2, rank = 1
            start = 3, end = 4, rank = 2
            start = 5, end = 6, rank = 3

        r{   r   N)BiopythonWarningz-%s location operators are not fully supportedrQ   r6   )r   operatorwarningsBior   warnlocation_operatorAttributeErrorpartsstrandr   _insert_location)r   r   r   r   r   r  locr(   s           r   r   z)DatabaseLoader._load_seqfeature_locations  s    $	((F2  0C//0$   &&E2Scjj2rd:$B$KE"5) 	@ID#!!#tax?	@  		 3s   AB,  B;,	B87B8c           
         	 t        |j                        dz   }	 t        |j
                        }|j                  xs d}d}|j                  r+| j                  |j                  xs d|j                        }nd}d}	| j                  j                  |	|||||||f       y# t        $ r! t        |j                  t              rd}n Y w xY w# t        $ r! t        |j
                  t              rd}n Y w xY w)zAdd SeqFeature location to seqfeature_location table (PRIVATE).

        TODO - Add location operator to location_qualifier_value?
        r6   Nr   rv   zINSERT INTO location (seqfeature_id, dbxref_id, term_id,start_pos, end_pos, strand, "rank") VALUES (%s, %s, %s, %s, %s, %s, %s))r]   r   	TypeErrorrY   r   r   r  ref_get_dbxref_idref_dbr   r/   )
r   r   r(   r   r   r   r  loc_term_idr   r=   s
             r   r  zDatabaseLoader._insert_location  s    	'!+E	hll#C %A <<
 ++HOO,Ar8<<PII2 	
 	-KVTR	
	[  	(../: 	  	(,,8 	s"   B! C !'C
C'C87C8c           	      F   | j                  d      }|D ]  }|dk7  ro| j                  ||      }||   }t        |t              s|g}t	        t        |            D ],  }||   }d}	| j                  j                  |	|||dz   |f       . w| j                  ||   |        y)zInsert feature's (key, value) pair qualifiers (PRIVATE).

        Qualifiers should be a dictionary of the form::

            {key : [value1, value2]}

        r   rK   r   zgINSERT INTO seqfeature_qualifier_value  (seqfeature_id, term_id, "rank", value) VALUES (%s, %s, %s, %s)r6   N)	r4   r@   rY   r   r   r    r   r/   _load_seqfeature_dbxref)
r   rJ   r   r   qualifier_keyqualifier_key_identriesqual_value_rankqualifier_valuer=   s
             r   r   z*DatabaseLoader._load_seqfeature_qualifiers  s     //0AB' $	WM
 	)#'#4#4! $5 $  %]3!'40  'iG',S\': O&-o&>O, 
 LL((),+a/+	* ,,Z-FVI$	Wr   c                 :   t        |      D ]p  \  }}	 |j                  dd      j                  dd      j                  d      }|d   }|dd }|D ]*  }| j                  ||      }	| j                  ||	|dz          , r y# t        $ r t	        d| d	      dw xY w)
a  Add SeqFeature's DB cross-references to the database (PRIVATE).

        Arguments:
         - dbxrefs - List, dbxref data from the source file in the
           format <database>:<accession>
         - seqfeature_id - Int, the identifier for the seqfeature in the
           seqfeature table

        Insert dbxref qualifier data for a seqfeature into the
        seqfeature_dbxref and, if required, dbxref tables.
        The dbxref_id qualifier/value sets go into the dbxref table
        as dbname, accession, version tuples, with dbxref.dbxref_id
        being automatically assigned, and into the seqfeature_dbxref
        table as seqfeature_id, dbxref_id, and rank tuples.
        rq   rv   r   :r   r6   NzParsing of db_xref failed: '')r   r   r   	Exceptionr:   r  _get_seqfeature_dbxref)
r   dbxrefsr   r(   r   dbxref_datar   r   rD   r   s
             r   r  z&DatabaseLoader._load_seqfeature_dbxref=  s    ( %W- 	PKD%T#mmC4<<T2FLLSQ ^(_

 ( P	 //I>	++M9dQhO	P	P  T #?wa!HItSTs   ;B  Bc                 v    d}| j                   j                  |||f      }|r|d   S | j                  ||d      S )a  Get DB cross-reference for accession (PRIVATE).

        Arguments:
         - db - String, the name of the external database containing
           the accession number
         - accession - String, the accession of the dbxref data

        Finds and returns the dbxref_id for the passed data.  The method
        attempts to find an existing record first, and inserts the data
        if there is no record.
        zASELECT dbxref_id FROM dbxref WHERE dbname = %s AND accession = %sr   )r   r.   rF   )r   r   rD   r=   r   s        r   r  zDatabaseLoader._get_dbxref_idb  sH     RLL77b)_M	 Q<Iq11r   c                 p    d}| j                   j                  |||f      }|r|S | j                  |||      S )zGet DB cross-reference, creating it if needed (PRIVATE).

        Check for a pre-existing seqfeature_dbxref entry with the passed
        seqfeature_id and dbxref_id.  If one does not exist, insert new
        data.
        zbSELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref WHERE seqfeature_id = %s AND dbxref_id = %s)r   r.   _add_seqfeature_dbxref)r   r   r   r(   r=   results         r   r  z%DatabaseLoader._get_seqfeature_dbxrefw  sH    : 	 44S=):TU M**=)TJJr   c                 L    d}| j                   j                  ||||f       ||fS )zAdd DB cross-reference (PRIVATE).

        Insert a seqfeature_dbxref row and return the seqfeature_id and
        dbxref_id
        zSINSERT INTO seqfeature_dbxref (seqfeature_id, dbxref_id, "rank") VALUES(%s, %s, %s)r   r/   )r   r   r   r(   r=   s        r   r  z%DatabaseLoader._add_seqfeature_dbxref  s4     	
 	S=)T"BCy))r   c                 v   t        |j                        D ]  \  }}|j                  d      }|dk7  rt        d      	 |j	                  dd      \  }}|j                         }|j                         }| j                  ||      }| j                  |||dz           y# t        $ r t        d| d      dw xY w)	zrLoad any sequence level cross references into the database (PRIVATE).

        See table bioentry_dbxref.
        r   r   z;Expected a single line in value, got {newline_escape_count}r  r6   z!Parsing of dbxrefs list failed: 'r  N)	r   r  r   r:   r   r|   r  r  _get_bioentry_dbxref)	r   r%   r&   r(   r   newline_escape_countr   rD   r   s	            r   r   zDatabaseLoader._load_dbxrefs  s    
 %V^^4 	HKD% $);;t#4 #q( Q Y %C 3IXXZ%OO-	 ++B	:I%%k9dQhG-	H"  Y #DUG1!MNTXXYs   5BB8c                 p    d}| j                   j                  |||f      }|r|S | j                  |||      S )zGet pre-existing db-xref, or create and return it (PRIVATE).

        Check for a pre-existing bioentry_dbxref entry with the passed
        seqfeature_id and dbxref_id.  If one does not exist, insert new
        data
        z\SELECT bioentry_id, dbxref_id FROM bioentry_dbxref WHERE bioentry_id = %s AND dbxref_id = %s)r   r.   _add_bioentry_dbxref)r   r&   r   r(   r=   r  s         r   r   z#DatabaseLoader._get_bioentry_dbxref  sH    8 	 44S;	:RS M((iFFr   c                 L    d}| j                   j                  ||||f       ||fS )zlInsert a bioentry_dbxref row (PRIVATE).

        Returns the seqfeature_id and dbxref_id (PRIVATE).
        zNINSERT INTO bioentry_dbxref (bioentry_id,dbxref_id,"rank") VALUES (%s, %s, %s)r  )r   r&   r   r(   r=   s        r   r#  z#DatabaseLoader._add_bioentry_dbxref  s4     	
 	S;	4"@AY''r   )Frx   )NNN)NN)zEMBL/GenBank/SwissProt) __name__
__module____qualname____doc__r   r+   r4   r@   rF   rn   r   r   r_   r   r   r   r   r   r"   r!   r$   r   r   r  r   r  r  r  r  r   r   r#  r   r   r   r
   r
   !   s    G7.M0&0>.eN"H*
Z @DJXR1hN0`@(!UFI2+^@UDL. ?W2,(@TJX-W^#PJ2*K&*H:G&(r   r
   c                       e Zd ZdZd Zd Zy)DatabaseRemovera  Complement the Loader functionality by fully removing a database.

    This probably isn't really useful for normal purposes, since you
    can just do a::

        DROP DATABASE db_name

    and then recreate the database. But, it's really useful for testing
    purposes.
    c                      || _         || _        y)z5Initialize with a database id and adaptor connection.N)r   r   )r   r   r   s      r   r   zDatabaseRemover.__init__  s    	r   c                     d}| j                   j                  || j                  f       d}| j                   j                  || j                  f       y)z3Remove everything related to the given database id.z.DELETE FROM bioentry WHERE biodatabase_id = %sz1DELETE FROM biodatabase WHERE biodatabase_id = %sN)r   r/   r   )r   r=   s     r   removezDatabaseRemover.remove  s@    >S499,/AS499,/r   N)r%  r&  r'  r(  r   r-  r   r   r   r*  r*    s    	
0r   r*  N)r(  timer   r   r   r   Bio.Seqr   Bio.SeqFeaturer   Bio.SeqUtils.CheckSumr   r
   r*  r   r   r   <module>r2     s7       * * (r( r(j%0 0r   