
    &hCU                         d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddlm
Z
  G d d	e      Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z G d de
      Zy)a  Implementations of Biopython-like Seq objects on top of BioSQL.

This allows retrieval of items stored in a BioSQL database using
a biopython-like SeqRecord and Seq interface.

Note: Currently we do not support recording per-letter-annotations
(like quality scores) in BioSQL.
    )Optional)
SeqFeature)Seq)SequenceDataAbstractBaseClass)_RestrictedDict)	SeqRecordc                   4     e Zd ZdZdZd fd	Zd Zd Z xZS )_BioSQLSequenceDataz9Retrieves sequence data from a BioSQL database (PRIVATE).)
primary_idadaptor_lengthstartc                 Z    || _         || _        || _        || _        t        |           y)aU  Create a new _BioSQLSequenceData object referring to a BioSQL entry.

        You wouldn't normally create a _BioSQLSequenceData object yourself,
        this is done for you when retrieving a DBSeqRecord object from the
        database, which creates a Seq object using a _BioSQLSequenceData
        instance as the data provider.
        N)r   r   r   r   super__init__)selfr   r   r   length	__class__s        \/mounts/lovelace/software/anaconda3/envs/py312/lib/python3.12/site-packages/BioSQL/BioSeq.pyr   z_BioSQLSequenceData.__init__#   s,     %
    c                     | j                   S )z"Return the length of the sequence.)r   r   s    r   __len__z_BioSQLSequenceData.__len__1   s    ||r   c                    t        |t              r;|j                  | j                        \  }}}t	        t        |||            }|dk(  ry|}|dk  r|| j                  z  }|dk  r%t        |      || j                  k\  rt        |      | j                  j                  | j                  | j                  |z   | j                  |z   dz         }t        |      S |dk(  r|dk(  rh|| j                  k(  rY| j                  j                  | j                  | j                  | j                  | j                  z         }|j                  d      S t        | j                  | j                  | j                  |z   |      S | j                  j                  | j                  | j                  |z   | j                  |z         }	|	dd|   j                  d      S )z@Return a subsequence as a bytes or a _BioSQLSequenceData object.r   r      ASCIIN)
isinstancesliceindicesr   lenrange
IndexErrorr   get_subseq_as_stringr   r   ordencoder
   )
r   keyr   endstepsizeicsequencefulls
             r   __getitem__z_BioSQLSequenceData.__getitem__5   s   c5!"{{4<<8E3uUC./Dqy A1uT\\!q5$S/)dll" o%11aa!1CA q6M19zddll2<<<<OOTZZdll1J  w// +OOT\\4::3Et 
 <<44e!3TZZ#5ED $<&&w//r   )r   r   )	__name__
__module____qualname____doc__	__slots__r   r   r.   __classcell__)r   s   @r   r
   r
      s    C=I&0r   r
   c                     | j                  d|f      }|sy t        |      dk7  rt        dt        |       d      |d   \  }t        |      S )Nz5SELECT length FROM biosequence WHERE bioentry_id = %sr   Expected 1 response, got .r   )execute_and_fetchallr    
ValueErrorint)r   r   seqsgiven_lengths       r   _retrieve_seq_lenr=   ^   s[    ''?*D 
4yA~4SYKqABB1gO\|r   c                 @   | j                  d|f      }|sy t        |      dk7  rt        dt        |       d      |d   \  }}}	 t        |      }t        |      }||k7  rt        d| d|       d}~|rt        || d|      }t        |      S t        d |      S # t        $ ru |t        d	| d      | j                  d
|f      }t        |      dk7  rt        dt        |       d      |d   \  }}}|rt        d| d      t        |      }d}~Y w xY w)NzLSELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %sr   r6   r7   r   z''length' differs from sequence length, z, Tz$Expected 'length' to be 'None', got zDSELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %sz*Expected 'seq' to have a falsy value, got F)r   r   r   )r8   r    r9   r:   	TypeErrorr
   r   )	r   r   r;   moltyper<   r   have_seqseqdatas	            r   _retrieve_seqrE   k   so    ''V	D 
4yA~4SYKqABB$(G!G\6V<(\!9,r&R    	":waO4y4''+  CF81MNN++RM
 t9>8T1EFF%)!W"sI#aPQQ\"s   .B A;DDc                     g }| j                  d|f      }|D ]-  \  }}}|r|dk7  r| d| }n|}|j                  | d|        / |S )zBRetrieve the database cross references for the sequence (PRIVATE).z{SELECT dbname, accession, version FROM bioentry_dbxref join dbxref using (dbxref_id) WHERE bioentry_id = %s ORDER BY "rank"0r7   :)r8   append)r   r   _dbxrefsdbxrefsdbname	accessionversionvs           r   _retrieve_dbxrefsrP      sv    H**	 
G '. )"	7w#~+Qwi(AA6(!A3() Or   c                    d}| j                  ||f      }g }|D ]  \  }}}| j                  d|f      }i }	|D ]&  \  }
}|	j                  |
g       j                  |       ( | j                  d|f      }|D ]-  \  }
}|
 d| }|	j                  dg       j                  |       / | j                  d|f      }g }|D ]  \  }}}}|r|dz  }|dk(  rd }|d	vrt        d
|d|      |)|'||k  r"dd l}ddlm} |j                  d|||fz  |       |t        j                         }|t        j                         }|j                  ||||f        | j                  d|f      }i }|D ]&  \  }}}}|r|dk7  r| d| }n|}|dk(  rd }||f||<   ( t        j                  |      }||_
        |	|_        t        |      dk(  rn)t        |      dk(  r|d   \  }}}}t        | |      |_        |j                  |d      \  }}t        j                   ||      |_        ||j"                  _        ||j"                  _        ||j"                  _        ng }|D ]G  }|\  }}}}|j                  |d      \  }}|j                  t        j                   |||||             I |D ch c]  }|j$                   }}t        |      dk(  rd|v r|d d d   }t        j*                  |d      |_        |j                  |        |S c c}w )NzSELECT seqfeature_id, type.name, "rank" FROM seqfeature join term type on (type_term_id = type.term_id) WHERE bioentry_id = %s ORDER BY "rank"zvSELECT name, value FROM seqfeature_qualifier_value  join term using (term_id) WHERE seqfeature_id = %s ORDER BY "rank"zSELECT dbxref.dbname, dbxref.accession FROM dbxref join seqfeature_dbxref using (dbxref_id) WHERE seqfeature_dbxref.seqfeature_id = %s ORDER BY "rank"rH   db_xrefzeSELECT location_id, start_pos, end_pos, strand FROM location WHERE seqfeature_id = %s ORDER BY "rank"r   r   )r   NzInvalid strand z% found in database for seqfeature_id )BiopythonWarningz<Inverted location start/end (%i and %i) for seqfeature_id %szsSELECT location_id, dbname, accession, version FROM location join dbxref using (dbxref_id) WHERE seqfeature_id = %srG   r7    )type)NN)strandrefref_dbrS   join)r8   
setdefaultrI   r9   warningsBiorT   warnr   UnknownPosition_seqfeature_id
qualifiersr    "_retrieve_location_qualifier_valuelocation_operatorgetSimpleLocationlocationrW   rY   rX   CompoundLocation)r   r   sqlresultsseq_feature_listseqfeature_idseqfeature_typeseqfeature_rankqvsra   qv_nameqv_valuevalue	locationslocation_idr   r'   rW   r\   rT   remote_resultslookuprL   rM   rN   rO   featurelocsrf   _strandss                                  r   _retrieve_featuresrz      s   	  **3>G;B ~)7** 
 
!$ 	@GX!!'2.55h?	@ ** 
 "% 	?GXiq
+E!!)R077>	? .. 
 	 07 	@+KV
{^+ *0-A   S_u0'*/m)DE$ }"224{ 002k5#v>?7	@: !55( 	
 7E 		.3KG7c> k7), |#)1+F;		. ''_=!.'y>Q^q .7l+KV )K)G% %jjlCOFG)88DG&,G#&,G##*G D% 2:/UC"(**[,"G--s6wv *..Aqxx.G.7|q R7] DbDz)::4HG 	(}~)~  /s   *L
c                 R    | j                  d|f      }	 |d   S # t        $ r Y yw xY w)NzASELECT value FROM location_qualifier_value WHERE location_id = %sr   rU   )execute_and_fetch_col0r"   )r   rs   rq   s      r   rb   rb   :  s:    **K	EQx s    	&&c                    i }|j                  t        | |             |j                  t        | |             |j                  t        | |             |j                  t	        | ||             |j                  t        | |             |S N)update_retrieve_alphabet_retrieve_qualifier_value_retrieve_reference_retrieve_taxon_retrieve_comment)r   r   taxon_idannotationss       r   _retrieve_annotationsr   E  s|    K)':>?0*EF*7J?@w
HEF(*=>r   c                    | j                  d|f      }t        |      dk7  rt        dt        |       d      |d   }t        |      dk7  rt        dt        |       d      |d   }|dk(  rd}n|d	k(  rd
}n
|dk(  rd}nd }|d|iS i S )Nz7SELECT alphabet FROM biosequence WHERE bioentry_id = %sr   r6   r7   r   z%Expected 1 alphabet in response, got dnaDNArnaRNAproteinmolecule_type)r8   r    r9   )r   r   ri   	alphabetsalphabetr   s         r   r   r   O  s    **AJ=G 7|q4S\N!DEE
I
9~@Y@PPQRSS|H5	U		Y	! //	r   c                     | j                  d|f      }i }|D ]=  \  }}|dk(  rd}n|dk(  rd}n|dk(  rd}|j                  |g       j                  |       ? |S )NzqSELECT name, value FROM bioentry_qualifier_value JOIN term USING (term_id) WHERE bioentry_id = %s ORDER BY "rank"keywordkeywordsdate_changeddatesecondary_accession
accessions)r8   r[   rI   )r   r   rn   ra   namerq   s         r   r   r   g  s    

&
&	 
C J 6e9D^#D**DdB'..u56 r   c                 V   | j                  d|f      }g }|D ]  \  }}}}}}	}
t        j                         }||#||dz  }t        j                  ||      g|_        |r||_        |r||_        ||_        |	dk(  r|
|_        n|	dk(  r|
|_	        |j                  |        |rd|iS i S )NzSELECT start_pos, end_pos,  location, title, authors, dbname, accession FROM bioentry_reference JOIN reference USING (reference_id) LEFT JOIN dbxref USING (dbxref_id) WHERE bioentry_id = %s ORDER BY "rank"r   PUBMEDMEDLINE
references)r8   r   	Referencere   rf   authorstitlejournal	pubmed_id
medline_idrI   )r   r   refsr   r   r'   rf   r   r   rL   rM   	references               r   r   r   |  s     ''	 

D JCG %?sHeWfi((*	3? 
",";";E3"G!HI 'I#IO$	X"+Iy #,I )$#%$ j))	r   c                 P   i }| j                  d|f      }|r|d   |d<   | j                  d|f      }|r|d   |d<   | j                  d|f      }|r|d   r|d   dk7  r|d   |d<   g }|r4| j                  d	|f      \  }}	}
||
k(  rn|j                  d|       |
}|r4|r||d
<   |S )NzVSELECT name FROM taxon_name WHERE taxon_id = %s AND name_class = 'genbank common name'r   sourcezRSELECT name FROM taxon_name WHERE taxon_id = %s AND name_class = 'scientific name'organismz3SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %srG   
ncbi_taxidzSELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id FROM taxon, taxon_name WHERE taxon.taxon_id=taxon_name.taxon_id AND taxon_name.name_class='scientific name' AND taxon.taxon_id = %staxonomy)r|   execute_oneinsert)r   r   r   acommon_namesscientific_namesncbi_taxidsr   r   rankparent_taxon_ids              r   r   r     s   
A11	2	L
 "1o(55	.	
 (+*00={K {1~+a.C*?%a., H
&-&9&9'
 K'
#dO & 4 "! $  *Hr   c                 f    | j                  d|f      }|D cg c]  }|d   	 }}|rd|iS i S c c}w )NzESELECT comment_text FROM comment WHERE bioentry_id=%s ORDER BY "rank"r   comment)r8   )r   r   rn   commcommentss        r   r   r     sK    

&
&O	C %((DQ(H(8$$	 )s   .c                   t   e Zd ZdZd Zd Zd Zd Z eeeed      Z	ede
e   fd       Zej                  d	e
e   dd
fd       Zej                  dd       Zd Zd Zd Z eeeed      Zedej(                  fd       Zej                  d	eej(                     dd
fd       Zej                  dd       Zy
)DBSeqRecordz4BioSQL equivalent of the Biopython SeqRecord object.c           	      ,   || _         || _        | j                   j                  d| j                  f      \  | _        | _        | _        }}| _        | _        | _        |r|dk7  r| d| | _	        n|| _	        t        ||      }t        |      | _        y)a8  Create a DBSeqRecord object.

        Arguments:
         - adaptor - A BioSQL.BioSeqDatabase.Adaptor object
         - primary_id - An internal integer ID used by BioSQL

        You wouldn't normally create a DBSeqRecord object yourself,
        this is done for you when using a BioSeqDatabase object
        zSELECT biodatabase_id, taxon_id, name, accession, version, identifier, division, description FROM bioentry WHERE bioentry_id = %srG   r7   r?   N)_adaptor_primary_idr   _biodatabase_id	_taxon_idr   _identifier	_divisiondescriptionidr=   r   _per_letter_annotations)r   r   r   rM   rN   r   s         r   r   zDBSeqRecord.__init__  s      % MM%%& 
		
 NIN w#~"1WI.DGDG
 #7J7'6f'E$r   c                 |    t        | d      s%t        | j                  | j                        | _        | j                  S )N_seq)hasattrrE   r   r   r   r   s    r   	__get_seqzDBSeqRecord.__get_seq  s.    tV$%dmmT5E5EFDIyyr   c                     || _         y r~   r   )r   rC   s     r   	__set_seqzDBSeqRecord.__set_seq  s	    	r   c                     | ` y r~   r   r   s    r   	__del_seqzDBSeqRecord.__del_seq  s    Ir   z
Seq objectreturnc                 |    t        | d      s%t        | j                  | j                        | _        | j                  S )zDatabase cross references.rJ   )r   rP   r   r   rJ   r   s    r   rK   zDBSeqRecord.dbxrefs   s0     tZ(-dmmT=M=MNDM}}r   rq   Nc                     || _         y r~   rJ   r   rq   s     r   rK   zDBSeqRecord.dbxrefs'  s	    r   c                     | ` y r~   r   r   s    r   rK   zDBSeqRecord.dbxrefs+  s    Mr   c                 |    t        | d      s%t        | j                  | j                        | _        | j                  S )N	_features)r   rz   r   r   r   r   s    r   __get_featureszDBSeqRecord.__get_features/  s.    t[)/t?O?OPDN~~r   c                     || _         y r~   r   )r   featuress     r   __set_featureszDBSeqRecord.__set_features4  s	    !r   c                     | ` y r~   r   r   s    r   __del_featureszDBSeqRecord.__del_features7  s    Nr   Featuresc                 &   t        | d      szt        | j                  | j                  | j                        | _        | j                  r| j                  | j
                  d<   | j                  r| j                  | j
                  d<   | j
                  S )zAnnotations._annotationsgidata_file_division)r   r   r   r   r   r   r   r   r   s    r   r   zDBSeqRecord.annotations<  sy     t^, 5t//!D *.*:*:!!$'~~:>..!!"67   r   c                 &    |r|| _         y i | _         y r~   r   r   s     r   r   zDBSeqRecord.annotationsI  s     %D "Dr   c                     | ` y r~   r   r   s    r   r   zDBSeqRecord.annotationsP  s    r   )r   N)r/   r0   r1   r2   r   _DBSeqRecord__get_seq_DBSeqRecord__set_seq_DBSeqRecord__del_seqpropertyrC   liststrrK   setterdeleter_DBSeqRecord__get_features_DBSeqRecord__set_features_DBSeqRecord__del_featuresr   r   _AnnotationsDictr   r    r   r   r   r     s   >&FP
 9iL
ACc   ^^T#Y 4   __ 
" 
SH
!Y77 
! 
! #)*D*D!E #$ # #  r   r   N)r2   typingr   r]   r   Bio.Seqr   r   Bio.SeqRecordr   r   r
   r=   rE   rP   rz   rb   r   r   r   r   r   r   r   r   r   r   <module>r      sv       1 ) #=07 =0@
.(b&HV0*$N4n
k) kr   