
    Xf@                         d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ  G d dej                  Z G d	 d
ej                  ZdS )a  Bio.Align support for tabular output from BLAST or FASTA.

This module contains a parser for tabular output from BLAST run with the
'-outfmt 7' argument, as well as tabular output from William Pearson's
FASTA alignment tools using the '-m 8CB' or '-m 8CC' arguments.
    N)	Alignment)
interfaces)Seq)	SeqRecordc                       e Zd ZdZ ej                    Z ej                    Z ej                    Z ej                    Z	dS )Statez=Enumerate alignment states needed when parsing a BTOP string.N)
__name__
__module____qualname____doc__enumautoMATCH	QUERY_GAP
TARGET_GAPNONE     1lib/python3.11/site-packages/Bio/Align/tabular.pyr   r      sH        GGDIKKE	IJ49;;DDDr   r   c                   4    e Zd ZdZdZd Zd Zd Zd Zd Z	dS )	AlignmentIteratora<  Alignment iterator for tabular output from BLAST or FASTA.

    For reading (pairwise) alignments from tabular output generated by BLAST
    run with the '-outfmt 7' argument, as well as tabular output generated by
    William Pearson's FASTA alignment programs with the '-m 8CB' or '-m 8CC'
    output formats.
    Tabularc                     	 t          |          }n# t          $ r t          d          d w xY w|                    d          st          d          |                                }|                     ||           d S )NzEmpty file.# zMissing header.)nextStopIteration
ValueError
startswithrstrip_parse_header)selfstreamlines      r   _read_headerzAlignmentIterator._read_header-   s    	6<<DD 	6 	6 	6]++5	6t$$ 	0.///{{}}64(((((s    -c           	         i }d}	 |dd                               d d          \  }}||vrt          d          	 ||c|d<   |d<   d| _        n# t          $ rv |dd          |d<   t          |          }|                    d	          sJ |dd                                                               d d          \  |d<   |d<   d
| _        Y nw xY w|D ]}|                                }|                    d	          sJ 	 |dd                               d          \  }}nO# t          $ rB d}	|                    |	          sJ t          |dt          |	                              }
Y  nw xY w|dk    r|d         dk    rK|
                    dd          \  }}|                                 \  }}t          |          | _        |dv sJ n	|}d | _        	 |                     d d          \  | _        | _        +# t          $ r$ |                                | _        d | _        Y Xw xY w|dk    r||d<   i|dk    r|                     d          | _        |dk    r||d<   || _        d S )N)	BLASTNBLASTPBLASTXTBLASTNTBLASTX
DELTABLASTPSIBLASTRPSBLAST
RPSTBLASTN      zNot a BLAST programProgramVersionz# BLAST processed zCommand liner   z# FASTA processed z: z hits foundQueryFASTAz - )ntaaDatabaseFieldsz, RID)splitr   _final_prefixr   r   r   stripendswithintlenrsplit_query_size	_query_id_query_description_fieldsmetadata)r!   r"   r#   rE   blast_programsprogramversionprefixvaluesuffixhits
query_line
query_sizeunits                 r   r    zAlignmentIterator._parse_header7   s   

	6#ABBx~~dA66GWn,, !6777 - 8?4HY)!4!5D  	6 	6 	6'+ABBxH^$<<D??4(((((7;ABBx7H7H7N7NtUV7W7W4HY)!4!5D	6  	( 	(D::<<D??4((((( $QRRt 4 4   &}}V,,,,,4S[[L 0122	
   I&'11-2\\%-C-C*J
'1'7'7'9'9$J'*:D$</////!&J'+D$3>H>N>NtUV>W>W;DND$;$;! 3 3 3%/%5%5%7%7DN.2D+++3 :%%',$$8##${{4005"' s7   4A A=CC D##AE/.E/#G>>*H,+H,c                    |D ]~}|                                 }|                    d          rS|                    | j                  r"|                    d          r| `| `| `| `| ` d S |                     ||           ~ d }d }d }d }d }d }d }	d }
d }d }d }d }d }d }d }| j        }|	                    d          }t          |          t          | j                  k    sJ i }i }i }t          || j                  D ]~\  }}|dk    r|}| j        || j        k    sJ #|dk    r|}	,|dk    rt          |          ||<   E|dk    rt          |          }[|dk    rt          |          ||<   t|d	k    rt          |          ||<   |d
k    rt          |          }
|dk    rt          |          }|dk    rt          |          }|dk    rt          |          }|dk    rt          |          |d<   |dk    rt          |          |d<   |dk    r|                     |          }5|dk    r|                     |          }R|dk    r||d<   _|dk    r||d<   l|dk    r||d<   ||}}|dk    r*|t          |          }|t          |          k    sJ |dk    r||d<   |dk    r||d<   |dk    r||d<   |dk    r||d<   |dk    r||d <   |d!k    r||d"<   |d#k    r||d$<   |d%k    r||d&<   |d'k    r||d(<   "|d)k    r||d*<   /|d+k    r||d,<   <|d-k    r||d.<   I|d/k    r||d0<   V|d1k    rt          |          |d2<   p|d3k    r||d<   |	|}	|d4k    rt          |          }|d5k    r|}|d6k    r|}|d7k    rt          |          }|d8k    rt          |          }|||<   |d9k    rt          |          ||<   |d:k    rt          |          ||<   |d;k    rt          |          ||<   -|d<k    rt          |          ||<   G|d=k    r|||<   T|d>k    r||d?<   a|d@k    r||d?<   nt!          dA|z            | j        dB         }||||d<   |
||
|k     r|
dCz  }
n|dCz  }||||k     r|dCz  }n|dCz  }||dDv r|
|
|dE<   |||dF<   n3|1|
|k     r|dCd d fxx         |
z  cc<   n|
|dCd d f         z
  |dCd d f<   ||dGv r|||dE<   |||dF<   n||dHd d fxx         |z  cc<   ||d }nt%          d |I          }nz|                    dJdK          }|dLk    r,t          |          ||
z
  k    sJ t%          |
|i|I          }n2|dMk    r|
|dE<   ||dF<   t%          |          }nt)          dN|z            t+          ||O          }| j        | j        |_        |r||_        | j        dB         dGv r0||dP<   |d }n|                    dJdK          }t%          |          }n^||d }nWt%          d |I          }nE|                    dJdK          }|-|+t          |          ||z
  k    sJ t%          ||i|I          }t+          ||	O          }|r||_        ||g}t1          ||          }||_        |||_        |S )QNr   z queries	zquery idz
subject idz
% identityzalignment length
mismatchesz	gap openszq. startzq. endzs. startzs. endevaluez	bit scoreBTOPaln_codezquery gigiz
query acc.zacc.zquery acc.verzacc.verzquery lengthzsubject idsidsz
subject gizsubject gisgiszsubject acc.zsubject accs.zaccs.zsubject tax idsztax idszsubject sci namesz	sci nameszsubject com namesz	com nameszsubject blast nameszblast nameszsubject super kingdomszsuper kingdomszsubject titletitlezsubject titlestitleszsubject strandstrandz% subject coveragez
% coveragezsubject acc.verzsubject lengthz	query seqzsubject seqscore	identical	positivesgapsz% positivesz% hsp coveragezquery/sbjct frameszquery frameframezsbjct framezUnexpected field '%s'r1   r0   )r(   r*   startend)r)   r*   r   )length- r)   r*   zUnknown program %s)idrc   )r   r   r;   r=   rD   rB   rC   rA   r    r:   r?   zipfloatr>   
parse_btopparse_cigarr   rE   r   replace	Exceptionr   descriptionannotationsr   r\   ) r!   r"   r#   alignment_lengthr]   btopcigarr\   query_id	target_idquery_start	query_endtarget_start
target_endquery_sequencetarget_sequencetarget_lengthcoordinatesrN   columnsrn   query_annotationstarget_annotationscolumnfieldrG   	query_seqquery
target_seqtargetrecords	alignments                                    r   _read_next_alignmentz&AlignmentIterator._read_next_alignments   s	    	 	D;;==Dt$$ 
??4#566 4==;T;T /(*FF""640000			
%
**T""7||s4<000000 $,77 f	B f	BMFE
""!>-#t~5555,&&"		,&&%*6]]E"",,,#&v;;  ,&&%([[E""+%%%([[E""*$$!&kk(""KK		*$$"6{{("" [[

(""(-fH%%+%%+0==K((&"oof55*$$"..v66*$$*0!$'',&&,2!&))/))/5!),#%H.((%!$VJJ%V44444-'',2"5)),&&+1"4((-'',2"5)).((-3"6**/)).4"7+++++06"9-----28";//---28";/////4:"=112227="#344/)).4"7++***/5"8,,***/5"8,,...38=="<00+++06"9-$ &I*** #F+%%!'-''"('!!F+%%KK	%.E""+%%%([[E""&%([[E""-''%*6]]E""***%*6]]E""...%+E""-''-3!'**-''.4"7++ !85!@AAA-	*+2B./"y'<Y&&q Q	#
(>j((!a
'-B"B"B&-8!'*$+4!%($Y&&AqqqD!!![0!!!! %0+ad2C$CAqqqD!'-C"C"C'.:"7+%,6"5)$111-!! 		Z888		+33C<<N)##>**i+.EEEEEn =jQQQ		I%%-8!'*+4!%(//		 4w >???)111". $ 7E 	2 1E=#'===+8x(&!

"1"9"9#r"B"B 11

&%!%JJ!$T*!=!=!=JJ"1"9"9#r"B"B+
0F//:3LLLLL!$lO%DZ!X!X!XJ:)444 	4!3F5/g{33	 +	#IOr   c                    g }g }|                     d           |                     d           t          j        }t          j        d|          }|D ]}|                    d          rc|t          j        k    rB|                     |d                    |                     |d                    t          j        }|dxx         dz  cc<   {|                    d          rc|t          j        k    rB|                     |d                    |                     |d                    t          j        }|dxx         dz  cc<   	 t          |          }n# t          $ r d}Y nw xY w|t          j        k    r"|dxx         |z  cc<   |dxx         |z  cc<   H|                     |d         |z              |                     |d         |z              t          j        }t          j        ||g          }|S )zParse a BTOP string and return alignment coordinates.

        A BTOP (Blast trace-back operations) string is used by BLAST to
        describe a sequence alignment.
        r   z([A-Z-*]{2}|\d+)rd   r0   )appendr   r   refindallr   r   r=   r   r>   r   r   nparray)	r!   rp   target_coordinatesquery_coordinatesstatetokenstokenrc   r{   s	            r   ri   zAlignmentIterator.parse_btopO  s?     !!!$$$  ###
/66  	( 	(E$$ (EO++&--.@.DEEE%,,->r-BCCC!OE"2&&&!+&&&&$$ (E,,,&--.@.DEEE%,,->r-BCCC!,E!"%%%*%%%% ZZFF!   FFF EK''&r***f4***%b)))V3))))&--.@.Dv.MNNN%,,->r-BV-KLLL!KEEh 24EFGGs   EE%$E%c                    g }g }d}d}|                     |           |                     |           t          j        }t          j        d|          }t          |ddd         |ddd                   D ]f\  }}	t          |          }|	dk    r||z  }||z  }n|	dk    r||z  }n|	dk    r||z  }|                     |           |                     |           gt          j        ||g          }
|
S )	a  Parse a CIGAR string and return alignment coordinates.

        A CIGAR string, as defined by the SAM Sequence Alignment/Map format,
        describes a sequence alignment as a series of lengths and operation
        (alignment/insertion/deletion) codes.
        r   z(M|D|I|\d+)Nr/   r0   MID)	r   r   r   r   r   rg   r>   r   r   )r!   rq   r   r   target_coordinatequery_coordinater   r   rc   	operationr{   s              r   rj   zAlignmentIterator.parse_cigar{  s9     !!"3444  !1222
NE22 "%VCCaC[&A,!?!? 
	7 
	7FI[[FC!V+! F*  c!!!V+!!c!! F* %%&7888$$%56666h 24EFGGr   N)
r	   r
   r   r   fmtr$   r    r   ri   rj   r   r   r   r   r   "   sv          C) ) ):! :! :!xZ Z Zx* * *X    r   r   )r   r   r   numpyr   	Bio.Alignr   r   Bio.Seqr   Bio.SeqRecordr   Enumr   r   r   r   r   <module>r      s      				                             # # # # # #    DI   w w w w w
4 w w w w wr   