
    Xf5]                         d Z ddlZddlZddlZddlmZ ddlmZ  G d de	          Z
 G d d	e
          Z G d
 de
          Z G d de
          ZdS )zSupport for various forms of sequence motif matrices.

Implementation of frequency (count) matrices, position-weight matrices,
and position-specific scoring matrices.
    N)Seq   )_pwmc                       e Zd ZdZd Zd Zd Zed             Zed             Z	ed             Z
	 dd
Zed             Zd ZdS )GenericPositionMatrixz9Base class for the support of position matrix operations.c                     d| _         |D ]f}| j         t          ||                   | _         n-| j         t          ||                   k    rt          d          d ||         D             | |<   g|| _        dS )Initialize the class.Nzdata has inconsistent lengthsc                 ,    g | ]}t          |          S  )float).0_s     1lib/python3.11/site-packages/Bio/motifs/matrix.py
<listcomp>z2GenericPositionMatrix.__init__.<locals>.<listcomp>$   s    ===E!HH===    )lengthlen	Exceptionalphabet)selfr   valuesletters       r   __init__zGenericPositionMatrix.__init__   s     	> 	>F{"!&.11F6N 3 333 ?@@@==fVn===DLL r   c                 @   d t          | j                  D             }dd                    |          z   }|g}| j        D ]D}d | |         D             }d|z  d                    |          z   }|                    |           Ed                    |          dz   }|S )zPReturn a string containing nucleotides and counts of the alphabet in the Matrix.c                     g | ]}d |z  S )z%6dr   )r   is     r   r   z1GenericPositionMatrix.__str__.<locals>.<listcomp>)   s    777q777r   z    c                     g | ]}d |z  S )z%6.2fr   )r   values     r   r   z1GenericPositionMatrix.__str__.<locals>.<listcomp>-   s    ???Wu_???r   z%c: 
)ranger   joinr   append)r   wordslinelinesr   texts         r   __str__zGenericPositionMatrix.__str__'   s    77E$+$6$6777sxx&m 	 	F??$v,???EF?SXXe__4DLLyy$&r   c                     t          |t                    rt          |          dk    r|\  }}t          |t                    rM|                    t           j                            \  }}}t          |||          } fd|D             }d}	nt          |t          j                  r j        |         }
d}	ntt          |t                    r fd|D             }d}	nNt          |t                    r't          |          dk    r|}
d}	n!t          |          t          d|z            t          |t                    r2|                     j                  \  }}}t          |||          }d}n1t          |t          j                  r|}d}nt          d|z            |	dk    r'|dk    r!t                               |
          |         S |	dk    r<|dk    r6t                               |
          t          fd|D                       S |	dk    r3|dk    r-i }|D ]&}
t                               |
          |         ||
<   '|S i }|D ].}
t                               |
          fd|D             ||
<   /t          |           j        k    r                      j        |          S |S t          |          dk    r	|d         }nt          d	          t          |t                    rM|                    t           j                            \  }}}t          |||          } fd
|D             }d}nt          |t          j                  r j        |         }d}ntt          |t                    r fd|D             }d}nNt          |t                    r't          |          dk    r|}d}n!t          |          t          d|z            |dk    rt                               |          S |dk    r'i }|D ] }t                               |          ||<   !|S t!          d          )z(Return the position matrix of index key.   c                 *    g | ]}j         |         S r   r   r   r   r   s     r   r   z5GenericPositionMatrix.__getitem__.<locals>.<listcomp>;   s     CCCQa 0CCCr   r   c                 *    g | ]}j         |         S r   r,   r-   s     r   r   z5GenericPositionMatrix.__getitem__.<locals>.<listcomp>A   s     ???Qa 0???r   zCannot understand key %sc              3   (   K   | ]}|         V  d S Nr   )r   index2r   s     r   	<genexpr>z4GenericPositionMatrix.__getitem__.<locals>.<genexpr>X   s'       G GF G G G G G Gr   c                      g | ]
}|         S r   r   )r   r   r   s     r   r   z5GenericPositionMatrix.__getitem__.<locals>.<listcomp>b   s    %B%B%BAfQi%B%B%Br   r   z"keys should be 1- or 2-dimensionalc                 *    g | ]}j         |         S r   r,   r-   s     r   r   z5GenericPositionMatrix.__getitem__.<locals>.<listcomp>n   s     999At}Q'999r   c                 *    g | ]}j         |         S r   r,   r-   s     r   r   z5GenericPositionMatrix.__getitem__.<locals>.<listcomp>t   s     555At}Q'555r   zShould not get here)
isinstancetupler   sliceindicesr   r!   numbersIntegralstrKeyErrorr   dict__getitem__sorted	__class__RuntimeError)r   keykey1key2start1stop1stride1indices1letters1dim1letter1start2stop2stride2indices2dim2r1   dstartstopstrider9   lettersdimr   r   s   `                        @r   r?   z!GenericPositionMatrix.__getitem__3   s   c5!! 5	E3xx1}} 
ddE** F-1\\#dm:L:L-M-M*FE7$VUG<<HCCCC(CCCHDDg&677 F"mD1GDDe,, 
F????$???HDDc** F4yyA~~"& &tnn,"#=#DEEEdE** F-1\\$+-F-F*FE7$VUG<<HDDg&677 F!FDD"#=#DEEE199++D'::6BBQYY4199!--dG<<F  G G G Gh G G GGGGQYY4199A#+ M M%)%5%5dG%D%DV%L'

HA#+ C C!%!1!1$!@!@%B%B%B%B%B%B%B'

h''4=88#~~dmQ??? SQ!fCDDDc5!! 	="%++c$-.@.@"A"AE4E400G9999999GCCW-.. 	=]3'FCCU## 
	=5555555GCCS!! 	=3xx1}}smm#5;<<<!88##D&111AXXA! ; ; ,,T6::&		H4555r   c                     d}t          | j                  D ]6}t          j         }| j        D ]}| |         |         }||k    r|}|}||z  }7t          |          S )zReturn the consensus sequence. r!   r   mathinfr   r   )r   sequencer   maximumr   countsequence_letters          r   	consensuszGenericPositionMatrix.consensus   su     t{## 	( 	(AxiG- - -VQ7??#G&,O'HH8}}r   c                     d}t          | j                  D ]5}t          j        }| j        D ]}| |         |         }||k     r|}|}||z  }6t          |          S )z"Return the anticonsensus sequence.rY   rZ   )r   r]   r   minimumr   r_   r`   s          r   anticonsensusz#GenericPositionMatrix.anticonsensus   ss     t{## 	( 	(AhG- - -VQ7??#G&,O'HH8}}r   c                     ddddddddd	d
dddddd}d}t           j                  D ] fd}t           |d          } fd|D             }|d         t          |dd                   k    r|d         d|d         z  k    r	|d         }ndt          |dd                   z  dt          |          z  k    r+d                    t          |dd                             }n9|d         dk    r+d                    t          |dd                             }nd}|                    ||          }||z  }t          |          S )z)Return the degenerate consensus sequence.ACGTMRWSYKVHDBN)rf   rg   rh   ri   ACAGATCGCTGTACGACTAGTCGTACGTrY   c                      |                   S r0   r   )
nucleotider   r   s    r   getz7GenericPositionMatrix.degenerate_consensus.<locals>.get   s    J'**r   T)rC   reversec                 ,    g | ]}|                  S r   r   )r   cr   r   s     r   r   z>GenericPositionMatrix.degenerate_consensus.<locals>.<listcomp>   s!    666Qd1gaj666r   r   r   Nr*         r   )r!   r   r@   sumr"   r   r   )	r   degenerate_nucleotider]   r   nucleotidescountsrC   r   r   s	   `       @r   degenerate_consensusz*GenericPositionMatrix.degenerate_consensus   s    !
 !
" t{## 	# 	#A+ + + + + + !3===K66666+666Fay3vabbz??**vay1vay=/H/H!!nS__$q3v;;66ggf[!_5566aggf[!_5566.223<<J
"HH8}}r   Nr   c                    | j         }t          |                              d          t          d          k    rd}nd}||t          d          d}t	          | j                  D ]`}d}	d}
|D ]}| |         |         }|
|z  }
||	k    r|}	|} |	||
z  k     r|}n'||
dz  }n||
z  }|	|k    r|                                }||z  }ant          d	          |S )
aY  Return the consensus sequence (as a string) for the given parameters.

        This function largely follows the conventions of the EMBOSS `cons` tool.

        Arguments:
         - substitution_matrix - the scoring matrix used when comparing
           sequences. By default, it is None, in which case we simply count the
           frequency of each letter.
           Instead of the default value, you can use the substitution matrices
           available in Bio.Align.substitution_matrices. Common choices are
           BLOSUM62 (also known as EBLOSUM62) for protein, and NUC.4.4 (also
           known as EDNAFULL) for nucleotides. NOTE: This has not yet been
           implemented.
         - plurality           - threshold value for the number of positive
           matches, divided by the total count in a column, required to reach
           consensus. If substitution_matrix is None, then this argument must
           be None, and is ignored; a ValueError is raised otherwise. If
           substitution_matrix is not None, then the default value of the
           plurality is 0.5.
         - identity            - number of identities, divided by the total
           count in a column, required to define a consensus value. If the
           number of identities is less than identity * total count in a column,
           then the undefined character ('N' for nucleotides and 'X' for amino
           acid sequences) is used in the consensus sequence. If identity is
           1.0, then only columns of identical letters contribute to the
           consensus. Default value is zero.
         - setcase             - threshold for the positive matches, divided by
           the total count in a column, above which the consensus is is
           upper-case and below which the consensus is in lower-case. By
           default, this is equal to 0.5.
        zACGTUN-rt   XNz5plurality must be None if substitution_matrix is NonerY   r   r*   zDcalculate_consensus currently only supports substitution_matrix=None)r   setunion
ValueErrorr!   r   lowerNotImplementedError)r   substitution_matrix	pluralityidentitysetcaser   	undefinedr]   r   r^   totalr   r_   consensus_lettersetcase_thresholds                  r   calculate_consensusz)GenericPositionMatrix.calculate_consensus   sE   D =x==y))S^^;;III&$ K   H4;'' - -& 2 2F LOEUNEw"'+1(X---'0$$,1AI)),3eO)"333+;+A+A+C+C(,,%-( &V   r   c                     | j         }d}d}t          | j                  D ]-}|D ](}|dv r|| |         |         z  }|| |         |         z  }).||z  S )z Compute the fraction GC content.        rx   )r   r!   r   )r   r   gc_totalr   r   r   s         r   
gc_contentz GenericPositionMatrix.gc_content  s~     =t{## 	) 	)A" ) )T>>VQ/Hfa() %r   c                 H   i }| j         dk    r)| d         ddd         |d<   | d         ddd         |d<   n(| d         ddd         |d<   | d         ddd         |d<   | d         ddd         |d<   | d         ddd         |d<   | j         }|                     ||          S )	zCompute reverse complement.ACGUUNrf   ri   rg   rh   )r   rA   )r   r   r   s      r   reverse_complementz(GenericPositionMatrix.reverse_complement%  s    =F""s)DDbD/F3Ks)DDbD/F3KKs)DDbD/F3Ks)DDbD/F3K3i"os3i"os=~~h///r   )NNr   N)__name__
__module____qualname____doc__r   r(   r?   propertyra   rd   r   r   r   r   r   r   r   r   r      s        CC
! 
! 
!
 
 
S6 S6 S6j   X   X + + X+\ MQD D D DL 
  
  X
 0 0 0 0 0r   r   c                       e Zd ZdZddZdS )FrequencyPositionMatrixzGClass for the support of frequency calculations on the Position Matrix.Nc                    i }|| j         D ]}dg| j        z  ||<   nft          |t                    r,| j         D ]#}t	          ||                   g| j        z  ||<   $n%| j         D ]}t	          |          g| j        z  ||<   t          | j                  D ].}| j         D ]$}||         |xx         | |         |         z  cc<   %/t          | j         |          S )a  Create and return a position-weight matrix by normalizing the counts matrix.

        If pseudocounts is None (default), no pseudocounts are added
        to the counts.

        If pseudocounts is a number, it is added to the counts before
        calculating the position-weight matrix.

        Alternatively, the pseudocounts can be a dictionary with a key
        for each letter in the alphabet associated with the motif.
        Nr   )r   r   r6   r>   r   r!   PositionWeightMatrix)r   pseudocountsr   r   r   s        r   	normalizez!FrequencyPositionMatrix.normalize7  s'    - 5 5"%!4v5d++ 	E- M M"'V(<"="=!>!LvM - E E"'"5"5!6!Dvt{## 	5 	5A- 5 5vq!!!T&\!_4!!!!5 $DM6:::r   r0   )r   r   r   r   r   r   r   r   r   r   4  s.        QQ; ; ; ; ; ;r   r   c                        e Zd ZdZd ZddZdS )r   zDClass for the support of weight calculations on the Position Matrix.c                     t                                ||           t           j                  D ]9t	           fd|D                       }|D ]} |         xx         |z  cc<   :|D ]}t           |                    |<   dS )r	   c              3   4   K   | ]}|                  V  d S r0   r   r   r   r   r   s     r   r2   z0PositionWeightMatrix.__init__.<locals>.<genexpr>[  s+      ??FVQ??????r   N)r   r   r!   r   r   r7   )r   r   r   r   r   r   s   `    @r   r   zPositionWeightMatrix.__init__W  s    &&tXv>>>t{## 	) 	)A?????h?????E" ) )VQ5() 	/ 	/F f..DLL	/ 	/r   Nc                 l   i }| j         }|!t                              | j         d          }nt          |          }t          |                                          }|D ]}||xx         |z  cc<   g ||<   t          | j                  D ]}|D ]}||         }|dk    r;| |         |         }|dk    rt          j        ||z  d          }	n;t          j	         }	n-| |         |         }|dk    rt          j	        }	nt          j
        }	||                             |	           t          ||          }
|
S )a/  Return the Position-Specific Scoring Matrix.

        The Position-Specific Scoring Matrix (PSSM) contains the log-odds
        scores computed from the probability matrix and the background
        probabilities. If the background is None, a uniform background
        distribution is assumed.
        N      ?r   r*   )r   r>   fromkeysr   r   r!   r   r[   logr\   nanr#   PositionSpecificScoringMatrix)r   
backgroundr   r   r   r   r   bplogoddspssms              r   log_oddszPositionWeightMatrix.log_oddsa  sX    =t}c::JJj))JJ%%''(( 	  	 Fv%'F6NNt{## 	/ 	/A" / /v&q55VQA1uu"&(1q5!"4"4#'8)VQA1uu"&("&(v%%g..../ -Xv>>r   r0   )r   r   r   r   r   r   r   r   r   r   r   T  s=        NN/ / /# # # # # #r   r   c                       e Zd ZdZd ZddZed             Zed             Zed	             Z	ddZ
ddZd Zd ZddZd
S )r   zGClass for the support of Position Specific Scoring Matrix calculations.c                     t           j                  g dk    rt          d j        z            	 t          |          }nr# t          $ rL 	 t          |d          }n7# t          $ r t          d          dt
          $ r t          d          dw xY wY nt          $ r t          d          dw xY wt          |          } j        }t          j
        ||z
  dz   t          j                  }t          j         fdt          |          D             t                    }t          j        |||           t          |          dk    r|d	         S |S )
ag  Return the PWM score for a given sequence for all positions.

        Notes:
         - the sequence can only be a DNA sequence
         - the search is performed only on one strand
         - if the sequence and the motif have the same length, a single
           number is returned
         - otherwise, the result is a one-dimensional numpy array

        )rf   rg   rh   ri   z6PSSM has wrong alphabet: %s - Use only with DNA motifsASCIIzBsequence should be a Seq, MutableSeq, string, or bytes-like objectNz-sequence should contain ASCII characters onlyr   c                 0    g | ]fd dD             S )c                 ,    g | ]}|                  S r   r   r   s     r   r   zFPositionSpecificScoringMatrix.calculate.<locals>.<listcomp>.<listcomp>  s!    333&d6l1o333r   r   r   r-   s    @r   r   z;PositionSpecificScoringMatrix.calculate.<locals>.<listcomp>  s2    FFF33333F333FFFr   r   )r@   r   r   bytes	TypeErrorUnicodeEncodeErrorr   r   r   npemptyfloat32arrayr!   r   r   	calculate)r   r]   nmscoresr   s   `     r   r   z'PositionSpecificScoringMatrix.calculate  s    $-  $8$8$888H4=X  	XHH 
	 
	 
		 733    X  &    C    	 	 	T 	
 MMK !a%!)RZ00(FFFFU1XXFFF
 
 	x&111v;;!!9Ms)   A 
B3A B3 4BB3B3r   T@B c              #   P  K   |                                 }t          |          }| j        }t          j        d||          }|r|                                 }|D ]I}	||	|	|z   |z   dz
           }
|                     |
          }||k    }t          j        |          d         |	z   }||         }|rA|                    |
          }||k    }t          j        |          d         |	z   }||         }n6t          j        dt                    }t          j        dt                    }t          j
        |||z
            }t          j
        ||          }t          j        t          j
        ||                    }||         }||         }t          ||          E d{V  KdS )zFind hits with PWM score above given threshold.

        A generator function, returning found hits in the given sequence
        with the pwm score higher than the threshold.
        r   r   )dtypeN)upperr   r   r   aranger   r   wherer   intr#   argsortzip)r   r]   	thresholdboth	chunksizeseq_lenmotif_lchunk_startsrcchunk_startsubseq
pos_scorespos_indpos_positions
neg_scoresneg_indneg_positionschunk_positionschunk_scoresorders                       r   searchz$PositionSpecificScoringMatrix.search  s      >>##h--+yGY77 	+((**B' 	: 	:KkK),Cg,MPQ,QQRF//J I-GHW--a0;>M#G,J 6\\&11
$	1 " 1 1! 4{ B'0

 "!C 8 8 8Xq555
 i}w7NOOO9Z<<LJryFFGGE-e4O'.L?L9999999999'	: 	:r   c                      d} j         }t           j                  D ]!|t           fd|D                       z  }"|S )zoMaximal possible score for this motif.

        returns the score computed for the consensus sequence.
        r   c              3   4   K   | ]}|                  V  d S r0   r   r   r   positionr   s     r   r2   z4PositionSpecificScoringMatrix.max.<locals>.<genexpr>  ,      FFFfh/FFFFFFr   )r   r!   r   maxr   scorerV   r   s   `  @r   r   z!PositionSpecificScoringMatrix.max  ^     -dk** 	G 	GHSFFFFFgFFFFFFEEr   c                      d} j         }t           j                  D ]!|t           fd|D                       z  }"|S )zsMinimal possible score for this motif.

        returns the score computed for the anticonsensus sequence.
        r   c              3   4   K   | ]}|                  V  d S r0   r   r   s     r   r2   z4PositionSpecificScoringMatrix.min.<locals>.<genexpr>  r   r   )r   r!   r   minr   s   `  @r   r   z!PositionSpecificScoringMatrix.min  r   r   c                      t          d          )zCompute the GC-ratio.z,Cannot compute the %GC composition of a PSSM)r   )r   s    r   r   z(PositionSpecificScoringMatrix.gc_content  s     FGGGr   Nc                    |!t                               | j        d          }nt          |          }t          |                                          }| j        D ]}||xx         |z  cc<   d}t          | j                  D ]n}| j        D ]d}| ||f         }t          j        |          r!t          j	        |          r|dk     r<||         }|t          j
        d|          z  }|||z  z  }eo|S )z.Return expected value of the score of a motif.Nr   r   r   r*   )r>   r   r   r   r   r!   r   r[   isnanisinfpow)	r   r   r   r   sxr   r   r   r   s	            r   meanz"PositionSpecificScoringMatrix.mean  s   t}c::JJj))JJ%%''((m 	( 	(Fv%'t{## 		" 		"A- " "vqy/:g&& :g&& 7Q;;v&G,,,a'k!" 	r   c                 f   |!t                               | j        d          }nt          |          }t          |                                          }| j        D ]}||xx         |z  cc<   d}t          | j                  D ]}d}d}| j        D ]o}| ||f         }t          j        |          r!t          j	        |          r|dk     r<||         }	|	t          j
        d|          z  }
||
|z  z  }||
|z  |z  z  }p|||z  z  }||z  }t          |d          }t          j        |          S )z2Return standard deviation of the score of a motif.Nr   r   r   r*   )r>   r   r   r   r   r!   r   r[   r   r   r   r   sqrt)r   r   r   r   variancer   r   sxxr   r   r   s              r   stdz!PositionSpecificScoringMatrix.std  sg   t}c::JJj))JJ%%''((m 	( 	(Fv%'t{## 	 	ABC- 	- 	-vqy/:g&& :g&& 7Q;;v&G,,,a'k!q7{W,,27NCOHHx##y"""r   c                    | j         |j         k    rt          d          d}t          | j         dz   |j                  D ]A}|dk     r|                     ||           }n|                    | |          }||k     r|}| }Bd|z
  |fS )zReturn the similarity score based on pearson correlation for the given motif against self.

        We use the Pearson's correlation of the respective probabilities.
        z.Cannot compare motifs with different alphabetsr   r   )r   r   r!   r   dist_pearson_at)r   othermax_poffsetr   max_os         r   dist_pearsonz*PositionSpecificScoringMatrix.dist_pearson/  s    
 =EN**MNNNT[L1,el;; 	  	 Fzz((88))$77qyy5y%r   c           	           j         }d}d}d}d}d}t           j        j        z             t          |          z  }	t	          t           j        z
  j                            D ] fd|D             }
fd|D             }|t          |
          z  }|t          |          z  }|t          d |
D                       z  }|t          d t          |
|          D                       z  }|t          d |D                       z  }||	z  }||	z  }||	z  }||	z  }||	z  }|||z  z
  }t          j	        |||z  z
  |||z  z
  z            }||z  S )zMReturn the similarity score based on pearson correlation at the given offset.r   c                 *    g | ]}|z   f         S r   r   )r   r   r
  posr   s     r   r   zAPositionSpecificScoringMatrix.dist_pearson_at.<locals>.<listcomp>L  s'    CCC$vsV|+,CCCr   c                 $    g | ]}|f         S r   r   )r   r   r  r  s     r   r   zAPositionSpecificScoringMatrix.dist_pearson_at.<locals>.<listcomp>M  s"    ;;;%$;;;r   c              3       K   | ]	}||z  V  
d S r0   r   )r   xs     r   r2   z@PositionSpecificScoringMatrix.dist_pearson_at.<locals>.<genexpr>P  &      ))q1u))))))r   c              3   &   K   | ]\  }}||z  V  d S r0   r   )r   r  ys      r   r2   z@PositionSpecificScoringMatrix.dist_pearson_at.<locals>.<genexpr>Q  s*      55Aq1u555555r   c              3       K   | ]	}||z  V  
d S r0   r   )r   r  s     r   r2   z@PositionSpecificScoringMatrix.dist_pearson_at.<locals>.<genexpr>R  r  r   )
r   r   r   r   r!   r   r   r   r[   r  )r   r  r
  rV   r   syr  sxysyynormxiyi	numeratordenominatorr  s   ```           @r   r  z-PositionSpecificScoringMatrix.dist_pearson_atB  s   -4; 566WET[615<@@AA 	* 	*CCCCCCC7CCCB;;;;;7;;;B#b''MB#b''MB3))b))))))C355R555555C3))b))))))CC
d

d
ttt"r'M	irBw3b= ABB;&&r     c                    ddl m} |!t                              | j        d          }nt          |          }t          |                                          }| j        D ]}||xx         |z  cc<    ||| |          S )z@Calculate the distribution of the scores at the given precision.r   )ScoreDistributionNr   )	precisionr   r   )
thresholdsr!  r>   r   r   r   r   )r   r   r"  r!  r   r   s         r   distributionz*PositionSpecificScoringMatrix.distribution\  s    111111t}c::JJj))JJ%%''((m 	( 	(Fv%'  94JWWWWr   )r   Tr   r0   )Nr  )r   r   r   r   r   r   r   r   r   r   r   r  r  r  r$  r   r   r   r   r     s        QQ3 3 3j: : : :B 	 	 X	 	 	 X	 H H XH   ,# # # #8     &' ' '4X X X X X Xr   r   )r   r[   r:   numpyr   Bio.Seqr   rY   r   r>   r   r   r   r   r   r   r   <module>r'     s'                     Y0 Y0 Y0 Y0 Y0D Y0 Y0 Y0x; ; ; ; ;3 ; ; ;@0 0 0 0 00 0 0 0f`X `X `X `X `X$9 `X `X `X `X `Xr   