a
    d6:                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZdd Z	dd Z
dd Zdd	d
Zdd Zdd Zdd Zdd ZdddZdddZdS )    Nc                 C   s"   t ttdtt|  d S )z@Transforming error rate to ASCII character using the Phred scalei!   )chrintroundmathZlog10abs)Zpval r   0lib/python3.9/site-packages/mapdamage/rescale.pyphred_pval_to_char   s    r
   c                 C   s   dt t| t d  d  S )zATransforming ASCII character in the Phred scale to the error rate
   r   )floatord)Zchr   r   r	   phred_char_to_pval   s    r   c           
   
   C   sD  t j| d}t j|s,td|  d  zt|}t|}i }|D ]V}|d |v rztd| |j	||d  f  qJt
|d t
|d d|t|d < qJt| D ] }|| k s||kr|| q|W  d	   W S 1 s0    Y  W nH tjy> }	 z,td
t j| d|j	|	f  W Y d	}	~	n
d	}	~	0 0 d	S )z
    Reads the damage probability correction file, returns a
    dictionary with this structure
    position (one based)  -  CT  -  probability
                          -  GA  -  probability
    zStats_out_MCMC_correct_prob.csvzRMissing file, the file 
	Stats_out_MCMC_correct_prob.csv
should be in the folder
	z2
Did you run the MCMC estimates of the parameters?ZPositionz;This file has multiple position definitions %s, line %d: %sC.TG.A)r   r   NzFile %s, line %d: %s)ospathjoinisfilesysexitopencsvZ
DictReaderZline_numr   r   listkeyspopError)
folderrescale_length_5prescale_length_3p	full_pathfiZ	fi_handle	corr_problinekeyer   r   r	   get_corr_prob   s*    

((r&   bothc           
      C   s   |dkrt |dkr"|dkr"d}n|dkr8|dkr8d}ndS || d }|| v r^| | | }nd}|| v rx| | | }	nd}	|d	kr|S |d
kr|	S |dkr|t|k r|S |	S ntddS )a  
    The position specific damaging correction, using the input
    corr_prob dictionary holding the damage correcting values
    nt_seq nucleotide in the sequence
    nt_ref nucleotide in the reference
    pos relative position from the 5' end
    length length of the sequence
    direction which end to consider the rescaling
    returns the correction probability for this particular set
    r   TCr   AGr      forwardZbackwardr'   z-Abnormal direction in the rescaling procedureN)SystemErrorr   
SystemExit)
r"   nt_seqnt_refposlength	directionsubsZback_posZp5_corrZp3_corrr   r   r	   corr_this_base3   s.    r6   c                  C   sp   t ttttdddgd } |  |  |  |  |  |  |  |  ddddddddddd}|S )zJInitialize a substitution table, to track the expected substitution countsr              )	CT-before	TC-before	GA-before	AG-beforeCT-afterTC-afterGA-afterAG-afterr*   r)   r+   r(   CT-pvalsCT-pvals_beforeTC-pvalsGA-pvalsGA-pvals_beforeAG-pvals)dictr   ziprangecopy)Zper_qualr5   r   r   r	   initialize_subsi   s*    "rK   c                 C   sl  |dkr>|dkr>d}| d  |7  < | d  dt | 7  < n|dkr||dkr|d	}| d
  |7  < | d  dt | 7  < n|dkr|dkrd}| d  dt | 7  < ||krtdnB|dkr|dkrd}| d  dt | 7  < ||krtdnd}|dkrN| |d  tt|d   d7  < | |d  tt|d   d7  < |dv rh| |  d7  < dS )zU record the expected substitution change, prob_corr is the excact version for nt_qualr(   r)   ZCTrA   rB   r,   r*   r+   ZGArD   rE   ZTCrC   z=Internal error: rescaling qualities for the wrong transitionsZAGrF   ZNNz-beforer   z-after)r*   r)   r+   r(   N)r   r.   r   r   )r5   r0   r1   nt_qualZ
nt_newqualZ	prob_corrZsub_typer   r   r	   record_subs   s0    


$$
rM   c                 C   sp   dD ]f}dD ]\}| | D ]N}||kr|d t | }|| v rV| |  | | | 7  < q| | | | |< qqqdS )z;Calculates summary statistics for the substition table subs)r9   r:   r;   r<   r=   r>   r?   r@   )r   r         (   z-QN)str)r5   ilvZqvr$   r   r   r	   qual_summary_subs   s    rT   c                 C   s  t d | d dkrJt dt| d | d   d t| d | d    nt d | d	 dkrt d
t| d | d	   d t| d | d	    nt d | d dkrt dt| d | d   d t| d | d    nt d | d dkr*t dt| d | d   d t| d | d    nt d t d t dt| d  d t| d   t dt| d  d t| d   t dt| d  d t| d   t d t| d!  d t| d"   t d#t| d$  d t| d%   t d&t| d'  d t| d(   t d)t| d*  d t| d+   t d,t| d-  d t| d.   t d/t| d0  d t| d1   t d2t| d3  d t| d4   d5S )6zPrint the substition tablezh	The expected substition frequencies before and after scaling using the scaled qualities as probalities:r)   r   z	CT	rB   z		rA   z
	CT	NA		NAr(   z	TC	rC   z
	TC	NA		NAr+   z	GA	rE   rD   z
	GA	NA		NAr*   z	AG	rF   z
	AG	NA		NAz)	Quality metrics before and after scalingz	CT-Q0 	zCT-before-Q0zCT-after-Q0z		CT-Q10 	zCT-before-Q10zCT-after-Q10z		CT-Q20 	zCT-before-Q20zCT-after-Q20z		CT-Q30 	zCT-before-Q30zCT-after-Q30z		CT-Q40 	zCT-before-Q40zCT-after-Q40z	GA-Q0 	zGA-before-Q0zGA-after-Q0z		GA-Q10 	zGA-before-Q10zGA-after-Q10z		GA-Q20 	zGA-before-Q20zGA-after-Q20z		GA-Q30 	zGA-before-Q30zGA-after-Q30z		GA-Q40 	zGA-before-Q40zGA-after-Q40N)printrQ   )r5   r   r   r	   
print_subs   s0    6666$$$$$$$$$rV   Fc              
   C   sd  |st t}|j}tj|}	| |j}
|	|
t
|	t|	 }tj|j||jd|\}}}t|}t|}|jrtj|}tj|}|ddd }dg| }d}d}tt||||D ]\}}}}|dkr|dks|dkr>|d	kr>d
t||||d
 ||d }d
t| }|| }td
| }|d
| 7 }nd
t| }|}||k r|||< t|||||| | |dkr|d
7 }q|s|d|j  qqd|}|jr|ddd }|jd d dkr|jd|jd d
  | }|jd d dkr(||j|jd d
  d  }||_td| }|drRt d| |!d|d |S )a  
    bam              a pysam bam object
    read             a pysam read object
    ref              a pysam fasta ref file
    reflengths       a dictionary holding the length of the references
    subs             a dictionary holding the corrected number of substition before and after scaling
    corr_prob dictionary from get_corr_prob
    returns a read with rescaled quality score

    Iterates through the read and reference, rescales the quality
    according to corr_prob
    iNr   r8   r(   r)   r*   r+   r,   )r4   -zCWarning: The aligment of the read is longer than the actual read %s    z%.5fZMRz,Read: %s already has a MR tag, can't rescalef)"logging	getLogger__name__query	mapdamageZalignZget_coordinatesZgetrnametidZfetchminmaxupperZalign_with_qualZcigarZqquallen
is_reverseseqZrevcomprH   rI   r6   r   r
   rM   warningqnamer   qualr   Zhas_tagr/   Zset_tag)bamreadrefr"   r5   debugr4   loggerZraw_seqZ
coordinateZchromZrefseqrg   rj   Zlength_readZlength_alignZnew_qualZpos_on_readZnumber_of_rescaled_basesrR   r0   r1   rL   ZpdamZpseqZnewpZnewqr   r   r	   rescale_qual_read   s`    




$



rp   c              
   C   s  |s$t t}|d|j|jf  t }t|j}|rBd}nd}tj|j||d}t	|j
|j|jd}t }	d}
d}|D ]}|jrn|js|s|d|j  n|jrV|
r|s|d	 d
}
|js|jr|j|jkr|j|jkrt||| ||	d|d}nL|jrL|jsL|j|jk rL|j|jkrLt||| ||	d|d}n|d7 }nt||| ||	|d}|| q~|dkr|s|d|  |	d |	d ks|	d |	d krtd t|	 |  |  |jst |	 |s|!dt | f  dS )a~  
    ref                a pysam fasta ref file
    bam_filename       name of a BAM/SAM file to read
    fi                 file containing the csv with correction probabilities
    reflengths         dictionary with the reference lengths
    options            options from the command line parsing

    Iterates through BAM file, makes a new BAM file with rescaled qualities.
    zRescaling BAM: '%s' -> '%s'Zwhwb)template)r   r   Tr   zCCannot rescale base PHRED scores for read '%s'; no scores assigned.zUWarning! Assuming the pairs are non-overlapping, facing inwards and correctly paired.Fr-   )r4   rn   r,   )rn   z9Number of non-rescaled reads due to improper pairing:  %dr:   r>   r<   r@   zeQualities for T.C and A.G transitions should not change in the rescaling, please contact the authors.z!Rescaling completed in %f secondsN)"r\   r]   r^   infofilenameZrescale_outtimepysamZSamfiler&   r   r   r   rK   Zis_unmappedrj   rh   ri   Z	is_pairedrf   Zmate_is_reverseZpnextr2   ra   Zmrnmrp   writer   r   rT   closequietrV   rn   )rm   optionsrn   ro   Z
start_timerk   Z
write_modeZbam_outr"   r5   Z
first_pairZnumber_of_non_proper_pairsZhitr   r   r	   rescale_qual  sV    



,,
$
r{   )r'   )Fr'   )F)r   r   r   r`   rv   	itertoolsr   r\   ru   r
   r   r&   r6   rK   rM   rT   rV   rp   r{   r   r   r   r	   <module>   s$   
6!
R